Ranged-export: Remove CachingBlockstore

The improvements in the range-export code lead to avoid reading most blocks
twice, as well as to allowing some blocks to be written to disk multiple times.

The cache hit-rate went down from being close to 50% to a maximum of 12% at
the very end of the export. The reason is that most CIDs are never read twice
since they are correctly tracked in the CID set.

These numbers do not support the maintenance of the CachingBlockstore
code. Additional testing shows that removing it has similar memory-usage
behaviour and about 5 minute-faster execution (around 10%).

Less code to maintain and less options to mess up with.
This commit is contained in:
Hector Sanjuan 2023-02-06 12:01:05 +01:00
parent fa93c23813
commit 1bb698619c
5 changed files with 4 additions and 136 deletions

View File

@ -403,7 +403,6 @@ func (m *MsgUuidMapType) UnmarshalJSON(b []byte) error {
type ChainExportConfig struct { type ChainExportConfig struct {
WriteBufferSize int WriteBufferSize int
NumWorkers int NumWorkers int
CacheSize int
IncludeMessages bool IncludeMessages bool
IncludeReceipts bool IncludeReceipts bool
IncludeStateRoots bool IncludeStateRoots bool

View File

@ -1,118 +0,0 @@
package store
import (
"context"
"fmt"
"sync/atomic"
lru "github.com/hashicorp/golang-lru"
blocks "github.com/ipfs/go-block-format"
"github.com/ipfs/go-cid"
"github.com/filecoin-project/lotus/blockstore"
)
type CachingBlockstore struct {
cache *lru.ARCCache
blocks blockstore.Blockstore
reads int64 // updated atomically
hits int64 // updated atomically
bytes int64 // updated atomically
}
func NewCachingBlockstore(blocks blockstore.Blockstore, cacheSize int) (*CachingBlockstore, error) {
cache, err := lru.NewARC(cacheSize)
if err != nil {
return nil, fmt.Errorf("new arc: %w", err)
}
return &CachingBlockstore{
cache: cache,
blocks: blocks,
}, nil
}
func (cs *CachingBlockstore) DeleteBlock(ctx context.Context, c cid.Cid) error {
return cs.blocks.DeleteBlock(ctx, c)
}
func (cs *CachingBlockstore) GetSize(ctx context.Context, c cid.Cid) (int, error) {
return cs.blocks.GetSize(ctx, c)
}
func (cs *CachingBlockstore) Put(ctx context.Context, blk blocks.Block) error {
return cs.blocks.Put(ctx, blk)
}
func (cs *CachingBlockstore) PutMany(ctx context.Context, blks []blocks.Block) error {
return cs.blocks.PutMany(ctx, blks)
}
func (cs *CachingBlockstore) AllKeysChan(ctx context.Context) (<-chan cid.Cid, error) {
return cs.blocks.AllKeysChan(ctx)
}
func (cs *CachingBlockstore) HashOnRead(enabled bool) {
cs.blocks.HashOnRead(enabled)
}
func (cs *CachingBlockstore) DeleteMany(ctx context.Context, cids []cid.Cid) error {
return cs.blocks.DeleteMany(ctx, cids)
}
func (cs *CachingBlockstore) Get(ctx context.Context, c cid.Cid) (blocks.Block, error) {
reads := atomic.AddInt64(&cs.reads, 1)
if reads%100000 == 0 {
hits := atomic.LoadInt64(&cs.hits)
by := atomic.LoadInt64(&cs.bytes)
log.Infow("CachingBlockstore stats", "reads", reads, "cache_len", cs.cache.Len(), "hit_rate", float64(hits)/float64(reads), "bytes_read", by)
}
v, hit := cs.cache.Get(c)
if hit {
atomic.AddInt64(&cs.hits, 1)
return v.(blocks.Block), nil
}
blk, err := cs.blocks.Get(ctx, c)
if err != nil {
return nil, err
}
atomic.AddInt64(&cs.bytes, int64(len(blk.RawData())))
cs.cache.Add(c, blk)
return blk, err
}
func (cs *CachingBlockstore) View(ctx context.Context, c cid.Cid, callback func([]byte) error) error {
reads := atomic.AddInt64(&cs.reads, 1)
if reads%1000000 == 0 {
hits := atomic.LoadInt64(&cs.hits)
by := atomic.LoadInt64(&cs.bytes)
log.Infow("CachingBlockstore stats", "reads", reads, "cache_len", cs.cache.Len(), "hit_rate", float64(hits)/float64(reads), "bytes_read", by)
}
v, hit := cs.cache.Get(c)
if hit {
atomic.AddInt64(&cs.hits, 1)
return callback(v.(blocks.Block).RawData())
}
blk, err := cs.blocks.Get(ctx, c)
if err != nil {
return err
}
atomic.AddInt64(&cs.bytes, int64(len(blk.RawData())))
cs.cache.Add(c, blk)
return callback(blk.RawData())
}
func (cs *CachingBlockstore) Has(ctx context.Context, c cid.Cid) (bool, error) {
atomic.AddInt64(&cs.reads, 1)
// Safe to query cache since blockstore never deletes
if cs.cache.Contains(c) {
return true, nil
}
return cs.blocks.Has(ctx, c)
}

View File

@ -508,8 +508,7 @@ func (cs *ChainStore) ExportRange(
w io.Writer, w io.Writer,
head, tail *types.TipSet, head, tail *types.TipSet,
messages, receipts, stateroots bool, messages, receipts, stateroots bool,
workers int, workers int) error {
cacheSize int) error {
h := &car.CarHeader{ h := &car.CarHeader{
Roots: head.Cids(), Roots: head.Cids(),
@ -520,11 +519,6 @@ func (cs *ChainStore) ExportRange(
return xerrors.Errorf("failed to write car header: %s", err) return xerrors.Errorf("failed to write car header: %s", err)
} }
cacheStore, err := NewCachingBlockstore(cs.UnionStore(), cacheSize)
if err != nil {
return err
}
start := time.Now() start := time.Now()
log.Infow("walking snapshot range", log.Infow("walking snapshot range",
"head", head.Key(), "head", head.Key(),
@ -544,7 +538,7 @@ func (cs *ChainStore) ExportRange(
includeReceipts: receipts, includeReceipts: receipts,
} }
pw, err := newWalkScheduler(ctx, cacheStore, cfg, w) pw, err := newWalkScheduler(ctx, cs.UnionStore(), cfg, w)
if err != nil { if err != nil {
return err return err
} }

View File

@ -1182,11 +1182,6 @@ var ChainExportRangeCmd = &cli.Command{
Usage: "specify the number of workers", Usage: "specify the number of workers",
Value: 1, Value: 1,
}, },
&cli.IntFlag{
Name: "cache-size",
Usage: "specify the size of the cache (in objects) to use while exporting",
Value: 100_000,
},
&cli.IntFlag{ &cli.IntFlag{
Name: "write-buffer", Name: "write-buffer",
Usage: "specify write buffer size", Usage: "specify write buffer size",
@ -1243,7 +1238,6 @@ var ChainExportRangeCmd = &cli.Command{
if err := api.ChainExportRangeInternal(ctx, head.Key(), tail.Key(), lapi.ChainExportConfig{ if err := api.ChainExportRangeInternal(ctx, head.Key(), tail.Key(), lapi.ChainExportConfig{
WriteBufferSize: cctx.Int("write-buffer"), WriteBufferSize: cctx.Int("write-buffer"),
NumWorkers: cctx.Int("workers"), NumWorkers: cctx.Int("workers"),
CacheSize: cctx.Int("cache-size"),
IncludeMessages: cctx.Bool("messages"), IncludeMessages: cctx.Bool("messages"),
IncludeReceipts: cctx.Bool("receipts"), IncludeReceipts: cctx.Bool("receipts"),
IncludeStateRoots: cctx.Bool("stateroots"), IncludeStateRoots: cctx.Bool("stateroots"),
@ -1256,7 +1250,6 @@ var ChainExportRangeCmd = &cli.Command{
stream, err := api.ChainExportRange(ctx, head.Key(), tail.Key(), lapi.ChainExportConfig{ stream, err := api.ChainExportRange(ctx, head.Key(), tail.Key(), lapi.ChainExportConfig{
WriteBufferSize: cctx.Int("write-buffer"), WriteBufferSize: cctx.Int("write-buffer"),
NumWorkers: cctx.Int("workers"), NumWorkers: cctx.Int("workers"),
CacheSize: cctx.Int("cache-size"),
IncludeMessages: cctx.Bool("messages"), IncludeMessages: cctx.Bool("messages"),
IncludeReceipts: cctx.Bool("receipts"), IncludeReceipts: cctx.Bool("receipts"),
IncludeStateRoots: cctx.Bool("stateroots"), IncludeStateRoots: cctx.Bool("stateroots"),

View File

@ -630,7 +630,7 @@ func (a ChainAPI) ChainExportRangeInternal(ctx context.Context, head, tail types
bw, bw,
headTs, tailTs, headTs, tailTs,
cfg.IncludeMessages, cfg.IncludeReceipts, cfg.IncludeStateRoots, cfg.IncludeMessages, cfg.IncludeReceipts, cfg.IncludeStateRoots,
cfg.NumWorkers, cfg.CacheSize, cfg.NumWorkers,
); err != nil { ); err != nil {
return fmt.Errorf("exporting chain range: %w", err) return fmt.Errorf("exporting chain range: %w", err)
} }
@ -658,7 +658,7 @@ func (a ChainAPI) ChainExportRange(ctx context.Context, head, tail types.TipSetK
headTs, headTs,
tailTs, tailTs,
cfg.IncludeMessages, cfg.IncludeReceipts, cfg.IncludeStateRoots, cfg.IncludeMessages, cfg.IncludeReceipts, cfg.IncludeStateRoots,
cfg.NumWorkers, cfg.CacheSize, cfg.NumWorkers,
) )
bw.Flush() //nolint:errcheck // it is a write to a pipe bw.Flush() //nolint:errcheck // it is a write to a pipe
w.CloseWithError(err) //nolint:errcheck // it is a pipe w.CloseWithError(err) //nolint:errcheck // it is a pipe