Ranged-export: Remove CachingBlockstore

The improvements in the range-export code lead to avoid reading most blocks twice, as well as to allowing some blocks to be written to disk multiple times. The cache hit-rate went down from being close to 50% to a maximum of 12% at the very end of the export. The reason is that most CIDs are never read twice since they are correctly tracked in the CID set. These numbers do not support the maintenance of the CachingBlockstore code. Additional testing shows that removing it has similar memory-usage behaviour and about 5 minute-faster execution (around 10%). Less code to maintain and less options to mess up with.
2023-02-06 12:01:05 +01:00 · 2023-02-06 12:01:05 +01:00 · 1bb698619c
commit 1bb698619c
parent fa93c23813
5 changed files with 4 additions and 136 deletions
--- a/api/types.go
+++ b/api/types.go
@ -403,7 +403,6 @@ func (m *MsgUuidMapType) UnmarshalJSON(b []byte) error {
 type ChainExportConfig struct {
 	WriteBufferSize   int
 	NumWorkers        int
-	CacheSize         int
 	IncludeMessages   bool
 	IncludeReceipts   bool
 	IncludeStateRoots bool
--- a/chain/store/cache.go
+++ b/chain/store/cache.go
@ -1,118 +0,0 @@
-package store
-
-import (
-	"context"
-	"fmt"
-	"sync/atomic"
-
-	lru "github.com/hashicorp/golang-lru"
-	blocks "github.com/ipfs/go-block-format"
-	"github.com/ipfs/go-cid"
-
-	"github.com/filecoin-project/lotus/blockstore"
-)
-
-type CachingBlockstore struct {
-	cache  *lru.ARCCache
-	blocks blockstore.Blockstore
-	reads  int64 // updated atomically
-	hits   int64 // updated atomically
-	bytes  int64 // updated atomically
-}
-
-func NewCachingBlockstore(blocks blockstore.Blockstore, cacheSize int) (*CachingBlockstore, error) {
-	cache, err := lru.NewARC(cacheSize)
-	if err != nil {
-		return nil, fmt.Errorf("new arc: %w", err)
-	}
-
-	return &CachingBlockstore{
-		cache:  cache,
-		blocks: blocks,
-	}, nil
-}
-
-func (cs *CachingBlockstore) DeleteBlock(ctx context.Context, c cid.Cid) error {
-	return cs.blocks.DeleteBlock(ctx, c)
-}
-
-func (cs *CachingBlockstore) GetSize(ctx context.Context, c cid.Cid) (int, error) {
-	return cs.blocks.GetSize(ctx, c)
-}
-
-func (cs *CachingBlockstore) Put(ctx context.Context, blk blocks.Block) error {
-	return cs.blocks.Put(ctx, blk)
-}
-
-func (cs *CachingBlockstore) PutMany(ctx context.Context, blks []blocks.Block) error {
-	return cs.blocks.PutMany(ctx, blks)
-}
-
-func (cs *CachingBlockstore) AllKeysChan(ctx context.Context) (<-chan cid.Cid, error) {
-	return cs.blocks.AllKeysChan(ctx)
-}
-
-func (cs *CachingBlockstore) HashOnRead(enabled bool) {
-	cs.blocks.HashOnRead(enabled)
-}
-
-func (cs *CachingBlockstore) DeleteMany(ctx context.Context, cids []cid.Cid) error {
-	return cs.blocks.DeleteMany(ctx, cids)
-}
-
-func (cs *CachingBlockstore) Get(ctx context.Context, c cid.Cid) (blocks.Block, error) {
-	reads := atomic.AddInt64(&cs.reads, 1)
-	if reads%100000 == 0 {
-		hits := atomic.LoadInt64(&cs.hits)
-		by := atomic.LoadInt64(&cs.bytes)
-		log.Infow("CachingBlockstore stats", "reads", reads, "cache_len", cs.cache.Len(), "hit_rate", float64(hits)/float64(reads), "bytes_read", by)
-	}
-
-	v, hit := cs.cache.Get(c)
-	if hit {
-		atomic.AddInt64(&cs.hits, 1)
-		return v.(blocks.Block), nil
-	}
-
-	blk, err := cs.blocks.Get(ctx, c)
-	if err != nil {
-		return nil, err
-	}
-
-	atomic.AddInt64(&cs.bytes, int64(len(blk.RawData())))
-	cs.cache.Add(c, blk)
-	return blk, err
-}
-
-func (cs *CachingBlockstore) View(ctx context.Context, c cid.Cid, callback func([]byte) error) error {
-	reads := atomic.AddInt64(&cs.reads, 1)
-	if reads%1000000 == 0 {
-		hits := atomic.LoadInt64(&cs.hits)
-		by := atomic.LoadInt64(&cs.bytes)
-		log.Infow("CachingBlockstore stats", "reads", reads, "cache_len", cs.cache.Len(), "hit_rate", float64(hits)/float64(reads), "bytes_read", by)
-	}
-	v, hit := cs.cache.Get(c)
-	if hit {
-		atomic.AddInt64(&cs.hits, 1)
-		return callback(v.(blocks.Block).RawData())
-	}
-
-	blk, err := cs.blocks.Get(ctx, c)
-	if err != nil {
-		return err
-	}
-
-	atomic.AddInt64(&cs.bytes, int64(len(blk.RawData())))
-	cs.cache.Add(c, blk)
-	return callback(blk.RawData())
-}
-
-func (cs *CachingBlockstore) Has(ctx context.Context, c cid.Cid) (bool, error) {
-	atomic.AddInt64(&cs.reads, 1)
-	// Safe to query cache since blockstore never deletes
-	if cs.cache.Contains(c) {
-		return true, nil
-	}
-
-	return cs.blocks.Has(ctx, c)
-}
--- a/chain/store/snapshot.go
+++ b/chain/store/snapshot.go
@ -508,8 +508,7 @@ func (cs *ChainStore) ExportRange(
 	w io.Writer,
 	head, tail *types.TipSet,
 	messages, receipts, stateroots bool,
-	workers int,
-	cacheSize int) error {
+	workers int) error {

 	h := &car.CarHeader{
 		Roots:   head.Cids(),
@ -520,11 +519,6 @@ func (cs *ChainStore) ExportRange(
 		return xerrors.Errorf("failed to write car header: %s", err)
 	}

-	cacheStore, err := NewCachingBlockstore(cs.UnionStore(), cacheSize)
-	if err != nil {
-		return err
-	}
-
 	start := time.Now()
 	log.Infow("walking snapshot range",
 		"head", head.Key(),
@ -544,7 +538,7 @@ func (cs *ChainStore) ExportRange(
 		includeReceipts: receipts,
 	}

-	pw, err := newWalkScheduler(ctx, cacheStore, cfg, w)
+	pw, err := newWalkScheduler(ctx, cs.UnionStore(), cfg, w)
 	if err != nil {
 		return err
 	}
--- a/cli/chain.go
+++ b/cli/chain.go
@ -1182,11 +1182,6 @@ var ChainExportRangeCmd = &cli.Command{
 			Usage: "specify the number of workers",
 			Value: 1,
 		},
-		&cli.IntFlag{
-			Name:  "cache-size",
-			Usage: "specify the size of the cache (in objects) to use while exporting",
-			Value: 100_000,
-		},
 		&cli.IntFlag{
 			Name:  "write-buffer",
 			Usage: "specify write buffer size",
@ -1243,7 +1238,6 @@ var ChainExportRangeCmd = &cli.Command{
 			if err := api.ChainExportRangeInternal(ctx, head.Key(), tail.Key(), lapi.ChainExportConfig{
 				WriteBufferSize:   cctx.Int("write-buffer"),
 				NumWorkers:        cctx.Int("workers"),
-				CacheSize:         cctx.Int("cache-size"),
 				IncludeMessages:   cctx.Bool("messages"),
 				IncludeReceipts:   cctx.Bool("receipts"),
 				IncludeStateRoots: cctx.Bool("stateroots"),
@ -1256,7 +1250,6 @@ var ChainExportRangeCmd = &cli.Command{
 		stream, err := api.ChainExportRange(ctx, head.Key(), tail.Key(), lapi.ChainExportConfig{
 			WriteBufferSize:   cctx.Int("write-buffer"),
 			NumWorkers:        cctx.Int("workers"),
-			CacheSize:         cctx.Int("cache-size"),
 			IncludeMessages:   cctx.Bool("messages"),
 			IncludeReceipts:   cctx.Bool("receipts"),
 			IncludeStateRoots: cctx.Bool("stateroots"),
--- a/node/impl/full/chain.go
+++ b/node/impl/full/chain.go
@ -630,7 +630,7 @@ func (a ChainAPI) ChainExportRangeInternal(ctx context.Context, head, tail types
 		bw,
 		headTs, tailTs,
 		cfg.IncludeMessages, cfg.IncludeReceipts, cfg.IncludeStateRoots,
-		cfg.NumWorkers, cfg.CacheSize,
+		cfg.NumWorkers,
 	); err != nil {
 		return fmt.Errorf("exporting chain range: %w", err)
 	}
@ -658,7 +658,7 @@ func (a ChainAPI) ChainExportRange(ctx context.Context, head, tail types.TipSetK
 			headTs,
 			tailTs,
 			cfg.IncludeMessages, cfg.IncludeReceipts, cfg.IncludeStateRoots,
-			cfg.NumWorkers, cfg.CacheSize,
+			cfg.NumWorkers,
 		)
 		bw.Flush()            //nolint:errcheck // it is a write to a pipe
 		w.CloseWithError(err) //nolint:errcheck // it is a pipe