Ranged-export: Remove CachingBlockstore

The improvements in the range-export code lead to avoid reading most blocks
twice, as well as to allowing some blocks to be written to disk multiple times.

The cache hit-rate went down from being close to 50% to a maximum of 12% at
the very end of the export. The reason is that most CIDs are never read twice
since they are correctly tracked in the CID set.

These numbers do not support the maintenance of the CachingBlockstore
code. Additional testing shows that removing it has similar memory-usage
behaviour and about 5 minute-faster execution (around 10%).

Less code to maintain and less options to mess up with.
This commit is contained in:
Hector Sanjuan 2023-02-06 12:01:05 +01:00
parent fa93c23813
commit 1bb698619c
5 changed files with 4 additions and 136 deletions

View File

@ -403,7 +403,6 @@ func (m *MsgUuidMapType) UnmarshalJSON(b []byte) error {
type ChainExportConfig struct {
WriteBufferSize int
NumWorkers int
CacheSize int
IncludeMessages bool
IncludeReceipts bool
IncludeStateRoots bool

View File

@ -1,118 +0,0 @@
package store
import (
"context"
"fmt"
"sync/atomic"
lru "github.com/hashicorp/golang-lru"
blocks "github.com/ipfs/go-block-format"
"github.com/ipfs/go-cid"
"github.com/filecoin-project/lotus/blockstore"
)
type CachingBlockstore struct {
cache *lru.ARCCache
blocks blockstore.Blockstore
reads int64 // updated atomically
hits int64 // updated atomically
bytes int64 // updated atomically
}
func NewCachingBlockstore(blocks blockstore.Blockstore, cacheSize int) (*CachingBlockstore, error) {
cache, err := lru.NewARC(cacheSize)
if err != nil {
return nil, fmt.Errorf("new arc: %w", err)
}
return &CachingBlockstore{
cache: cache,
blocks: blocks,
}, nil
}
func (cs *CachingBlockstore) DeleteBlock(ctx context.Context, c cid.Cid) error {
return cs.blocks.DeleteBlock(ctx, c)
}
func (cs *CachingBlockstore) GetSize(ctx context.Context, c cid.Cid) (int, error) {
return cs.blocks.GetSize(ctx, c)
}
func (cs *CachingBlockstore) Put(ctx context.Context, blk blocks.Block) error {
return cs.blocks.Put(ctx, blk)
}
func (cs *CachingBlockstore) PutMany(ctx context.Context, blks []blocks.Block) error {
return cs.blocks.PutMany(ctx, blks)
}
func (cs *CachingBlockstore) AllKeysChan(ctx context.Context) (<-chan cid.Cid, error) {
return cs.blocks.AllKeysChan(ctx)
}
func (cs *CachingBlockstore) HashOnRead(enabled bool) {
cs.blocks.HashOnRead(enabled)
}
func (cs *CachingBlockstore) DeleteMany(ctx context.Context, cids []cid.Cid) error {
return cs.blocks.DeleteMany(ctx, cids)
}
func (cs *CachingBlockstore) Get(ctx context.Context, c cid.Cid) (blocks.Block, error) {
reads := atomic.AddInt64(&cs.reads, 1)
if reads%100000 == 0 {
hits := atomic.LoadInt64(&cs.hits)
by := atomic.LoadInt64(&cs.bytes)
log.Infow("CachingBlockstore stats", "reads", reads, "cache_len", cs.cache.Len(), "hit_rate", float64(hits)/float64(reads), "bytes_read", by)
}
v, hit := cs.cache.Get(c)
if hit {
atomic.AddInt64(&cs.hits, 1)
return v.(blocks.Block), nil
}
blk, err := cs.blocks.Get(ctx, c)
if err != nil {
return nil, err
}
atomic.AddInt64(&cs.bytes, int64(len(blk.RawData())))
cs.cache.Add(c, blk)
return blk, err
}
func (cs *CachingBlockstore) View(ctx context.Context, c cid.Cid, callback func([]byte) error) error {
reads := atomic.AddInt64(&cs.reads, 1)
if reads%1000000 == 0 {
hits := atomic.LoadInt64(&cs.hits)
by := atomic.LoadInt64(&cs.bytes)
log.Infow("CachingBlockstore stats", "reads", reads, "cache_len", cs.cache.Len(), "hit_rate", float64(hits)/float64(reads), "bytes_read", by)
}
v, hit := cs.cache.Get(c)
if hit {
atomic.AddInt64(&cs.hits, 1)
return callback(v.(blocks.Block).RawData())
}
blk, err := cs.blocks.Get(ctx, c)
if err != nil {
return err
}
atomic.AddInt64(&cs.bytes, int64(len(blk.RawData())))
cs.cache.Add(c, blk)
return callback(blk.RawData())
}
func (cs *CachingBlockstore) Has(ctx context.Context, c cid.Cid) (bool, error) {
atomic.AddInt64(&cs.reads, 1)
// Safe to query cache since blockstore never deletes
if cs.cache.Contains(c) {
return true, nil
}
return cs.blocks.Has(ctx, c)
}

View File

@ -508,8 +508,7 @@ func (cs *ChainStore) ExportRange(
w io.Writer,
head, tail *types.TipSet,
messages, receipts, stateroots bool,
workers int,
cacheSize int) error {
workers int) error {
h := &car.CarHeader{
Roots: head.Cids(),
@ -520,11 +519,6 @@ func (cs *ChainStore) ExportRange(
return xerrors.Errorf("failed to write car header: %s", err)
}
cacheStore, err := NewCachingBlockstore(cs.UnionStore(), cacheSize)
if err != nil {
return err
}
start := time.Now()
log.Infow("walking snapshot range",
"head", head.Key(),
@ -544,7 +538,7 @@ func (cs *ChainStore) ExportRange(
includeReceipts: receipts,
}
pw, err := newWalkScheduler(ctx, cacheStore, cfg, w)
pw, err := newWalkScheduler(ctx, cs.UnionStore(), cfg, w)
if err != nil {
return err
}

View File

@ -1182,11 +1182,6 @@ var ChainExportRangeCmd = &cli.Command{
Usage: "specify the number of workers",
Value: 1,
},
&cli.IntFlag{
Name: "cache-size",
Usage: "specify the size of the cache (in objects) to use while exporting",
Value: 100_000,
},
&cli.IntFlag{
Name: "write-buffer",
Usage: "specify write buffer size",
@ -1243,7 +1238,6 @@ var ChainExportRangeCmd = &cli.Command{
if err := api.ChainExportRangeInternal(ctx, head.Key(), tail.Key(), lapi.ChainExportConfig{
WriteBufferSize: cctx.Int("write-buffer"),
NumWorkers: cctx.Int("workers"),
CacheSize: cctx.Int("cache-size"),
IncludeMessages: cctx.Bool("messages"),
IncludeReceipts: cctx.Bool("receipts"),
IncludeStateRoots: cctx.Bool("stateroots"),
@ -1256,7 +1250,6 @@ var ChainExportRangeCmd = &cli.Command{
stream, err := api.ChainExportRange(ctx, head.Key(), tail.Key(), lapi.ChainExportConfig{
WriteBufferSize: cctx.Int("write-buffer"),
NumWorkers: cctx.Int("workers"),
CacheSize: cctx.Int("cache-size"),
IncludeMessages: cctx.Bool("messages"),
IncludeReceipts: cctx.Bool("receipts"),
IncludeStateRoots: cctx.Bool("stateroots"),

View File

@ -630,7 +630,7 @@ func (a ChainAPI) ChainExportRangeInternal(ctx context.Context, head, tail types
bw,
headTs, tailTs,
cfg.IncludeMessages, cfg.IncludeReceipts, cfg.IncludeStateRoots,
cfg.NumWorkers, cfg.CacheSize,
cfg.NumWorkers,
); err != nil {
return fmt.Errorf("exporting chain range: %w", err)
}
@ -658,7 +658,7 @@ func (a ChainAPI) ChainExportRange(ctx context.Context, head, tail types.TipSetK
headTs,
tailTs,
cfg.IncludeMessages, cfg.IncludeReceipts, cfg.IncludeStateRoots,
cfg.NumWorkers, cfg.CacheSize,
cfg.NumWorkers,
)
bw.Flush() //nolint:errcheck // it is a write to a pipe
w.CloseWithError(err) //nolint:errcheck // it is a pipe