lotus/blockstore/splitstore/splitstore_prune.go
ZenGround0 0c91b0dc10
feat:chain:splitstore chain prune (#9056)
* Splitstore chain prune
* Protect on reification for simpler logic and sound cold compact protect
* Recovery from checkpoint during chain prune
* Splitstore (discard and universal mode) running in itests
* Add pause and restart functions to itest block miner
* Add config options to itest full nodes
* Add FsRepo support for itest full ndoes

Co-authored-by: zenground0 <ZenGround0@users.noreply.github.com>
2022-08-05 16:34:16 -04:00

565 lines
14 KiB
Go

package splitstore
import (
"bytes"
"os"
"runtime"
"sync"
"sync/atomic"
"time"
cid "github.com/ipfs/go-cid"
ipld "github.com/ipfs/go-ipld-format"
cbg "github.com/whyrusleeping/cbor-gen"
"go.opencensus.io/stats"
"golang.org/x/xerrors"
"github.com/filecoin-project/lotus/api"
bstore "github.com/filecoin-project/lotus/blockstore"
"github.com/filecoin-project/lotus/build"
"github.com/filecoin-project/lotus/chain/types"
"github.com/filecoin-project/lotus/metrics"
)
var (
// PruneOnline is a prune option that instructs PruneChain to use online gc for reclaiming space;
// there is no value associated with this option.
PruneOnlineGC = "splitstore.PruneOnlineGC"
// PruneMoving is a prune option that instructs PruneChain to use moving gc for reclaiming space;
// the value associated with this option is the path of the new coldstore.
PruneMovingGC = "splitstore.PruneMovingGC"
// PruneRetainState is a prune option that instructs PruneChain as to how many finalities worth
// of state to retain in the coldstore.
// The value is an integer:
// - if it is -1 then all state objects reachable from the chain will be retained in the coldstore.
// this is useful for garbage collecting side-chains and other garbage in archival nodes.
// This is the (safe) default.
// - if it is 0 then no state objects that are unreachable within the compaction boundary will
// be retained in the coldstore.
// - if it is a positive integer, then it's the number of finalities past the compaction boundary
// for which chain-reachable state objects are retained.
PruneRetainState = "splitstore.PruneRetainState"
)
// PruneChain instructs the SplitStore to prune chain state in the coldstore, according to the
// options specified.
func (s *SplitStore) PruneChain(opts api.PruneOpts) error {
retainState := opts.RetainState
var gcOpts []bstore.BlockstoreGCOption
if opts.MovingGC {
gcOpts = append(gcOpts, bstore.WithFullGC(true))
}
doGC := func() error { return s.gcBlockstore(s.cold, gcOpts) }
var retainStateP func(int64) bool
switch {
case retainState > 0:
retainStateP = func(depth int64) bool {
return depth <= int64(CompactionBoundary)+retainState*int64(build.Finality)
}
case retainState < 0:
retainStateP = func(_ int64) bool { return true }
default:
retainStateP = func(depth int64) bool {
return depth <= int64(CompactionBoundary)
}
}
if _, ok := s.cold.(bstore.BlockstoreIterator); !ok {
return xerrors.Errorf("coldstore does not support efficient iteration")
}
return s.pruneChain(retainStateP, doGC)
}
func (s *SplitStore) pruneChain(retainStateP func(int64) bool, doGC func() error) error {
// inhibit compaction while we are setting up
s.headChangeMx.Lock()
defer s.headChangeMx.Unlock()
// take the compaction lock; fail if there is a compaction in progress
if !atomic.CompareAndSwapInt32(&s.compacting, 0, 1) {
return xerrors.Errorf("compaction, prune or warmup in progress")
}
// check if we are actually closing first
if atomic.LoadInt32(&s.closing) == 1 {
atomic.StoreInt32(&s.compacting, 0)
return errClosing
}
// ensure that we have compacted at least once
if s.compactionIndex == 0 {
atomic.StoreInt32(&s.compacting, 0)
return xerrors.Errorf("splitstore has not compacted yet")
}
// get the current tipset
curTs := s.chain.GetHeaviestTipSet()
// begin the transaction and go
s.beginTxnProtect()
s.compactType = cold
go func() {
defer atomic.StoreInt32(&s.compacting, 0)
defer s.endTxnProtect()
log.Info("pruning splitstore")
start := time.Now()
s.prune(curTs, retainStateP, doGC)
log.Infow("prune done", "took", time.Since(start))
}()
return nil
}
func (s *SplitStore) prune(curTs *types.TipSet, retainStateP func(int64) bool, doGC func() error) {
log.Debug("waiting for active views to complete")
start := time.Now()
s.viewWait()
log.Debugw("waiting for active views done", "took", time.Since(start))
err := s.doPrune(curTs, retainStateP, doGC)
if err != nil {
log.Errorf("PRUNE ERROR: %s", err)
}
}
func (s *SplitStore) doPrune(curTs *types.TipSet, retainStateP func(int64) bool, doGC func() error) error {
currentEpoch := curTs.Height()
log.Infow("running prune", "currentEpoch", currentEpoch, "baseEpoch", s.baseEpoch)
markSet, err := s.markSetEnv.New("live", s.markSetSize)
if err != nil {
return xerrors.Errorf("error creating mark set: %w", err)
}
defer markSet.Close() //nolint:errcheck
defer s.debug.Flush()
if err := s.checkClosing(); err != nil {
return err
}
// 0. track all protected references at beginning of compaction; anything added later should
// be transactionally protected by the write
log.Info("protecting references with registered protectors")
err = s.applyProtectors()
if err != nil {
return err
}
// 1. mark reachable objects by walking the chain from the current epoch; we keep all messages
// and chain headers; state and reciepts are retained only if it is within retention policy scope
log.Info("marking reachable objects")
startMark := time.Now()
count := new(int64)
err = s.walkChainDeep(curTs, retainStateP,
func(c cid.Cid) error {
if isUnitaryObject(c) {
return errStopWalk
}
mark, err := markSet.Has(c)
if err != nil {
return xerrors.Errorf("error checking markset: %w", err)
}
if mark {
return errStopWalk
}
atomic.AddInt64(count, 1)
return markSet.Mark(c)
})
if err != nil {
return xerrors.Errorf("error marking: %w", err)
}
log.Infow("marking done", "took", time.Since(startMark), "marked", count)
if err := s.checkClosing(); err != nil {
return err
}
// 1.1 protect transactional refs
err = s.protectTxnRefs(markSet)
if err != nil {
return xerrors.Errorf("error protecting transactional refs: %w", err)
}
if err := s.checkClosing(); err != nil {
return err
}
// 2. iterate through the coldstore to collect dead objects
log.Info("collecting dead objects")
startCollect := time.Now()
deadw, err := NewColdSetWriter(s.deadSetPath())
if err != nil {
return xerrors.Errorf("error creating coldset: %w", err)
}
defer deadw.Close() //nolint:errcheck
// some stats for logging
var liveCnt, deadCnt int
err = s.cold.(bstore.BlockstoreIterator).ForEachKey(func(c cid.Cid) error {
// was it marked?
mark, err := markSet.Has(c)
if err != nil {
return xerrors.Errorf("error checking mark set for %s: %w", c, err)
}
if mark {
liveCnt++
return nil
}
// Note: a possible optimization here is to also purge objects that are in the hotstore
// but this needs special care not to purge genesis state, so we don't bother (yet)
// it's dead in the coldstore, mark it as candidate for purge
if err := deadw.Write(c); err != nil {
return xerrors.Errorf("error writing cid to coldstore: %w", err)
}
deadCnt++
return nil
})
if err != nil {
return xerrors.Errorf("error dead objects: %w", err)
}
if err := deadw.Close(); err != nil {
return xerrors.Errorf("error closing deadset: %w", err)
}
stats.Record(s.ctx, metrics.SplitstoreCompactionDead.M(int64(deadCnt)))
log.Infow("dead collection done", "took", time.Since(startCollect))
log.Infow("prune stats", "live", liveCnt, "dead", deadCnt)
if err := s.checkClosing(); err != nil {
return err
}
// now that we have collected dead objects, check for missing references from transactional i/o
// this is carried over from hot compaction for completeness
s.waitForMissingRefs(markSet)
if err := s.checkClosing(); err != nil {
return err
}
deadr, err := NewColdSetReader(s.deadSetPath())
if err != nil {
return xerrors.Errorf("error opening deadset: %w", err)
}
defer deadr.Close() //nolint:errcheck
// 3. Purge dead objects with checkpointing for recovery.
// This is the critical section of prune, whereby any dead object not in the markSet is
// considered already deleted.
// We delete dead objects in batches, holding the transaction lock, where we check the markSet
// again for new references created by the caller.
// After each batch we write a checkpoint to disk; if the process is interrupted before completion
// the process will continue from the checkpoint in the next recovery.
if err := s.beginCriticalSection(markSet); err != nil {
return xerrors.Errorf("error beginning critical section: %w", err)
}
if err := s.checkClosing(); err != nil {
return err
}
checkpoint, err := NewCheckpoint(s.pruneCheckpointPath())
if err != nil {
return xerrors.Errorf("error creating checkpoint: %w", err)
}
defer checkpoint.Close() //nolint:errcheck
log.Info("purging dead objects from the coldstore")
startPurge := time.Now()
err = s.purge(deadr, checkpoint, markSet)
if err != nil {
return xerrors.Errorf("error purging dead objects: %w", err)
}
log.Infow("purging dead objects from coldstore done", "took", time.Since(startPurge))
s.endCriticalSection()
if err := checkpoint.Close(); err != nil {
log.Warnf("error closing checkpoint: %s", err)
}
if err := os.Remove(s.pruneCheckpointPath()); err != nil {
log.Warnf("error removing checkpoint: %s", err)
}
if err := deadr.Close(); err != nil {
log.Warnf("error closing deadset: %s", err)
}
if err := os.Remove(s.deadSetPath()); err != nil {
log.Warnf("error removing deadset: %s", err)
}
// we are done; do some housekeeping
s.endTxnProtect()
err = doGC()
if err != nil {
log.Warnf("error garbage collecting cold store: %s", err)
}
s.pruneIndex++
err = s.ds.Put(s.ctx, pruneIndexKey, int64ToBytes(s.compactionIndex))
if err != nil {
return xerrors.Errorf("error saving compaction index: %w", err)
}
return nil
}
func (s *SplitStore) completePrune() error {
checkpoint, last, err := OpenCheckpoint(s.pruneCheckpointPath())
if err != nil {
return xerrors.Errorf("error opening checkpoint: %w", err)
}
defer checkpoint.Close() //nolint:errcheck
deadr, err := NewColdSetReader(s.deadSetPath())
if err != nil {
return xerrors.Errorf("error opening deadset: %w", err)
}
defer deadr.Close() //nolint:errcheck
markSet, err := s.markSetEnv.Recover("live")
if err != nil {
return xerrors.Errorf("error recovering markset: %w", err)
}
defer markSet.Close() //nolint:errcheck
// PURGE!
s.compactType = cold
log.Info("purging dead objects from the coldstore")
startPurge := time.Now()
err = s.completePurge(deadr, checkpoint, last, markSet)
if err != nil {
return xerrors.Errorf("error purgin dead objects: %w", err)
}
log.Infow("purging dead objects from the coldstore done", "took", time.Since(startPurge))
markSet.EndCriticalSection()
s.compactType = none
if err := checkpoint.Close(); err != nil {
log.Warnf("error closing checkpoint: %s", err)
}
if err := os.Remove(s.pruneCheckpointPath()); err != nil {
log.Warnf("error removing checkpoint: %s", err)
}
if err := deadr.Close(); err != nil {
log.Warnf("error closing deadset: %s", err)
}
if err := os.Remove(s.deadSetPath()); err != nil {
log.Warnf("error removing deadset: %s", err)
}
return nil
}
// like walkChain but peforms a deep walk, using parallel walking with walkObjectLax,
// whereby all extant messages are retained and state roots are retained if they satisfy
// the given predicate.
// missing references are ignored, as we expect to have plenty for snapshot syncs.
func (s *SplitStore) walkChainDeep(ts *types.TipSet, retainStateP func(int64) bool,
f func(cid.Cid) error) error {
visited := cid.NewSet()
toWalk := ts.Cids()
walkCnt := 0
workers := runtime.NumCPU() / 2
if workers < 2 {
workers = 2
}
var wg sync.WaitGroup
workch := make(chan cid.Cid, 16*workers)
errch := make(chan error, workers)
var once sync.Once
defer once.Do(func() { close(workch) })
push := func(c cid.Cid) error {
if !visited.Visit(c) {
return nil
}
select {
case workch <- c:
return nil
case err := <-errch:
return err
}
}
worker := func() {
defer wg.Done()
for c := range workch {
err := s.walkObjectLax(c, f)
if err != nil {
errch <- xerrors.Errorf("error walking object (cid: %s): %w", c, err)
return
}
}
}
for i := 0; i < workers; i++ {
wg.Add(1)
go worker()
}
baseEpoch := ts.Height()
minEpoch := baseEpoch // for progress report
log.Infof("walking at epoch %d", minEpoch)
walkBlock := func(c cid.Cid) error {
if !visited.Visit(c) {
return nil
}
walkCnt++
if err := f(c); err != nil {
return err
}
var hdr types.BlockHeader
err := s.view(c, func(data []byte) error {
return hdr.UnmarshalCBOR(bytes.NewBuffer(data))
})
if err != nil {
return xerrors.Errorf("error unmarshaling block header (cid: %s): %w", c, err)
}
if hdr.Height < minEpoch {
minEpoch = hdr.Height
if minEpoch%10_000 == 0 {
log.Infof("walking at epoch %d (walked: %d)", minEpoch, walkCnt)
}
}
depth := int64(baseEpoch - hdr.Height)
retainState := retainStateP(depth)
if hdr.Height > 0 {
if err := push(hdr.Messages); err != nil {
return err
}
if retainState {
if err := push(hdr.ParentMessageReceipts); err != nil {
return err
}
}
}
if retainState || hdr.Height == 0 {
if err := push(hdr.ParentStateRoot); err != nil {
return err
}
}
if hdr.Height > 0 {
toWalk = append(toWalk, hdr.Parents...)
}
return nil
}
for len(toWalk) > 0 {
// walking can take a while, so check this with every opportunity
if err := s.checkClosing(); err != nil {
return err
}
select {
case err := <-errch:
return err
default:
}
walking := toWalk
toWalk = nil
for _, c := range walking {
if err := walkBlock(c); err != nil {
return xerrors.Errorf("error walking block (cid: %s): %w", c, err)
}
}
}
once.Do(func() { close(workch) })
wg.Wait()
select {
case err := <-errch:
return err
default:
}
log.Infow("chain walk done", "walked", walkCnt)
return nil
}
// like walkObject but treats missing references laxly; faster version of walkObjectIncomplete
// without an occurs check.
func (s *SplitStore) walkObjectLax(c cid.Cid, f func(cid.Cid) error) error {
if err := f(c); err != nil {
if err == errStopWalk {
return nil
}
return err
}
if c.Prefix().Codec != cid.DagCBOR {
return nil
}
// check this before recursing
if err := s.checkClosing(); err != nil {
return err
}
var links []cid.Cid
err := s.view(c, func(data []byte) error {
return cbg.ScanForLinks(bytes.NewReader(data), func(c cid.Cid) {
links = append(links, c)
})
})
if err != nil {
if ipld.IsNotFound(err) { // not a problem for deep walks
return nil
}
return xerrors.Errorf("error scanning linked block (cid: %s): %w", c, err)
}
for _, c := range links {
err := s.walkObjectLax(c, f)
if err != nil {
return xerrors.Errorf("error walking link (cid: %s): %w", c, err)
}
}
return nil
}