mark from current epoch to boundary epoch when necessary

this is necessary to avoid wearing clown shoes when the node stays
offline for an extended period of time (more than 1 finality).

Basically it gets quite slow if we do the full 2 finality walk, so we
try to avoid it unless necessary.
The conditions under which a full walk is necessary is if there is a
sync gap (most likely because the node was offline) during which the
tracking of writes is inaccurate because we have not yet delivered the
HeadChange notification.  In this case, it is possible to have
actually hot blocks to be tracked before the boundary and fail to mark
them accordingly.  So when we detect a sync gap, we do the full walk;
if there is no sync gap, we can just use the much faster boundary
epoch walk.
This commit is contained in:
vyzo 2021-03-14 12:32:05 +02:00
parent d7ceef104e
commit 3a9b7c592d
2 changed files with 70 additions and 20 deletions

View File

@ -52,6 +52,10 @@ var (
// CompactionBoundary is the number of epochs from the current epoch at which
// we will walk the chain for live objects.
CompactionBoundary = 2 * build.Finality
// SyncGapTime is the time delay from a tipset's min timestamp before we decide
// there is a sync gap
SyncGapTime = 5 * time.Minute
)
var (
@ -64,6 +68,11 @@ var (
// all active blocks into the hotstore.
warmupEpochKey = dstore.NewKey("/splitstore/warmupEpoch")
// syncGapEpochKey stores the last epoch where a sync gap was detected.
// If there is a sync gap after the boundary epoch, compaction will perform
// a slower full walk from the current epoch to the boundary epoch
syncGapEpochKey = dstore.NewKey("/splitstore/syncGapEpoch")
// markSetSizeKey stores the current estimate for the mark set size.
// this is first computed at warmup and updated in every compaction
markSetSizeKey = dstore.NewKey("/splitstore/markSetSize")
@ -102,9 +111,10 @@ type SplitStore struct {
critsection int32 // compaction critical section
closing int32 // the split store is closing
baseEpoch abi.ChainEpoch
warmupEpoch abi.ChainEpoch
warm bool
baseEpoch abi.ChainEpoch
syncGapEpoch abi.ChainEpoch
warmupEpoch abi.ChainEpoch
warm bool
coldPurgeSize int
@ -348,6 +358,17 @@ func (s *SplitStore) Start(chain ChainAccessor) error {
return xerrors.Errorf("error loading warmup epoch: %w", err)
}
// load sync gap epoch from metadata ds
bs, err = s.ds.Get(syncGapEpochKey)
switch err {
case nil:
s.syncGapEpoch = bytesToEpoch(bs)
case dstore.ErrNotFound:
default:
return xerrors.Errorf("error loading sync gap epoch: %w", err)
}
// load markSetSize from metadata ds
// if none, the splitstore will compute it during warmup and update in every compaction
bs, err = s.ds.Get(markSetSizeKey)
@ -393,6 +414,14 @@ func (s *SplitStore) HeadChange(_, apply []*types.TipSet) error {
s.curTs = curTs
s.mx.Unlock()
timestamp := time.Unix(int64(curTs.MinTimestamp()), 0)
if time.Since(timestamp) > SyncGapTime {
err := s.setSyncGapEpoch(epoch)
if err != nil {
log.Warnf("error saving sync gap epoch: %s", err)
}
}
if !atomic.CompareAndSwapInt32(&s.compacting, 0, 1) {
// we are currently compacting, do nothing and wait for the next head change
return nil
@ -432,7 +461,7 @@ func (s *SplitStore) HeadChange(_, apply []*types.TipSet) error {
log.Info("compacting splitstore")
start := time.Now()
s.compact(curTs)
s.compact(curTs, s.syncGapEpoch)
log.Infow("compaction done", "took", time.Since(start))
}()
@ -513,7 +542,7 @@ func (s *SplitStore) warmup(curTs *types.TipSet) error {
}
}
log.Infow("warmup stats", "visited", count, "cold", xcount, "missing", missing)
log.Infow("warmup stats", "visited", count, "warm", xcount, "missing", missing)
if count > s.markSetSize {
s.markSetSize = count + count>>2 // overestimate a bit
@ -536,7 +565,7 @@ func (s *SplitStore) warmup(curTs *types.TipSet) error {
}
// Compaction/GC Algorithm
func (s *SplitStore) compact(curTs *types.TipSet) {
func (s *SplitStore) compact(curTs *types.TipSet, syncGapEpoch abi.ChainEpoch) {
var err error
if s.markSetSize == 0 {
start := time.Now()
@ -552,7 +581,7 @@ func (s *SplitStore) compact(curTs *types.TipSet) {
}
start := time.Now()
err = s.doCompact(curTs)
err = s.doCompact(curTs, syncGapEpoch)
took := time.Since(start).Milliseconds()
stats.Record(context.Background(), metrics.SplitstoreCompactionTimeSeconds.M(float64(took)/1e3))
@ -579,33 +608,48 @@ func (s *SplitStore) estimateMarkSetSize(curTs *types.TipSet) error {
return nil
}
func (s *SplitStore) doCompact(curTs *types.TipSet) error {
func (s *SplitStore) doCompact(curTs *types.TipSet, syncGapEpoch abi.ChainEpoch) error {
coldEpoch := s.baseEpoch + CompactionCold
currentEpoch := curTs.Height()
boundaryEpoch := currentEpoch - CompactionBoundary
log.Infow("running compaction", "currentEpoch", currentEpoch, "baseEpoch", s.baseEpoch, "coldEpoch", coldEpoch, "boundaryEpoch", boundaryEpoch)
coldSet, err := s.env.Create("cold", s.markSetSize)
markSet, err := s.env.Create("live", s.markSetSize)
if err != nil {
return xerrors.Errorf("error creating mark set: %w", err)
}
defer coldSet.Close() //nolint:errcheck
defer markSet.Close() //nolint:errcheck
// 1. mark reachable cold objects by looking at the objects reachable only from the cold epoch
log.Infow("marking reachable cold blocks", "boundaryEpoch", boundaryEpoch)
// 1. mark reachable objects by walking the chain from the current epoch to the boundary epoch
log.Infow("marking reachable blocks", "currentEpoch", currentEpoch, "boundaryEpoch", boundaryEpoch)
startMark := time.Now()
boundaryTs, err := s.chain.GetTipsetByHeight(context.Background(), boundaryEpoch, curTs, true)
if err != nil {
return xerrors.Errorf("error getting tipset at boundary epoch: %w", err)
var markTs *types.TipSet
if syncGapEpoch > boundaryEpoch {
// There is a sync gap that may have caused writes that are logically after the boundary
// epoch to be written before the respective head change notification and hence be tracked
// at the wrong epoch.
// This can happen if the node is offline or falls out of sync for an extended period of time.
// In this case we perform a full walk to avoid pathologies with pushing actually hot
// objects into the coldstore.
markTs = curTs
log.Infof("sync gap detected at epoch %d; marking from current epoch to boundary epoch", syncGapEpoch)
} else {
// There is no pathological sync gap, so we can use the much faster single tipset walk at
// the boundary epoch.
boundaryTs, err := s.chain.GetTipsetByHeight(context.Background(), boundaryEpoch, curTs, true)
if err != nil {
return xerrors.Errorf("error getting tipset at boundary epoch: %w", err)
}
markTs = boundaryTs
}
var count int64
err = s.walk(boundaryTs, boundaryEpoch,
err = s.walk(markTs, boundaryEpoch,
func(cid cid.Cid) error {
count++
return coldSet.Mark(cid)
return markSet.Mark(cid)
})
if err != nil {
@ -637,9 +681,9 @@ func (s *SplitStore) doCompact(curTs *types.TipSet) error {
}
// check whether it is reachable in the cold boundary
mark, err := coldSet.Has(cid)
mark, err := markSet.Has(cid)
if err != nil {
return xerrors.Errorf("error checkiing cold set for %s: %w", cid, err)
return xerrors.Errorf("error checkiing mark set for %s: %w", cid, err)
}
if mark {
@ -909,10 +953,14 @@ func (s *SplitStore) gcHotstore() {
func (s *SplitStore) setBaseEpoch(epoch abi.ChainEpoch) error {
s.baseEpoch = epoch
// write to datastore
return s.ds.Put(baseEpochKey, epochToBytes(epoch))
}
func (s *SplitStore) setSyncGapEpoch(epoch abi.ChainEpoch) error {
s.syncGapEpoch = epoch
return s.ds.Put(syncGapEpochKey, epochToBytes(epoch))
}
func epochToBytes(epoch abi.ChainEpoch) []byte {
return uint64ToBytes(uint64(epoch))
}

View File

@ -46,6 +46,7 @@ func testSplitStore(t *testing.T, cfg *Config) {
genBlock.Messages = garbage.Cid()
genBlock.ParentMessageReceipts = garbage.Cid()
genBlock.ParentStateRoot = garbage.Cid()
genBlock.Timestamp = uint64(time.Now().Unix())
genTs := mock.TipSet(genBlock)
chain.push(genTs)
@ -80,6 +81,7 @@ func testSplitStore(t *testing.T, cfg *Config) {
blk.Messages = garbage.Cid()
blk.ParentMessageReceipts = garbage.Cid()
blk.ParentStateRoot = garbage.Cid()
blk.Timestamp = uint64(time.Now().Unix())
sblk, err := blk.ToStorageBlock()
if err != nil {