correctness fixes for the autobatch blockstore

1. Simplify shutdown and make it idempotent by using a context.
2. Make sure `Flush` actually _fully_ flushes if the previous flush failed.
3. Don't clear the flush batch if flushing fails.
This commit is contained in:
Steven Allen 2022-01-12 14:37:29 -08:00 committed by Jennifer Wang
parent 4fd49cd541
commit ab3c1c507e

View File

@ -26,18 +26,17 @@ type AutobatchBlockstore struct {
addedCids map[cid.Cid]struct{} addedCids map[cid.Cid]struct{}
stateLock sync.Mutex stateLock sync.Mutex
doFlushLock sync.Mutex
bufferedBatch blockBatch bufferedBatch blockBatch
flushingBatch blockBatch flushingBatch blockBatch
flushErr error flushErr error
flushWorkerDone bool
flushCh chan struct{} flushCh chan struct{}
doFlushLock sync.Mutex
flushRetryDelay time.Duration flushRetryDelay time.Duration
flushCtx context.Context doneCh chan struct{}
shutdownCh chan struct{} shutdown context.CancelFunc
backingBs Blockstore backingBs Blockstore
@ -46,21 +45,21 @@ type AutobatchBlockstore struct {
} }
func NewAutobatch(ctx context.Context, backingBs Blockstore, bufferCapacity int) *AutobatchBlockstore { func NewAutobatch(ctx context.Context, backingBs Blockstore, bufferCapacity int) *AutobatchBlockstore {
ctx, cancel := context.WithCancel(ctx)
bs := &AutobatchBlockstore{ bs := &AutobatchBlockstore{
addedCids: make(map[cid.Cid]struct{}), addedCids: make(map[cid.Cid]struct{}),
backingBs: backingBs, backingBs: backingBs,
bufferCapacity: bufferCapacity, bufferCapacity: bufferCapacity,
flushCtx: ctx,
flushCh: make(chan struct{}, 1), flushCh: make(chan struct{}, 1),
shutdownCh: make(chan struct{}), doneCh: make(chan struct{}),
// could be made configable // could be made configable
flushRetryDelay: time.Millisecond * 100, flushRetryDelay: time.Millisecond * 100,
flushWorkerDone: false, shutdown: cancel,
} }
bs.bufferedBatch.blockMap = make(map[cid.Cid]block.Block) bs.bufferedBatch.blockMap = make(map[cid.Cid]block.Block)
go bs.flushWorker() go bs.flushWorker(ctx)
return bs return bs
} }
@ -88,66 +87,85 @@ func (bs *AutobatchBlockstore) Put(ctx context.Context, blk block.Block) error {
return nil return nil
} }
func (bs *AutobatchBlockstore) flushWorker() { func (bs *AutobatchBlockstore) flushWorker(ctx context.Context) {
defer func() { defer close(bs.doneCh)
bs.stateLock.Lock()
bs.flushWorkerDone = true
bs.stateLock.Unlock()
}()
for { for {
select { select {
case <-bs.flushCh: case <-bs.flushCh:
putErr := bs.doFlush(bs.flushCtx) // TODO: check if we _should_ actually flush. We could get a spurious wakeup
// here.
putErr := bs.doFlush(ctx, false)
for putErr != nil { for putErr != nil {
select { select {
case <-bs.shutdownCh: case <-ctx.Done():
return return
case <-time.After(bs.flushRetryDelay): case <-time.After(bs.flushRetryDelay):
autolog.Errorf("FLUSH ERRORED: %w, retrying after %v", putErr, bs.flushRetryDelay) autolog.Errorf("FLUSH ERRORED: %w, retrying after %v", putErr, bs.flushRetryDelay)
putErr = bs.doFlush(bs.flushCtx) putErr = bs.doFlush(ctx, true)
} }
} }
case <-bs.shutdownCh: case <-ctx.Done():
// Do one last flush.
_ = bs.doFlush(ctx, false)
return return
} }
} }
} }
// caller must NOT hold stateLock // caller must NOT hold stateLock
func (bs *AutobatchBlockstore) doFlush(ctx context.Context) error { // set retryOnly to true to only retry a failed flush and not flush anything new.
func (bs *AutobatchBlockstore) doFlush(ctx context.Context, retryOnly bool) error {
bs.doFlushLock.Lock() bs.doFlushLock.Lock()
defer bs.doFlushLock.Unlock() defer bs.doFlushLock.Unlock()
if bs.flushErr == nil {
// If we failed to flush last time, try flushing again.
if bs.flushErr != nil {
bs.flushErr = bs.backingBs.PutMany(ctx, bs.flushingBatch.blockList)
}
// If we failed, or we're _only_ retrying, bail.
if retryOnly || bs.flushErr != nil {
return bs.flushErr
}
// Then take the current batch...
bs.stateLock.Lock() bs.stateLock.Lock()
// We do NOT clear addedCids here, because its purpose is to expedite Puts // We do NOT clear addedCids here, because its purpose is to expedite Puts
bs.flushingBatch = bs.bufferedBatch bs.flushingBatch = bs.bufferedBatch
bs.bufferedBatch.blockList = make([]block.Block, 0, len(bs.flushingBatch.blockList)) bs.bufferedBatch.blockList = make([]block.Block, 0, len(bs.flushingBatch.blockList))
bs.bufferedBatch.blockMap = make(map[cid.Cid]block.Block, len(bs.flushingBatch.blockMap)) bs.bufferedBatch.blockMap = make(map[cid.Cid]block.Block, len(bs.flushingBatch.blockMap))
bs.stateLock.Unlock() bs.stateLock.Unlock()
}
// And try to flush it.
bs.flushErr = bs.backingBs.PutMany(ctx, bs.flushingBatch.blockList) bs.flushErr = bs.backingBs.PutMany(ctx, bs.flushingBatch.blockList)
// If we succeeded, reset the batch. Otherwise, we'll try again next time.
if bs.flushErr == nil {
bs.stateLock.Lock() bs.stateLock.Lock()
bs.flushingBatch = blockBatch{} bs.flushingBatch = blockBatch{}
bs.stateLock.Unlock() bs.stateLock.Unlock()
}
return bs.flushErr return bs.flushErr
} }
// caller must NOT hold stateLock // caller must NOT hold stateLock
func (bs *AutobatchBlockstore) Flush(ctx context.Context) error { func (bs *AutobatchBlockstore) Flush(ctx context.Context) error {
return bs.doFlush(ctx) return bs.doFlush(ctx, false)
} }
func (bs *AutobatchBlockstore) Shutdown(ctx context.Context) error { func (bs *AutobatchBlockstore) Shutdown(ctx context.Context) error {
bs.stateLock.Lock() // TODO: Prevent puts after we call this to avoid losing data.
flushDone := bs.flushWorkerDone bs.shutdown()
bs.stateLock.Unlock() select {
if !flushDone { case <-bs.doneCh:
// may racily block forever if Shutdown is called in parallel case <-ctx.Done():
bs.shutdownCh <- struct{}{} return ctx.Err()
} }
bs.doFlushLock.Lock()
defer bs.doFlushLock.Unlock()
return bs.flushErr return bs.flushErr
} }