correctness fixes for the autobatch blockstore
1. Simplify shutdown and make it idempotent by using a context. 2. Make sure `Flush` actually _fully_ flushes if the previous flush failed. 3. Don't clear the flush batch if flushing fails.
This commit is contained in:
parent
b161f56bd2
commit
2a862d497f
@ -26,18 +26,17 @@ type AutobatchBlockstore struct {
|
|||||||
addedCids map[cid.Cid]struct{}
|
addedCids map[cid.Cid]struct{}
|
||||||
|
|
||||||
stateLock sync.Mutex
|
stateLock sync.Mutex
|
||||||
doFlushLock sync.Mutex
|
|
||||||
bufferedBatch blockBatch
|
bufferedBatch blockBatch
|
||||||
|
|
||||||
flushingBatch blockBatch
|
flushingBatch blockBatch
|
||||||
flushErr error
|
flushErr error
|
||||||
flushWorkerDone bool
|
|
||||||
|
|
||||||
flushCh chan struct{}
|
flushCh chan struct{}
|
||||||
|
|
||||||
|
doFlushLock sync.Mutex
|
||||||
flushRetryDelay time.Duration
|
flushRetryDelay time.Duration
|
||||||
flushCtx context.Context
|
doneCh chan struct{}
|
||||||
shutdownCh chan struct{}
|
shutdown context.CancelFunc
|
||||||
|
|
||||||
backingBs Blockstore
|
backingBs Blockstore
|
||||||
|
|
||||||
@ -46,21 +45,21 @@ type AutobatchBlockstore struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func NewAutobatch(ctx context.Context, backingBs Blockstore, bufferCapacity int) *AutobatchBlockstore {
|
func NewAutobatch(ctx context.Context, backingBs Blockstore, bufferCapacity int) *AutobatchBlockstore {
|
||||||
|
ctx, cancel := context.WithCancel(ctx)
|
||||||
bs := &AutobatchBlockstore{
|
bs := &AutobatchBlockstore{
|
||||||
addedCids: make(map[cid.Cid]struct{}),
|
addedCids: make(map[cid.Cid]struct{}),
|
||||||
backingBs: backingBs,
|
backingBs: backingBs,
|
||||||
bufferCapacity: bufferCapacity,
|
bufferCapacity: bufferCapacity,
|
||||||
flushCtx: ctx,
|
|
||||||
flushCh: make(chan struct{}, 1),
|
flushCh: make(chan struct{}, 1),
|
||||||
shutdownCh: make(chan struct{}),
|
doneCh: make(chan struct{}),
|
||||||
// could be made configable
|
// could be made configable
|
||||||
flushRetryDelay: time.Millisecond * 100,
|
flushRetryDelay: time.Millisecond * 100,
|
||||||
flushWorkerDone: false,
|
shutdown: cancel,
|
||||||
}
|
}
|
||||||
|
|
||||||
bs.bufferedBatch.blockMap = make(map[cid.Cid]block.Block)
|
bs.bufferedBatch.blockMap = make(map[cid.Cid]block.Block)
|
||||||
|
|
||||||
go bs.flushWorker()
|
go bs.flushWorker(ctx)
|
||||||
|
|
||||||
return bs
|
return bs
|
||||||
}
|
}
|
||||||
@ -88,66 +87,85 @@ func (bs *AutobatchBlockstore) Put(ctx context.Context, blk block.Block) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (bs *AutobatchBlockstore) flushWorker() {
|
func (bs *AutobatchBlockstore) flushWorker(ctx context.Context) {
|
||||||
defer func() {
|
defer close(bs.doneCh)
|
||||||
bs.stateLock.Lock()
|
|
||||||
bs.flushWorkerDone = true
|
|
||||||
bs.stateLock.Unlock()
|
|
||||||
}()
|
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-bs.flushCh:
|
case <-bs.flushCh:
|
||||||
putErr := bs.doFlush(bs.flushCtx)
|
// TODO: check if we _should_ actually flush. We could get a spurious wakeup
|
||||||
|
// here.
|
||||||
|
putErr := bs.doFlush(ctx, false)
|
||||||
for putErr != nil {
|
for putErr != nil {
|
||||||
select {
|
select {
|
||||||
case <-bs.shutdownCh:
|
case <-ctx.Done():
|
||||||
return
|
return
|
||||||
case <-time.After(bs.flushRetryDelay):
|
case <-time.After(bs.flushRetryDelay):
|
||||||
autolog.Errorf("FLUSH ERRORED: %w, retrying after %v", putErr, bs.flushRetryDelay)
|
autolog.Errorf("FLUSH ERRORED: %w, retrying after %v", putErr, bs.flushRetryDelay)
|
||||||
putErr = bs.doFlush(bs.flushCtx)
|
putErr = bs.doFlush(ctx, true)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case <-bs.shutdownCh:
|
case <-ctx.Done():
|
||||||
|
// Do one last flush.
|
||||||
|
_ = bs.doFlush(ctx, false)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// caller must NOT hold stateLock
|
// caller must NOT hold stateLock
|
||||||
func (bs *AutobatchBlockstore) doFlush(ctx context.Context) error {
|
// set retryOnly to true to only retry a failed flush and not flush anything new.
|
||||||
|
func (bs *AutobatchBlockstore) doFlush(ctx context.Context, retryOnly bool) error {
|
||||||
bs.doFlushLock.Lock()
|
bs.doFlushLock.Lock()
|
||||||
defer bs.doFlushLock.Unlock()
|
defer bs.doFlushLock.Unlock()
|
||||||
if bs.flushErr == nil {
|
|
||||||
bs.stateLock.Lock()
|
// If we failed to flush last time, try flushing again.
|
||||||
// We do NOT clear addedCids here, because its purpose is to expedite Puts
|
if bs.flushErr != nil {
|
||||||
bs.flushingBatch = bs.bufferedBatch
|
bs.flushErr = bs.backingBs.PutMany(ctx, bs.flushingBatch.blockList)
|
||||||
bs.bufferedBatch.blockList = make([]block.Block, 0, len(bs.flushingBatch.blockList))
|
|
||||||
bs.bufferedBatch.blockMap = make(map[cid.Cid]block.Block, len(bs.flushingBatch.blockMap))
|
|
||||||
bs.stateLock.Unlock()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bs.flushErr = bs.backingBs.PutMany(ctx, bs.flushingBatch.blockList)
|
// If we failed, or we're _only_ retrying, bail.
|
||||||
|
if retryOnly || bs.flushErr != nil {
|
||||||
|
return bs.flushErr
|
||||||
|
}
|
||||||
|
|
||||||
|
// Then take the current batch...
|
||||||
bs.stateLock.Lock()
|
bs.stateLock.Lock()
|
||||||
bs.flushingBatch = blockBatch{}
|
// We do NOT clear addedCids here, because its purpose is to expedite Puts
|
||||||
|
bs.flushingBatch = bs.bufferedBatch
|
||||||
|
bs.bufferedBatch.blockList = make([]block.Block, 0, len(bs.flushingBatch.blockList))
|
||||||
|
bs.bufferedBatch.blockMap = make(map[cid.Cid]block.Block, len(bs.flushingBatch.blockMap))
|
||||||
bs.stateLock.Unlock()
|
bs.stateLock.Unlock()
|
||||||
|
|
||||||
|
// And try to flush it.
|
||||||
|
bs.flushErr = bs.backingBs.PutMany(ctx, bs.flushingBatch.blockList)
|
||||||
|
|
||||||
|
// If we succeeded, reset the batch. Otherwise, we'll try again next time.
|
||||||
|
if bs.flushErr == nil {
|
||||||
|
bs.stateLock.Lock()
|
||||||
|
bs.flushingBatch = blockBatch{}
|
||||||
|
bs.stateLock.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
return bs.flushErr
|
return bs.flushErr
|
||||||
}
|
}
|
||||||
|
|
||||||
// caller must NOT hold stateLock
|
// caller must NOT hold stateLock
|
||||||
func (bs *AutobatchBlockstore) Flush(ctx context.Context) error {
|
func (bs *AutobatchBlockstore) Flush(ctx context.Context) error {
|
||||||
return bs.doFlush(ctx)
|
return bs.doFlush(ctx, false)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (bs *AutobatchBlockstore) Shutdown(ctx context.Context) error {
|
func (bs *AutobatchBlockstore) Shutdown(ctx context.Context) error {
|
||||||
bs.stateLock.Lock()
|
// TODO: Prevent puts after we call this to avoid losing data.
|
||||||
flushDone := bs.flushWorkerDone
|
bs.shutdown()
|
||||||
bs.stateLock.Unlock()
|
select {
|
||||||
if !flushDone {
|
case <-bs.doneCh:
|
||||||
// may racily block forever if Shutdown is called in parallel
|
case <-ctx.Done():
|
||||||
bs.shutdownCh <- struct{}{}
|
return ctx.Err()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bs.doFlushLock.Lock()
|
||||||
|
defer bs.doFlushLock.Unlock()
|
||||||
|
|
||||||
return bs.flushErr
|
return bs.flushErr
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user