eth/downloader: remove stale beacon headers as backfilling progresses (#24670)

* eth/downloader: remove stale beacon headers as backfilling progresses

* eth/downloader: remove leftover from a previous design

* eth/downloader: do partial beacon cleanups if chain is large

* eth/downloader: linter != heart
This commit is contained in:
Péter Szilágyi 2022-04-13 20:31:08 +03:00 committed by GitHub
parent ca298a2821
commit 86216189a5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 147 additions and 16 deletions

View File

@ -36,6 +36,7 @@ type beaconBackfiller struct {
syncMode SyncMode // Sync mode to use for backfilling the skeleton chains syncMode SyncMode // Sync mode to use for backfilling the skeleton chains
success func() // Callback to run on successful sync cycle completion success func() // Callback to run on successful sync cycle completion
filling bool // Flag whether the downloader is backfilling or not filling bool // Flag whether the downloader is backfilling or not
filled *types.Header // Last header filled by the last terminated sync loop
started chan struct{} // Notification channel whether the downloader inited started chan struct{} // Notification channel whether the downloader inited
lock sync.Mutex // Mutex protecting the sync lock lock sync.Mutex // Mutex protecting the sync lock
} }
@ -48,16 +49,18 @@ func newBeaconBackfiller(dl *Downloader, success func()) backfiller {
} }
} }
// suspend cancels any background downloader threads. // suspend cancels any background downloader threads and returns the last header
func (b *beaconBackfiller) suspend() { // that has been successfully backfilled.
func (b *beaconBackfiller) suspend() *types.Header {
// If no filling is running, don't waste cycles // If no filling is running, don't waste cycles
b.lock.Lock() b.lock.Lock()
filling := b.filling filling := b.filling
filled := b.filled
started := b.started started := b.started
b.lock.Unlock() b.lock.Unlock()
if !filling { if !filling {
return return filled // Return the filled header on the previous sync completion
} }
// A previous filling should be running, though it may happen that it hasn't // A previous filling should be running, though it may happen that it hasn't
// yet started (being done on a new goroutine). Many concurrent beacon head // yet started (being done on a new goroutine). Many concurrent beacon head
@ -69,6 +72,10 @@ func (b *beaconBackfiller) suspend() {
// Now that we're sure the downloader successfully started up, we can cancel // Now that we're sure the downloader successfully started up, we can cancel
// it safely without running the risk of data races. // it safely without running the risk of data races.
b.downloader.Cancel() b.downloader.Cancel()
// Sync cycle was just terminated, retrieve and return the last filled header.
// Can't use `filled` as that contains a stale value from before cancellation.
return b.downloader.blockchain.CurrentFastBlock().Header()
} }
// resume starts the downloader threads for backfilling state and chain data. // resume starts the downloader threads for backfilling state and chain data.
@ -81,6 +88,7 @@ func (b *beaconBackfiller) resume() {
return return
} }
b.filling = true b.filling = true
b.filled = nil
b.started = make(chan struct{}) b.started = make(chan struct{})
mode := b.syncMode mode := b.syncMode
b.lock.Unlock() b.lock.Unlock()
@ -92,6 +100,7 @@ func (b *beaconBackfiller) resume() {
defer func() { defer func() {
b.lock.Lock() b.lock.Lock()
b.filling = false b.filling = false
b.filled = b.downloader.blockchain.CurrentFastBlock().Header()
b.lock.Unlock() b.lock.Unlock()
}() }()
// If the downloader fails, report an error as in beacon chain mode there // If the downloader fails, report an error as in beacon chain mode there

View File

@ -19,6 +19,7 @@ package downloader
import ( import (
"encoding/json" "encoding/json"
"errors" "errors"
"fmt"
"math/rand" "math/rand"
"sort" "sort"
"time" "time"
@ -148,11 +149,15 @@ type backfiller interface {
// based on the skeleton chain as it might be invalid. The backfiller should // based on the skeleton chain as it might be invalid. The backfiller should
// gracefully handle multiple consecutive suspends without a resume, even // gracefully handle multiple consecutive suspends without a resume, even
// on initial sartup. // on initial sartup.
suspend() //
// The method should return the last block header that has been successfully
// backfilled, or nil if the backfiller was not resumed.
suspend() *types.Header
// resume requests the backfiller to start running fill or snap sync based on // resume requests the backfiller to start running fill or snap sync based on
// the skeleton chain as it has successfully been linked. Appending new heads // the skeleton chain as it has successfully been linked. Appending new heads
// to the end of the chain will not result in suspend/resume cycles. // to the end of the chain will not result in suspend/resume cycles.
// leaking too much sync logic out to the filler.
resume() resume()
} }
@ -358,8 +363,17 @@ func (s *skeleton) sync(head *types.Header) (*types.Header, error) {
if linked { if linked {
s.filler.resume() s.filler.resume()
} }
defer s.filler.suspend() defer func() {
if filled := s.filler.suspend(); filled != nil {
// If something was filled, try to delete stale sync helpers. If
// unsuccessful, warn the user, but not much else we can do (it's
// a programming error, just let users report an issue and don't
// choke in the meantime).
if err := s.cleanStales(filled); err != nil {
log.Error("Failed to clean stale beacon headers", "err", err)
}
}
}()
// Create a set of unique channels for this sync cycle. We need these to be // Create a set of unique channels for this sync cycle. We need these to be
// ephemeral so a data race doesn't accidentally deliver something stale on // ephemeral so a data race doesn't accidentally deliver something stale on
// a persistent channel across syncs (yup, this happened) // a persistent channel across syncs (yup, this happened)
@ -582,8 +596,16 @@ func (s *skeleton) processNewHead(head *types.Header, force bool) bool {
lastchain := s.progress.Subchains[0] lastchain := s.progress.Subchains[0]
if lastchain.Tail >= number { if lastchain.Tail >= number {
// If the chain is down to a single beacon header, and it is re-announced
// once more, ignore it instead of tearing down sync for a noop.
if lastchain.Head == lastchain.Tail {
if current := rawdb.ReadSkeletonHeader(s.db, number); current.Hash() == head.Hash() {
return false
}
}
// Not a noop / double head announce, abort with a reorg
if force { if force {
log.Warn("Beacon chain reorged", "tail", lastchain.Tail, "newHead", number) log.Warn("Beacon chain reorged", "tail", lastchain.Tail, "head", lastchain.Head, "newHead", number)
} }
return true return true
} }
@ -943,12 +965,44 @@ func (s *skeleton) processResponse(res *headerResponse) (linked bool, merged boo
// If the beacon chain was linked to the local chain, completely swap out // If the beacon chain was linked to the local chain, completely swap out
// all internal progress and abort header synchronization. // all internal progress and abort header synchronization.
if linked { if linked {
// Note, linking into the local chain should also mean that there are // Linking into the local chain should also mean that there are no
// no leftover subchains, but just in case there's some junk due to // leftover subchains, but in the case of importing the blocks via
// strange conditions or bugs, clean up all internal state. // the engine API, we will not push the subchains forward. This will
if len(s.progress.Subchains) > 1 { // lead to a gap between an old sync cycle and a future one.
log.Error("Cleaning up leftovers after beacon link") if subchains := len(s.progress.Subchains); subchains > 1 {
switch {
// If there are only 2 subchains - the current one and an older
// one - and the old one consists of a single block, then it's
// the expected new sync cycle after some propagated blocks. Log
// it for debugging purposes, explicitly clean and don't escalate.
case subchains == 2 && s.progress.Subchains[1].Head == s.progress.Subchains[1].Tail:
log.Debug("Cleaning previous beacon sync state", "head", s.progress.Subchains[1].Head)
rawdb.DeleteSkeletonHeader(batch, s.progress.Subchains[1].Head)
s.progress.Subchains = s.progress.Subchains[:1] s.progress.Subchains = s.progress.Subchains[:1]
// If we have more than one header or more than one leftover chain,
// the syncer's internal state is corrupted. Do try to fix it, but
// be very vocal about the fault.
default:
var context []interface{}
for i := range s.progress.Subchains[1:] {
context = append(context, fmt.Sprintf("stale_head_%d", i+1))
context = append(context, s.progress.Subchains[i+1].Head)
context = append(context, fmt.Sprintf("stale_tail_%d", i+1))
context = append(context, s.progress.Subchains[i+1].Tail)
context = append(context, fmt.Sprintf("stale_next_%d", i+1))
context = append(context, s.progress.Subchains[i+1].Next)
}
log.Error("Cleaning spurious beacon sync leftovers", context...)
s.progress.Subchains = s.progress.Subchains[:1]
// Note, here we didn't actually delete the headers at all,
// just the metadata. We could implement a cleanup mechanism,
// but further modifying corrupted state is kind of asking
// for it. Unless there's a good enough reason to risk it,
// better to live with the small database junk.
}
} }
break break
} }
@ -1023,6 +1077,74 @@ func (s *skeleton) processResponse(res *headerResponse) (linked bool, merged boo
return linked, merged return linked, merged
} }
// cleanStales removes previously synced beacon headers that have become stale
// due to the downloader backfilling past the tracked tail.
func (s *skeleton) cleanStales(filled *types.Header) error {
number := filled.Number.Uint64()
log.Trace("Cleaning stale beacon headers", "filled", number, "hash", filled.Hash())
// If the filled header is below the linked subchain, something's
// corrupted internally. Report and error and refuse to do anything.
if number < s.progress.Subchains[0].Tail {
return fmt.Errorf("filled header below beacon header tail: %d < %d", number, s.progress.Subchains[0].Tail)
}
// Subchain seems trimmable, push the tail forward up to the last
// filled header and delete everything before it - if available. In
// case we filled past the head, recreate the subchain with a new
// head to keep it consistent with the data on disk.
var (
start = s.progress.Subchains[0].Tail // start deleting from the first known header
end = number // delete until the requested threshold
)
s.progress.Subchains[0].Tail = number
s.progress.Subchains[0].Next = filled.ParentHash
if s.progress.Subchains[0].Head < number {
// If more headers were filled than available, push the entire
// subchain forward to keep tracking the node's block imports
end = s.progress.Subchains[0].Head + 1 // delete the entire original range, including the head
s.progress.Subchains[0].Head = number // assign a new head (tail is already assigned to this)
}
// Execute the trimming and the potential rewiring of the progress
batch := s.db.NewBatch()
if end != number {
// The entire original skeleton chain was deleted and a new one
// defined. Make sure the new single-header chain gets pushed to
// disk to keep internal state consistent.
rawdb.WriteSkeletonHeader(batch, filled)
}
s.saveSyncStatus(batch)
for n := start; n < end; n++ {
// If the batch grew too big, flush it and continue with a new batch.
// The catch is that the sync metadata needs to reflect the actually
// flushed state, so temporarily change the subchain progress and
// revert after the flush.
if batch.ValueSize() >= ethdb.IdealBatchSize {
tmpTail := s.progress.Subchains[0].Tail
tmpNext := s.progress.Subchains[0].Next
s.progress.Subchains[0].Tail = n
s.progress.Subchains[0].Next = rawdb.ReadSkeletonHeader(s.db, n).ParentHash
s.saveSyncStatus(batch)
if err := batch.Write(); err != nil {
log.Crit("Failed to write beacon trim data", "err", err)
}
batch.Reset()
s.progress.Subchains[0].Tail = tmpTail
s.progress.Subchains[0].Next = tmpNext
s.saveSyncStatus(batch)
}
rawdb.DeleteSkeletonHeader(batch, n)
}
if err := batch.Write(); err != nil {
log.Crit("Failed to write beacon trim data", "err", err)
}
return nil
}
// Bounds retrieves the current head and tail tracked by the skeleton syncer. // Bounds retrieves the current head and tail tracked by the skeleton syncer.
// This method is used by the backfiller, whose life cycle is controlled by the // This method is used by the backfiller, whose life cycle is controlled by the
// skeleton syncer. // skeleton syncer.

View File

@ -55,10 +55,11 @@ func newHookedBackfiller() backfiller {
// based on the skeleton chain as it might be invalid. The backfiller should // based on the skeleton chain as it might be invalid. The backfiller should
// gracefully handle multiple consecutive suspends without a resume, even // gracefully handle multiple consecutive suspends without a resume, even
// on initial sartup. // on initial sartup.
func (hf *hookedBackfiller) suspend() { func (hf *hookedBackfiller) suspend() *types.Header {
if hf.suspendHook != nil { if hf.suspendHook != nil {
hf.suspendHook() hf.suspendHook()
} }
return nil // we don't really care about header cleanups for now
} }
// resume requests the backfiller to start running fill or snap sync based on // resume requests the backfiller to start running fill or snap sync based on
@ -426,7 +427,6 @@ func TestSkeletonSyncExtend(t *testing.T) {
newstate: []*subchain{ newstate: []*subchain{
{Head: 49, Tail: 49}, {Head: 49, Tail: 49},
}, },
err: errReorgDenied,
}, },
// Initialize a sync and try to extend it with a sibling block. // Initialize a sync and try to extend it with a sibling block.
{ {
@ -489,7 +489,7 @@ func TestSkeletonSyncExtend(t *testing.T) {
<-wait <-wait
if err := skeleton.Sync(tt.extend, false); err != tt.err { if err := skeleton.Sync(tt.extend, false); err != tt.err {
t.Errorf("extension failure mismatch: have %v, want %v", err, tt.err) t.Errorf("test %d: extension failure mismatch: have %v, want %v", i, err, tt.err)
} }
skeleton.Terminate() skeleton.Terminate()