From aa45fa3ff7afe91776f4c9535e31880ea93973a9 Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Wed, 9 Dec 2020 05:10:34 +0000 Subject: [PATCH] Revert fork choice if disk write fails (#2068) ## Issue Addressed Closes #2028 Replaces #2059 ## Proposed Changes If writing to the database fails while importing a block, revert fork choice to the last version stored on disk. This prevents fork choice from being ahead of the blocks on disk. Having fork choice ahead is particularly bad if it is later successfully written to disk, because it renders the database corrupt (see #2028). ## Additional Info * This mitigation might fail if the head+fork choice haven't been persisted yet, which can only happen at first startup (see #2067) * This relies on it being OK for the head tracker to be ahead of fork choice. I figure this is tolerable because blocks only get added to the head tracker after successfully being written on disk _and_ to fork choice, so even if fork choice reverts a little bit, when the pruning algorithm runs, those blocks will still be on disk and OK to prune. The pruning algorithm also doesn't rely on heads being unique, technically it's OK for multiple blocks from the same linear chain segment to be present in the head tracker. This begs the question of #1785 (i.e. things would be simpler with the head tracker out of the way). Alternatively, this PR could just revert the head tracker as well (I'll look into this tomorrow). --- beacon_node/beacon_chain/src/beacon_chain.rs | 60 ++++++++++++++++++-- beacon_node/beacon_chain/src/builder.rs | 23 ++------ beacon_node/beacon_chain/src/errors.rs | 3 + 3 files changed, 63 insertions(+), 23 deletions(-) diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index 51f0c2ea4..cdc1c251f 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -160,6 +160,15 @@ pub trait BeaconChainTypes: Send + Sync + 'static { type EthSpec: types::EthSpec; } +pub type BeaconForkChoice = ForkChoice< + BeaconForkChoiceStore< + ::EthSpec, + ::HotStore, + ::ColdStore, + >, + ::EthSpec, +>; + /// Represents the "Beacon Chain" component of Ethereum 2.0. Allows import of blocks and block /// operations and chooses a canonical head. pub struct BeaconChain { @@ -207,13 +216,9 @@ pub struct BeaconChain { pub genesis_state_root: Hash256, /// The root of the list of genesis validators, used during syncing. pub genesis_validators_root: Hash256, - - #[allow(clippy::type_complexity)] /// A state-machine that is updated with information from the network and chooses a canonical /// head block. - pub fork_choice: RwLock< - ForkChoice, T::EthSpec>, - >, + pub fork_choice: RwLock>, /// A handler for events generated by the beacon chain. This is only initialized when the /// HTTP server is enabled. pub event_handler: Option>, @@ -284,6 +289,25 @@ impl BeaconChain { persisted_fork_choice.as_kv_store_op(FORK_CHOICE_DB_KEY) } + /// Load fork choice from disk, returning `None` if it isn't found. + pub fn load_fork_choice( + store: Arc>, + ) -> Result>, Error> { + let persisted_fork_choice = + match store.get_item::(&FORK_CHOICE_DB_KEY)? { + Some(fc) => fc, + None => return Ok(None), + }; + + let fc_store = + BeaconForkChoiceStore::from_persisted(persisted_fork_choice.fork_choice_store, store)?; + + Ok(Some(ForkChoice::from_persisted( + persisted_fork_choice.fork_choice, + fc_store, + )?)) + } + /// Persists `self.op_pool` to disk. /// /// ## Notes @@ -1715,13 +1739,37 @@ impl BeaconChain { // Store the block and its state, and execute the confirmation batch for the intermediate // states, which will delete their temporary flags. + // If the write fails, revert fork choice to the version from disk, else we can + // end up with blocks in fork choice that are missing from disk. + // See https://github.com/sigp/lighthouse/issues/2028 ops.push(StoreOp::PutBlock( block_root, Box::new(signed_block.clone()), )); ops.push(StoreOp::PutState(block.state_root, &state)); let txn_lock = self.store.hot_db.begin_rw_transaction(); - self.store.do_atomically(ops)?; + + if let Err(e) = self.store.do_atomically(ops) { + error!( + self.log, + "Database write failed!"; + "msg" => "Restoring fork choice from disk", + "error" => ?e, + ); + match Self::load_fork_choice(self.store.clone())? { + Some(persisted_fork_choice) => { + *fork_choice = persisted_fork_choice; + } + None => { + crit!( + self.log, + "No stored fork choice found to restore from"; + "warning" => "The database is likely corrupt now, consider --purge-db" + ); + } + } + return Err(e.into()); + } drop(txn_lock); // The fork choice write-lock is dropped *after* the on-disk database has been updated. diff --git a/beacon_node/beacon_chain/src/builder.rs b/beacon_node/beacon_chain/src/builder.rs index 5276d1152..a03192a52 100644 --- a/beacon_node/beacon_chain/src/builder.rs +++ b/beacon_node/beacon_chain/src/builder.rs @@ -1,11 +1,8 @@ -use crate::beacon_chain::{ - BEACON_CHAIN_DB_KEY, ETH1_CACHE_DB_KEY, FORK_CHOICE_DB_KEY, OP_POOL_DB_KEY, -}; +use crate::beacon_chain::{BEACON_CHAIN_DB_KEY, ETH1_CACHE_DB_KEY, OP_POOL_DB_KEY}; use crate::eth1_chain::{CachingEth1Backend, SszEth1}; use crate::head_tracker::HeadTracker; use crate::migrate::{BackgroundMigrator, MigratorConfig}; use crate::persisted_beacon_chain::PersistedBeaconChain; -use crate::persisted_fork_choice::PersistedForkChoice; use crate::shuffling_cache::ShufflingCache; use crate::snapshot_cache::{SnapshotCache, DEFAULT_SNAPSHOT_CACHE_SIZE}; use crate::timeout_rw_lock::TimeoutRwLock; @@ -248,20 +245,12 @@ where .to_string() })?; - let persisted_fork_choice = store - .get_item::(&FORK_CHOICE_DB_KEY) - .map_err(|e| format!("DB error when reading persisted fork choice: {:?}", e))? - .ok_or("No persisted fork choice present in database.")?; - - let fc_store = BeaconForkChoiceStore::from_persisted( - persisted_fork_choice.fork_choice_store, - store.clone(), - ) - .map_err(|e| format!("Unable to load ForkChoiceStore: {:?}", e))?; - let fork_choice = - ForkChoice::from_persisted(persisted_fork_choice.fork_choice, fc_store) - .map_err(|e| format!("Unable to parse persisted fork choice from disk: {:?}", e))?; + BeaconChain::>::load_fork_choice( + store.clone(), + ) + .map_err(|e| format!("Unable to load fork choice from disk: {:?}", e))? + .ok_or("Fork choice not found in store")?; let genesis_block = store .get_item::>(&chain.genesis_block_root) diff --git a/beacon_node/beacon_chain/src/errors.rs b/beacon_node/beacon_chain/src/errors.rs index 4153ac702..2b71636e9 100644 --- a/beacon_node/beacon_chain/src/errors.rs +++ b/beacon_node/beacon_chain/src/errors.rs @@ -1,4 +1,5 @@ use crate::beacon_chain::ForkChoiceError; +use crate::beacon_fork_choice_store::Error as ForkChoiceStoreError; use crate::eth1_chain::Error as Eth1ChainError; use crate::migrate::PruningError; use crate::naive_aggregation_pool::Error as NaiveAggregationError; @@ -46,6 +47,7 @@ pub enum BeaconChainError { DBInconsistent(String), DBError(store::Error), ForkChoiceError(ForkChoiceError), + ForkChoiceStoreError(ForkChoiceStoreError), MissingBeaconBlock(Hash256), MissingBeaconState(Hash256), SlotProcessingError(SlotProcessingError), @@ -106,6 +108,7 @@ easy_from_to!(ObservedBlockProducersError, BeaconChainError); easy_from_to!(BlockSignatureVerifierError, BeaconChainError); easy_from_to!(PruningError, BeaconChainError); easy_from_to!(ArithError, BeaconChainError); +easy_from_to!(ForkChoiceStoreError, BeaconChainError); #[derive(Debug)] pub enum BlockProductionError {