Reset payload statuses when resuming fork choice (#3498)

## Issue Addressed

NA

## Proposed Changes

This PR is motivated by a recent consensus failure in Geth where it returned `INVALID` for an `VALID` block. Without this PR, the only way to recover is by re-syncing Lighthouse. Whilst ELs "shouldn't have consensus failures", in reality it's something that we can expect from time to time due to the complex nature of Ethereum. Being able to recover easily will help the network recover and EL devs to troubleshoot.

The risk introduced with this PR is that genuinely INVALID payloads get a "second chance" at being imported. I believe the DoS risk here is negligible since LH needs to be restarted in order to re-process the payload. Furthermore, there's no reason to think that a well-performing EL will accept a truly invalid payload the second-time-around.

## Additional Info

This implementation has the following intricacies:

1. Instead of just resetting *invalid* payloads to optimistic, we'll also reset *valid* payloads. This is an artifact of our existing implementation.
1. We will only reset payload statuses when we detect an invalid payload present in `proto_array`
    - This helps save us from forgetting that all our blocks are valid in the "best case scenario" where there are no invalid blocks.
1. If we fail to revert the payload statuses we'll log a `CRIT` and just continue with a `proto_array` that *does not* have reverted payload statuses.
    - The code to revert statuses needs to deal with balances and proposer-boost, so it's a failure point. This is a defensive measure to avoid introducing new show-stopping bugs to LH.
This commit is contained in:
Paul Hauner 2022-08-29 14:34:41 +00:00
parent 2ce86a0830
commit 8609cced0e
13 changed files with 161 additions and 14 deletions

1
Cargo.lock generated
View File

@ -2235,6 +2235,7 @@ dependencies = [
"eth2_ssz",
"eth2_ssz_derive",
"proto_array",
"slog",
"state_processing",
"store",
"tokio",

View File

@ -58,7 +58,7 @@ use execution_layer::{
};
use fork_choice::{
AttestationFromBlock, ExecutionStatus, ForkChoice, ForkchoiceUpdateParameters,
InvalidationOperation, PayloadVerificationStatus,
InvalidationOperation, PayloadVerificationStatus, ResetPayloadStatuses,
};
use futures::channel::mpsc::Sender;
use itertools::process_results;
@ -432,7 +432,9 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
/// Load fork choice from disk, returning `None` if it isn't found.
pub fn load_fork_choice(
store: BeaconStore<T>,
reset_payload_statuses: ResetPayloadStatuses,
spec: &ChainSpec,
log: &Logger,
) -> Result<Option<BeaconForkChoice<T>>, Error> {
let persisted_fork_choice =
match store.get_item::<PersistedForkChoice>(&FORK_CHOICE_DB_KEY)? {
@ -445,8 +447,10 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
Ok(Some(ForkChoice::from_persisted(
persisted_fork_choice.fork_choice,
reset_payload_statuses,
fc_store,
spec,
log,
)?))
}
@ -2925,10 +2929,15 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
// Since the write failed, try to revert the canonical head back to what was stored
// in the database. This attempts to prevent inconsistency between the database and
// fork choice.
if let Err(e) =
self.canonical_head
.restore_from_store(fork_choice, &self.store, &self.spec)
{
if let Err(e) = self.canonical_head.restore_from_store(
fork_choice,
ResetPayloadStatuses::always_reset_conditionally(
self.config.always_reset_payload_statuses,
),
&self.store,
&self.spec,
&self.log,
) {
crit!(
self.log,
"No stored fork choice found to restore from";

View File

@ -17,7 +17,7 @@ use crate::{
};
use eth1::Config as Eth1Config;
use execution_layer::ExecutionLayer;
use fork_choice::ForkChoice;
use fork_choice::{ForkChoice, ResetPayloadStatuses};
use futures::channel::mpsc::Sender;
use operation_pool::{OperationPool, PersistedOperationPool};
use parking_lot::RwLock;
@ -245,7 +245,11 @@ where
let fork_choice =
BeaconChain::<Witness<TSlotClock, TEth1Backend, _, _, _>>::load_fork_choice(
store.clone(),
ResetPayloadStatuses::always_reset_conditionally(
self.chain_config.always_reset_payload_statuses,
),
&self.spec,
log,
)
.map_err(|e| format!("Unable to load fork choice from disk: {:?}", e))?
.ok_or("Fork choice not found in store")?;

View File

@ -43,7 +43,9 @@ use crate::{
BeaconChain, BeaconChainError as Error, BeaconChainTypes, BeaconSnapshot,
};
use eth2::types::{EventKind, SseChainReorg, SseFinalizedCheckpoint, SseHead, SseLateHead};
use fork_choice::{ExecutionStatus, ForkChoiceView, ForkchoiceUpdateParameters, ProtoBlock};
use fork_choice::{
ExecutionStatus, ForkChoiceView, ForkchoiceUpdateParameters, ProtoBlock, ResetPayloadStatuses,
};
use itertools::process_results;
use parking_lot::{Mutex, RwLock, RwLockReadGuard, RwLockWriteGuard};
use slog::{crit, debug, error, warn, Logger};
@ -249,11 +251,14 @@ impl<T: BeaconChainTypes> CanonicalHead<T> {
// and it needs to be dropped to prevent a dead-lock. Requiring it to be passed here is
// defensive programming.
mut fork_choice_write_lock: RwLockWriteGuard<BeaconForkChoice<T>>,
reset_payload_statuses: ResetPayloadStatuses,
store: &BeaconStore<T>,
spec: &ChainSpec,
log: &Logger,
) -> Result<(), Error> {
let fork_choice = <BeaconChain<T>>::load_fork_choice(store.clone(), spec)?
.ok_or(Error::MissingPersistedForkChoice)?;
let fork_choice =
<BeaconChain<T>>::load_fork_choice(store.clone(), reset_payload_statuses, spec, log)?
.ok_or(Error::MissingPersistedForkChoice)?;
let fork_choice_view = fork_choice.cached_fork_choice_view();
let beacon_block_root = fork_choice_view.head_block_root;
let beacon_block = store

View File

@ -34,7 +34,12 @@ pub struct ChainConfig {
pub builder_fallback_epochs_since_finalization: usize,
/// Whether any chain health checks should be considered when deciding whether to use the builder API.
pub builder_fallback_disable_checks: bool,
/// When set to `true`, weigh the "unrealized" FFG progression when choosing a head in fork
/// choice.
pub count_unrealized: bool,
/// When set to `true`, forget any valid/invalid/optimistic statuses in fork choice during start
/// up.
pub always_reset_payload_statuses: bool,
/// Whether to apply paranoid checks to blocks proposed by this beacon node.
pub paranoid_block_proposal: bool,
}
@ -54,6 +59,7 @@ impl Default for ChainConfig {
builder_fallback_epochs_since_finalization: 3,
builder_fallback_disable_checks: false,
count_unrealized: true,
always_reset_payload_statuses: false,
paranoid_block_proposal: false,
}
}

View File

@ -786,4 +786,12 @@ pub fn cli_app<'a, 'b>() -> App<'a, 'b> {
.takes_value(true)
.default_value("true")
)
.arg(
Arg::with_name("reset-payload-statuses")
.long("reset-payload-statuses")
.help("When present, Lighthouse will forget the payload statuses of any \
already-imported blocks. This can assist in the recovery from a consensus \
failure caused by the execution layer.")
.takes_value(false)
)
}

View File

@ -644,6 +644,9 @@ pub fn get_config<E: EthSpec>(
client_config.chain.count_unrealized =
clap_utils::parse_required(cli_args, "count-unrealized")?;
client_config.chain.always_reset_payload_statuses =
cli_args.is_present("reset-payload-statuses");
client_config.chain.paranoid_block_proposal = cli_args.is_present("paranoid-block-proposal");
/*

View File

@ -12,6 +12,7 @@ state_processing = { path = "../state_processing" }
proto_array = { path = "../proto_array" }
eth2_ssz = "0.4.1"
eth2_ssz_derive = "0.3.0"
slog = { version = "2.5.2", features = ["max_level_trace", "release_max_level_trace"] }
[dev-dependencies]
beacon_chain = { path = "../../beacon_node/beacon_chain" }

View File

@ -1,5 +1,6 @@
use crate::{ForkChoiceStore, InvalidationOperation};
use proto_array::{Block as ProtoBlock, ExecutionStatus, ProtoArrayForkChoice};
use slog::{crit, debug, warn, Logger};
use ssz_derive::{Decode, Encode};
use state_processing::{
per_block_processing::errors::AttesterSlashingValidationError, per_epoch_processing,
@ -79,6 +80,26 @@ impl<T> From<state_processing::EpochProcessingError> for Error<T> {
}
}
#[derive(Debug, Clone, Copy)]
/// Controls how fork choice should behave when restoring from a persisted fork choice.
pub enum ResetPayloadStatuses {
/// Reset all payload statuses back to "optimistic".
Always,
/// Only reset all payload statuses back to "optimistic" when an "invalid" block is present.
OnlyWithInvalidPayload,
}
impl ResetPayloadStatuses {
/// When `should_always_reset == True`, return `ResetPayloadStatuses::Always`.
pub fn always_reset_conditionally(should_always_reset: bool) -> Self {
if should_always_reset {
ResetPayloadStatuses::Always
} else {
ResetPayloadStatuses::OnlyWithInvalidPayload
}
}
}
#[derive(Debug)]
pub enum InvalidBlock {
UnknownParent(Hash256),
@ -1425,15 +1446,68 @@ where
.map_err(Into::into)
}
/// Instantiate `Self` from some `PersistedForkChoice` generated by a earlier call to
/// `Self::to_persisted`.
pub fn proto_array_from_persisted(
persisted: &PersistedForkChoice,
reset_payload_statuses: ResetPayloadStatuses,
spec: &ChainSpec,
log: &Logger,
) -> Result<ProtoArrayForkChoice, Error<T::Error>> {
let mut proto_array = ProtoArrayForkChoice::from_bytes(&persisted.proto_array_bytes)
.map_err(Error::InvalidProtoArrayBytes)?;
let contains_invalid_payloads = proto_array.contains_invalid_payloads();
debug!(
log,
"Restoring fork choice from persisted";
"reset_payload_statuses" => ?reset_payload_statuses,
"contains_invalid_payloads" => contains_invalid_payloads,
);
// Exit early if there are no "invalid" payloads, if requested.
if matches!(
reset_payload_statuses,
ResetPayloadStatuses::OnlyWithInvalidPayload
) && !contains_invalid_payloads
{
return Ok(proto_array);
}
// Reset all blocks back to being "optimistic". This helps recover from an EL consensus
// fault where an invalid payload becomes valid.
if let Err(e) = proto_array.set_all_blocks_to_optimistic::<E>(spec) {
// If there is an error resetting the optimistic status then log loudly and revert
// back to a proto-array which does not have the reset applied. This indicates a
// significant error in Lighthouse and warrants detailed investigation.
crit!(
log,
"Failed to reset payload statuses";
"error" => e,
"info" => "please report this error",
);
ProtoArrayForkChoice::from_bytes(&persisted.proto_array_bytes)
.map_err(Error::InvalidProtoArrayBytes)
} else {
debug!(
log,
"Successfully reset all payload statuses";
);
Ok(proto_array)
}
}
/// Instantiate `Self` from some `PersistedForkChoice` generated by a earlier call to
/// `Self::to_persisted`.
pub fn from_persisted(
persisted: PersistedForkChoice,
reset_payload_statuses: ResetPayloadStatuses,
fc_store: T,
spec: &ChainSpec,
log: &Logger,
) -> Result<Self, Error<T::Error>> {
let proto_array = ProtoArrayForkChoice::from_bytes(&persisted.proto_array_bytes)
.map_err(Error::InvalidProtoArrayBytes)?;
let proto_array =
Self::proto_array_from_persisted(&persisted, reset_payload_statuses, spec, log)?;
let current_slot = fc_store.get_current_slot();
@ -1456,7 +1530,16 @@ where
// If a call to `get_head` fails, the only known cause is because the only head with viable
// FFG properties is has an invalid payload. In this scenario, set all the payloads back to
// an optimistic status so that we can have a head to start from.
if fork_choice.get_head(current_slot, spec).is_err() {
if let Err(e) = fork_choice.get_head(current_slot, spec) {
warn!(
log,
"Could not find head on persisted FC";
"info" => "resetting all payload statuses and retrying",
"error" => ?e
);
// Although we may have already made this call whilst loading `proto_array`, try it
// again since we may have mutated the `proto_array` during `get_head` and therefore may
// get a different result.
fork_choice
.proto_array
.set_all_blocks_to_optimistic::<E>(spec)?;

View File

@ -1,4 +1,5 @@
use std::collections::BTreeSet;
use std::fmt::Debug;
use types::{BeaconBlockRef, BeaconState, Checkpoint, EthSpec, ExecPayload, Hash256, Slot};
/// Approximates the `Store` in "Ethereum 2.0 Phase 0 -- Beacon Chain Fork Choice":
@ -18,7 +19,7 @@ use types::{BeaconBlockRef, BeaconState, Checkpoint, EthSpec, ExecPayload, Hash2
/// concrete struct is to allow this crate to be free from "impure" on-disk database logic,
/// hopefully making auditing easier.
pub trait ForkChoiceStore<T: EthSpec>: Sized {
type Error;
type Error: Debug;
/// Returns the last value passed to `Self::set_current_slot`.
fn get_current_slot(&self) -> Slot;

View File

@ -4,7 +4,7 @@ mod fork_choice_store;
pub use crate::fork_choice::{
AttestationFromBlock, CountUnrealized, Error, ForkChoice, ForkChoiceView,
ForkchoiceUpdateParameters, InvalidAttestation, InvalidBlock, PayloadVerificationStatus,
PersistedForkChoice, QueuedAttestation,
PersistedForkChoice, QueuedAttestation, ResetPayloadStatuses,
};
pub use fork_choice_store::ForkChoiceStore;
pub use proto_array::{Block as ProtoBlock, ExecutionStatus, InvalidationOperation};

View File

@ -317,6 +317,17 @@ impl ProtoArrayForkChoice {
.map_err(|e| format!("find_head failed: {:?}", e))
}
/// Returns `true` if there are any blocks in `self` with an `INVALID` execution payload status.
///
/// This will operate on *all* blocks, even those that do not descend from the finalized
/// ancestor.
pub fn contains_invalid_payloads(&mut self) -> bool {
self.proto_array
.nodes
.iter()
.any(|node| node.execution_status.is_invalid())
}
/// For all nodes, regardless of their relationship to the finalized block, set their execution
/// status to be optimistic.
///

View File

@ -178,6 +178,21 @@ fn count_unrealized_true() {
.with_config(|config| assert!(config.chain.count_unrealized));
}
#[test]
fn reset_payload_statuses_default() {
CommandLineTest::new()
.run_with_zero_port()
.with_config(|config| assert!(!config.chain.always_reset_payload_statuses));
}
#[test]
fn reset_payload_statuses_present() {
CommandLineTest::new()
.flag("reset-payload-statuses", None)
.run_with_zero_port()
.with_config(|config| assert!(config.chain.always_reset_payload_statuses));
}
#[test]
fn freezer_dir_flag() {
let dir = TempDir::new().expect("Unable to create temporary directory");