Make re-org strat more cautious and add more config (#4151)

## Proposed Changes

This change attempts to prevent failed re-orgs by:

1. Lowering the re-org cutoff from 2s to 1s. This is informed by a failed re-org attempted by @yorickdowne's node. The failed block was requested in the 1.5-2s window due to a Vouch failure, and failed to propagate to the majority of the network before the attestation deadline at 4s.
2. Allow users to adjust their re-org cutoff depending on observed network conditions and their risk profile. The static 2 second cutoff was too rigid.
3. Add a `--proposer-reorg-disallowed-offsets` flag which can be used to prohibit reorgs at certain slots. This is intended to help workaround an issue whereby reorging blocks at slot 1 are currently taking ~1.6s to propagate on gossip rather than ~500ms. This is suspected to be due to a cache miss in current versions of Prysm, which should be fixed in their next release.

## Additional Info

I'm of two minds about removing the `shuffling_stable` check which checks for blocks at slot 0 in the epoch. If we removed it users would be able to configure Lighthouse to try reorging at slot 0, which likely wouldn't work very well due to interactions with the proposer index cache. I think we could leave it for now and revisit it later.
This commit is contained in:
Michael Sproul 2023-04-13 07:05:01 +00:00
parent 00cf5fc184
commit b90c0c3fb1
12 changed files with 218 additions and 18 deletions

View File

@ -106,7 +106,6 @@ use task_executor::{ShutdownReason, TaskExecutor};
use tokio_stream::Stream;
use tree_hash::TreeHash;
use types::beacon_state::CloneConfig;
use types::consts::merge::INTERVALS_PER_SLOT;
use types::*;
pub type ForkChoiceError = fork_choice::Error<crate::ForkChoiceStoreError>;
@ -128,12 +127,6 @@ pub const VALIDATOR_PUBKEY_CACHE_LOCK_TIMEOUT: Duration = Duration::from_secs(1)
/// The timeout for the eth1 finalization cache
pub const ETH1_FINALIZATION_CACHE_LOCK_TIMEOUT: Duration = Duration::from_millis(200);
/// The latest delay from the start of the slot at which to attempt a 1-slot re-org.
fn max_re_org_slot_delay(seconds_per_slot: u64) -> Duration {
// Allow at least half of the attestation deadline for the block to propagate.
Duration::from_secs(seconds_per_slot) / INTERVALS_PER_SLOT as u32 / 2
}
// These keys are all zero because they get stored in different columns, see `DBColumn` type.
pub const BEACON_CHAIN_DB_KEY: Hash256 = Hash256::zero();
pub const OP_POOL_DB_KEY: Hash256 = Hash256::zero();
@ -3761,7 +3754,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
// 1. It seems we have time to propagate and still receive the proposer boost.
// 2. The current head block was seen late.
// 3. The `get_proposer_head` conditions from fork choice pass.
let proposing_on_time = slot_delay < max_re_org_slot_delay(self.spec.seconds_per_slot);
let proposing_on_time = slot_delay < self.config.re_org_cutoff(self.spec.seconds_per_slot);
if !proposing_on_time {
debug!(
self.log,
@ -3791,6 +3784,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
slot,
canonical_head,
re_org_threshold,
&self.config.re_org_disallowed_offsets,
self.config.re_org_max_epochs_since_finalization,
)
.map_err(|e| match e {
@ -4069,6 +4063,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
.get_preliminary_proposer_head(
head_block_root,
re_org_threshold,
&self.config.re_org_disallowed_offsets,
self.config.re_org_max_epochs_since_finalization,
)
.map_err(|e| e.map_inner_error(Error::ProposerHeadForkChoiceError))?;
@ -4079,7 +4074,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
let re_org_block_slot = head_slot + 1;
let fork_choice_slot = info.current_slot;
// If a re-orging proposal isn't made by the `max_re_org_slot_delay` then we give up
// If a re-orging proposal isn't made by the `re_org_cutoff` then we give up
// and allow the fork choice update for the canonical head through so that we may attest
// correctly.
let current_slot_ok = if head_slot == fork_choice_slot {
@ -4090,7 +4085,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
.and_then(|slot_start| {
let now = self.slot_clock.now_duration()?;
let slot_delay = now.saturating_sub(slot_start);
Some(slot_delay <= max_re_org_slot_delay(self.spec.seconds_per_slot))
Some(slot_delay <= self.config.re_org_cutoff(self.spec.seconds_per_slot))
})
.unwrap_or(false)
} else {

View File

@ -22,7 +22,7 @@ use fork_choice::{CountUnrealized, ForkChoice, ResetPayloadStatuses};
use futures::channel::mpsc::Sender;
use operation_pool::{OperationPool, PersistedOperationPool};
use parking_lot::RwLock;
use proto_array::ReOrgThreshold;
use proto_array::{DisallowedReOrgOffsets, ReOrgThreshold};
use slasher::Slasher;
use slog::{crit, error, info, Logger};
use slot_clock::{SlotClock, TestingSlotClock};
@ -175,6 +175,15 @@ where
self
}
/// Sets the proposer re-org disallowed offsets list.
pub fn proposer_re_org_disallowed_offsets(
mut self,
disallowed_offsets: DisallowedReOrgOffsets,
) -> Self {
self.chain_config.re_org_disallowed_offsets = disallowed_offsets;
self
}
/// Sets the store (database).
///
/// Should generally be called early in the build chain.

View File

@ -1,10 +1,12 @@
pub use proto_array::ReOrgThreshold;
pub use proto_array::{DisallowedReOrgOffsets, ReOrgThreshold};
use serde_derive::{Deserialize, Serialize};
use std::time::Duration;
use types::{Checkpoint, Epoch};
pub const DEFAULT_RE_ORG_THRESHOLD: ReOrgThreshold = ReOrgThreshold(20);
pub const DEFAULT_RE_ORG_MAX_EPOCHS_SINCE_FINALIZATION: Epoch = Epoch::new(2);
/// Default to 1/12th of the slot, which is 1 second on mainnet.
pub const DEFAULT_RE_ORG_CUTOFF_DENOMINATOR: u32 = 12;
pub const DEFAULT_FORK_CHOICE_BEFORE_PROPOSAL_TIMEOUT: u64 = 250;
/// Default fraction of a slot lookahead for payload preparation (12/3 = 4 seconds on mainnet).
@ -34,6 +36,13 @@ pub struct ChainConfig {
pub re_org_threshold: Option<ReOrgThreshold>,
/// Maximum number of epochs since finalization for attempting a proposer re-org.
pub re_org_max_epochs_since_finalization: Epoch,
/// Maximum delay after the start of the slot at which to propose a reorging block.
pub re_org_cutoff_millis: Option<u64>,
/// Additional epoch offsets at which re-orging block proposals are not permitted.
///
/// By default this list is empty, but it can be useful for reacting to network conditions, e.g.
/// slow gossip of re-org blocks at slot 1 in the epoch.
pub re_org_disallowed_offsets: DisallowedReOrgOffsets,
/// Number of milliseconds to wait for fork choice before proposing a block.
///
/// If set to 0 then block proposal will not wait for fork choice at all.
@ -82,6 +91,8 @@ impl Default for ChainConfig {
max_network_size: 10 * 1_048_576, // 10M
re_org_threshold: Some(DEFAULT_RE_ORG_THRESHOLD),
re_org_max_epochs_since_finalization: DEFAULT_RE_ORG_MAX_EPOCHS_SINCE_FINALIZATION,
re_org_cutoff_millis: None,
re_org_disallowed_offsets: DisallowedReOrgOffsets::default(),
fork_choice_before_proposal_timeout_ms: DEFAULT_FORK_CHOICE_BEFORE_PROPOSAL_TIMEOUT,
// Builder fallback configs that are set in `clap` will override these.
builder_fallback_skips: 3,
@ -100,3 +111,14 @@ impl Default for ChainConfig {
}
}
}
impl ChainConfig {
/// The latest delay from the start of the slot at which to attempt a 1-slot re-org.
pub fn re_org_cutoff(&self, seconds_per_slot: u64) -> Duration {
self.re_org_cutoff_millis
.map(Duration::from_millis)
.unwrap_or_else(|| {
Duration::from_secs(seconds_per_slot) / DEFAULT_RE_ORG_CUTOFF_DENOMINATOR
})
}
}

View File

@ -1,6 +1,6 @@
//! Generic tests that make use of the (newer) `InteractiveApiTester`
use beacon_chain::{
chain_config::ReOrgThreshold,
chain_config::{DisallowedReOrgOffsets, ReOrgThreshold},
test_utils::{AttestationStrategy, BlockStrategy, SyncCommitteeStrategy},
};
use eth2::types::DepositContractData;
@ -110,6 +110,8 @@ pub struct ReOrgTest {
misprediction: bool,
/// Whether to expect withdrawals to change on epoch boundaries.
expect_withdrawals_change_on_epoch: bool,
/// Epoch offsets to avoid proposing reorg blocks at.
disallowed_offsets: Vec<u64>,
}
impl Default for ReOrgTest {
@ -127,6 +129,7 @@ impl Default for ReOrgTest {
should_re_org: true,
misprediction: false,
expect_withdrawals_change_on_epoch: false,
disallowed_offsets: vec![],
}
}
}
@ -238,6 +241,32 @@ pub async fn proposer_boost_re_org_head_distance() {
.await;
}
// Check that a re-org at a disallowed offset fails.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
pub async fn proposer_boost_re_org_disallowed_offset() {
let offset = 4;
proposer_boost_re_org_test(ReOrgTest {
head_slot: Slot::new(E::slots_per_epoch() + offset - 1),
disallowed_offsets: vec![offset],
should_re_org: false,
..Default::default()
})
.await;
}
// Check that a re-org at the *only* allowed offset succeeds.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
pub async fn proposer_boost_re_org_disallowed_offset_exact() {
let offset = 4;
let disallowed_offsets = (0..E::slots_per_epoch()).filter(|o| *o != offset).collect();
proposer_boost_re_org_test(ReOrgTest {
head_slot: Slot::new(E::slots_per_epoch() + offset - 1),
disallowed_offsets,
..Default::default()
})
.await;
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
pub async fn proposer_boost_re_org_very_unhealthy() {
proposer_boost_re_org_test(ReOrgTest {
@ -286,6 +315,7 @@ pub async fn proposer_boost_re_org_test(
should_re_org,
misprediction,
expect_withdrawals_change_on_epoch,
disallowed_offsets,
}: ReOrgTest,
) {
assert!(head_slot > 0);
@ -320,6 +350,9 @@ pub async fn proposer_boost_re_org_test(
.proposer_re_org_max_epochs_since_finalization(Epoch::new(
max_epochs_since_finalization,
))
.proposer_re_org_disallowed_offsets(
DisallowedReOrgOffsets::new::<E>(disallowed_offsets).unwrap(),
)
})),
)
.await;

View File

@ -885,6 +885,28 @@ pub fn cli_app<'a, 'b>() -> App<'a, 'b> {
allowed. Default: 2")
.conflicts_with("disable-proposer-reorgs")
)
.arg(
Arg::with_name("proposer-reorg-cutoff")
.long("proposer-reorg-cutoff")
.value_name("MILLISECONDS")
.help("Maximum delay after the start of the slot at which to propose a reorging \
block. Lower values can prevent failed reorgs by ensuring the block has \
ample time to propagate and be processed by the network. The default is \
1/12th of a slot (1 second on mainnet)")
.conflicts_with("disable-proposer-reorgs")
)
.arg(
Arg::with_name("proposer-reorg-disallowed-offsets")
.long("proposer-reorg-disallowed-offsets")
.value_name("N1,N2,...")
.help("Comma-separated list of integer offsets which can be used to avoid \
proposing reorging blocks at certain slots. An offset of N means that \
reorging proposals will not be attempted at any slot such that \
`slot % SLOTS_PER_EPOCH == N`. By default only re-orgs at offset 0 will be \
avoided. Any offsets supplied with this flag will impose additional \
restrictions.")
.conflicts_with("disable-proposer-reorgs")
)
.arg(
Arg::with_name("prepare-payload-lookahead")
.long("prepare-payload-lookahead")

View File

@ -1,5 +1,5 @@
use beacon_chain::chain_config::{
ReOrgThreshold, DEFAULT_PREPARE_PAYLOAD_LOOKAHEAD_FACTOR,
DisallowedReOrgOffsets, ReOrgThreshold, DEFAULT_PREPARE_PAYLOAD_LOOKAHEAD_FACTOR,
DEFAULT_RE_ORG_MAX_EPOCHS_SINCE_FINALIZATION, DEFAULT_RE_ORG_THRESHOLD,
};
use clap::ArgMatches;
@ -686,6 +686,23 @@ pub fn get_config<E: EthSpec>(
client_config.chain.re_org_max_epochs_since_finalization =
clap_utils::parse_optional(cli_args, "proposer-reorg-epochs-since-finalization")?
.unwrap_or(DEFAULT_RE_ORG_MAX_EPOCHS_SINCE_FINALIZATION);
client_config.chain.re_org_cutoff_millis =
clap_utils::parse_optional(cli_args, "proposer-reorg-cutoff")?;
if let Some(disallowed_offsets_str) =
clap_utils::parse_optional::<String>(cli_args, "proposer-reorg-disallowed-offsets")?
{
let disallowed_offsets = disallowed_offsets_str
.split(',')
.map(|s| {
s.parse()
.map_err(|e| format!("invalid disallowed-offsets: {e:?}"))
})
.collect::<Result<Vec<u64>, _>>()?;
client_config.chain.re_org_disallowed_offsets =
DisallowedReOrgOffsets::new::<E>(disallowed_offsets)
.map_err(|e| format!("invalid disallowed-offsets: {e:?}"))?;
}
}
// Note: This overrides any previous flags that enable this option.

View File

@ -14,6 +14,15 @@ There are three flags which control the re-orging behaviour:
* `--proposer-reorg-threshold N`: attempt to orphan blocks with less than N% of the committee vote. If this parameter isn't set then N defaults to 20% when the feature is enabled.
* `--proposer-reorg-epochs-since-finalization N`: only attempt to re-org late blocks when the number of epochs since finalization is less than or equal to N. The default is 2 epochs,
meaning re-orgs will only be attempted when the chain is finalizing optimally.
* `--proposer-reorg-cutoff T`: only attempt to re-org late blocks when the proposal is being made
before T milliseconds into the slot. Delays between the validator client and the beacon node can
cause some blocks to be requested later than the start of the slot, which makes them more likely
to fail. The default cutoff is 1000ms on mainnet, which gives blocks 3000ms to be signed and
propagated before the attestation deadline at 4000ms.
* `--proposer-reorg-disallowed-offsets N1,N2,N3...`: Prohibit Lighthouse from attempting to reorg at
specific offsets in each epoch. A disallowed offset `N` prevents reorging blocks from being
proposed at any `slot` such that `slot % SLOTS_PER_EPOCH == N`. The value to this flag is a
comma-separated list of integer offsets.
All flags should be applied to `lighthouse bn`. The default configuration is recommended as it
balances the chance of the re-org succeeding against the chance of failure due to attestations

View File

@ -1,7 +1,7 @@
use crate::{ForkChoiceStore, InvalidationOperation};
use proto_array::{
Block as ProtoBlock, ExecutionStatus, ProposerHeadError, ProposerHeadInfo,
ProtoArrayForkChoice, ReOrgThreshold,
Block as ProtoBlock, DisallowedReOrgOffsets, ExecutionStatus, ProposerHeadError,
ProposerHeadInfo, ProtoArrayForkChoice, ReOrgThreshold,
};
use slog::{crit, debug, warn, Logger};
use ssz_derive::{Decode, Encode};
@ -533,6 +533,7 @@ where
current_slot: Slot,
canonical_head: Hash256,
re_org_threshold: ReOrgThreshold,
disallowed_offsets: &DisallowedReOrgOffsets,
max_epochs_since_finalization: Epoch,
) -> Result<ProposerHeadInfo, ProposerHeadError<Error<proto_array::Error>>> {
// Ensure that fork choice has already been updated for the current slot. This prevents
@ -564,6 +565,7 @@ where
canonical_head,
self.fc_store.justified_balances(),
re_org_threshold,
disallowed_offsets,
max_epochs_since_finalization,
)
.map_err(ProposerHeadError::convert_inner_error)
@ -573,6 +575,7 @@ where
&self,
canonical_head: Hash256,
re_org_threshold: ReOrgThreshold,
disallowed_offsets: &DisallowedReOrgOffsets,
max_epochs_since_finalization: Epoch,
) -> Result<ProposerHeadInfo, ProposerHeadError<Error<proto_array::Error>>> {
let current_slot = self.fc_store.get_current_slot();
@ -582,6 +585,7 @@ where
canonical_head,
self.fc_store.justified_balances(),
re_org_threshold,
disallowed_offsets,
max_epochs_since_finalization,
)
.map_err(ProposerHeadError::convert_inner_error)

View File

@ -50,6 +50,7 @@ pub enum Error {
block_root: Hash256,
parent_root: Hash256,
},
InvalidEpochOffset(u64),
Arith(ArithError),
}

View File

@ -8,8 +8,8 @@ mod ssz_container;
pub use crate::justified_balances::JustifiedBalances;
pub use crate::proto_array::{calculate_committee_fraction, InvalidationOperation};
pub use crate::proto_array_fork_choice::{
Block, DoNotReOrg, ExecutionStatus, ProposerHeadError, ProposerHeadInfo, ProtoArrayForkChoice,
ReOrgThreshold,
Block, DisallowedReOrgOffsets, DoNotReOrg, ExecutionStatus, ProposerHeadError,
ProposerHeadInfo, ProtoArrayForkChoice, ReOrgThreshold,
};
pub use error::Error;

View File

@ -250,6 +250,9 @@ pub enum DoNotReOrg {
ParentDistance,
HeadDistance,
ShufflingUnstable,
DisallowedOffset {
offset: u64,
},
JustificationAndFinalizationNotCompetitive,
ChainNotFinalizing {
epochs_since_finalization: u64,
@ -271,6 +274,9 @@ impl std::fmt::Display for DoNotReOrg {
Self::ParentDistance => write!(f, "parent too far from head"),
Self::HeadDistance => write!(f, "head too far from current slot"),
Self::ShufflingUnstable => write!(f, "shuffling unstable at epoch boundary"),
Self::DisallowedOffset { offset } => {
write!(f, "re-orgs disabled at offset {offset}")
}
Self::JustificationAndFinalizationNotCompetitive => {
write!(f, "justification or finalization not competitive")
}
@ -304,6 +310,31 @@ impl std::fmt::Display for DoNotReOrg {
#[serde(transparent)]
pub struct ReOrgThreshold(pub u64);
/// New-type for disallowed re-org slots.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(transparent)]
pub struct DisallowedReOrgOffsets {
// Vecs are faster than hashmaps for small numbers of items.
offsets: Vec<u64>,
}
impl Default for DisallowedReOrgOffsets {
fn default() -> Self {
DisallowedReOrgOffsets { offsets: vec![0] }
}
}
impl DisallowedReOrgOffsets {
pub fn new<E: EthSpec>(offsets: Vec<u64>) -> Result<Self, Error> {
for &offset in &offsets {
if offset >= E::slots_per_epoch() {
return Err(Error::InvalidEpochOffset(offset));
}
}
Ok(Self { offsets })
}
}
#[derive(PartialEq)]
pub struct ProtoArrayForkChoice {
pub(crate) proto_array: ProtoArray,
@ -460,6 +491,7 @@ impl ProtoArrayForkChoice {
canonical_head: Hash256,
justified_balances: &JustifiedBalances,
re_org_threshold: ReOrgThreshold,
disallowed_offsets: &DisallowedReOrgOffsets,
max_epochs_since_finalization: Epoch,
) -> Result<ProposerHeadInfo, ProposerHeadError<Error>> {
let info = self.get_proposer_head_info::<E>(
@ -467,6 +499,7 @@ impl ProtoArrayForkChoice {
canonical_head,
justified_balances,
re_org_threshold,
disallowed_offsets,
max_epochs_since_finalization,
)?;
@ -501,6 +534,7 @@ impl ProtoArrayForkChoice {
canonical_head: Hash256,
justified_balances: &JustifiedBalances,
re_org_threshold: ReOrgThreshold,
disallowed_offsets: &DisallowedReOrgOffsets,
max_epochs_since_finalization: Epoch,
) -> Result<ProposerHeadInfo, ProposerHeadError<Error>> {
let mut nodes = self
@ -545,6 +579,12 @@ impl ProtoArrayForkChoice {
return Err(DoNotReOrg::ShufflingUnstable.into());
}
// Check allowed slot offsets.
let offset = (re_org_block_slot % E::slots_per_epoch()).as_u64();
if disallowed_offsets.offsets.contains(&offset) {
return Err(DoNotReOrg::DisallowedOffset { offset }.into());
}
// Check FFG.
let ffg_competitive = parent_node.unrealized_justified_checkpoint
== head_node.unrealized_justified_checkpoint

View File

@ -2,6 +2,7 @@ use beacon_node::ClientConfig as Config;
use crate::exec::{CommandLineTestExec, CompletedTest};
use beacon_node::beacon_chain::chain_config::{
DisallowedReOrgOffsets, DEFAULT_RE_ORG_CUTOFF_DENOMINATOR,
DEFAULT_RE_ORG_MAX_EPOCHS_SINCE_FINALIZATION, DEFAULT_RE_ORG_THRESHOLD,
};
use eth1::Eth1Endpoint;
@ -1888,6 +1889,10 @@ fn enable_proposer_re_orgs_default() {
config.chain.re_org_max_epochs_since_finalization,
DEFAULT_RE_ORG_MAX_EPOCHS_SINCE_FINALIZATION,
);
assert_eq!(
config.chain.re_org_cutoff(12),
Duration::from_secs(12) / DEFAULT_RE_ORG_CUTOFF_DENOMINATOR
);
});
}
@ -1920,6 +1925,49 @@ fn proposer_re_org_max_epochs_since_finalization() {
});
}
#[test]
fn proposer_re_org_cutoff() {
CommandLineTest::new()
.flag("proposer-reorg-cutoff", Some("500"))
.run_with_zero_port()
.with_config(|config| {
assert_eq!(config.chain.re_org_cutoff(12), Duration::from_millis(500))
});
}
#[test]
fn proposer_re_org_disallowed_offsets_default() {
CommandLineTest::new()
.run_with_zero_port()
.with_config(|config| {
assert_eq!(
config.chain.re_org_disallowed_offsets,
DisallowedReOrgOffsets::new::<MainnetEthSpec>(vec![0]).unwrap()
)
});
}
#[test]
fn proposer_re_org_disallowed_offsets_override() {
CommandLineTest::new()
.flag("--proposer-reorg-disallowed-offsets", Some("1,2,3"))
.run_with_zero_port()
.with_config(|config| {
assert_eq!(
config.chain.re_org_disallowed_offsets,
DisallowedReOrgOffsets::new::<MainnetEthSpec>(vec![1, 2, 3]).unwrap()
)
});
}
#[test]
#[should_panic]
fn proposer_re_org_disallowed_offsets_invalid() {
CommandLineTest::new()
.flag("--proposer-reorg-disallowed-offsets", Some("32,33,34"))
.run_with_zero_port();
}
#[test]
fn monitoring_endpoint() {
CommandLineTest::new()