From b06559ae97853e6b935348800a479883dcd858a4 Mon Sep 17 00:00:00 2001 From: Paul Hauner Date: Wed, 20 Jan 2021 06:52:37 +0000 Subject: [PATCH] Disallow attestation production earlier than head (#2130) ## Issue Addressed The non-finality period on Pyrmont between epochs [`9114`](https://pyrmont.beaconcha.in/epoch/9114) and [`9182`](https://pyrmont.beaconcha.in/epoch/9182) was contributed to by all the `lighthouse_team` validators going down. The nodes saw excessive CPU and RAM usage, resulting in the system to kill the `lighthouse bn` process. The `Restart=on-failure` directive for `systemd` caused the process to bounce in ~10-30m intervals. Diagnosis with `heaptrack` showed that the `BeaconChain::produce_unaggregated_attestation` function was calling `store::beacon_state::get_full_state` and sometimes resulting in a tree hash cache allocation. These allocations were approximately the size of the hosts physical memory and still allocated when `lighthouse bn` was killed by the OS. There was no CPU analysis (e.g., `perf`), but the `BeaconChain::produce_unaggregated_attestation` is very CPU-heavy so it is reasonable to assume it is the cause of the excessive CPU usage, too. ## Proposed Changes `BeaconChain::produce_unaggregated_attestation` has two paths: 1. Fast path: attesting to the head slot or later. 2. Slow path: attesting to a slot earlier than the head block. Path (2) is the only path that calls `store::beacon_state::get_full_state`, therefore it is the path causing this excessive CPU/RAM usage. This PR removes the current functionality of path (2) and replaces it with a static error (`BeaconChainError::AttestingPriorToHead`). This change reduces the generality of `BeaconChain::produce_unaggregated_attestation` (and therefore [`/eth/v1/validator/attestation_data`](https://ethereum.github.io/eth2.0-APIs/#/Validator/produceAttestationData)), but I argue that this functionality is an edge-case and arguably a violation of the [Honest Validator spec](https://github.com/ethereum/eth2.0-specs/blob/dev/specs/phase0/validator.md). It's possible that a validator goes back to a prior slot to "catch up" and submit some missed attestations. This change would prevent such behaviour, returning an error. My concerns with this catch-up behaviour is that it is: - Not specified as "honest validator" attesting behaviour. - Is behaviour that is risky for slashing (although, all validator clients *should* have slashing protection and will eventually fail if they do not). - It disguises clock-sync issues between a BN and VC. ## Additional Info It's likely feasible to implement path (2) if we implement some sort of caching mechanism. This would be a multi-week task and this PR gets the issue patched in the short term. I haven't created an issue to add path (2), instead I think we should implement it if we get user-demand. --- beacon_node/beacon_chain/src/beacon_chain.rs | 43 ++++++------------- beacon_node/beacon_chain/src/errors.rs | 4 ++ .../tests/attestation_production.rs | 39 ++++++++--------- 3 files changed, 35 insertions(+), 51 deletions(-) diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index 38aa66737..1f195611c 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -853,37 +853,22 @@ impl BeaconChain { Cow::Borrowed(&head.beacon_state), ) } else { - // Note: this method will fail if `slot` is more than `state.block_roots.len()` slots - // prior to the head. + // We disallow producing attestations *prior* to the current head since such an + // attestation would require loading a `BeaconState` from disk. Loading `BeaconState` + // from disk is very resource intensive and proposes a DoS risk from validator clients. // - // This seems reasonable, producing an attestation at a slot so far - // in the past seems useless, definitely in mainnet spec. In minimal spec, when the - // block roots only contain two epochs of history, it's possible that you will fail to - // produce an attestation that would be valid to be included in a block. Given that - // minimal is only for testing, I think this is fine. + // Although we generally allow validator clients to do things that might harm us (i.e., + // we trust them), sometimes we need to protect the BN from accidental errors which + // could cause it significant harm. // - // It is important to note that what's _not_ allowed here is attesting to a slot in the - // past. You can still attest to a block an arbitrary distance in the past, just not as - // if you are in a slot in the past. - let beacon_block_root = *head.beacon_state.get_block_root(slot)?; - let state_root = *head.beacon_state.get_state_root(slot)?; - - // Avoid holding a lock on the head whilst doing database reads. Good boi functions - // don't hog locks. - drop(head); - - let mut state = self - .get_state(&state_root, Some(slot))? - .ok_or(Error::MissingBeaconState(state_root))?; - - state.build_committee_cache(RelativeEpoch::Current, &self.spec)?; - - self.produce_unaggregated_attestation_for_block( - slot, - index, - beacon_block_root, - Cow::Owned(state), - ) + // This case is particularity harmful since the HTTP API can effectively call this + // function an unlimited amount of times. If `n` validators all happen to call it at + // the same time, we're going to load `n` states (and tree hash caches) into memory all + // at once. With `n >= 10` we're looking at hundreds of MB or GBs of RAM. + Err(Error::AttestingPriorToHead { + head_slot: head.beacon_block.slot(), + request_slot: slot, + }) } } diff --git a/beacon_node/beacon_chain/src/errors.rs b/beacon_node/beacon_chain/src/errors.rs index e9568bbe2..9af9167f7 100644 --- a/beacon_node/beacon_chain/src/errors.rs +++ b/beacon_node/beacon_chain/src/errors.rs @@ -92,6 +92,10 @@ pub enum BeaconChainError { }, WeakSubjectivtyVerificationFailure, WeakSubjectivtyShutdownError(TrySendError<&'static str>), + AttestingPriorToHead { + head_slot: Slot, + request_slot: Slot, + }, } easy_from_to!(SlotProcessingError, BeaconChainError); diff --git a/beacon_node/beacon_chain/tests/attestation_production.rs b/beacon_node/beacon_chain/tests/attestation_production.rs index cb7893180..4e4e062d4 100644 --- a/beacon_node/beacon_chain/tests/attestation_production.rs +++ b/beacon_node/beacon_chain/tests/attestation_production.rs @@ -25,6 +25,7 @@ lazy_static! { #[test] fn produces_attestations() { let num_blocks_produced = MainnetEthSpec::slots_per_epoch() * 4; + let additional_slots_tested = MainnetEthSpec::slots_per_epoch() * 3; let harness = BeaconChainHarness::new_with_store_config( MainnetEthSpec, @@ -32,38 +33,32 @@ fn produces_attestations() { StoreConfig::default(), ); - // Skip past the genesis slot. - harness.advance_slot(); - - harness.extend_chain( - num_blocks_produced as usize, - BlockStrategy::OnCanonicalHead, - AttestationStrategy::AllValidators, - ); - let chain = &harness.chain; - let state = &harness.chain.head().expect("should get head").beacon_state; - assert_eq!(state.slot, num_blocks_produced, "head should have updated"); - assert_ne!( - state.finalized_checkpoint.epoch, 0, - "head should have updated" - ); - - let current_slot = chain.slot().expect("should get slot"); - // Test all valid committee indices for all slots in the chain. - for slot in 0..=current_slot.as_u64() + MainnetEthSpec::slots_per_epoch() * 3 { + // for slot in 0..=current_slot.as_u64() + MainnetEthSpec::slots_per_epoch() * 3 { + for slot in 0..=num_blocks_produced + additional_slots_tested { + if slot > 0 && slot <= num_blocks_produced { + harness.advance_slot(); + + harness.extend_chain( + 1, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ); + } + let slot = Slot::from(slot); let mut state = chain .state_at_slot(slot, StateSkipConfig::WithStateRoots) .expect("should get state"); - let block_slot = if slot > current_slot { - current_slot - } else { + let block_slot = if slot <= num_blocks_produced { slot + } else { + Slot::from(num_blocks_produced) }; + let block = chain .block_at_slot(block_slot) .expect("should get block")