Fix off-by-one bug in missed block detection (#5012)

* Regression test * Fix the bug
2023-12-15 14:23:54 +11:00 · 2023-12-15 14:23:54 +11:00 · f1113540d8
commit f1113540d8
parent 366f0d7ac2
2 changed files with 102 additions and 16 deletions
--- a/beacon_node/beacon_chain/src/validator_monitor.rs
+++ b/beacon_node/beacon_chain/src/validator_monitor.rs
@ -442,7 +442,7 @@ impl<T: EthSpec> ValidatorMonitor<T> {
    }

    /// Add some validators to `self` for additional monitoring.
-    fn add_validator_pubkey(&mut self, pubkey: PublicKeyBytes) {
+    pub fn add_validator_pubkey(&mut self, pubkey: PublicKeyBytes) {
        let index_opt = self
            .indices
            .iter()
@ -602,8 +602,10 @@ impl<T: EthSpec> ValidatorMonitor<T> {

        let end_slot = current_slot.saturating_sub(MISSED_BLOCK_LAG_SLOTS).as_u64();

-        // List of proposers per epoch from the beacon_proposer_cache
-        let mut proposers_per_epoch: Option<SmallVec<[usize; TYPICAL_SLOTS_PER_EPOCH]>> = None;
+        // List of proposers per epoch from the beacon_proposer_cache, and the epoch at which the
+        // cache is valid.
+        let mut proposers_per_epoch: Option<(SmallVec<[usize; TYPICAL_SLOTS_PER_EPOCH]>, Epoch)> =
+            None;

        for (prev_slot, slot) in (start_slot.as_u64()..=end_slot)
            .map(Slot::new)
@ -617,25 +619,30 @@ impl<T: EthSpec> ValidatorMonitor<T> {
                // Found missed block
                if block_root == prev_block_root {
                    let slot_epoch = slot.epoch(T::slots_per_epoch());
-                    let prev_slot_epoch = prev_slot.epoch(T::slots_per_epoch());

                    if let Ok(shuffling_decision_block) =
                        state.proposer_shuffling_decision_root_at_epoch(slot_epoch, *block_root)
                    {
-                        // Only update the cache if it needs to be initialised or because
-                        // slot is at epoch + 1
-                        if proposers_per_epoch.is_none() || slot_epoch != prev_slot_epoch {
-                            proposers_per_epoch = self.get_proposers_by_epoch_from_cache(
-                                slot_epoch,
-                                shuffling_decision_block,
-                            );
+                        // Update the cache if it has not yet been initialised, or if it is
+                        // initialised for a prior epoch. This is an optimisation to avoid bouncing
+                        // the proposer shuffling cache lock when there are lots of missed blocks.
+                        if proposers_per_epoch
+                            .as_ref()
+                            .map_or(true, |(_, cached_epoch)| *cached_epoch != slot_epoch)
+                        {
+                            proposers_per_epoch = self
+                                .get_proposers_by_epoch_from_cache(
+                                    slot_epoch,
+                                    shuffling_decision_block,
+                                )
+                                .map(|cache| (cache, slot_epoch));
                        }

                        // Only add missed blocks for the proposer if it's in the list of monitored validators
                        let slot_in_epoch = slot % T::slots_per_epoch();
                        if let Some(proposer_index) = proposers_per_epoch
-                            .as_deref()
-                            .and_then(|proposers| proposers.get(slot_in_epoch.as_usize()))
+                            .as_ref()
+                            .and_then(|(proposers, _)| proposers.get(slot_in_epoch.as_usize()))
                        {
                            let i = *proposer_index as u64;
                            if let Some(pub_key) = self.indices.get(&i) {
@ -674,7 +681,8 @@ impl<T: EthSpec> ValidatorMonitor<T> {
                            debug!(
                                self.log,
                                "Could not get proposers from cache";
-                                "epoch" => ?slot_epoch
+                                "epoch" => ?slot_epoch,
+                                "decision_root" => ?shuffling_decision_block,
                            );
                        }
                    }
--- a/beacon_node/beacon_chain/tests/validator_monitor.rs
+++ b/beacon_node/beacon_chain/tests/validator_monitor.rs
@ -1,9 +1,9 @@
-use lazy_static::lazy_static;
-
 use beacon_chain::test_utils::{
    AttestationStrategy, BeaconChainHarness, BlockStrategy, EphemeralHarnessType,
 };
 use beacon_chain::validator_monitor::{ValidatorMonitorConfig, MISSED_BLOCK_LAG_SLOTS};
+use lazy_static::lazy_static;
+use logging::test_logger;
 use types::{Epoch, EthSpec, Keypair, MainnetEthSpec, PublicKeyBytes, Slot};

 // Should ideally be divisible by 3.
@ -23,6 +23,7 @@ fn get_harness(
    let harness = BeaconChainHarness::builder(MainnetEthSpec)
        .default_spec()
        .keypairs(KEYPAIRS[0..validator_count].to_vec())
+        .logger(test_logger())
        .fresh_ephemeral_store()
        .mock_execution_layer()
        .validator_monitor_config(ValidatorMonitorConfig {
@ -39,6 +40,83 @@ fn get_harness(
    harness
 }

+// Regression test for off-by-one caching issue in missed block detection.
+#[tokio::test]
+async fn missed_blocks_across_epochs() {
+    let slots_per_epoch = E::slots_per_epoch();
+    let all_validators = (0..VALIDATOR_COUNT).collect::<Vec<_>>();
+
+    let harness = get_harness(VALIDATOR_COUNT, vec![]);
+    let validator_monitor = &harness.chain.validator_monitor;
+    let mut genesis_state = harness.get_current_state();
+    let genesis_state_root = genesis_state.update_tree_hash_cache().unwrap();
+    let genesis_block_root = harness.head_block_root();
+
+    // Skip a slot in the first epoch (to prime the cache inside the missed block function) and then
+    // at a different offset in the 2nd epoch. The missed block in the 2nd epoch MUST NOT reuse
+    // the cache from the first epoch.
+    let first_skip_offset = 3;
+    let second_skip_offset = slots_per_epoch / 2;
+    assert_ne!(first_skip_offset, second_skip_offset);
+    let first_skip_slot = Slot::new(first_skip_offset);
+    let second_skip_slot = Slot::new(slots_per_epoch + second_skip_offset);
+    let slots = (1..2 * slots_per_epoch)
+        .map(Slot::new)
+        .filter(|slot| *slot != first_skip_slot && *slot != second_skip_slot)
+        .collect::<Vec<_>>();
+
+    let (block_roots_by_slot, state_roots_by_slot, _, head_state) = harness
+        .add_attested_blocks_at_slots(genesis_state, genesis_state_root, &slots, &all_validators)
+        .await;
+
+    // Prime the proposer shuffling cache.
+    let mut proposer_shuffling_cache = harness.chain.beacon_proposer_cache.lock();
+    for epoch in [0, 1].into_iter().map(Epoch::new) {
+        let start_slot = epoch.start_slot(slots_per_epoch) + 1;
+        let state = harness
+            .get_hot_state(state_roots_by_slot[&start_slot])
+            .unwrap();
+        let decision_root = state
+            .proposer_shuffling_decision_root(genesis_block_root)
+            .unwrap();
+        proposer_shuffling_cache
+            .insert(
+                epoch,
+                decision_root,
+                state
+                    .get_beacon_proposer_indices(&harness.chain.spec)
+                    .unwrap(),
+                state.fork(),
+            )
+            .unwrap();
+    }
+    drop(proposer_shuffling_cache);
+
+    // Monitor the validator that proposed the block at the same offset in the 0th epoch as the skip
+    // in the 1st epoch.
+    let innocent_proposer_slot = Slot::new(second_skip_offset);
+    let innocent_proposer = harness
+        .get_block(block_roots_by_slot[&innocent_proposer_slot])
+        .unwrap()
+        .message()
+        .proposer_index();
+
+    let mut vm_write = validator_monitor.write();
+
+    // Call `process_` once to update validator indices.
+    vm_write.process_valid_state(head_state.current_epoch(), &head_state, &harness.chain.spec);
+    // Start monitoring the innocent validator.
+    vm_write.add_validator_pubkey(KEYPAIRS[innocent_proposer as usize].pk.compress());
+    // Check for missed blocks.
+    vm_write.process_valid_state(head_state.current_epoch(), &head_state, &harness.chain.spec);
+
+    // My client is innocent, your honour!
+    assert_eq!(
+        vm_write.get_monitored_validator_missed_block_count(innocent_proposer),
+        0
+    );
+}
+
 #[tokio::test]
 async fn produces_missed_blocks() {
    let validator_count = 16;