Remove grandparents from snapshot cache (#2917)

## Issue Addressed NA ## Proposed Changes In https://github.com/sigp/lighthouse/pull/2832 we made some changes to the `SnapshotCache` to help deal with the one-block reorgs seen on mainnet (and testnets). I believe the change in #2832 is good and we should keep it, but I think that in its present form it is causing the `SnapshotCache` to hold onto states that it doesn't need anymore. For example, a skip slot will result in one more `BeaconSnapshot` being stored in the cache. This PR adds a new type of pruning that happens after a block is inserted to the cache. We will remove any snapshot from the cache that is a *grandparent* of the block being imported. Since we know the grandparent has two valid blocks built atop it, it is not at risk from a one-block re-org. ## Additional Info NA
2022-01-14 07:20:55 +00:00 · 2022-01-14 07:20:55 +00:00 · c11253a82f
commit c11253a82f
parent ceeab02e3a
3 changed files with 65 additions and 4 deletions
--- a/beacon_node/beacon_chain/src/beacon_chain.rs
+++ b/beacon_node/beacon_chain/src/beacon_chain.rs
@ -2796,6 +2796,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
                        beacon_block_root: block_root,
                    },
                    None,
                    &self.spec,
                )
            })
            .unwrap_or_else(|e| {
@ -3740,6 +3741,12 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
            .try_write_for(BLOCK_PROCESSING_CACHE_LOCK_TIMEOUT)
            .map(|mut snapshot_cache| {
                snapshot_cache.prune(new_finalized_checkpoint.epoch);
                debug!(
                    self.log,
                    "Snapshot cache pruned";
                    "new_len" => snapshot_cache.len(),
                    "remaining_roots" => ?snapshot_cache.beacon_block_roots(),
                );
            })
            .unwrap_or_else(|| {
                error!(
--- a/beacon_node/beacon_chain/src/metrics.rs
+++ b/beacon_node/beacon_chain/src/metrics.rs
@ -4,8 +4,12 @@ use crate::{BeaconChain, BeaconChainError, BeaconChainTypes};
 use lazy_static::lazy_static;
 pub use lighthouse_metrics::*;
 use slot_clock::SlotClock;
 use std::time::Duration;
 use types::{BeaconState, Epoch, EthSpec, Hash256, Slot};
 /// The maximum time to wait for the snapshot cache lock during a metrics scrape.
 const SNAPSHOT_CACHE_TIMEOUT: Duration = Duration::from_millis(100);
 lazy_static! {
    /*
     * Block Processing
@ -18,6 +22,10 @@ lazy_static! {
        "beacon_block_processing_successes_total",
        "Count of blocks processed without error"
    );
    pub static ref BLOCK_PROCESSING_SNAPSHOT_CACHE_SIZE: Result<IntGauge> = try_create_int_gauge(
        "beacon_block_processing_snapshot_cache_size",
        "Count snapshots in the snapshot cache"
    );
    pub static ref BLOCK_PROCESSING_SNAPSHOT_CACHE_MISSES: Result<IntCounter> = try_create_int_counter(
        "beacon_block_processing_snapshot_cache_misses",
        "Count of snapshot cache misses"
@ -913,6 +921,16 @@ pub fn scrape_for_metrics<T: BeaconChainTypes>(beacon_chain: &BeaconChain<T>) {
    let attestation_stats = beacon_chain.op_pool.attestation_stats();
    if let Some(snapshot_cache) = beacon_chain
        .snapshot_cache
        .try_write_for(SNAPSHOT_CACHE_TIMEOUT)
    {
        set_gauge(
            &BLOCK_PROCESSING_SNAPSHOT_CACHE_SIZE,
            snapshot_cache.len() as i64,
        )
    }
    set_gauge_by_usize(
        &OP_POOL_NUM_ATTESTATIONS,
        attestation_stats.num_attestations,
--- a/beacon_node/beacon_chain/src/snapshot_cache.rs
+++ b/beacon_node/beacon_chain/src/snapshot_cache.rs
@ -1,4 +1,5 @@
 use crate::BeaconSnapshot;
 use itertools::process_results;
 use std::cmp;
 use std::time::Duration;
 use types::{
@ -164,9 +165,25 @@ impl<T: EthSpec> SnapshotCache<T> {
        }
    }
    /// The block roots of all snapshots contained in `self`.
    pub fn beacon_block_roots(&self) -> Vec<Hash256> {
        self.snapshots.iter().map(|s| s.beacon_block_root).collect()
    }
    /// The number of snapshots contained in `self`.
    pub fn len(&self) -> usize {
        self.snapshots.len()
    }
    /// Insert a snapshot, potentially removing an existing snapshot if `self` is at capacity (see
    /// struct-level documentation for more info).
-    pub fn insert(&mut self, snapshot: BeaconSnapshot<T>, pre_state: Option<BeaconState<T>>) {
+    pub fn insert(
        &mut self,
        snapshot: BeaconSnapshot<T>,
        pre_state: Option<BeaconState<T>>,
        spec: &ChainSpec,
    ) {
        let parent_root = snapshot.beacon_block.message().parent_root();
        let item = CacheItem {
            beacon_block: snapshot.beacon_block,
            beacon_block_root: snapshot.beacon_block_root,
@ -174,6 +191,25 @@ impl<T: EthSpec> SnapshotCache<T> {
            pre_state,
        };
        // Remove the grandparent of the block that was just inserted.
        //
        // Assuming it's unlikely to see re-orgs deeper than one block, this method helps keep the
        // cache small by removing any states that already have more than one descendant.
        //
        // Remove the grandparent first to free up room in the cache.
        let grandparent_result =
            process_results(item.beacon_state.rev_iter_block_roots(spec), |iter| {
                iter.map(|(_slot, root)| root)
                    .find(|root| *root != item.beacon_block_root && *root != parent_root)
            });
        if let Ok(Some(grandparent_root)) = grandparent_result {
            let head_block_root = self.head_block_root;
            self.snapshots.retain(|snapshot| {
                let root = snapshot.beacon_block_root;
                root == head_block_root || root != grandparent_root
            });
        }
        if self.snapshots.len() < self.max_len {
            self.snapshots.push(item);
        } else {
@ -384,7 +420,7 @@ mod test {
            *snapshot.beacon_state.slot_mut() =
                Slot::from(i * MainnetEthSpec::slots_per_epoch() + 1);
-            cache.insert(snapshot, None);
+            cache.insert(snapshot, None, &spec);
            assert_eq!(
                cache.snapshots.len(),
@ -402,7 +438,7 @@ mod test {
        // 2        2
        // 3        3
        assert_eq!(cache.snapshots.len(), CACHE_SIZE);
-        cache.insert(get_snapshot(42), None);
+        cache.insert(get_snapshot(42), None, &spec);
        assert_eq!(cache.snapshots.len(), CACHE_SIZE);
        assert!(
@ -462,7 +498,7 @@ mod test {
        // Over-fill the cache so it needs to eject some old values on insert.
        for i in 0..CACHE_SIZE as u64 {
-            cache.insert(get_snapshot(u64::max_value() - i), None);
+            cache.insert(get_snapshot(u64::max_value() - i), None, &spec);
        }
        // Ensure that the new head value was not removed from the cache.