Clone state ahead of block production (#4925)

* Clone state ahead of block production * Add pruning and fix logging * Don't hold 2 states in mem
2023-11-30 13:49:35 +11:00 · 2023-11-30 13:49:35 +11:00 · 547ed1de63
commit 547ed1de63
parent 43d98153d6
3 changed files with 117 additions and 21 deletions
--- a/beacon_node/beacon_chain/src/beacon_chain.rs
+++ b/beacon_node/beacon_chain/src/beacon_chain.rs
@ -482,6 +482,11 @@ pub struct BeaconChain<T: BeaconChainTypes> {
    pub data_availability_checker: Arc<DataAvailabilityChecker<T>>,
    /// The KZG trusted setup used by this chain.
    pub kzg: Option<Arc<Kzg>>,
    /// State with complete tree hash cache, ready for block production.
    ///
    /// NB: We can delete this once we have tree-states.
    #[allow(clippy::type_complexity)]
    pub block_production_state: Arc<Mutex<Option<(Hash256, BlockProductionPreState<T::EthSpec>)>>>,
 }
 pub enum BeaconBlockResponseType<T: EthSpec> {
@ -4030,7 +4035,16 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
                );
                (re_org_state.pre_state, re_org_state.state_root)
            }
-            // Normal case: proposing a block atop the current head. Use the snapshot cache.
+            // Normal case: proposing a block atop the current head using the cache.
            else if let Some((_, cached_state)) = self
                .block_production_state
                .lock()
                .take()
                .filter(|(cached_block_root, _)| *cached_block_root == head_block_root)
            {
                (cached_state.pre_state, cached_state.state_root)
            }
            // Fall back to a direct read of the snapshot cache.
            else if let Some(pre_state) = self
                .snapshot_cache
                .try_read_for(BLOCK_PROCESSING_CACHE_LOCK_TIMEOUT)
@ -4038,6 +4052,12 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
                    snapshot_cache.get_state_for_block_production(head_block_root)
                })
            {
                warn!(
                    self.log,
                    "Block production cache miss";
                    "message" => "falling back to snapshot cache clone",
                    "slot" => slot
                );
                (pre_state.pre_state, pre_state.state_root)
            } else {
                warn!(
@ -4161,12 +4181,27 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
        drop(proposer_head_timer);
        let re_org_parent_block = proposer_head.parent_node.root;
-        // Only attempt a re-org if we hit the snapshot cache.
+        // Only attempt a re-org if we hit the block production cache or snapshot cache.
        let pre_state = self
-            .snapshot_cache
+            .block_production_state
-            .try_read_for(BLOCK_PROCESSING_CACHE_LOCK_TIMEOUT)
+            .lock()
-            .and_then(|snapshot_cache| {
+            .take()
-                snapshot_cache.get_state_for_block_production(re_org_parent_block)
+            .and_then(|(cached_block_root, state)| {
                (cached_block_root == re_org_parent_block).then_some(state)
            })
            .or_else(|| {
                warn!(
                    self.log,
                    "Block production cache miss";
                    "message" => "falling back to snapshot cache during re-org",
                    "slot" => slot,
                    "block_root" => ?re_org_parent_block
                );
                self.snapshot_cache
                    .try_read_for(BLOCK_PROCESSING_CACHE_LOCK_TIMEOUT)
                    .and_then(|snapshot_cache| {
                        snapshot_cache.get_state_for_block_production(re_org_parent_block)
                    })
            })
            .or_else(|| {
                debug!(
@ -5326,15 +5361,18 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
    ///
    /// This function will result in a call to `forkchoiceUpdated` on the EL if we're in the
    /// tail-end of the slot (as defined by `self.config.prepare_payload_lookahead`).
    ///
    /// Return `Ok(Some(head_block_root))` if this node prepared to propose at the next slot on
    /// top of `head_block_root`.
    pub async fn prepare_beacon_proposer(
        self: &Arc<Self>,
        current_slot: Slot,
-    ) -> Result<(), Error> {
+    ) -> Result<Option<Hash256>, Error> {
        let prepare_slot = current_slot + 1;
        // There's no need to run the proposer preparation routine before the bellatrix fork.
        if self.slot_is_prior_to_bellatrix(prepare_slot) {
-            return Ok(());
+            return Ok(None);
        }
        let execution_layer = self
@ -5347,7 +5385,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
        if !self.config.always_prepare_payload
            && !execution_layer.has_any_proposer_preparation_data().await
        {
-            return Ok(());
+            return Ok(None);
        }
        // Load the cached head and its forkchoice update parameters.
@ -5394,7 +5432,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
        let Some((forkchoice_update_params, Some(pre_payload_attributes))) = maybe_prep_data else {
            // Appropriate log messages have already been logged above and in
            // `get_pre_payload_attributes`.
-            return Ok(());
+            return Ok(None);
        };
        // If the execution layer doesn't have any proposer data for this validator then we assume
@ -5405,7 +5443,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
                .has_proposer_preparation_data(proposer)
                .await
        {
-            return Ok(());
+            return Ok(None);
        }
        // Fetch payload attributes from the execution layer's cache, or compute them from scratch
@ -5500,7 +5538,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
                "prepare_slot" => prepare_slot,
                "validator" => proposer,
            );
-            return Ok(());
+            return Ok(None);
        };
        // If we are close enough to the proposal slot, send an fcU, which will have payload
@ -5523,7 +5561,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
            .await?;
        }
-        Ok(())
+        Ok(Some(head_root))
    }
    pub async fn update_execution_engine_forkchoice(
--- a/beacon_node/beacon_chain/src/builder.rs
+++ b/beacon_node/beacon_chain/src/builder.rs
@ -925,6 +925,7 @@ where
                    .map_err(|e| format!("Error initializing DataAvailabiltyChecker: {:?}", e))?,
            ),
            kzg,
            block_production_state: Arc::new(Mutex::new(None)),
        };
        let head = beacon_chain.head_snapshot();
--- a/beacon_node/beacon_chain/src/state_advance_timer.rs
+++ b/beacon_node/beacon_chain/src/state_advance_timer.rs
@ -45,6 +45,9 @@ const MAX_ADVANCE_DISTANCE: u64 = 4;
 /// impact whilst having 8 epochs without a block is a comfortable grace period.
 const MAX_FORK_CHOICE_DISTANCE: u64 = 256;
 /// Drop any unused block production state cache after this many slots.
 const MAX_BLOCK_PRODUCTION_CACHE_DISTANCE: u64 = 4;
 #[derive(Debug)]
 enum Error {
    BeaconChain(BeaconChainError),
@ -227,19 +230,73 @@ async fn state_advance_timer<T: BeaconChainTypes>(
                // Prepare proposers so that the node can send payload attributes in the case where
                // it decides to abandon a proposer boost re-org.
-                if let Err(e) = beacon_chain.prepare_beacon_proposer(current_slot).await {
+                let proposer_head = beacon_chain
-                    warn!(
+                    .prepare_beacon_proposer(current_slot)
-                        log,
+                    .await
-                        "Unable to prepare proposer with lookahead";
+                    .unwrap_or_else(|e| {
-                        "error" => ?e,
+                        warn!(
-                        "slot" => next_slot,
+                            log,
-                    );
+                            "Unable to prepare proposer with lookahead";
-                }
+                            "error" => ?e,
                            "slot" => next_slot,
                        );
                        None
                    });
                // Use a blocking task to avoid blocking the core executor whilst waiting for locks
                // in `ForkChoiceSignalTx`.
                beacon_chain.task_executor.clone().spawn_blocking(
                    move || {
                        // If we're proposing, clone the head state preemptively so that it isn't on
                        // the hot path of proposing. We can delete this once we have tree-states.
                        if let Some(proposer_head) = proposer_head {
                            let mut cache = beacon_chain.block_production_state.lock();
                            // Avoid holding two states in memory. It's OK to hold the lock because
                            // we always lock the block production cache before the snapshot cache
                            // and we prefer for block production to wait for the block production
                            // cache if a clone is in-progress.
                            if cache
                                .as_ref()
                                .map_or(false, |(cached_head, _)| *cached_head != proposer_head)
                            {
                                drop(cache.take());
                            }
                            if let Some(proposer_state) = beacon_chain
                                .snapshot_cache
                                .try_read_for(BLOCK_PROCESSING_CACHE_LOCK_TIMEOUT)
                                .and_then(|snapshot_cache| {
                                    snapshot_cache.get_state_for_block_production(proposer_head)
                                })
                            {
                                *cache = Some((proposer_head, proposer_state));
                                debug!(
                                    log,
                                    "Cloned state ready for block production";
                                    "head_block_root" => ?proposer_head,
                                    "slot" => next_slot
                                );
                            } else {
                                warn!(
                                    log,
                                    "Block production state missing from snapshot cache";
                                    "head_block_root" => ?proposer_head,
                                    "slot" => next_slot
                                );
                            }
                        } else {
                            // If we aren't proposing, drop any old block production cache to save
                            // memory.
                            let mut cache = beacon_chain.block_production_state.lock();
                            if let Some((_, state)) = &*cache {
                                if state.pre_state.slot() + MAX_BLOCK_PRODUCTION_CACHE_DISTANCE
                                    <= current_slot
                                {
                                    drop(cache.take());
                                }
                            }
                        }
                        // Signal block proposal for the next slot (if it happens to be waiting).
                        if let Some(tx) = &beacon_chain.fork_choice_signal_tx {
                            if let Err(e) = tx.notify_fork_choice_complete(next_slot) {