Clone state ahead of block production (#4925)

* Clone state ahead of block production * Add pruning and fix logging * Don't hold 2 states in mem
2023-11-30 13:49:35 +11:00 · 2023-11-30 13:49:35 +11:00 · 547ed1de63
commit 547ed1de63
parent 43d98153d6
3 changed files with 117 additions and 21 deletions
--- a/beacon_node/beacon_chain/src/beacon_chain.rs
+++ b/beacon_node/beacon_chain/src/beacon_chain.rs
@ -482,6 +482,11 @@ pub struct BeaconChain<T: BeaconChainTypes> {
    pub data_availability_checker: Arc<DataAvailabilityChecker<T>>,
    /// The KZG trusted setup used by this chain.
    pub kzg: Option<Arc<Kzg>>,
+    /// State with complete tree hash cache, ready for block production.
+    ///
+    /// NB: We can delete this once we have tree-states.
+    #[allow(clippy::type_complexity)]
+    pub block_production_state: Arc<Mutex<Option<(Hash256, BlockProductionPreState<T::EthSpec>)>>>,
 }

 pub enum BeaconBlockResponseType<T: EthSpec> {
@ -4030,7 +4035,16 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
                );
                (re_org_state.pre_state, re_org_state.state_root)
            }
-            // Normal case: proposing a block atop the current head. Use the snapshot cache.
+            // Normal case: proposing a block atop the current head using the cache.
+            else if let Some((_, cached_state)) = self
+                .block_production_state
+                .lock()
+                .take()
+                .filter(|(cached_block_root, _)| *cached_block_root == head_block_root)
+            {
+                (cached_state.pre_state, cached_state.state_root)
+            }
+            // Fall back to a direct read of the snapshot cache.
            else if let Some(pre_state) = self
                .snapshot_cache
                .try_read_for(BLOCK_PROCESSING_CACHE_LOCK_TIMEOUT)
@ -4038,6 +4052,12 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
                    snapshot_cache.get_state_for_block_production(head_block_root)
                })
            {
+                warn!(
+                    self.log,
+                    "Block production cache miss";
+                    "message" => "falling back to snapshot cache clone",
+                    "slot" => slot
+                );
                (pre_state.pre_state, pre_state.state_root)
            } else {
                warn!(
@ -4161,12 +4181,27 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
        drop(proposer_head_timer);
        let re_org_parent_block = proposer_head.parent_node.root;

-        // Only attempt a re-org if we hit the snapshot cache.
+        // Only attempt a re-org if we hit the block production cache or snapshot cache.
        let pre_state = self
-            .snapshot_cache
-            .try_read_for(BLOCK_PROCESSING_CACHE_LOCK_TIMEOUT)
-            .and_then(|snapshot_cache| {
-                snapshot_cache.get_state_for_block_production(re_org_parent_block)
+            .block_production_state
+            .lock()
+            .take()
+            .and_then(|(cached_block_root, state)| {
+                (cached_block_root == re_org_parent_block).then_some(state)
+            })
+            .or_else(|| {
+                warn!(
+                    self.log,
+                    "Block production cache miss";
+                    "message" => "falling back to snapshot cache during re-org",
+                    "slot" => slot,
+                    "block_root" => ?re_org_parent_block
+                );
+                self.snapshot_cache
+                    .try_read_for(BLOCK_PROCESSING_CACHE_LOCK_TIMEOUT)
+                    .and_then(|snapshot_cache| {
+                        snapshot_cache.get_state_for_block_production(re_org_parent_block)
+                    })
            })
            .or_else(|| {
                debug!(
@ -5326,15 +5361,18 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
    ///
    /// This function will result in a call to `forkchoiceUpdated` on the EL if we're in the
    /// tail-end of the slot (as defined by `self.config.prepare_payload_lookahead`).
+    ///
+    /// Return `Ok(Some(head_block_root))` if this node prepared to propose at the next slot on
+    /// top of `head_block_root`.
    pub async fn prepare_beacon_proposer(
        self: &Arc<Self>,
        current_slot: Slot,
-    ) -> Result<(), Error> {
+    ) -> Result<Option<Hash256>, Error> {
        let prepare_slot = current_slot + 1;

        // There's no need to run the proposer preparation routine before the bellatrix fork.
        if self.slot_is_prior_to_bellatrix(prepare_slot) {
-            return Ok(());
+            return Ok(None);
        }

        let execution_layer = self
@ -5347,7 +5385,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
        if !self.config.always_prepare_payload
            && !execution_layer.has_any_proposer_preparation_data().await
        {
-            return Ok(());
+            return Ok(None);
        }

        // Load the cached head and its forkchoice update parameters.
@ -5394,7 +5432,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
        let Some((forkchoice_update_params, Some(pre_payload_attributes))) = maybe_prep_data else {
            // Appropriate log messages have already been logged above and in
            // `get_pre_payload_attributes`.
-            return Ok(());
+            return Ok(None);
        };

        // If the execution layer doesn't have any proposer data for this validator then we assume
@ -5405,7 +5443,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
                .has_proposer_preparation_data(proposer)
                .await
        {
-            return Ok(());
+            return Ok(None);
        }

        // Fetch payload attributes from the execution layer's cache, or compute them from scratch
@ -5500,7 +5538,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
                "prepare_slot" => prepare_slot,
                "validator" => proposer,
            );
-            return Ok(());
+            return Ok(None);
        };

        // If we are close enough to the proposal slot, send an fcU, which will have payload
@ -5523,7 +5561,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
            .await?;
        }

-        Ok(())
+        Ok(Some(head_root))
    }

    pub async fn update_execution_engine_forkchoice(
--- a/beacon_node/beacon_chain/src/builder.rs
+++ b/beacon_node/beacon_chain/src/builder.rs
@ -925,6 +925,7 @@ where
                    .map_err(|e| format!("Error initializing DataAvailabiltyChecker: {:?}", e))?,
            ),
            kzg,
+            block_production_state: Arc::new(Mutex::new(None)),
        };

        let head = beacon_chain.head_snapshot();
--- a/beacon_node/beacon_chain/src/state_advance_timer.rs
+++ b/beacon_node/beacon_chain/src/state_advance_timer.rs
@ -45,6 +45,9 @@ const MAX_ADVANCE_DISTANCE: u64 = 4;
 /// impact whilst having 8 epochs without a block is a comfortable grace period.
 const MAX_FORK_CHOICE_DISTANCE: u64 = 256;

+/// Drop any unused block production state cache after this many slots.
+const MAX_BLOCK_PRODUCTION_CACHE_DISTANCE: u64 = 4;
+
 #[derive(Debug)]
 enum Error {
    BeaconChain(BeaconChainError),
@ -227,19 +230,73 @@ async fn state_advance_timer<T: BeaconChainTypes>(

                // Prepare proposers so that the node can send payload attributes in the case where
                // it decides to abandon a proposer boost re-org.
-                if let Err(e) = beacon_chain.prepare_beacon_proposer(current_slot).await {
-                    warn!(
-                        log,
-                        "Unable to prepare proposer with lookahead";
-                        "error" => ?e,
-                        "slot" => next_slot,
-                    );
-                }
+                let proposer_head = beacon_chain
+                    .prepare_beacon_proposer(current_slot)
+                    .await
+                    .unwrap_or_else(|e| {
+                        warn!(
+                            log,
+                            "Unable to prepare proposer with lookahead";
+                            "error" => ?e,
+                            "slot" => next_slot,
+                        );
+                        None
+                    });

                // Use a blocking task to avoid blocking the core executor whilst waiting for locks
                // in `ForkChoiceSignalTx`.
                beacon_chain.task_executor.clone().spawn_blocking(
                    move || {
+                        // If we're proposing, clone the head state preemptively so that it isn't on
+                        // the hot path of proposing. We can delete this once we have tree-states.
+                        if let Some(proposer_head) = proposer_head {
+                            let mut cache = beacon_chain.block_production_state.lock();
+
+                            // Avoid holding two states in memory. It's OK to hold the lock because
+                            // we always lock the block production cache before the snapshot cache
+                            // and we prefer for block production to wait for the block production
+                            // cache if a clone is in-progress.
+                            if cache
+                                .as_ref()
+                                .map_or(false, |(cached_head, _)| *cached_head != proposer_head)
+                            {
+                                drop(cache.take());
+                            }
+                            if let Some(proposer_state) = beacon_chain
+                                .snapshot_cache
+                                .try_read_for(BLOCK_PROCESSING_CACHE_LOCK_TIMEOUT)
+                                .and_then(|snapshot_cache| {
+                                    snapshot_cache.get_state_for_block_production(proposer_head)
+                                })
+                            {
+                                *cache = Some((proposer_head, proposer_state));
+                                debug!(
+                                    log,
+                                    "Cloned state ready for block production";
+                                    "head_block_root" => ?proposer_head,
+                                    "slot" => next_slot
+                                );
+                            } else {
+                                warn!(
+                                    log,
+                                    "Block production state missing from snapshot cache";
+                                    "head_block_root" => ?proposer_head,
+                                    "slot" => next_slot
+                                );
+                            }
+                        } else {
+                            // If we aren't proposing, drop any old block production cache to save
+                            // memory.
+                            let mut cache = beacon_chain.block_production_state.lock();
+                            if let Some((_, state)) = &*cache {
+                                if state.pre_state.slot() + MAX_BLOCK_PRODUCTION_CACHE_DISTANCE
+                                    <= current_slot
+                                {
+                                    drop(cache.take());
+                                }
+                            }
+                        }
+
                        // Signal block proposal for the next slot (if it happens to be waiting).
                        if let Some(tx) = &beacon_chain.fork_choice_signal_tx {
                            if let Err(e) = tx.notify_fork_choice_complete(next_slot) {