Reduce lock contention in backfill sync (#2716)

## Proposed Changes Clone the proposer pubkeys during backfill signature verification to reduce the time that the pubkey cache lock is held for. Cloning such a small number of pubkeys has negligible impact on the total running time, but greatly reduces lock contention. On a Ryzen 5950X, the setup step seems to take around 180us regardless of whether the key is cloned or not, while the verification takes 7ms. When Lighthouse is limited to 10% of one core using `sudo cpulimit --pid <pid> --limit 10` the total time jumps up to 800ms, but the setup step remains only 250us. This means that under heavy load this PR could cut the time the lock is held for from 800ms to 250us, which is a huge saving of 99.97%!
2021-10-15 03:28:03 +00:00 · 2021-10-15 03:28:03 +00:00 · 5cde3fc4da
commit 5cde3fc4da
parent 9c5a8ab7f2
2 changed files with 26 additions and 3 deletions
--- a/beacon_node/beacon_chain/src/historical_blocks.rs
+++ b/beacon_node/beacon_chain/src/historical_blocks.rs
@ -1,4 +1,4 @@
-use crate::{errors::BeaconChainError as Error, BeaconChain, BeaconChainTypes};
+use crate::{errors::BeaconChainError as Error, metrics, BeaconChain, BeaconChainTypes};
 use itertools::Itertools;
 use slog::debug;
 use state_processing::{
@ -136,6 +136,8 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
        // Verify signatures in one batch, holding the pubkey cache lock for the shortest duration
        // possible. For each block fetch the parent root from its successor. Slicing from index 1
        // is safe because we've already checked that `blocks_to_import` is non-empty.
+        let sig_timer = metrics::start_timer(&metrics::BACKFILL_SIGNATURE_TOTAL_TIMES);
+        let setup_timer = metrics::start_timer(&metrics::BACKFILL_SIGNATURE_SETUP_TIMES);
        let pubkey_cache = self
            .validator_pubkey_cache
            .try_read_for(PUBKEY_CACHE_LOCK_TIMEOUT)
@ -156,17 +158,22 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
                    block.message().proposer_index(),
                    &self.spec.fork_at_epoch(block.message().epoch()),
                    self.genesis_validators_root,
-                    |validator_index| pubkey_cache.get(validator_index).map(Cow::Borrowed),
+                    |validator_index| pubkey_cache.get(validator_index).cloned().map(Cow::Owned),
                    &self.spec,
                )
            })
            .collect::<Result<Vec<_>, _>>()
            .map_err(HistoricalBlockError::SignatureSet)
            .map(ParallelSignatureSets::from)?;
+        drop(pubkey_cache);
+        drop(setup_timer);
+
+        let verify_timer = metrics::start_timer(&metrics::BACKFILL_SIGNATURE_VERIFY_TIMES);
        if !signature_set.verify() {
            return Err(HistoricalBlockError::InvalidSignature.into());
        }
-        drop(pubkey_cache);
+        drop(verify_timer);
+        drop(sig_timer);

        // Write the I/O batches to disk, writing the blocks themselves first, as it's better
        // for the hot DB to contain extra blocks than for the cold DB to point to blocks that
--- a/beacon_node/beacon_chain/src/metrics.rs
+++ b/beacon_node/beacon_chain/src/metrics.rs
@ -855,6 +855,22 @@ lazy_static! {
        "beacon_sync_committee_message_processing_signature_seconds",
        "Time spent on the signature verification of sync message processing"
    );
+
+    /*
+     * Checkpoint sync & backfill
+     */
+    pub static ref BACKFILL_SIGNATURE_SETUP_TIMES: Result<Histogram> = try_create_histogram(
+        "beacon_backfill_signature_setup_seconds",
+        "Time spent constructing the signature set during backfill sync"
+    );
+    pub static ref BACKFILL_SIGNATURE_VERIFY_TIMES: Result<Histogram> = try_create_histogram(
+        "beacon_backfill_signature_verify_seconds",
+        "Time spent verifying the signature set during backfill sync"
+    );
+    pub static ref BACKFILL_SIGNATURE_TOTAL_TIMES: Result<Histogram> = try_create_histogram(
+        "beacon_backfill_signature_total_seconds",
+        "Time spent verifying the signature set during backfill sync, including setup"
+    );
 }

 /// Scrape the `beacon_chain` for metrics that are not constantly updated (e.g., the present slot,