Add attestation duty slot metric (#2704)

## Issue Addressed Resolves #2521 ## Proposed Changes Add a metric that indicates the next attestation duty slot for all managed validators in the validator client.
2023-02-09 23:51:17 +00:00 · 2023-02-09 23:51:17 +00:00 · 2b735a9e8b
commit 2b735a9e8b
parent aa5b7ef783
5 changed files with 78 additions and 3 deletions
--- a/validator_client/src/cli.rs
+++ b/validator_client/src/cli.rs
@ -231,6 +231,15 @@ pub fn cli_app<'a, 'b>() -> App<'a, 'b> {
                    address of this server (e.g., http://localhost:5064).")
                .takes_value(true),
        )
+        .arg(
+            Arg::with_name("enable-high-validator-count-metrics")
+                .long("enable-high-validator-count-metrics")
+                .help("Enable per validator metrics for > 64 validators. \
+                    Note: This flag is automatically enabled for <= 64 validators. \
+                    Enabling this metric for higher validator counts will lead to higher volume \
+                    of prometheus metrics being collected.")
+                .takes_value(false),
+        )
        /*
         * Explorer metrics
         */
--- a/validator_client/src/config.rs
+++ b/validator_client/src/config.rs
@ -53,6 +53,11 @@ pub struct Config {
    /// If true, enable functionality that monitors the network for attestations or proposals from
    /// any of the validators managed by this client before starting up.
    pub enable_doppelganger_protection: bool,
+    /// If true, then we publish validator specific metrics (e.g next attestation duty slot)
+    /// for all our managed validators.
+    /// Note: We publish validator specific metrics for low validator counts without this flag
+    /// (<= 64 validators)
+    pub enable_high_validator_count_metrics: bool,
    /// Enable use of the blinded block endpoints during proposals.
    pub builder_proposals: bool,
    /// Overrides the timestamp field in builder api ValidatorRegistrationV1
@ -99,6 +104,7 @@ impl Default for Config {
            http_metrics: <_>::default(),
            monitoring_api: None,
            enable_doppelganger_protection: false,
+            enable_high_validator_count_metrics: false,
            beacon_nodes_tls_certs: None,
            block_delay: None,
            builder_proposals: false,
@ -273,6 +279,10 @@ impl Config {
            config.http_metrics.enabled = true;
        }

+        if cli_args.is_present("enable-high-validator-count-metrics") {
+            config.enable_high_validator_count_metrics = true;
+        }
+
        if let Some(address) = cli_args.value_of("metrics-address") {
            config.http_metrics.listen_addr = address
                .parse::<IpAddr>()
--- a/validator_client/src/duties_service.rs
+++ b/validator_client/src/duties_service.rs
@ -9,6 +9,7 @@
 mod sync;

 use crate::beacon_node_fallback::{BeaconNodeFallback, OfflineOnFailure, RequireSynced};
+use crate::http_metrics::metrics::{get_int_gauge, set_int_gauge, ATTESTATION_DUTY};
 use crate::{
    block_service::BlockServiceNotification,
    http_metrics::metrics,
@ -39,6 +40,11 @@ const SUBSCRIPTION_BUFFER_SLOTS: u64 = 2;
 /// Only retain `HISTORICAL_DUTIES_EPOCHS` duties prior to the current epoch.
 const HISTORICAL_DUTIES_EPOCHS: u64 = 2;

+/// Minimum number of validators for which we auto-enable per-validator metrics.
+/// For validators greater than this value, we need to manually set the `enable-per-validator-metrics`
+/// flag in the cli to enable collection of per validator metrics.
+const VALIDATOR_METRICS_MIN_COUNT: usize = 64;
+
 #[derive(Debug)]
 pub enum Error {
    UnableToReadSlotClock,
@ -121,6 +127,7 @@ pub struct DutiesService<T, E: EthSpec> {
    /// This functionality is a little redundant since most BNs will likely reject duties when they
    /// aren't synced, but we keep it around for an emergency.
    pub require_synced: RequireSynced,
+    pub enable_high_validator_count_metrics: bool,
    pub context: RuntimeContext<E>,
    pub spec: ChainSpec,
 }
@ -220,6 +227,12 @@ impl<T: SlotClock + 'static, E: EthSpec> DutiesService<T, E> {
            .cloned()
            .collect()
    }
+
+    /// Returns `true` if we should collect per validator metrics and `false` otherwise.
+    pub fn per_validator_metrics(&self) -> bool {
+        self.enable_high_validator_count_metrics
+            || self.total_validator_count() <= VALIDATOR_METRICS_MIN_COUNT
+    }
 }

 /// Start the service that periodically polls the beacon node for validator duties. This will start
@ -501,6 +514,7 @@ async fn poll_beacon_attesters<T: SlotClock + 'static, E: EthSpec>(
        current_epoch,
        &local_indices,
        &local_pubkeys,
+        current_slot,
    )
    .await
    {
@ -520,9 +534,14 @@ async fn poll_beacon_attesters<T: SlotClock + 'static, E: EthSpec>(
    );

    // Download the duties and update the duties for the next epoch.
-    if let Err(e) =
-        poll_beacon_attesters_for_epoch(duties_service, next_epoch, &local_indices, &local_pubkeys)
-            .await
+    if let Err(e) = poll_beacon_attesters_for_epoch(
+        duties_service,
+        next_epoch,
+        &local_indices,
+        &local_pubkeys,
+        current_slot,
+    )
+    .await
    {
        error!(
            log,
@ -619,6 +638,7 @@ async fn poll_beacon_attesters_for_epoch<T: SlotClock + 'static, E: EthSpec>(
    epoch: Epoch,
    local_indices: &[u64],
    local_pubkeys: &HashSet<PublicKeyBytes>,
+    current_slot: Slot,
 ) -> Result<(), Error> {
    let log = duties_service.context.log();

@ -671,6 +691,35 @@ async fn poll_beacon_attesters_for_epoch<T: SlotClock + 'static, E: EthSpec>(
            .data
            .into_iter()
            .filter(|duty| {
+                if duties_service.per_validator_metrics() {
+                    let validator_index = duty.validator_index;
+                    let duty_slot = duty.slot;
+                    if let Some(existing_slot_gauge) =
+                        get_int_gauge(&ATTESTATION_DUTY, &[&validator_index.to_string()])
+                    {
+                        let existing_slot = Slot::new(existing_slot_gauge.get() as u64);
+                        let existing_epoch = existing_slot.epoch(E::slots_per_epoch());
+
+                        // First condition ensures that we switch to the next epoch duty slot
+                        // once the current epoch duty slot passes.
+                        // Second condition is to ensure that next epoch duties don't override
+                        // current epoch duties.
+                        if existing_slot < current_slot
+                            || (duty_slot.epoch(E::slots_per_epoch()) <= existing_epoch
+                                && duty_slot > current_slot
+                                && duty_slot != existing_slot)
+                        {
+                            existing_slot_gauge.set(duty_slot.as_u64() as i64);
+                        }
+                    } else {
+                        set_int_gauge(
+                            &ATTESTATION_DUTY,
+                            &[&validator_index.to_string()],
+                            duty_slot.as_u64() as i64,
+                        );
+                    }
+                }
+
                local_pubkeys.contains(&duty.pubkey) && {
                    // Only update the duties if either is true:
                    //
--- a/validator_client/src/http_metrics/metrics.rs
+++ b/validator_client/src/http_metrics/metrics.rs
@ -172,6 +172,12 @@ lazy_static::lazy_static! {
        "Duration to obtain a signature",
        &["type"]
    );
+
+    pub static ref ATTESTATION_DUTY: Result<IntGaugeVec> = try_create_int_gauge_vec(
+        "vc_attestation_duty_slot",
+        "Attestation duty slot for all managed validators",
+        &["validator"]
+    );
 }

 pub fn gather_prometheus_metrics<T: EthSpec>(
--- a/validator_client/src/lib.rs
+++ b/validator_client/src/lib.rs
@ -422,6 +422,7 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
            },
            spec: context.eth2_config.spec.clone(),
            context: duties_context,
+            enable_high_validator_count_metrics: config.enable_high_validator_count_metrics,
        });

        // Update the metrics server.