lighthouse/validator_client/src/beacon_node_fallback.rs

//! Allows for a list of `BeaconNodeHttpClient` to appear as a single entity which will exhibits
//! "fallback" behaviour; it will try a request on all of the nodes until one or none of them
//! succeed.

use crate::check_synced::check_synced;
use crate::http_metrics::metrics::{inc_counter_vec, ENDPOINT_ERRORS, ENDPOINT_REQUESTS};
use environment::RuntimeContext;
use eth2::BeaconNodeHttpClient;
use futures::future;
use slog::{debug, error, info, warn, Logger};
use slot_clock::SlotClock;
use std::fmt;
use std::fmt::Debug;
use std::future::Future;
use std::marker::PhantomData;
use std::sync::Arc;
use std::time::{Duration, Instant};
use tokio::{sync::RwLock, time::sleep};
use types::{ChainSpec, Config, EthSpec};

/// Message emitted when the VC detects the BN is using a different spec.
const UPDATE_REQUIRED_LOG_HINT: &str = "this VC or the remote BN may need updating";

/// The number of seconds *prior* to slot start that we will try and update the state of fallback
/// nodes.
///
/// Ideally this should be somewhere between 2/3rds through the slot and the end of it. If we set it
/// too early, we risk switching nodes between the time of publishing an attestation and publishing
/// an aggregate; this may result in a missed aggregation. If we set this time too late, we risk not
/// having the correct nodes up and running prior to the start of the slot.
const SLOT_LOOKAHEAD: Duration = Duration::from_secs(2);

/// Indicates a measurement of latency between the VC and a BN.
pub struct LatencyMeasurement {
    /// An identifier for the beacon node (e.g. the URL).
    pub beacon_node_id: String,
    /// The round-trip latency, if the BN responded successfully.
    pub latency: Option<Duration>,
}

/// Starts a service that will routinely try and update the status of the provided `beacon_nodes`.
///
/// See `SLOT_LOOKAHEAD` for information about when this should run.
pub fn start_fallback_updater_service<T: SlotClock + 'static, E: EthSpec>(
    context: RuntimeContext<E>,
    beacon_nodes: Arc<BeaconNodeFallback<T, E>>,
) -> Result<(), &'static str> {
    let executor = context.executor;
    if beacon_nodes.slot_clock.is_none() {
        return Err("Cannot start fallback updater without slot clock");
    }

    let future = async move {
        loop {
            beacon_nodes.update_all_candidates().await;

            let sleep_time = beacon_nodes
                .slot_clock
                .as_ref()
                .and_then(|slot_clock| {
                    let slot = slot_clock.now()?;
                    let till_next_slot = slot_clock.duration_to_slot(slot + 1)?;

                    till_next_slot.checked_sub(SLOT_LOOKAHEAD)
                })
                .unwrap_or_else(|| Duration::from_secs(1));

            sleep(sleep_time).await
        }
    };

    executor.spawn(future, "fallback");

    Ok(())
}

/// Indicates if a beacon node must be synced before some action is performed on it.
#[derive(PartialEq, Clone, Copy)]
pub enum RequireSynced {
    Yes,
    No,
}

/// Indicates if a beacon node should be set to `Offline` if a request fails.
#[derive(PartialEq, Clone, Copy)]
pub enum OfflineOnFailure {
    Yes,
    No,
}

impl PartialEq<bool> for RequireSynced {
    fn eq(&self, other: &bool) -> bool {
        if *other {
            *self == RequireSynced::Yes
        } else {
            *self == RequireSynced::No
        }
    }
}

#[derive(Debug)]
pub enum Error<E> {
    /// The node was unavailable and we didn't attempt to contact it.
    Unavailable(CandidateError),
    /// We attempted to contact the node but it failed.
    RequestFailed(E),
}

impl<E> Error<E> {
    pub fn request_failure(&self) -> Option<&E> {
        match self {
            Error::RequestFailed(e) => Some(e),
            _ => None,
        }
    }
}

/// The list of errors encountered whilst attempting to perform a query.
pub struct Errors<E>(pub Vec<(String, Error<E>)>);

impl<E: Debug> fmt::Display for Errors<E> {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        if !self.0.is_empty() {
            write!(f, "Some endpoints failed, num_failed: {}", self.0.len())?;
        }
        for (i, (id, error)) in self.0.iter().enumerate() {
            let comma = if i + 1 < self.0.len() { "," } else { "" };

            write!(f, " {} => {:?}{}", id, error, comma)?;
        }
        Ok(())
    }
}

/// Reasons why a candidate might not be ready.
#[derive(Debug, Clone, Copy)]
pub enum CandidateError {
    Uninitialized,
    Offline,
    Incompatible,
    NotSynced,
}

/// Represents a `BeaconNodeHttpClient` inside a `BeaconNodeFallback` that may or may not be used
/// for a query.
pub struct CandidateBeaconNode<E> {
    beacon_node: BeaconNodeHttpClient,
    status: RwLock<Result<(), CandidateError>>,
    _phantom: PhantomData<E>,
}

impl<E: EthSpec> CandidateBeaconNode<E> {
    /// Instantiate a new node.
    pub fn new(beacon_node: BeaconNodeHttpClient) -> Self {
        Self {
            beacon_node,
            status: RwLock::new(Err(CandidateError::Uninitialized)),
            _phantom: PhantomData,
        }
    }

    /// Returns the status of `self`.
    ///
    /// If `RequiredSynced::No`, any `NotSynced` node will be ignored and mapped to `Ok(())`.
    pub async fn status(&self, synced: RequireSynced) -> Result<(), CandidateError> {
        match *self.status.read().await {
            Err(CandidateError::NotSynced) if synced == false => Ok(()),
            other => other,
        }
    }

    /// Indicate that `self` is offline.
    pub async fn set_offline(&self) {
        *self.status.write().await = Err(CandidateError::Offline)
    }

    /// Perform some queries against the node to determine if it is a good candidate, updating
    /// `self.status` and returning that result.
    pub async fn refresh_status<T: SlotClock>(
        &self,
        slot_clock: Option<&T>,
        spec: &ChainSpec,
        log: &Logger,
    ) -> Result<(), CandidateError> {
        let previous_status = self.status(RequireSynced::Yes).await;
        let was_offline = matches!(previous_status, Err(CandidateError::Offline));

        let new_status = if let Err(e) = self.is_online(was_offline, log).await {
            Err(e)
        } else if let Err(e) = self.is_compatible(spec, log).await {
            Err(e)
        } else if let Err(e) = self.is_synced(slot_clock, log).await {
            Err(e)
        } else {
            Ok(())
        };

        // In case of concurrent use, the latest value will always be used. It's possible that a
        // long time out might over-ride a recent successful response, leading to a falsely-offline
        // status. I deem this edge-case acceptable in return for the concurrency benefits of not
        // holding a write-lock whilst we check the online status of the node.
        *self.status.write().await = new_status;

        new_status
    }

    /// Checks if the node is reachable.
    async fn is_online(&self, was_offline: bool, log: &Logger) -> Result<(), CandidateError> {
        let result = self
            .beacon_node
            .get_node_version()
            .await
            .map(|body| body.data.version);

        match result {
            Ok(version) => {
                if was_offline {
                    info!(
                        log,
                        "Connected to beacon node";
                        "version" => version,
                        "endpoint" => %self.beacon_node,
                    );
                }
                Ok(())
            }
            Err(e) => {
                warn!(
                    log,
                    "Offline beacon node";
                    "error" => %e,
                    "endpoint" => %self.beacon_node,
                );
                Err(CandidateError::Offline)
            }
        }
    }

    /// Checks if the node has the correct specification.
    async fn is_compatible(&self, spec: &ChainSpec, log: &Logger) -> Result<(), CandidateError> {
        let config = self
            .beacon_node
            .get_config_spec::<Config>()
            .await
            .map_err(|e| {
                error!(
                    log,
                    "Unable to read spec from beacon node";
                    "error" => %e,
                    "endpoint" => %self.beacon_node,
                );
                CandidateError::Offline
            })?
            .data;

        let beacon_node_spec = ChainSpec::from_config::<E>(&config).ok_or_else(|| {
            error!(
                log,
                "The minimal/mainnet spec type of the beacon node does not match the validator \
                client. See the --network command.";
                "endpoint" => %self.beacon_node,
            );
            CandidateError::Incompatible
        })?;

        if beacon_node_spec.genesis_fork_version != spec.genesis_fork_version {
            error!(
                log,
                "Beacon node is configured for a different network";
                "endpoint" => %self.beacon_node,
                "bn_genesis_fork" => ?beacon_node_spec.genesis_fork_version,
                "our_genesis_fork" => ?spec.genesis_fork_version,
            );
            return Err(CandidateError::Incompatible);
        } else if beacon_node_spec.altair_fork_epoch != spec.altair_fork_epoch {
            warn!(
                log,
                "Beacon node has mismatched Altair fork epoch";
                "endpoint" => %self.beacon_node,
                "endpoint_altair_fork_epoch" => ?beacon_node_spec.altair_fork_epoch,
                "hint" => UPDATE_REQUIRED_LOG_HINT,
            );
        } else if beacon_node_spec.bellatrix_fork_epoch != spec.bellatrix_fork_epoch {
            warn!(
                log,
                "Beacon node has mismatched Bellatrix fork epoch";
                "endpoint" => %self.beacon_node,
                "endpoint_bellatrix_fork_epoch" => ?beacon_node_spec.bellatrix_fork_epoch,
                "hint" => UPDATE_REQUIRED_LOG_HINT,
            );
        } else if beacon_node_spec.capella_fork_epoch != spec.capella_fork_epoch {
            warn!(
                log,
                "Beacon node has mismatched Capella fork epoch";
                "endpoint" => %self.beacon_node,
                "endpoint_capella_fork_epoch" => ?beacon_node_spec.capella_fork_epoch,
                "hint" => UPDATE_REQUIRED_LOG_HINT,
            );
        }

        Ok(())
    }

    /// Checks if the beacon node is synced.
    async fn is_synced<T: SlotClock>(
        &self,
        slot_clock: Option<&T>,
        log: &Logger,
    ) -> Result<(), CandidateError> {
        if let Some(slot_clock) = slot_clock {
            check_synced(&self.beacon_node, slot_clock, Some(log)).await
        } else {
            // Skip this check if we don't supply a slot clock.
            Ok(())
        }
    }
}

/// A collection of `CandidateBeaconNode` that can be used to perform requests with "fallback"
/// behaviour, where the failure of one candidate results in the next candidate receiving an
/// identical query.
pub struct BeaconNodeFallback<T, E> {
    candidates: Vec<CandidateBeaconNode<E>>,
    slot_clock: Option<T>,
    disable_run_on_all: bool,
    spec: ChainSpec,
    log: Logger,
}

impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
    pub fn new(
        candidates: Vec<CandidateBeaconNode<E>>,
        disable_run_on_all: bool,
        spec: ChainSpec,
        log: Logger,
    ) -> Self {
        Self {
            candidates,
            slot_clock: None,
            disable_run_on_all,
            spec,
            log,
        }
    }

    /// Used to update the slot clock post-instantiation.
    ///
    /// This is the result of a chicken-and-egg issue where `Self` needs a slot clock for some
    /// operations, but `Self` is required to obtain the slot clock since we need the genesis time
    /// from a beacon node.
    pub fn set_slot_clock(&mut self, slot_clock: T) {
        self.slot_clock = Some(slot_clock);
    }

    /// The count of candidates, regardless of their state.
    pub fn num_total(&self) -> usize {
        self.candidates.len()
    }

    /// The count of synced and ready candidates.
    pub async fn num_synced(&self) -> usize {
        let mut n = 0;
        for candidate in &self.candidates {
            if candidate.status(RequireSynced::Yes).await.is_ok() {
                n += 1
            }
        }
        n
    }

    /// The count of synced and ready fallbacks excluding the primary beacon node candidate.
    pub async fn num_synced_fallback(&self) -> usize {
        let mut n = 0;
        for candidate in self.candidates.iter().skip(1) {
            if candidate.status(RequireSynced::Yes).await.is_ok() {
                n += 1
            }
        }
        n
    }

    /// The count of candidates that are online and compatible, but not necessarily synced.
    pub async fn num_available(&self) -> usize {
        let mut n = 0;
        for candidate in &self.candidates {
            if candidate.status(RequireSynced::No).await.is_ok() {
                n += 1
            }
        }
        n
    }

    /// Loop through ALL candidates in `self.candidates` and update their sync status.
    ///
    /// It is possible for a node to return an unsynced status while continuing to serve
    /// low quality responses. To route around this it's best to poll all connected beacon nodes.
    /// A previous implementation of this function polled only the unavailable BNs.
    pub async fn update_all_candidates(&self) {
        let futures = self
            .candidates
            .iter()
            .map(|candidate| {
                candidate.refresh_status(self.slot_clock.as_ref(), &self.spec, &self.log)
            })
            .collect::<Vec<_>>();

        // run all updates concurrently and ignore errors
        let _ = future::join_all(futures).await;
    }

    /// Concurrently send a request to all candidates (regardless of
    /// offline/online) status and attempt to collect a rough reading on the
    /// latency between the VC and candidate.
    pub async fn measure_latency(&self) -> Vec<LatencyMeasurement> {
        let futures: Vec<_> = self
            .candidates
            .iter()
            .map(|candidate| async {
                let beacon_node_id = candidate.beacon_node.to_string();
                // The `node/version` endpoint is used since I imagine it would
                // require the least processing in the BN and therefore measure
                // the connection moreso than the BNs processing speed.
                //
                // I imagine all clients have the version string availble as a
                // pre-computed string.
                let response_instant = candidate
                    .beacon_node
                    .get_node_version()
                    .await
                    .ok()
                    .map(|_| Instant::now());
                (beacon_node_id, response_instant)
            })
            .collect();

        let request_instant = Instant::now();

        // Send the request to all BNs at the same time. This might involve some
        // queueing on the sending host, however I hope it will avoid bias
        // caused by sending requests at different times.
        future::join_all(futures)
            .await
            .into_iter()
            .map(|(beacon_node_id, response_instant)| LatencyMeasurement {
                beacon_node_id,
                latency: response_instant
                    .and_then(|response| response.checked_duration_since(request_instant)),
            })
            .collect()
    }

    /// Run `func` against each candidate in `self`, returning immediately if a result is found.
    /// Otherwise, return all the errors encountered along the way.
    ///
    /// First this function will try all nodes with a suitable status. If no candidates are suitable
    /// or all the requests fail, it will try updating the status of all unsuitable nodes and
    /// re-running `func` again.
    pub async fn first_success<'a, F, O, Err, R>(
        &'a self,
        require_synced: RequireSynced,
        offline_on_failure: OfflineOnFailure,
        func: F,
    ) -> Result<O, Errors<Err>>
    where
        F: Fn(&'a BeaconNodeHttpClient) -> R,
        R: Future<Output = Result<O, Err>>,
        Err: Debug,
    {
        let mut errors = vec![];
        let mut to_retry = vec![];
        let mut retry_unsynced = vec![];
        let log = &self.log.clone();

        // Run `func` using a `candidate`, returning the value or capturing errors.
        //
        // We use a macro instead of a closure here since it is not trivial to move `func` into a
        // closure.
        macro_rules! try_func {
            ($candidate: ident) => {{
                inc_counter_vec(&ENDPOINT_REQUESTS, &[$candidate.beacon_node.as_ref()]);

                // There exists a race condition where `func` may be called when the candidate is
                // actually not ready. We deem this an acceptable inefficiency.
                match func(&$candidate.beacon_node).await {
                    Ok(val) => return Ok(val),
                    Err(e) => {
                        debug!(
                            log,
                            "Request to beacon node failed";
                            "node" => $candidate.beacon_node.to_string(),
                            "error" => ?e,
                        );
                        // If we have an error on this function, make the client as not-ready.
                        //
                        // There exists a race condition where the candidate may have been marked
                        // as ready between the `func` call and now. We deem this an acceptable
                        // inefficiency.
                        if matches!(offline_on_failure, OfflineOnFailure::Yes) {
                            $candidate.set_offline().await;
                        }
                        errors.push(($candidate.beacon_node.to_string(), Error::RequestFailed(e)));
                        inc_counter_vec(&ENDPOINT_ERRORS, &[$candidate.beacon_node.as_ref()]);
                    }
                }
            }};
        }

        // First pass: try `func` on all synced and ready candidates.
        //
        // This ensures that we always choose a synced node if it is available.
        for candidate in &self.candidates {
            match candidate.status(RequireSynced::Yes).await {
                Err(e @ CandidateError::NotSynced) if require_synced == false => {
                    // This client is unsynced we will try it after trying all synced clients
                    retry_unsynced.push(candidate);
                    errors.push((candidate.beacon_node.to_string(), Error::Unavailable(e)));
                }
                Err(e) => {
                    // This client was not ready on the first pass, we might try it again later.
                    to_retry.push(candidate);
                    errors.push((candidate.beacon_node.to_string(), Error::Unavailable(e)));
                }
                _ => try_func!(candidate),
            }
        }

        // Second pass: try `func` on ready unsynced candidates. This only runs if we permit
        // unsynced candidates.
        //
        // Due to async race-conditions, it is possible that we will send a request to a candidate
        // that has been set to an offline/unready status. This is acceptable.
        if require_synced == false {
            for candidate in retry_unsynced {
                try_func!(candidate);
            }
        }

        // Third pass: try again, attempting to make non-ready clients become ready.
        for candidate in to_retry {
            // If the candidate hasn't luckily transferred into the correct state in the meantime,
            // force an update of the state.
            let new_status = match candidate.status(require_synced).await {
                Ok(()) => Ok(()),
                Err(_) => {
                    candidate
                        .refresh_status(self.slot_clock.as_ref(), &self.spec, &self.log)
                        .await
                }
            };

            match new_status {
                Ok(()) => try_func!(candidate),
                Err(CandidateError::NotSynced) if require_synced == false => try_func!(candidate),
                Err(e) => {
                    errors.push((candidate.beacon_node.to_string(), Error::Unavailable(e)));
                }
            }
        }

        // There were no candidates already ready and we were unable to make any of them ready.
        Err(Errors(errors))
    }

    /// Run `func` against all candidates in `self`, collecting the result of `func` against each
    /// candidate.
    ///
    /// First this function will try all nodes with a suitable status. If no candidates are suitable
    /// it will try updating the status of all unsuitable nodes and re-running `func` again.
    ///
    /// Note: This function returns `Ok(())` if `func` returned successfully on all beacon nodes.
    /// It returns a list of errors along with the beacon node id that failed for `func`.
    /// Since this ignores the actual result of `func`, this function should only be used for beacon
    /// node calls whose results we do not care about, only that they completed successfully.
    pub async fn run_on_all<'a, F, O, Err, R>(
        &'a self,
        require_synced: RequireSynced,
        offline_on_failure: OfflineOnFailure,
        func: F,
    ) -> Result<(), Errors<Err>>
    where
        F: Fn(&'a BeaconNodeHttpClient) -> R,
        R: Future<Output = Result<O, Err>>,
    {
        let mut results = vec![];
        let mut to_retry = vec![];
        let mut retry_unsynced = vec![];

        // Run `func` using a `candidate`, returning the value or capturing errors.
        //
        // We use a macro instead of a closure here since it is not trivial to move `func` into a
        // closure.
        macro_rules! try_func {
            ($candidate: ident) => {{
                inc_counter_vec(&ENDPOINT_REQUESTS, &[$candidate.beacon_node.as_ref()]);

                // There exists a race condition where `func` may be called when the candidate is
                // actually not ready. We deem this an acceptable inefficiency.
                match func(&$candidate.beacon_node).await {
                    Ok(val) => results.push(Ok(val)),
                    Err(e) => {
                        // If we have an error on this function, make the client as not-ready.
                        //
                        // There exists a race condition where the candidate may have been marked
                        // as ready between the `func` call and now. We deem this an acceptable
                        // inefficiency.
                        if matches!(offline_on_failure, OfflineOnFailure::Yes) {
                            $candidate.set_offline().await;
                        }
                        results.push(Err((
                            $candidate.beacon_node.to_string(),
                            Error::RequestFailed(e),
                        )));
                        inc_counter_vec(&ENDPOINT_ERRORS, &[$candidate.beacon_node.as_ref()]);
                    }
                }
            }};
        }

        // First pass: try `func` on all synced and ready candidates.
        //
        // This ensures that we always choose a synced node if it is available.
        for candidate in &self.candidates {
            match candidate.status(RequireSynced::Yes).await {
                Err(CandidateError::NotSynced) if require_synced == false => {
                    // This client is unsynced we will try it after trying all synced clients
                    retry_unsynced.push(candidate);
                }
                Err(_) => {
                    // This client was not ready on the first pass, we might try it again later.
                    to_retry.push(candidate);
                }
                Ok(_) => try_func!(candidate),
            }
        }

        // Second pass: try `func` on ready unsynced candidates. This only runs if we permit
        // unsynced candidates.
        //
        // Due to async race-conditions, it is possible that we will send a request to a candidate
        // that has been set to an offline/unready status. This is acceptable.
        if require_synced == false {
            for candidate in retry_unsynced {
                try_func!(candidate);
            }
        }

        // Third pass: try again, attempting to make non-ready clients become ready.
        for candidate in to_retry {
            // If the candidate hasn't luckily transferred into the correct state in the meantime,
            // force an update of the state.
            let new_status = match candidate.status(require_synced).await {
                Ok(()) => Ok(()),
                Err(_) => {
                    candidate
                        .refresh_status(self.slot_clock.as_ref(), &self.spec, &self.log)
                        .await
                }
            };

            match new_status {
                Ok(()) => try_func!(candidate),
                Err(CandidateError::NotSynced) if require_synced == false => try_func!(candidate),
                Err(e) => {
                    results.push(Err((
                        candidate.beacon_node.to_string(),
                        Error::Unavailable(e),
                    )));
                }
            }
        }

        let errors: Vec<_> = results.into_iter().filter_map(|res| res.err()).collect();

        if !errors.is_empty() {
            Err(Errors(errors))
        } else {
            Ok(())
        }
    }

    /// Call `func` on first beacon node that returns success or on all beacon nodes
    /// depending on the value of `disable_run_on_all`.
    pub async fn run<'a, F, Err, R>(
        &'a self,
        require_synced: RequireSynced,
        offline_on_failure: OfflineOnFailure,
        func: F,
    ) -> Result<(), Errors<Err>>
    where
        F: Fn(&'a BeaconNodeHttpClient) -> R,
        R: Future<Output = Result<(), Err>>,
        Err: Debug,
    {
        if self.disable_run_on_all {
            self.first_success(require_synced, offline_on_failure, func)
                .await?;
            Ok(())
        } else {
            self.run_on_all(require_synced, offline_on_failure, func)
                .await
        }
    }
}