2020-12-18 09:17:03 +00:00
|
|
|
//! Allows for a list of `BeaconNodeHttpClient` to appear as a single entity which will exhibits
|
|
|
|
//! "fallback" behaviour; it will try a request on all of the nodes until one or none of them
|
|
|
|
//! succeed.
|
|
|
|
|
|
|
|
use crate::check_synced::check_synced;
|
|
|
|
use crate::http_metrics::metrics::{inc_counter_vec, ENDPOINT_ERRORS, ENDPOINT_REQUESTS};
|
|
|
|
use environment::RuntimeContext;
|
|
|
|
use eth2::BeaconNodeHttpClient;
|
|
|
|
use futures::future;
|
2023-03-02 05:26:14 +00:00
|
|
|
use slog::{debug, error, info, warn, Logger};
|
2020-12-18 09:17:03 +00:00
|
|
|
use slot_clock::SlotClock;
|
|
|
|
use std::fmt;
|
|
|
|
use std::fmt::Debug;
|
|
|
|
use std::future::Future;
|
|
|
|
use std::marker::PhantomData;
|
|
|
|
use std::sync::Arc;
|
2023-03-05 23:43:29 +00:00
|
|
|
use std::time::{Duration, Instant};
|
2020-12-18 09:17:03 +00:00
|
|
|
use tokio::{sync::RwLock, time::sleep};
|
2022-08-10 07:52:59 +00:00
|
|
|
use types::{ChainSpec, Config, EthSpec};
|
2020-12-18 09:17:03 +00:00
|
|
|
|
2023-03-06 04:08:48 +00:00
|
|
|
/// Message emitted when the VC detects the BN is using a different spec.
|
|
|
|
const UPDATE_REQUIRED_LOG_HINT: &str = "this VC or the remote BN may need updating";
|
|
|
|
|
2020-12-18 09:17:03 +00:00
|
|
|
/// The number of seconds *prior* to slot start that we will try and update the state of fallback
|
|
|
|
/// nodes.
|
|
|
|
///
|
|
|
|
/// Ideally this should be somewhere between 2/3rds through the slot and the end of it. If we set it
|
|
|
|
/// too early, we risk switching nodes between the time of publishing an attestation and publishing
|
|
|
|
/// an aggregate; this may result in a missed aggregation. If we set this time too late, we risk not
|
|
|
|
/// having the correct nodes up and running prior to the start of the slot.
|
Implement `el_offline` and use it in the VC (#4295)
## Issue Addressed
Closes https://github.com/sigp/lighthouse/issues/4291, part of #3613.
## Proposed Changes
- Implement the `el_offline` field on `/eth/v1/node/syncing`. We set `el_offline=true` if:
- The EL's internal status is `Offline` or `AuthFailed`, _or_
- The most recent call to `newPayload` resulted in an error (more on this in a moment).
- Use the `el_offline` field in the VC to mark nodes with offline ELs as _unsynced_. These nodes will still be used, but only after synced nodes.
- Overhaul the usage of `RequireSynced` so that `::No` is used almost everywhere. The `--allow-unsynced` flag was broken and had the opposite effect to intended, so it has been deprecated.
- Add tests for the EL being offline on the upcheck call, and being offline due to the newPayload check.
## Why track `newPayload` errors?
Tracking the EL's online/offline status is too coarse-grained to be useful in practice, because:
- If the EL is timing out to some calls, it's unlikely to timeout on the `upcheck` call, which is _just_ `eth_syncing`. Every failed call is followed by an upcheck [here](https://github.com/sigp/lighthouse/blob/693886b94176faa4cb450f024696cb69cda2fe58/beacon_node/execution_layer/src/engines.rs#L372-L380), which would have the effect of masking the failure and keeping the status _online_.
- The `newPayload` call is the most likely to time out. It's the call in which ELs tend to do most of their work (often 1-2 seconds), with `forkchoiceUpdated` usually returning much faster (<50ms).
- If `newPayload` is failing consistently (e.g. timing out) then this is a good indication that either the node's EL is in trouble, or the network as a whole is. In the first case validator clients _should_ prefer other BNs if they have one available. In the second case, all of their BNs will likely report `el_offline` and they'll just have to proceed with trying to use them.
## Additional Changes
- Add utility method `ForkName::latest` which is quite convenient for test writing, but probably other things too.
- Delete some stale comments from when we used to support multiple execution nodes.
2023-05-17 05:51:56 +00:00
|
|
|
const SLOT_LOOKAHEAD: Duration = Duration::from_secs(2);
|
2020-12-18 09:17:03 +00:00
|
|
|
|
2023-03-05 23:43:29 +00:00
|
|
|
/// Indicates a measurement of latency between the VC and a BN.
|
|
|
|
pub struct LatencyMeasurement {
|
|
|
|
/// An identifier for the beacon node (e.g. the URL).
|
|
|
|
pub beacon_node_id: String,
|
|
|
|
/// The round-trip latency, if the BN responded successfully.
|
|
|
|
pub latency: Option<Duration>,
|
|
|
|
}
|
|
|
|
|
2020-12-18 09:17:03 +00:00
|
|
|
/// Starts a service that will routinely try and update the status of the provided `beacon_nodes`.
|
|
|
|
///
|
|
|
|
/// See `SLOT_LOOKAHEAD` for information about when this should run.
|
|
|
|
pub fn start_fallback_updater_service<T: SlotClock + 'static, E: EthSpec>(
|
|
|
|
context: RuntimeContext<E>,
|
|
|
|
beacon_nodes: Arc<BeaconNodeFallback<T, E>>,
|
|
|
|
) -> Result<(), &'static str> {
|
|
|
|
let executor = context.executor;
|
|
|
|
if beacon_nodes.slot_clock.is_none() {
|
|
|
|
return Err("Cannot start fallback updater without slot clock");
|
|
|
|
}
|
|
|
|
|
|
|
|
let future = async move {
|
|
|
|
loop {
|
Implement `el_offline` and use it in the VC (#4295)
## Issue Addressed
Closes https://github.com/sigp/lighthouse/issues/4291, part of #3613.
## Proposed Changes
- Implement the `el_offline` field on `/eth/v1/node/syncing`. We set `el_offline=true` if:
- The EL's internal status is `Offline` or `AuthFailed`, _or_
- The most recent call to `newPayload` resulted in an error (more on this in a moment).
- Use the `el_offline` field in the VC to mark nodes with offline ELs as _unsynced_. These nodes will still be used, but only after synced nodes.
- Overhaul the usage of `RequireSynced` so that `::No` is used almost everywhere. The `--allow-unsynced` flag was broken and had the opposite effect to intended, so it has been deprecated.
- Add tests for the EL being offline on the upcheck call, and being offline due to the newPayload check.
## Why track `newPayload` errors?
Tracking the EL's online/offline status is too coarse-grained to be useful in practice, because:
- If the EL is timing out to some calls, it's unlikely to timeout on the `upcheck` call, which is _just_ `eth_syncing`. Every failed call is followed by an upcheck [here](https://github.com/sigp/lighthouse/blob/693886b94176faa4cb450f024696cb69cda2fe58/beacon_node/execution_layer/src/engines.rs#L372-L380), which would have the effect of masking the failure and keeping the status _online_.
- The `newPayload` call is the most likely to time out. It's the call in which ELs tend to do most of their work (often 1-2 seconds), with `forkchoiceUpdated` usually returning much faster (<50ms).
- If `newPayload` is failing consistently (e.g. timing out) then this is a good indication that either the node's EL is in trouble, or the network as a whole is. In the first case validator clients _should_ prefer other BNs if they have one available. In the second case, all of their BNs will likely report `el_offline` and they'll just have to proceed with trying to use them.
## Additional Changes
- Add utility method `ForkName::latest` which is quite convenient for test writing, but probably other things too.
- Delete some stale comments from when we used to support multiple execution nodes.
2023-05-17 05:51:56 +00:00
|
|
|
beacon_nodes.update_all_candidates().await;
|
2020-12-18 09:17:03 +00:00
|
|
|
|
|
|
|
let sleep_time = beacon_nodes
|
|
|
|
.slot_clock
|
|
|
|
.as_ref()
|
|
|
|
.and_then(|slot_clock| {
|
|
|
|
let slot = slot_clock.now()?;
|
|
|
|
let till_next_slot = slot_clock.duration_to_slot(slot + 1)?;
|
|
|
|
|
|
|
|
till_next_slot.checked_sub(SLOT_LOOKAHEAD)
|
|
|
|
})
|
|
|
|
.unwrap_or_else(|| Duration::from_secs(1));
|
|
|
|
|
|
|
|
sleep(sleep_time).await
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
executor.spawn(future, "fallback");
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Indicates if a beacon node must be synced before some action is performed on it.
|
|
|
|
#[derive(PartialEq, Clone, Copy)]
|
|
|
|
pub enum RequireSynced {
|
|
|
|
Yes,
|
|
|
|
No,
|
|
|
|
}
|
|
|
|
|
2022-08-29 11:35:59 +00:00
|
|
|
/// Indicates if a beacon node should be set to `Offline` if a request fails.
|
|
|
|
#[derive(PartialEq, Clone, Copy)]
|
|
|
|
pub enum OfflineOnFailure {
|
|
|
|
Yes,
|
|
|
|
No,
|
|
|
|
}
|
|
|
|
|
2020-12-18 09:17:03 +00:00
|
|
|
impl PartialEq<bool> for RequireSynced {
|
|
|
|
fn eq(&self, other: &bool) -> bool {
|
|
|
|
if *other {
|
|
|
|
*self == RequireSynced::Yes
|
|
|
|
} else {
|
|
|
|
*self == RequireSynced::No
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug)]
|
|
|
|
pub enum Error<E> {
|
|
|
|
/// The node was unavailable and we didn't attempt to contact it.
|
|
|
|
Unavailable(CandidateError),
|
|
|
|
/// We attempted to contact the node but it failed.
|
|
|
|
RequestFailed(E),
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<E> Error<E> {
|
|
|
|
pub fn request_failure(&self) -> Option<&E> {
|
|
|
|
match self {
|
|
|
|
Error::RequestFailed(e) => Some(e),
|
|
|
|
_ => None,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// The list of errors encountered whilst attempting to perform a query.
|
2022-09-28 19:53:35 +00:00
|
|
|
pub struct Errors<E>(pub Vec<(String, Error<E>)>);
|
2020-12-18 09:17:03 +00:00
|
|
|
|
2022-09-28 19:53:35 +00:00
|
|
|
impl<E: Debug> fmt::Display for Errors<E> {
|
2020-12-18 09:17:03 +00:00
|
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
2022-09-28 19:53:35 +00:00
|
|
|
if !self.0.is_empty() {
|
|
|
|
write!(f, "Some endpoints failed, num_failed: {}", self.0.len())?;
|
|
|
|
}
|
2020-12-18 09:17:03 +00:00
|
|
|
for (i, (id, error)) in self.0.iter().enumerate() {
|
|
|
|
let comma = if i + 1 < self.0.len() { "," } else { "" };
|
|
|
|
|
|
|
|
write!(f, " {} => {:?}{}", id, error, comma)?;
|
|
|
|
}
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Reasons why a candidate might not be ready.
|
|
|
|
#[derive(Debug, Clone, Copy)]
|
|
|
|
pub enum CandidateError {
|
|
|
|
Uninitialized,
|
|
|
|
Offline,
|
|
|
|
Incompatible,
|
|
|
|
NotSynced,
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Represents a `BeaconNodeHttpClient` inside a `BeaconNodeFallback` that may or may not be used
|
|
|
|
/// for a query.
|
|
|
|
pub struct CandidateBeaconNode<E> {
|
|
|
|
beacon_node: BeaconNodeHttpClient,
|
|
|
|
status: RwLock<Result<(), CandidateError>>,
|
|
|
|
_phantom: PhantomData<E>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<E: EthSpec> CandidateBeaconNode<E> {
|
|
|
|
/// Instantiate a new node.
|
|
|
|
pub fn new(beacon_node: BeaconNodeHttpClient) -> Self {
|
|
|
|
Self {
|
|
|
|
beacon_node,
|
|
|
|
status: RwLock::new(Err(CandidateError::Uninitialized)),
|
|
|
|
_phantom: PhantomData,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns the status of `self`.
|
|
|
|
///
|
|
|
|
/// If `RequiredSynced::No`, any `NotSynced` node will be ignored and mapped to `Ok(())`.
|
|
|
|
pub async fn status(&self, synced: RequireSynced) -> Result<(), CandidateError> {
|
|
|
|
match *self.status.read().await {
|
|
|
|
Err(CandidateError::NotSynced) if synced == false => Ok(()),
|
|
|
|
other => other,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Indicate that `self` is offline.
|
|
|
|
pub async fn set_offline(&self) {
|
|
|
|
*self.status.write().await = Err(CandidateError::Offline)
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Perform some queries against the node to determine if it is a good candidate, updating
|
|
|
|
/// `self.status` and returning that result.
|
|
|
|
pub async fn refresh_status<T: SlotClock>(
|
|
|
|
&self,
|
|
|
|
slot_clock: Option<&T>,
|
|
|
|
spec: &ChainSpec,
|
|
|
|
log: &Logger,
|
|
|
|
) -> Result<(), CandidateError> {
|
2023-05-22 02:36:43 +00:00
|
|
|
let previous_status = self.status(RequireSynced::Yes).await;
|
|
|
|
let was_offline = matches!(previous_status, Err(CandidateError::Offline));
|
|
|
|
|
|
|
|
let new_status = if let Err(e) = self.is_online(was_offline, log).await {
|
Avoid hogging the fallback `status` lock in the VC (#3022)
## Issue Addressed
Addresses https://github.com/sigp/lighthouse/issues/2926
## Proposed Changes
Appropriated from https://github.com/sigp/lighthouse/issues/2926#issuecomment-1039676768:
When a node returns *any* error we call [`CandidateBeaconNode::set_offline`](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L424) which sets it's `status` to `CandidateError::Offline`. That node will then be ignored until the routine [`fallback_updater_service`](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L44) manages to reconnect to it.
However, I believe there was an issue in the [`CanidateBeaconNode::refesh_status`](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L157-L178) method, which is used by the updater service to see if the node has come good again. It was holding a [write lock on the `status` field](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L165) whilst it polled the node status. This means a long timeout would hog the write lock and starve other processes.
When a VC is trying to access a beacon node for whatever purpose (getting duties, posting blocks, etc), it performs [three passes](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L432-L482) through the lists of nodes, trying to run some generic `function` (closure, lambda, etc) on each node:
- 1st pass: only try running `function` on all nodes which are both synced and online.
- 2nd pass: try running `function` on all nodes that are online, but not necessarily synced.
- 3rd pass: for each offline node, try refreshing its status and then running `function` on it.
So, it turns out that if the `CanidateBeaconNode::refesh_status` function from the routine update service is hogging the write-lock, the 1st pass gets blocked whilst trying to read the status of the first node. So, nodes that should be left until the 3rd pass are blocking the process of the 1st and 2nd passes, hence the behaviour described in #2926.
## Additional Info
NA
2022-02-22 03:09:00 +00:00
|
|
|
Err(e)
|
2020-12-18 09:17:03 +00:00
|
|
|
} else if let Err(e) = self.is_compatible(spec, log).await {
|
Avoid hogging the fallback `status` lock in the VC (#3022)
## Issue Addressed
Addresses https://github.com/sigp/lighthouse/issues/2926
## Proposed Changes
Appropriated from https://github.com/sigp/lighthouse/issues/2926#issuecomment-1039676768:
When a node returns *any* error we call [`CandidateBeaconNode::set_offline`](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L424) which sets it's `status` to `CandidateError::Offline`. That node will then be ignored until the routine [`fallback_updater_service`](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L44) manages to reconnect to it.
However, I believe there was an issue in the [`CanidateBeaconNode::refesh_status`](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L157-L178) method, which is used by the updater service to see if the node has come good again. It was holding a [write lock on the `status` field](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L165) whilst it polled the node status. This means a long timeout would hog the write lock and starve other processes.
When a VC is trying to access a beacon node for whatever purpose (getting duties, posting blocks, etc), it performs [three passes](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L432-L482) through the lists of nodes, trying to run some generic `function` (closure, lambda, etc) on each node:
- 1st pass: only try running `function` on all nodes which are both synced and online.
- 2nd pass: try running `function` on all nodes that are online, but not necessarily synced.
- 3rd pass: for each offline node, try refreshing its status and then running `function` on it.
So, it turns out that if the `CanidateBeaconNode::refesh_status` function from the routine update service is hogging the write-lock, the 1st pass gets blocked whilst trying to read the status of the first node. So, nodes that should be left until the 3rd pass are blocking the process of the 1st and 2nd passes, hence the behaviour described in #2926.
## Additional Info
NA
2022-02-22 03:09:00 +00:00
|
|
|
Err(e)
|
2020-12-18 09:17:03 +00:00
|
|
|
} else if let Err(e) = self.is_synced(slot_clock, log).await {
|
Avoid hogging the fallback `status` lock in the VC (#3022)
## Issue Addressed
Addresses https://github.com/sigp/lighthouse/issues/2926
## Proposed Changes
Appropriated from https://github.com/sigp/lighthouse/issues/2926#issuecomment-1039676768:
When a node returns *any* error we call [`CandidateBeaconNode::set_offline`](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L424) which sets it's `status` to `CandidateError::Offline`. That node will then be ignored until the routine [`fallback_updater_service`](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L44) manages to reconnect to it.
However, I believe there was an issue in the [`CanidateBeaconNode::refesh_status`](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L157-L178) method, which is used by the updater service to see if the node has come good again. It was holding a [write lock on the `status` field](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L165) whilst it polled the node status. This means a long timeout would hog the write lock and starve other processes.
When a VC is trying to access a beacon node for whatever purpose (getting duties, posting blocks, etc), it performs [three passes](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L432-L482) through the lists of nodes, trying to run some generic `function` (closure, lambda, etc) on each node:
- 1st pass: only try running `function` on all nodes which are both synced and online.
- 2nd pass: try running `function` on all nodes that are online, but not necessarily synced.
- 3rd pass: for each offline node, try refreshing its status and then running `function` on it.
So, it turns out that if the `CanidateBeaconNode::refesh_status` function from the routine update service is hogging the write-lock, the 1st pass gets blocked whilst trying to read the status of the first node. So, nodes that should be left until the 3rd pass are blocking the process of the 1st and 2nd passes, hence the behaviour described in #2926.
## Additional Info
NA
2022-02-22 03:09:00 +00:00
|
|
|
Err(e)
|
2020-12-18 09:17:03 +00:00
|
|
|
} else {
|
Avoid hogging the fallback `status` lock in the VC (#3022)
## Issue Addressed
Addresses https://github.com/sigp/lighthouse/issues/2926
## Proposed Changes
Appropriated from https://github.com/sigp/lighthouse/issues/2926#issuecomment-1039676768:
When a node returns *any* error we call [`CandidateBeaconNode::set_offline`](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L424) which sets it's `status` to `CandidateError::Offline`. That node will then be ignored until the routine [`fallback_updater_service`](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L44) manages to reconnect to it.
However, I believe there was an issue in the [`CanidateBeaconNode::refesh_status`](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L157-L178) method, which is used by the updater service to see if the node has come good again. It was holding a [write lock on the `status` field](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L165) whilst it polled the node status. This means a long timeout would hog the write lock and starve other processes.
When a VC is trying to access a beacon node for whatever purpose (getting duties, posting blocks, etc), it performs [three passes](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L432-L482) through the lists of nodes, trying to run some generic `function` (closure, lambda, etc) on each node:
- 1st pass: only try running `function` on all nodes which are both synced and online.
- 2nd pass: try running `function` on all nodes that are online, but not necessarily synced.
- 3rd pass: for each offline node, try refreshing its status and then running `function` on it.
So, it turns out that if the `CanidateBeaconNode::refesh_status` function from the routine update service is hogging the write-lock, the 1st pass gets blocked whilst trying to read the status of the first node. So, nodes that should be left until the 3rd pass are blocking the process of the 1st and 2nd passes, hence the behaviour described in #2926.
## Additional Info
NA
2022-02-22 03:09:00 +00:00
|
|
|
Ok(())
|
|
|
|
};
|
|
|
|
|
|
|
|
// In case of concurrent use, the latest value will always be used. It's possible that a
|
|
|
|
// long time out might over-ride a recent successful response, leading to a falsely-offline
|
|
|
|
// status. I deem this edge-case acceptable in return for the concurrency benefits of not
|
|
|
|
// holding a write-lock whilst we check the online status of the node.
|
|
|
|
*self.status.write().await = new_status;
|
2020-12-18 09:17:03 +00:00
|
|
|
|
Avoid hogging the fallback `status` lock in the VC (#3022)
## Issue Addressed
Addresses https://github.com/sigp/lighthouse/issues/2926
## Proposed Changes
Appropriated from https://github.com/sigp/lighthouse/issues/2926#issuecomment-1039676768:
When a node returns *any* error we call [`CandidateBeaconNode::set_offline`](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L424) which sets it's `status` to `CandidateError::Offline`. That node will then be ignored until the routine [`fallback_updater_service`](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L44) manages to reconnect to it.
However, I believe there was an issue in the [`CanidateBeaconNode::refesh_status`](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L157-L178) method, which is used by the updater service to see if the node has come good again. It was holding a [write lock on the `status` field](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L165) whilst it polled the node status. This means a long timeout would hog the write lock and starve other processes.
When a VC is trying to access a beacon node for whatever purpose (getting duties, posting blocks, etc), it performs [three passes](https://github.com/sigp/lighthouse/blob/c3a793fd73a3b11b130b82032904d39c952869e4/validator_client/src/beacon_node_fallback.rs#L432-L482) through the lists of nodes, trying to run some generic `function` (closure, lambda, etc) on each node:
- 1st pass: only try running `function` on all nodes which are both synced and online.
- 2nd pass: try running `function` on all nodes that are online, but not necessarily synced.
- 3rd pass: for each offline node, try refreshing its status and then running `function` on it.
So, it turns out that if the `CanidateBeaconNode::refesh_status` function from the routine update service is hogging the write-lock, the 1st pass gets blocked whilst trying to read the status of the first node. So, nodes that should be left until the 3rd pass are blocking the process of the 1st and 2nd passes, hence the behaviour described in #2926.
## Additional Info
NA
2022-02-22 03:09:00 +00:00
|
|
|
new_status
|
2020-12-18 09:17:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Checks if the node is reachable.
|
2023-05-22 02:36:43 +00:00
|
|
|
async fn is_online(&self, was_offline: bool, log: &Logger) -> Result<(), CandidateError> {
|
2020-12-18 09:17:03 +00:00
|
|
|
let result = self
|
|
|
|
.beacon_node
|
|
|
|
.get_node_version()
|
|
|
|
.await
|
|
|
|
.map(|body| body.data.version);
|
|
|
|
|
|
|
|
match result {
|
|
|
|
Ok(version) => {
|
2023-05-22 02:36:43 +00:00
|
|
|
if was_offline {
|
|
|
|
info!(
|
|
|
|
log,
|
|
|
|
"Connected to beacon node";
|
|
|
|
"version" => version,
|
|
|
|
"endpoint" => %self.beacon_node,
|
|
|
|
);
|
|
|
|
}
|
2020-12-18 09:17:03 +00:00
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
Err(e) => {
|
|
|
|
warn!(
|
|
|
|
log,
|
|
|
|
"Offline beacon node";
|
|
|
|
"error" => %e,
|
|
|
|
"endpoint" => %self.beacon_node,
|
|
|
|
);
|
|
|
|
Err(CandidateError::Offline)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Checks if the node has the correct specification.
|
|
|
|
async fn is_compatible(&self, spec: &ChainSpec, log: &Logger) -> Result<(), CandidateError> {
|
2022-08-10 07:52:59 +00:00
|
|
|
let config = self
|
2020-12-18 09:17:03 +00:00
|
|
|
.beacon_node
|
2022-08-10 07:52:59 +00:00
|
|
|
.get_config_spec::<Config>()
|
2020-12-18 09:17:03 +00:00
|
|
|
.await
|
|
|
|
.map_err(|e| {
|
|
|
|
error!(
|
|
|
|
log,
|
|
|
|
"Unable to read spec from beacon node";
|
|
|
|
"error" => %e,
|
|
|
|
"endpoint" => %self.beacon_node,
|
|
|
|
);
|
|
|
|
CandidateError::Offline
|
|
|
|
})?
|
|
|
|
.data;
|
|
|
|
|
2022-08-10 07:52:59 +00:00
|
|
|
let beacon_node_spec = ChainSpec::from_config::<E>(&config).ok_or_else(|| {
|
|
|
|
error!(
|
2021-03-26 04:53:57 +00:00
|
|
|
log,
|
2022-08-10 07:52:59 +00:00
|
|
|
"The minimal/mainnet spec type of the beacon node does not match the validator \
|
|
|
|
client. See the --network command.";
|
2021-07-09 06:15:32 +00:00
|
|
|
"endpoint" => %self.beacon_node,
|
2021-03-26 04:53:57 +00:00
|
|
|
);
|
2022-08-10 07:52:59 +00:00
|
|
|
CandidateError::Incompatible
|
|
|
|
})?;
|
2021-03-26 04:53:57 +00:00
|
|
|
|
2021-09-27 04:22:07 +00:00
|
|
|
if beacon_node_spec.genesis_fork_version != spec.genesis_fork_version {
|
2020-12-18 09:17:03 +00:00
|
|
|
error!(
|
|
|
|
log,
|
2021-09-27 04:22:07 +00:00
|
|
|
"Beacon node is configured for a different network";
|
2020-12-18 09:17:03 +00:00
|
|
|
"endpoint" => %self.beacon_node,
|
2021-09-27 04:22:07 +00:00
|
|
|
"bn_genesis_fork" => ?beacon_node_spec.genesis_fork_version,
|
|
|
|
"our_genesis_fork" => ?spec.genesis_fork_version,
|
|
|
|
);
|
|
|
|
return Err(CandidateError::Incompatible);
|
2022-01-24 22:33:04 +00:00
|
|
|
} else if beacon_node_spec.altair_fork_epoch != spec.altair_fork_epoch {
|
2021-09-27 04:22:07 +00:00
|
|
|
warn!(
|
|
|
|
log,
|
2022-01-24 22:33:04 +00:00
|
|
|
"Beacon node has mismatched Altair fork epoch";
|
2021-09-27 04:22:07 +00:00
|
|
|
"endpoint" => %self.beacon_node,
|
2022-01-24 22:33:04 +00:00
|
|
|
"endpoint_altair_fork_epoch" => ?beacon_node_spec.altair_fork_epoch,
|
2023-03-06 04:08:48 +00:00
|
|
|
"hint" => UPDATE_REQUIRED_LOG_HINT,
|
2021-09-27 04:22:07 +00:00
|
|
|
);
|
2022-01-24 22:33:04 +00:00
|
|
|
} else if beacon_node_spec.bellatrix_fork_epoch != spec.bellatrix_fork_epoch {
|
|
|
|
warn!(
|
2021-09-27 04:22:07 +00:00
|
|
|
log,
|
2022-01-24 22:33:04 +00:00
|
|
|
"Beacon node has mismatched Bellatrix fork epoch";
|
|
|
|
"endpoint" => %self.beacon_node,
|
|
|
|
"endpoint_bellatrix_fork_epoch" => ?beacon_node_spec.bellatrix_fork_epoch,
|
2023-03-06 04:08:48 +00:00
|
|
|
"hint" => UPDATE_REQUIRED_LOG_HINT,
|
|
|
|
);
|
|
|
|
} else if beacon_node_spec.capella_fork_epoch != spec.capella_fork_epoch {
|
|
|
|
warn!(
|
|
|
|
log,
|
|
|
|
"Beacon node has mismatched Capella fork epoch";
|
|
|
|
"endpoint" => %self.beacon_node,
|
|
|
|
"endpoint_capella_fork_epoch" => ?beacon_node_spec.capella_fork_epoch,
|
|
|
|
"hint" => UPDATE_REQUIRED_LOG_HINT,
|
2020-12-18 09:17:03 +00:00
|
|
|
);
|
|
|
|
}
|
2021-09-27 04:22:07 +00:00
|
|
|
|
|
|
|
Ok(())
|
2020-12-18 09:17:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Checks if the beacon node is synced.
|
|
|
|
async fn is_synced<T: SlotClock>(
|
|
|
|
&self,
|
|
|
|
slot_clock: Option<&T>,
|
|
|
|
log: &Logger,
|
|
|
|
) -> Result<(), CandidateError> {
|
|
|
|
if let Some(slot_clock) = slot_clock {
|
2021-01-06 03:01:48 +00:00
|
|
|
check_synced(&self.beacon_node, slot_clock, Some(log)).await
|
2020-12-18 09:17:03 +00:00
|
|
|
} else {
|
|
|
|
// Skip this check if we don't supply a slot clock.
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// A collection of `CandidateBeaconNode` that can be used to perform requests with "fallback"
|
|
|
|
/// behaviour, where the failure of one candidate results in the next candidate receiving an
|
|
|
|
/// identical query.
|
|
|
|
pub struct BeaconNodeFallback<T, E> {
|
|
|
|
candidates: Vec<CandidateBeaconNode<E>>,
|
|
|
|
slot_clock: Option<T>,
|
2022-09-28 19:53:35 +00:00
|
|
|
disable_run_on_all: bool,
|
2020-12-18 09:17:03 +00:00
|
|
|
spec: ChainSpec,
|
|
|
|
log: Logger,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
|
2022-09-28 19:53:35 +00:00
|
|
|
pub fn new(
|
|
|
|
candidates: Vec<CandidateBeaconNode<E>>,
|
|
|
|
disable_run_on_all: bool,
|
|
|
|
spec: ChainSpec,
|
|
|
|
log: Logger,
|
|
|
|
) -> Self {
|
2020-12-18 09:17:03 +00:00
|
|
|
Self {
|
|
|
|
candidates,
|
|
|
|
slot_clock: None,
|
2022-09-28 19:53:35 +00:00
|
|
|
disable_run_on_all,
|
2020-12-18 09:17:03 +00:00
|
|
|
spec,
|
|
|
|
log,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Used to update the slot clock post-instantiation.
|
|
|
|
///
|
|
|
|
/// This is the result of a chicken-and-egg issue where `Self` needs a slot clock for some
|
|
|
|
/// operations, but `Self` is required to obtain the slot clock since we need the genesis time
|
|
|
|
/// from a beacon node.
|
|
|
|
pub fn set_slot_clock(&mut self, slot_clock: T) {
|
|
|
|
self.slot_clock = Some(slot_clock);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// The count of candidates, regardless of their state.
|
2021-05-26 05:58:41 +00:00
|
|
|
pub fn num_total(&self) -> usize {
|
2020-12-18 09:17:03 +00:00
|
|
|
self.candidates.len()
|
|
|
|
}
|
|
|
|
|
|
|
|
/// The count of synced and ready candidates.
|
|
|
|
pub async fn num_synced(&self) -> usize {
|
|
|
|
let mut n = 0;
|
|
|
|
for candidate in &self.candidates {
|
|
|
|
if candidate.status(RequireSynced::Yes).await.is_ok() {
|
|
|
|
n += 1
|
|
|
|
}
|
|
|
|
}
|
|
|
|
n
|
|
|
|
}
|
|
|
|
|
2021-05-26 05:58:41 +00:00
|
|
|
/// The count of synced and ready fallbacks excluding the primary beacon node candidate.
|
|
|
|
pub async fn num_synced_fallback(&self) -> usize {
|
|
|
|
let mut n = 0;
|
|
|
|
for candidate in self.candidates.iter().skip(1) {
|
|
|
|
if candidate.status(RequireSynced::Yes).await.is_ok() {
|
|
|
|
n += 1
|
|
|
|
}
|
|
|
|
}
|
|
|
|
n
|
|
|
|
}
|
|
|
|
|
2020-12-18 09:17:03 +00:00
|
|
|
/// The count of candidates that are online and compatible, but not necessarily synced.
|
|
|
|
pub async fn num_available(&self) -> usize {
|
|
|
|
let mut n = 0;
|
|
|
|
for candidate in &self.candidates {
|
|
|
|
if candidate.status(RequireSynced::No).await.is_ok() {
|
|
|
|
n += 1
|
|
|
|
}
|
|
|
|
}
|
|
|
|
n
|
|
|
|
}
|
|
|
|
|
Implement `el_offline` and use it in the VC (#4295)
## Issue Addressed
Closes https://github.com/sigp/lighthouse/issues/4291, part of #3613.
## Proposed Changes
- Implement the `el_offline` field on `/eth/v1/node/syncing`. We set `el_offline=true` if:
- The EL's internal status is `Offline` or `AuthFailed`, _or_
- The most recent call to `newPayload` resulted in an error (more on this in a moment).
- Use the `el_offline` field in the VC to mark nodes with offline ELs as _unsynced_. These nodes will still be used, but only after synced nodes.
- Overhaul the usage of `RequireSynced` so that `::No` is used almost everywhere. The `--allow-unsynced` flag was broken and had the opposite effect to intended, so it has been deprecated.
- Add tests for the EL being offline on the upcheck call, and being offline due to the newPayload check.
## Why track `newPayload` errors?
Tracking the EL's online/offline status is too coarse-grained to be useful in practice, because:
- If the EL is timing out to some calls, it's unlikely to timeout on the `upcheck` call, which is _just_ `eth_syncing`. Every failed call is followed by an upcheck [here](https://github.com/sigp/lighthouse/blob/693886b94176faa4cb450f024696cb69cda2fe58/beacon_node/execution_layer/src/engines.rs#L372-L380), which would have the effect of masking the failure and keeping the status _online_.
- The `newPayload` call is the most likely to time out. It's the call in which ELs tend to do most of their work (often 1-2 seconds), with `forkchoiceUpdated` usually returning much faster (<50ms).
- If `newPayload` is failing consistently (e.g. timing out) then this is a good indication that either the node's EL is in trouble, or the network as a whole is. In the first case validator clients _should_ prefer other BNs if they have one available. In the second case, all of their BNs will likely report `el_offline` and they'll just have to proceed with trying to use them.
## Additional Changes
- Add utility method `ForkName::latest` which is quite convenient for test writing, but probably other things too.
- Delete some stale comments from when we used to support multiple execution nodes.
2023-05-17 05:51:56 +00:00
|
|
|
/// Loop through ALL candidates in `self.candidates` and update their sync status.
|
2020-12-18 09:17:03 +00:00
|
|
|
///
|
Implement `el_offline` and use it in the VC (#4295)
## Issue Addressed
Closes https://github.com/sigp/lighthouse/issues/4291, part of #3613.
## Proposed Changes
- Implement the `el_offline` field on `/eth/v1/node/syncing`. We set `el_offline=true` if:
- The EL's internal status is `Offline` or `AuthFailed`, _or_
- The most recent call to `newPayload` resulted in an error (more on this in a moment).
- Use the `el_offline` field in the VC to mark nodes with offline ELs as _unsynced_. These nodes will still be used, but only after synced nodes.
- Overhaul the usage of `RequireSynced` so that `::No` is used almost everywhere. The `--allow-unsynced` flag was broken and had the opposite effect to intended, so it has been deprecated.
- Add tests for the EL being offline on the upcheck call, and being offline due to the newPayload check.
## Why track `newPayload` errors?
Tracking the EL's online/offline status is too coarse-grained to be useful in practice, because:
- If the EL is timing out to some calls, it's unlikely to timeout on the `upcheck` call, which is _just_ `eth_syncing`. Every failed call is followed by an upcheck [here](https://github.com/sigp/lighthouse/blob/693886b94176faa4cb450f024696cb69cda2fe58/beacon_node/execution_layer/src/engines.rs#L372-L380), which would have the effect of masking the failure and keeping the status _online_.
- The `newPayload` call is the most likely to time out. It's the call in which ELs tend to do most of their work (often 1-2 seconds), with `forkchoiceUpdated` usually returning much faster (<50ms).
- If `newPayload` is failing consistently (e.g. timing out) then this is a good indication that either the node's EL is in trouble, or the network as a whole is. In the first case validator clients _should_ prefer other BNs if they have one available. In the second case, all of their BNs will likely report `el_offline` and they'll just have to proceed with trying to use them.
## Additional Changes
- Add utility method `ForkName::latest` which is quite convenient for test writing, but probably other things too.
- Delete some stale comments from when we used to support multiple execution nodes.
2023-05-17 05:51:56 +00:00
|
|
|
/// It is possible for a node to return an unsynced status while continuing to serve
|
|
|
|
/// low quality responses. To route around this it's best to poll all connected beacon nodes.
|
|
|
|
/// A previous implementation of this function polled only the unavailable BNs.
|
|
|
|
pub async fn update_all_candidates(&self) {
|
|
|
|
let futures = self
|
|
|
|
.candidates
|
|
|
|
.iter()
|
|
|
|
.map(|candidate| {
|
|
|
|
candidate.refresh_status(self.slot_clock.as_ref(), &self.spec, &self.log)
|
|
|
|
})
|
|
|
|
.collect::<Vec<_>>();
|
2020-12-18 09:17:03 +00:00
|
|
|
|
Implement `el_offline` and use it in the VC (#4295)
## Issue Addressed
Closes https://github.com/sigp/lighthouse/issues/4291, part of #3613.
## Proposed Changes
- Implement the `el_offline` field on `/eth/v1/node/syncing`. We set `el_offline=true` if:
- The EL's internal status is `Offline` or `AuthFailed`, _or_
- The most recent call to `newPayload` resulted in an error (more on this in a moment).
- Use the `el_offline` field in the VC to mark nodes with offline ELs as _unsynced_. These nodes will still be used, but only after synced nodes.
- Overhaul the usage of `RequireSynced` so that `::No` is used almost everywhere. The `--allow-unsynced` flag was broken and had the opposite effect to intended, so it has been deprecated.
- Add tests for the EL being offline on the upcheck call, and being offline due to the newPayload check.
## Why track `newPayload` errors?
Tracking the EL's online/offline status is too coarse-grained to be useful in practice, because:
- If the EL is timing out to some calls, it's unlikely to timeout on the `upcheck` call, which is _just_ `eth_syncing`. Every failed call is followed by an upcheck [here](https://github.com/sigp/lighthouse/blob/693886b94176faa4cb450f024696cb69cda2fe58/beacon_node/execution_layer/src/engines.rs#L372-L380), which would have the effect of masking the failure and keeping the status _online_.
- The `newPayload` call is the most likely to time out. It's the call in which ELs tend to do most of their work (often 1-2 seconds), with `forkchoiceUpdated` usually returning much faster (<50ms).
- If `newPayload` is failing consistently (e.g. timing out) then this is a good indication that either the node's EL is in trouble, or the network as a whole is. In the first case validator clients _should_ prefer other BNs if they have one available. In the second case, all of their BNs will likely report `el_offline` and they'll just have to proceed with trying to use them.
## Additional Changes
- Add utility method `ForkName::latest` which is quite convenient for test writing, but probably other things too.
- Delete some stale comments from when we used to support multiple execution nodes.
2023-05-17 05:51:56 +00:00
|
|
|
// run all updates concurrently and ignore errors
|
2020-12-18 09:17:03 +00:00
|
|
|
let _ = future::join_all(futures).await;
|
|
|
|
}
|
|
|
|
|
2023-03-05 23:43:29 +00:00
|
|
|
/// Concurrently send a request to all candidates (regardless of
|
|
|
|
/// offline/online) status and attempt to collect a rough reading on the
|
|
|
|
/// latency between the VC and candidate.
|
|
|
|
pub async fn measure_latency(&self) -> Vec<LatencyMeasurement> {
|
|
|
|
let futures: Vec<_> = self
|
|
|
|
.candidates
|
|
|
|
.iter()
|
|
|
|
.map(|candidate| async {
|
|
|
|
let beacon_node_id = candidate.beacon_node.to_string();
|
|
|
|
// The `node/version` endpoint is used since I imagine it would
|
|
|
|
// require the least processing in the BN and therefore measure
|
|
|
|
// the connection moreso than the BNs processing speed.
|
|
|
|
//
|
|
|
|
// I imagine all clients have the version string availble as a
|
|
|
|
// pre-computed string.
|
|
|
|
let response_instant = candidate
|
|
|
|
.beacon_node
|
|
|
|
.get_node_version()
|
|
|
|
.await
|
|
|
|
.ok()
|
|
|
|
.map(|_| Instant::now());
|
|
|
|
(beacon_node_id, response_instant)
|
|
|
|
})
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
let request_instant = Instant::now();
|
|
|
|
|
|
|
|
// Send the request to all BNs at the same time. This might involve some
|
|
|
|
// queueing on the sending host, however I hope it will avoid bias
|
|
|
|
// caused by sending requests at different times.
|
|
|
|
future::join_all(futures)
|
|
|
|
.await
|
|
|
|
.into_iter()
|
|
|
|
.map(|(beacon_node_id, response_instant)| LatencyMeasurement {
|
|
|
|
beacon_node_id,
|
|
|
|
latency: response_instant
|
|
|
|
.and_then(|response| response.checked_duration_since(request_instant)),
|
|
|
|
})
|
|
|
|
.collect()
|
|
|
|
}
|
|
|
|
|
2020-12-18 09:17:03 +00:00
|
|
|
/// Run `func` against each candidate in `self`, returning immediately if a result is found.
|
|
|
|
/// Otherwise, return all the errors encountered along the way.
|
|
|
|
///
|
|
|
|
/// First this function will try all nodes with a suitable status. If no candidates are suitable
|
|
|
|
/// or all the requests fail, it will try updating the status of all unsuitable nodes and
|
|
|
|
/// re-running `func` again.
|
|
|
|
pub async fn first_success<'a, F, O, Err, R>(
|
|
|
|
&'a self,
|
|
|
|
require_synced: RequireSynced,
|
2022-08-29 11:35:59 +00:00
|
|
|
offline_on_failure: OfflineOnFailure,
|
2020-12-18 09:17:03 +00:00
|
|
|
func: F,
|
2022-09-28 19:53:35 +00:00
|
|
|
) -> Result<O, Errors<Err>>
|
2020-12-18 09:17:03 +00:00
|
|
|
where
|
|
|
|
F: Fn(&'a BeaconNodeHttpClient) -> R,
|
|
|
|
R: Future<Output = Result<O, Err>>,
|
2023-03-02 05:26:14 +00:00
|
|
|
Err: Debug,
|
2020-12-18 09:17:03 +00:00
|
|
|
{
|
|
|
|
let mut errors = vec![];
|
|
|
|
let mut to_retry = vec![];
|
|
|
|
let mut retry_unsynced = vec![];
|
2023-03-02 05:26:14 +00:00
|
|
|
let log = &self.log.clone();
|
2020-12-18 09:17:03 +00:00
|
|
|
|
|
|
|
// Run `func` using a `candidate`, returning the value or capturing errors.
|
|
|
|
//
|
|
|
|
// We use a macro instead of a closure here since it is not trivial to move `func` into a
|
|
|
|
// closure.
|
|
|
|
macro_rules! try_func {
|
|
|
|
($candidate: ident) => {{
|
|
|
|
inc_counter_vec(&ENDPOINT_REQUESTS, &[$candidate.beacon_node.as_ref()]);
|
|
|
|
|
|
|
|
// There exists a race condition where `func` may be called when the candidate is
|
|
|
|
// actually not ready. We deem this an acceptable inefficiency.
|
|
|
|
match func(&$candidate.beacon_node).await {
|
|
|
|
Ok(val) => return Ok(val),
|
|
|
|
Err(e) => {
|
2023-03-02 05:26:14 +00:00
|
|
|
debug!(
|
|
|
|
log,
|
|
|
|
"Request to beacon node failed";
|
|
|
|
"node" => $candidate.beacon_node.to_string(),
|
|
|
|
"error" => ?e,
|
|
|
|
);
|
2020-12-18 09:17:03 +00:00
|
|
|
// If we have an error on this function, make the client as not-ready.
|
|
|
|
//
|
|
|
|
// There exists a race condition where the candidate may have been marked
|
|
|
|
// as ready between the `func` call and now. We deem this an acceptable
|
|
|
|
// inefficiency.
|
2022-08-29 11:35:59 +00:00
|
|
|
if matches!(offline_on_failure, OfflineOnFailure::Yes) {
|
|
|
|
$candidate.set_offline().await;
|
|
|
|
}
|
2020-12-18 09:17:03 +00:00
|
|
|
errors.push(($candidate.beacon_node.to_string(), Error::RequestFailed(e)));
|
|
|
|
inc_counter_vec(&ENDPOINT_ERRORS, &[$candidate.beacon_node.as_ref()]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}};
|
|
|
|
}
|
|
|
|
|
|
|
|
// First pass: try `func` on all synced and ready candidates.
|
|
|
|
//
|
|
|
|
// This ensures that we always choose a synced node if it is available.
|
|
|
|
for candidate in &self.candidates {
|
|
|
|
match candidate.status(RequireSynced::Yes).await {
|
|
|
|
Err(e @ CandidateError::NotSynced) if require_synced == false => {
|
|
|
|
// This client is unsynced we will try it after trying all synced clients
|
|
|
|
retry_unsynced.push(candidate);
|
|
|
|
errors.push((candidate.beacon_node.to_string(), Error::Unavailable(e)));
|
|
|
|
}
|
|
|
|
Err(e) => {
|
|
|
|
// This client was not ready on the first pass, we might try it again later.
|
|
|
|
to_retry.push(candidate);
|
|
|
|
errors.push((candidate.beacon_node.to_string(), Error::Unavailable(e)));
|
|
|
|
}
|
|
|
|
_ => try_func!(candidate),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Second pass: try `func` on ready unsynced candidates. This only runs if we permit
|
|
|
|
// unsynced candidates.
|
|
|
|
//
|
|
|
|
// Due to async race-conditions, it is possible that we will send a request to a candidate
|
|
|
|
// that has been set to an offline/unready status. This is acceptable.
|
|
|
|
if require_synced == false {
|
|
|
|
for candidate in retry_unsynced {
|
|
|
|
try_func!(candidate);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Third pass: try again, attempting to make non-ready clients become ready.
|
|
|
|
for candidate in to_retry {
|
|
|
|
// If the candidate hasn't luckily transferred into the correct state in the meantime,
|
|
|
|
// force an update of the state.
|
|
|
|
let new_status = match candidate.status(require_synced).await {
|
|
|
|
Ok(()) => Ok(()),
|
|
|
|
Err(_) => {
|
|
|
|
candidate
|
|
|
|
.refresh_status(self.slot_clock.as_ref(), &self.spec, &self.log)
|
|
|
|
.await
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
match new_status {
|
|
|
|
Ok(()) => try_func!(candidate),
|
|
|
|
Err(CandidateError::NotSynced) if require_synced == false => try_func!(candidate),
|
|
|
|
Err(e) => {
|
|
|
|
errors.push((candidate.beacon_node.to_string(), Error::Unavailable(e)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// There were no candidates already ready and we were unable to make any of them ready.
|
2022-09-28 19:53:35 +00:00
|
|
|
Err(Errors(errors))
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Run `func` against all candidates in `self`, collecting the result of `func` against each
|
|
|
|
/// candidate.
|
|
|
|
///
|
|
|
|
/// First this function will try all nodes with a suitable status. If no candidates are suitable
|
|
|
|
/// it will try updating the status of all unsuitable nodes and re-running `func` again.
|
|
|
|
///
|
|
|
|
/// Note: This function returns `Ok(())` if `func` returned successfully on all beacon nodes.
|
|
|
|
/// It returns a list of errors along with the beacon node id that failed for `func`.
|
|
|
|
/// Since this ignores the actual result of `func`, this function should only be used for beacon
|
|
|
|
/// node calls whose results we do not care about, only that they completed successfully.
|
|
|
|
pub async fn run_on_all<'a, F, O, Err, R>(
|
|
|
|
&'a self,
|
|
|
|
require_synced: RequireSynced,
|
|
|
|
offline_on_failure: OfflineOnFailure,
|
|
|
|
func: F,
|
|
|
|
) -> Result<(), Errors<Err>>
|
|
|
|
where
|
|
|
|
F: Fn(&'a BeaconNodeHttpClient) -> R,
|
|
|
|
R: Future<Output = Result<O, Err>>,
|
|
|
|
{
|
|
|
|
let mut results = vec![];
|
|
|
|
let mut to_retry = vec![];
|
|
|
|
let mut retry_unsynced = vec![];
|
|
|
|
|
|
|
|
// Run `func` using a `candidate`, returning the value or capturing errors.
|
|
|
|
//
|
|
|
|
// We use a macro instead of a closure here since it is not trivial to move `func` into a
|
|
|
|
// closure.
|
|
|
|
macro_rules! try_func {
|
|
|
|
($candidate: ident) => {{
|
|
|
|
inc_counter_vec(&ENDPOINT_REQUESTS, &[$candidate.beacon_node.as_ref()]);
|
|
|
|
|
|
|
|
// There exists a race condition where `func` may be called when the candidate is
|
|
|
|
// actually not ready. We deem this an acceptable inefficiency.
|
|
|
|
match func(&$candidate.beacon_node).await {
|
|
|
|
Ok(val) => results.push(Ok(val)),
|
|
|
|
Err(e) => {
|
|
|
|
// If we have an error on this function, make the client as not-ready.
|
|
|
|
//
|
|
|
|
// There exists a race condition where the candidate may have been marked
|
|
|
|
// as ready between the `func` call and now. We deem this an acceptable
|
|
|
|
// inefficiency.
|
|
|
|
if matches!(offline_on_failure, OfflineOnFailure::Yes) {
|
|
|
|
$candidate.set_offline().await;
|
|
|
|
}
|
|
|
|
results.push(Err((
|
|
|
|
$candidate.beacon_node.to_string(),
|
|
|
|
Error::RequestFailed(e),
|
|
|
|
)));
|
|
|
|
inc_counter_vec(&ENDPOINT_ERRORS, &[$candidate.beacon_node.as_ref()]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}};
|
|
|
|
}
|
|
|
|
|
|
|
|
// First pass: try `func` on all synced and ready candidates.
|
|
|
|
//
|
|
|
|
// This ensures that we always choose a synced node if it is available.
|
|
|
|
for candidate in &self.candidates {
|
|
|
|
match candidate.status(RequireSynced::Yes).await {
|
|
|
|
Err(CandidateError::NotSynced) if require_synced == false => {
|
|
|
|
// This client is unsynced we will try it after trying all synced clients
|
|
|
|
retry_unsynced.push(candidate);
|
|
|
|
}
|
|
|
|
Err(_) => {
|
|
|
|
// This client was not ready on the first pass, we might try it again later.
|
|
|
|
to_retry.push(candidate);
|
|
|
|
}
|
|
|
|
Ok(_) => try_func!(candidate),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Second pass: try `func` on ready unsynced candidates. This only runs if we permit
|
|
|
|
// unsynced candidates.
|
|
|
|
//
|
|
|
|
// Due to async race-conditions, it is possible that we will send a request to a candidate
|
|
|
|
// that has been set to an offline/unready status. This is acceptable.
|
|
|
|
if require_synced == false {
|
|
|
|
for candidate in retry_unsynced {
|
|
|
|
try_func!(candidate);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Third pass: try again, attempting to make non-ready clients become ready.
|
|
|
|
for candidate in to_retry {
|
|
|
|
// If the candidate hasn't luckily transferred into the correct state in the meantime,
|
|
|
|
// force an update of the state.
|
|
|
|
let new_status = match candidate.status(require_synced).await {
|
|
|
|
Ok(()) => Ok(()),
|
|
|
|
Err(_) => {
|
|
|
|
candidate
|
|
|
|
.refresh_status(self.slot_clock.as_ref(), &self.spec, &self.log)
|
|
|
|
.await
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
match new_status {
|
|
|
|
Ok(()) => try_func!(candidate),
|
|
|
|
Err(CandidateError::NotSynced) if require_synced == false => try_func!(candidate),
|
|
|
|
Err(e) => {
|
|
|
|
results.push(Err((
|
|
|
|
candidate.beacon_node.to_string(),
|
|
|
|
Error::Unavailable(e),
|
|
|
|
)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
let errors: Vec<_> = results.into_iter().filter_map(|res| res.err()).collect();
|
|
|
|
|
|
|
|
if !errors.is_empty() {
|
|
|
|
Err(Errors(errors))
|
|
|
|
} else {
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Call `func` on first beacon node that returns success or on all beacon nodes
|
|
|
|
/// depending on the value of `disable_run_on_all`.
|
|
|
|
pub async fn run<'a, F, Err, R>(
|
|
|
|
&'a self,
|
|
|
|
require_synced: RequireSynced,
|
|
|
|
offline_on_failure: OfflineOnFailure,
|
|
|
|
func: F,
|
|
|
|
) -> Result<(), Errors<Err>>
|
|
|
|
where
|
|
|
|
F: Fn(&'a BeaconNodeHttpClient) -> R,
|
|
|
|
R: Future<Output = Result<(), Err>>,
|
2023-03-02 05:26:14 +00:00
|
|
|
Err: Debug,
|
2022-09-28 19:53:35 +00:00
|
|
|
{
|
|
|
|
if self.disable_run_on_all {
|
|
|
|
self.first_success(require_synced, offline_on_failure, func)
|
|
|
|
.await?;
|
|
|
|
Ok(())
|
|
|
|
} else {
|
|
|
|
self.run_on_all(require_synced, offline_on_failure, func)
|
|
|
|
.await
|
|
|
|
}
|
2020-12-18 09:17:03 +00:00
|
|
|
}
|
|
|
|
}
|