Publish subscriptions to all beacon nodes (#3529)
## Issue Addressed Resolves #3516 ## Proposed Changes Adds a beacon fallback function for running a beacon node http query on all available fallbacks instead of returning on a first successful result. Uses the new `run_on_all` method for attestation and sync committee subscriptions. ## Additional Info Please provide any additional information. For example, future considerations or information useful for reviewers.
This commit is contained in:
parent
abcebf276f
commit
6779912fe4
@ -442,3 +442,19 @@ fn monitoring_endpoint() {
|
||||
assert_eq!(api_conf.update_period_secs, Some(30));
|
||||
});
|
||||
}
|
||||
#[test]
|
||||
fn disable_run_on_all_default() {
|
||||
CommandLineTest::new().run().with_config(|config| {
|
||||
assert!(!config.disable_run_on_all);
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn disable_run_on_all() {
|
||||
CommandLineTest::new()
|
||||
.flag("disable-run-on-all", None)
|
||||
.run()
|
||||
.with_config(|config| {
|
||||
assert!(config.disable_run_on_all);
|
||||
});
|
||||
}
|
||||
|
@ -105,11 +105,13 @@ impl<E> Error<E> {
|
||||
}
|
||||
|
||||
/// The list of errors encountered whilst attempting to perform a query.
|
||||
pub struct AllErrored<E>(pub Vec<(String, Error<E>)>);
|
||||
pub struct Errors<E>(pub Vec<(String, Error<E>)>);
|
||||
|
||||
impl<E: Debug> fmt::Display for AllErrored<E> {
|
||||
impl<E: Debug> fmt::Display for Errors<E> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "All endpoints failed")?;
|
||||
if !self.0.is_empty() {
|
||||
write!(f, "Some endpoints failed, num_failed: {}", self.0.len())?;
|
||||
}
|
||||
for (i, (id, error)) in self.0.iter().enumerate() {
|
||||
let comma = if i + 1 < self.0.len() { "," } else { "" };
|
||||
|
||||
@ -294,15 +296,22 @@ impl<E: EthSpec> CandidateBeaconNode<E> {
|
||||
pub struct BeaconNodeFallback<T, E> {
|
||||
candidates: Vec<CandidateBeaconNode<E>>,
|
||||
slot_clock: Option<T>,
|
||||
disable_run_on_all: bool,
|
||||
spec: ChainSpec,
|
||||
log: Logger,
|
||||
}
|
||||
|
||||
impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
|
||||
pub fn new(candidates: Vec<CandidateBeaconNode<E>>, spec: ChainSpec, log: Logger) -> Self {
|
||||
pub fn new(
|
||||
candidates: Vec<CandidateBeaconNode<E>>,
|
||||
disable_run_on_all: bool,
|
||||
spec: ChainSpec,
|
||||
log: Logger,
|
||||
) -> Self {
|
||||
Self {
|
||||
candidates,
|
||||
slot_clock: None,
|
||||
disable_run_on_all,
|
||||
spec,
|
||||
log,
|
||||
}
|
||||
@ -396,7 +405,7 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
|
||||
require_synced: RequireSynced,
|
||||
offline_on_failure: OfflineOnFailure,
|
||||
func: F,
|
||||
) -> Result<O, AllErrored<Err>>
|
||||
) -> Result<O, Errors<Err>>
|
||||
where
|
||||
F: Fn(&'a BeaconNodeHttpClient) -> R,
|
||||
R: Future<Output = Result<O, Err>>,
|
||||
@ -486,6 +495,145 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
|
||||
}
|
||||
|
||||
// There were no candidates already ready and we were unable to make any of them ready.
|
||||
Err(AllErrored(errors))
|
||||
Err(Errors(errors))
|
||||
}
|
||||
|
||||
/// Run `func` against all candidates in `self`, collecting the result of `func` against each
|
||||
/// candidate.
|
||||
///
|
||||
/// First this function will try all nodes with a suitable status. If no candidates are suitable
|
||||
/// it will try updating the status of all unsuitable nodes and re-running `func` again.
|
||||
///
|
||||
/// Note: This function returns `Ok(())` if `func` returned successfully on all beacon nodes.
|
||||
/// It returns a list of errors along with the beacon node id that failed for `func`.
|
||||
/// Since this ignores the actual result of `func`, this function should only be used for beacon
|
||||
/// node calls whose results we do not care about, only that they completed successfully.
|
||||
pub async fn run_on_all<'a, F, O, Err, R>(
|
||||
&'a self,
|
||||
require_synced: RequireSynced,
|
||||
offline_on_failure: OfflineOnFailure,
|
||||
func: F,
|
||||
) -> Result<(), Errors<Err>>
|
||||
where
|
||||
F: Fn(&'a BeaconNodeHttpClient) -> R,
|
||||
R: Future<Output = Result<O, Err>>,
|
||||
{
|
||||
let mut results = vec![];
|
||||
let mut to_retry = vec![];
|
||||
let mut retry_unsynced = vec![];
|
||||
|
||||
// Run `func` using a `candidate`, returning the value or capturing errors.
|
||||
//
|
||||
// We use a macro instead of a closure here since it is not trivial to move `func` into a
|
||||
// closure.
|
||||
macro_rules! try_func {
|
||||
($candidate: ident) => {{
|
||||
inc_counter_vec(&ENDPOINT_REQUESTS, &[$candidate.beacon_node.as_ref()]);
|
||||
|
||||
// There exists a race condition where `func` may be called when the candidate is
|
||||
// actually not ready. We deem this an acceptable inefficiency.
|
||||
match func(&$candidate.beacon_node).await {
|
||||
Ok(val) => results.push(Ok(val)),
|
||||
Err(e) => {
|
||||
// If we have an error on this function, make the client as not-ready.
|
||||
//
|
||||
// There exists a race condition where the candidate may have been marked
|
||||
// as ready between the `func` call and now. We deem this an acceptable
|
||||
// inefficiency.
|
||||
if matches!(offline_on_failure, OfflineOnFailure::Yes) {
|
||||
$candidate.set_offline().await;
|
||||
}
|
||||
results.push(Err((
|
||||
$candidate.beacon_node.to_string(),
|
||||
Error::RequestFailed(e),
|
||||
)));
|
||||
inc_counter_vec(&ENDPOINT_ERRORS, &[$candidate.beacon_node.as_ref()]);
|
||||
}
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
// First pass: try `func` on all synced and ready candidates.
|
||||
//
|
||||
// This ensures that we always choose a synced node if it is available.
|
||||
for candidate in &self.candidates {
|
||||
match candidate.status(RequireSynced::Yes).await {
|
||||
Err(CandidateError::NotSynced) if require_synced == false => {
|
||||
// This client is unsynced we will try it after trying all synced clients
|
||||
retry_unsynced.push(candidate);
|
||||
}
|
||||
Err(_) => {
|
||||
// This client was not ready on the first pass, we might try it again later.
|
||||
to_retry.push(candidate);
|
||||
}
|
||||
Ok(_) => try_func!(candidate),
|
||||
}
|
||||
}
|
||||
|
||||
// Second pass: try `func` on ready unsynced candidates. This only runs if we permit
|
||||
// unsynced candidates.
|
||||
//
|
||||
// Due to async race-conditions, it is possible that we will send a request to a candidate
|
||||
// that has been set to an offline/unready status. This is acceptable.
|
||||
if require_synced == false {
|
||||
for candidate in retry_unsynced {
|
||||
try_func!(candidate);
|
||||
}
|
||||
}
|
||||
|
||||
// Third pass: try again, attempting to make non-ready clients become ready.
|
||||
for candidate in to_retry {
|
||||
// If the candidate hasn't luckily transferred into the correct state in the meantime,
|
||||
// force an update of the state.
|
||||
let new_status = match candidate.status(require_synced).await {
|
||||
Ok(()) => Ok(()),
|
||||
Err(_) => {
|
||||
candidate
|
||||
.refresh_status(self.slot_clock.as_ref(), &self.spec, &self.log)
|
||||
.await
|
||||
}
|
||||
};
|
||||
|
||||
match new_status {
|
||||
Ok(()) => try_func!(candidate),
|
||||
Err(CandidateError::NotSynced) if require_synced == false => try_func!(candidate),
|
||||
Err(e) => {
|
||||
results.push(Err((
|
||||
candidate.beacon_node.to_string(),
|
||||
Error::Unavailable(e),
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let errors: Vec<_> = results.into_iter().filter_map(|res| res.err()).collect();
|
||||
|
||||
if !errors.is_empty() {
|
||||
Err(Errors(errors))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Call `func` on first beacon node that returns success or on all beacon nodes
|
||||
/// depending on the value of `disable_run_on_all`.
|
||||
pub async fn run<'a, F, Err, R>(
|
||||
&'a self,
|
||||
require_synced: RequireSynced,
|
||||
offline_on_failure: OfflineOnFailure,
|
||||
func: F,
|
||||
) -> Result<(), Errors<Err>>
|
||||
where
|
||||
F: Fn(&'a BeaconNodeHttpClient) -> R,
|
||||
R: Future<Output = Result<(), Err>>,
|
||||
{
|
||||
if self.disable_run_on_all {
|
||||
self.first_success(require_synced, offline_on_failure, func)
|
||||
.await?;
|
||||
Ok(())
|
||||
} else {
|
||||
self.run_on_all(require_synced, offline_on_failure, func)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
use crate::beacon_node_fallback::{AllErrored, Error as FallbackError};
|
||||
use crate::beacon_node_fallback::{Error as FallbackError, Errors};
|
||||
use crate::{
|
||||
beacon_node_fallback::{BeaconNodeFallback, RequireSynced},
|
||||
graffiti_file::GraffitiFile,
|
||||
@ -20,8 +20,8 @@ pub enum BlockError {
|
||||
Irrecoverable(String),
|
||||
}
|
||||
|
||||
impl From<AllErrored<BlockError>> for BlockError {
|
||||
fn from(e: AllErrored<BlockError>) -> Self {
|
||||
impl From<Errors<BlockError>> for BlockError {
|
||||
fn from(e: Errors<BlockError>) -> Self {
|
||||
if e.0.iter().any(|(_, error)| {
|
||||
matches!(
|
||||
error,
|
||||
|
@ -26,6 +26,16 @@ pub fn cli_app<'a, 'b>() -> App<'a, 'b> {
|
||||
)
|
||||
.takes_value(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("disable-run-on-all")
|
||||
.long("disable-run-on-all")
|
||||
.value_name("DISABLE_RUN_ON_ALL")
|
||||
.help("By default, Lighthouse publishes attestation, sync committee subscriptions \
|
||||
and proposer preparation messages to all beacon nodes provided in the \
|
||||
`--beacon-nodes flag`. This option changes that behaviour such that these \
|
||||
api calls only go out to the first available and synced beacon node")
|
||||
.takes_value(false)
|
||||
)
|
||||
// This argument is deprecated, use `--beacon-nodes` instead.
|
||||
.arg(
|
||||
Arg::with_name("server")
|
||||
|
@ -61,6 +61,8 @@ pub struct Config {
|
||||
/// A list of custom certificates that the validator client will additionally use when
|
||||
/// connecting to a beacon node over SSL/TLS.
|
||||
pub beacon_nodes_tls_certs: Option<Vec<PathBuf>>,
|
||||
/// Disables publishing http api requests to all beacon nodes for select api calls.
|
||||
pub disable_run_on_all: bool,
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
@ -96,6 +98,7 @@ impl Default for Config {
|
||||
builder_proposals: false,
|
||||
builder_registration_timestamp_override: None,
|
||||
gas_limit: None,
|
||||
disable_run_on_all: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -177,6 +180,7 @@ impl Config {
|
||||
}
|
||||
|
||||
config.allow_unsynced_beacon_node = cli_args.is_present("allow-unsynced");
|
||||
config.disable_run_on_all = cli_args.is_present("disable-run-on-all");
|
||||
config.disable_auto_discover = cli_args.is_present("disable-auto-discover");
|
||||
config.init_slashing_protection = cli_args.is_present("init-slashing-protection");
|
||||
config.use_long_timeouts = cli_args.is_present("use-long-timeouts");
|
||||
|
@ -570,12 +570,12 @@ async fn poll_beacon_attesters<T: SlotClock + 'static, E: EthSpec>(
|
||||
});
|
||||
}
|
||||
|
||||
// If there are any subscriptions, push them out to the beacon node.
|
||||
// If there are any subscriptions, push them out to beacon nodes
|
||||
if !subscriptions.is_empty() {
|
||||
let subscriptions_ref = &subscriptions;
|
||||
if let Err(e) = duties_service
|
||||
.beacon_nodes
|
||||
.first_success(
|
||||
.run(
|
||||
duties_service.require_synced,
|
||||
OfflineOnFailure::Yes,
|
||||
|beacon_node| async move {
|
||||
|
@ -327,8 +327,12 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
|
||||
// Initialize the number of connected, avaliable beacon nodes to 0.
|
||||
set_gauge(&http_metrics::metrics::AVAILABLE_BEACON_NODES_COUNT, 0);
|
||||
|
||||
let mut beacon_nodes: BeaconNodeFallback<_, T> =
|
||||
BeaconNodeFallback::new(candidates, context.eth2_config.spec.clone(), log.clone());
|
||||
let mut beacon_nodes: BeaconNodeFallback<_, T> = BeaconNodeFallback::new(
|
||||
candidates,
|
||||
config.disable_run_on_all,
|
||||
context.eth2_config.spec.clone(),
|
||||
log.clone(),
|
||||
);
|
||||
|
||||
// Perform some potentially long-running initialization tasks.
|
||||
let (genesis_time, genesis_validators_root) = tokio::select! {
|
||||
|
@ -331,7 +331,7 @@ impl<T: SlotClock + 'static, E: EthSpec> PreparationService<T, E> {
|
||||
let preparation_entries = preparation_data.as_slice();
|
||||
match self
|
||||
.beacon_nodes
|
||||
.first_success(
|
||||
.run(
|
||||
RequireSynced::Yes,
|
||||
OfflineOnFailure::Yes,
|
||||
|beacon_node| async move {
|
||||
@ -349,7 +349,7 @@ impl<T: SlotClock + 'static, E: EthSpec> PreparationService<T, E> {
|
||||
),
|
||||
Err(e) => error!(
|
||||
log,
|
||||
"Unable to publish proposer preparation";
|
||||
"Unable to publish proposer preparation to all beacon nodes";
|
||||
"error" => %e,
|
||||
),
|
||||
}
|
||||
|
@ -568,7 +568,7 @@ impl<T: SlotClock + 'static, E: EthSpec> SyncCommitteeService<T, E> {
|
||||
|
||||
if let Err(e) = self
|
||||
.beacon_nodes
|
||||
.first_success(
|
||||
.run(
|
||||
RequireSynced::No,
|
||||
OfflineOnFailure::Yes,
|
||||
|beacon_node| async move {
|
||||
|
Loading…
Reference in New Issue
Block a user