lighthouse/beacon_node/http_api/src/sync_committees.rs
Paul Hauner 2de26b20f8 Don't return errors on HTTP API for already-known messages (#3341)
## Issue Addressed

- Resolves #3266

## Proposed Changes

Return 200 OK rather than an error when a block, attestation or sync message is already known.

Presently, we will log return an error which causes a BN to go "offline" from the VCs perspective which causes the fallback mechanism to do work to try and avoid and upcheck offline nodes. This can be observed as instability in the `vc_beacon_nodes_available_count` metric.

The current behaviour also causes scary logs for the user. There's nothing to *actually* be concerned about when we see duplicate messages, this can happen on fallback systems (see code comments).

## Additional Info

NA
2022-08-10 07:52:57 +00:00

354 lines
15 KiB
Rust

//! Handlers for sync committee endpoints.
use crate::publish_pubsub_message;
use beacon_chain::sync_committee_verification::{
Error as SyncVerificationError, VerifiedSyncCommitteeMessage,
};
use beacon_chain::{
validator_monitor::timestamp_now, BeaconChain, BeaconChainError, BeaconChainTypes,
StateSkipConfig, MAXIMUM_GOSSIP_CLOCK_DISPARITY,
};
use eth2::types::{self as api_types};
use lighthouse_network::PubsubMessage;
use network::NetworkMessage;
use slog::{debug, error, warn, Logger};
use slot_clock::SlotClock;
use std::cmp::max;
use std::collections::HashMap;
use tokio::sync::mpsc::UnboundedSender;
use types::{
slot_data::SlotData, BeaconStateError, Epoch, EthSpec, SignedContributionAndProof,
SyncCommitteeMessage, SyncDuty, SyncSubnetId,
};
/// The struct that is returned to the requesting HTTP client.
type SyncDuties = api_types::ExecutionOptimisticResponse<Vec<SyncDuty>>;
/// Handles a request from the HTTP API for sync committee duties.
pub fn sync_committee_duties<T: BeaconChainTypes>(
request_epoch: Epoch,
request_indices: &[u64],
chain: &BeaconChain<T>,
) -> Result<SyncDuties, warp::reject::Rejection> {
let altair_fork_epoch = if let Some(altair_fork_epoch) = chain.spec.altair_fork_epoch {
altair_fork_epoch
} else {
// Empty response for networks with Altair disabled.
return Ok(convert_to_response(vec![], false));
};
// Even when computing duties from state, any block roots pulled using the request epoch are
// still dependent on the head. So using `is_optimistic_head` is fine for both cases.
let execution_optimistic = chain
.is_optimistic_or_invalid_head()
.map_err(warp_utils::reject::beacon_chain_error)?;
// Try using the head's sync committees to satisfy the request. This should be sufficient for
// the vast majority of requests. Rather than checking if we think the request will succeed in a
// way prone to data races, we attempt the request immediately and check the error code.
match chain.sync_committee_duties_from_head(request_epoch, request_indices) {
Ok(duties) => return Ok(convert_to_response(duties, execution_optimistic)),
Err(BeaconChainError::SyncDutiesError(BeaconStateError::SyncCommitteeNotKnown {
..
}))
| Err(BeaconChainError::SyncDutiesError(BeaconStateError::IncorrectStateVariant)) => (),
Err(e) => return Err(warp_utils::reject::beacon_chain_error(e)),
}
let duties = duties_from_state_load(request_epoch, request_indices, altair_fork_epoch, chain)
.map_err(|e| match e {
BeaconChainError::SyncDutiesError(BeaconStateError::SyncCommitteeNotKnown {
current_epoch,
..
}) => warp_utils::reject::custom_bad_request(format!(
"invalid epoch: {}, current epoch: {}",
request_epoch, current_epoch
)),
e => warp_utils::reject::beacon_chain_error(e),
})?;
Ok(convert_to_response(duties, execution_optimistic))
}
/// Slow path for duties: load a state and use it to compute the duties.
fn duties_from_state_load<T: BeaconChainTypes>(
request_epoch: Epoch,
request_indices: &[u64],
altair_fork_epoch: Epoch,
chain: &BeaconChain<T>,
) -> Result<Vec<Option<SyncDuty>>, BeaconChainError> {
// Determine what the current epoch would be if we fast-forward our system clock by
// `MAXIMUM_GOSSIP_CLOCK_DISPARITY`.
//
// Most of the time, `tolerant_current_epoch` will be equal to `current_epoch`. However, during
// the last `MAXIMUM_GOSSIP_CLOCK_DISPARITY` duration of the epoch `tolerant_current_epoch`
// will equal `current_epoch + 1`
let current_epoch = chain.epoch()?;
let tolerant_current_epoch = chain
.slot_clock
.now_with_future_tolerance(MAXIMUM_GOSSIP_CLOCK_DISPARITY)
.ok_or(BeaconChainError::UnableToReadSlot)?
.epoch(T::EthSpec::slots_per_epoch());
let max_sync_committee_period = tolerant_current_epoch.sync_committee_period(&chain.spec)? + 1;
let sync_committee_period = request_epoch.sync_committee_period(&chain.spec)?;
if tolerant_current_epoch < altair_fork_epoch {
// Empty response if the epoch is pre-Altair.
Ok(vec![])
} else if sync_committee_period <= max_sync_committee_period {
// Load the state at the start of the *previous* sync committee period.
// This is sufficient for historical duties, and efficient in the case where the head
// is lagging the current epoch and we need duties for the next period (because we only
// have to transition the head to start of the current period).
//
// We also need to ensure that the load slot is after the Altair fork.
let load_slot = max(
chain.spec.epochs_per_sync_committee_period * sync_committee_period.saturating_sub(1),
altair_fork_epoch,
)
.start_slot(T::EthSpec::slots_per_epoch());
let state = chain.state_at_slot(load_slot, StateSkipConfig::WithoutStateRoots)?;
state
.get_sync_committee_duties(request_epoch, request_indices, &chain.spec)
.map_err(BeaconChainError::SyncDutiesError)
} else {
Err(BeaconChainError::SyncDutiesError(
BeaconStateError::SyncCommitteeNotKnown {
current_epoch,
epoch: request_epoch,
},
))
}
}
fn convert_to_response(duties: Vec<Option<SyncDuty>>, execution_optimistic: bool) -> SyncDuties {
api_types::GenericResponse::from(duties.into_iter().flatten().collect::<Vec<_>>())
.add_execution_optimistic(execution_optimistic)
}
/// Receive sync committee duties, storing them in the pools & broadcasting them.
pub fn process_sync_committee_signatures<T: BeaconChainTypes>(
sync_committee_signatures: Vec<SyncCommitteeMessage>,
network_tx: UnboundedSender<NetworkMessage<T::EthSpec>>,
chain: &BeaconChain<T>,
log: Logger,
) -> Result<(), warp::reject::Rejection> {
let mut failures = vec![];
let seen_timestamp = timestamp_now();
for (i, sync_committee_signature) in sync_committee_signatures.iter().enumerate() {
let subnet_positions = match get_subnet_positions_for_sync_committee_message(
sync_committee_signature,
chain,
) {
Ok(positions) => positions,
Err(e) => {
error!(
log,
"Unable to compute subnet positions for sync message";
"error" => ?e,
"slot" => sync_committee_signature.slot,
);
failures.push(api_types::Failure::new(i, format!("Verification: {:?}", e)));
continue;
}
};
// Verify and publish on all relevant subnets.
//
// The number of assigned subnets on any practical network should be ~1, so the apparent
// inefficiency of verifying multiple times is not a real inefficiency.
let mut verified_for_pool = None;
for subnet_id in subnet_positions.keys().copied() {
match VerifiedSyncCommitteeMessage::verify(
sync_committee_signature.clone(),
subnet_id,
chain,
) {
Ok(verified) => {
publish_pubsub_message(
&network_tx,
PubsubMessage::SyncCommitteeMessage(Box::new((
subnet_id,
verified.sync_message().clone(),
))),
)?;
// Register with validator monitor
chain
.validator_monitor
.read()
.register_api_sync_committee_message(
seen_timestamp,
verified.sync_message(),
&chain.slot_clock,
);
verified_for_pool = Some(verified);
}
// If this validator has already published a sync message, just ignore this message
// without returning an error.
//
// This is likely to happen when a VC uses fallback BNs. If the first BN publishes
// the message and then fails to respond in a timely fashion then the VC will move
// to the second BN. The BN will then report that this message has already been
// seen, which is not actually an error as far as the network or user are concerned.
Err(SyncVerificationError::PriorSyncCommitteeMessageKnown {
validator_index,
slot,
}) => {
debug!(
log,
"Ignoring already-known sync message";
"slot" => slot,
"validator_index" => validator_index,
);
}
Err(e) => {
error!(
log,
"Failure verifying sync committee signature for gossip";
"error" => ?e,
"request_index" => i,
"slot" => sync_committee_signature.slot,
"validator_index" => sync_committee_signature.validator_index,
);
failures.push(api_types::Failure::new(i, format!("Verification: {:?}", e)));
}
}
}
if let Some(verified) = verified_for_pool {
if let Err(e) = chain.add_to_naive_sync_aggregation_pool(verified) {
error!(
log,
"Unable to add sync committee signature to pool";
"error" => ?e,
"slot" => sync_committee_signature.slot,
"validator_index" => sync_committee_signature.validator_index,
);
}
}
}
if failures.is_empty() {
Ok(())
} else {
Err(warp_utils::reject::indexed_bad_request(
"error processing sync committee signatures".to_string(),
failures,
))
}
}
/// Get the set of all subnet assignments for a `SyncCommitteeMessage`.
pub fn get_subnet_positions_for_sync_committee_message<T: BeaconChainTypes>(
sync_message: &SyncCommitteeMessage,
chain: &BeaconChain<T>,
) -> Result<HashMap<SyncSubnetId, Vec<usize>>, SyncVerificationError> {
let pubkey = chain
.validator_pubkey_bytes(sync_message.validator_index as usize)?
.ok_or(SyncVerificationError::UnknownValidatorIndex(
sync_message.validator_index as usize,
))?;
let sync_committee = chain.sync_committee_at_next_slot(sync_message.get_slot())?;
Ok(sync_committee.subcommittee_positions_for_public_key(&pubkey)?)
}
/// Receive signed contributions and proofs, storing them in the op pool and broadcasting.
pub fn process_signed_contribution_and_proofs<T: BeaconChainTypes>(
signed_contribution_and_proofs: Vec<SignedContributionAndProof<T::EthSpec>>,
network_tx: UnboundedSender<NetworkMessage<T::EthSpec>>,
chain: &BeaconChain<T>,
log: Logger,
) -> Result<(), warp::reject::Rejection> {
let mut verified_contributions = Vec::with_capacity(signed_contribution_and_proofs.len());
let mut failures = vec![];
let seen_timestamp = timestamp_now();
// Verify contributions & broadcast to the network.
for (index, contribution) in signed_contribution_and_proofs.into_iter().enumerate() {
let aggregator_index = contribution.message.aggregator_index;
let subcommittee_index = contribution.message.contribution.subcommittee_index;
let contribution_slot = contribution.message.contribution.slot;
match chain.verify_sync_contribution_for_gossip(contribution) {
Ok(verified_contribution) => {
publish_pubsub_message(
&network_tx,
PubsubMessage::SignedContributionAndProof(Box::new(
verified_contribution.aggregate().clone(),
)),
)?;
// Register with validator monitor
chain
.validator_monitor
.read()
.register_api_sync_committee_contribution(
seen_timestamp,
verified_contribution.aggregate(),
verified_contribution.participant_pubkeys(),
&chain.slot_clock,
);
verified_contributions.push((index, verified_contribution));
}
// If we already know the contribution, don't broadcast it or attempt to
// further verify it. Return success.
Err(SyncVerificationError::SyncContributionAlreadyKnown(_)) => continue,
// If we've already seen this aggregator produce an aggregate, just
// skip this one.
//
// We're likely to see this with VCs that use fallback BNs. The first
// BN might time-out *after* publishing the aggregate and then the
// second BN will indicate it's already seen the aggregate.
//
// There's no actual error for the user or the network since the
// aggregate has been successfully published by some other node.
Err(SyncVerificationError::AggregatorAlreadyKnown(_)) => continue,
Err(e) => {
error!(
log,
"Failure verifying signed contribution and proof";
"error" => ?e,
"request_index" => index,
"aggregator_index" => aggregator_index,
"subcommittee_index" => subcommittee_index,
"contribution_slot" => contribution_slot,
);
failures.push(api_types::Failure::new(
index,
format!("Verification: {:?}", e),
));
}
}
}
// Add to the block inclusion pool.
for (index, verified_contribution) in verified_contributions {
if let Err(e) = chain.add_contribution_to_block_inclusion_pool(verified_contribution) {
warn!(
log,
"Could not add verified sync contribution to the inclusion pool";
"error" => ?e,
"request_index" => index,
);
failures.push(api_types::Failure::new(index, format!("Op pool: {:?}", e)));
}
}
if !failures.is_empty() {
Err(warp_utils::reject::indexed_bad_request(
"error processing contribution and proofs".to_string(),
failures,
))
} else {
Ok(())
}
}