Handle unknown head during attestation publishing (#5010)

* Handle unknown head during attestation publishing

* Merge remote-tracking branch 'origin/unstable' into queue-http-attestations

* Simplify task spawner

* Improve logging

* Add a test

* Improve error logging

* Merge remote-tracking branch 'origin/unstable' into queue-http-attestations

* Fix beta compiler warnings
This commit is contained in:
Michael Sproul 2024-02-15 23:24:47 +11:00 committed by GitHub
parent 49536ff103
commit f17fb291b7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 455 additions and 140 deletions

View File

@ -505,6 +505,7 @@ where
network_senders: None,
network_globals: None,
beacon_processor_send: None,
beacon_processor_reprocess_send: None,
eth1_service: Some(genesis_service.eth1_service.clone()),
log: context.log().clone(),
sse_logging_components: runtime_context.sse_logging_components.clone(),
@ -747,6 +748,9 @@ where
network_globals: self.network_globals.clone(),
eth1_service: self.eth1_service.clone(),
beacon_processor_send: Some(beacon_processor_channels.beacon_processor_tx.clone()),
beacon_processor_reprocess_send: Some(
beacon_processor_channels.work_reprocessing_tx.clone(),
),
sse_logging_components: runtime_context.sse_logging_components.clone(),
log: log.clone(),
});

View File

@ -16,6 +16,7 @@ mod database;
mod metrics;
mod produce_block;
mod proposer_duties;
mod publish_attestations;
mod publish_blocks;
mod standard_block_rewards;
mod state_id;
@ -35,7 +36,7 @@ use beacon_chain::{
validator_monitor::timestamp_now, AttestationError as AttnError, BeaconChain, BeaconChainError,
BeaconChainTypes, WhenSlotSkipped,
};
use beacon_processor::BeaconProcessorSend;
use beacon_processor::{work_reprocessing_queue::ReprocessQueueMessage, BeaconProcessorSend};
pub use block_id::BlockId;
use builder_states::get_next_withdrawals;
use bytes::Bytes;
@ -129,6 +130,7 @@ pub struct Context<T: BeaconChainTypes> {
pub network_senders: Option<NetworkSenders<T::EthSpec>>,
pub network_globals: Option<Arc<NetworkGlobals<T::EthSpec>>>,
pub beacon_processor_send: Option<BeaconProcessorSend<T::EthSpec>>,
pub beacon_processor_reprocess_send: Option<Sender<ReprocessQueueMessage>>,
pub eth1_service: Option<eth1::Service>,
pub sse_logging_components: Option<SSELoggingComponents>,
pub log: Logger,
@ -534,6 +536,11 @@ pub fn serve<T: BeaconChainTypes>(
.filter(|_| config.enable_beacon_processor);
let task_spawner_filter =
warp::any().map(move || TaskSpawner::new(beacon_processor_send.clone()));
let beacon_processor_reprocess_send = ctx
.beacon_processor_reprocess_send
.clone()
.filter(|_| config.enable_beacon_processor);
let reprocess_send_filter = warp::any().map(move || beacon_processor_reprocess_send.clone());
let duplicate_block_status_code = ctx.config.duplicate_block_status_code;
@ -1756,140 +1763,26 @@ pub fn serve<T: BeaconChainTypes>(
.and(warp::path::end())
.and(warp_utils::json::json())
.and(network_tx_filter.clone())
.and(reprocess_send_filter)
.and(log_filter.clone())
.then(
|task_spawner: TaskSpawner<T::EthSpec>,
chain: Arc<BeaconChain<T>>,
attestations: Vec<Attestation<T::EthSpec>>,
network_tx: UnboundedSender<NetworkMessage<T::EthSpec>>,
log: Logger| {
task_spawner.blocking_json_task(Priority::P0, move || {
let seen_timestamp = timestamp_now();
let mut failures = Vec::new();
let mut num_already_known = 0;
for (index, attestation) in attestations.as_slice().iter().enumerate() {
let attestation = match chain
.verify_unaggregated_attestation_for_gossip(attestation, None)
{
Ok(attestation) => attestation,
Err(AttnError::PriorAttestationKnown { .. }) => {
num_already_known += 1;
// Skip to the next attestation since an attestation for this
// validator is already known in this epoch.
//
// There's little value for the network in validating a second
// attestation for another validator since it is either:
//
// 1. A duplicate.
// 2. Slashable.
// 3. Invalid.
//
// We are likely to get duplicates in the case where a VC is using
// fallback BNs. If the first BN actually publishes some/all of a
// batch of attestations but fails to respond in a timely fashion,
// the VC is likely to try publishing the attestations on another
// BN. That second BN may have already seen the attestations from
// the first BN and therefore indicate that the attestations are
// "already seen". An attestation that has already been seen has
// been published on the network so there's no actual error from
// the perspective of the user.
//
// It's better to prevent slashable attestations from ever
// appearing on the network than trying to slash validators,
// especially those validators connected to the local API.
//
// There might be *some* value in determining that this attestation
// is invalid, but since a valid attestation already it exists it
// appears that this validator is capable of producing valid
// attestations and there's no immediate cause for concern.
continue;
}
Err(e) => {
error!(log,
"Failure verifying attestation for gossip";
"error" => ?e,
"request_index" => index,
"committee_index" => attestation.data.index,
"attestation_slot" => attestation.data.slot,
);
failures.push(api_types::Failure::new(
index,
format!("Verification: {:?}", e),
));
// skip to the next attestation so we do not publish this one to gossip
continue;
}
};
// Notify the validator monitor.
chain
.validator_monitor
.read()
.register_api_unaggregated_attestation(
seen_timestamp,
attestation.indexed_attestation(),
&chain.slot_clock,
);
publish_pubsub_message(
&network_tx,
PubsubMessage::Attestation(Box::new((
attestation.subnet_id(),
attestation.attestation().clone(),
))),
)?;
let committee_index = attestation.attestation().data.index;
let slot = attestation.attestation().data.slot;
if let Err(e) = chain.apply_attestation_to_fork_choice(&attestation) {
error!(log,
"Failure applying verified attestation to fork choice";
"error" => ?e,
"request_index" => index,
"committee_index" => committee_index,
"slot" => slot,
);
failures.push(api_types::Failure::new(
index,
format!("Fork choice: {:?}", e),
));
};
if let Err(e) = chain.add_to_naive_aggregation_pool(&attestation) {
error!(log,
"Failure adding verified attestation to the naive aggregation pool";
"error" => ?e,
"request_index" => index,
"committee_index" => committee_index,
"slot" => slot,
);
failures.push(api_types::Failure::new(
index,
format!("Naive aggregation pool: {:?}", e),
));
}
}
if num_already_known > 0 {
debug!(
reprocess_tx: Option<Sender<ReprocessQueueMessage>>,
log: Logger| async move {
let result = crate::publish_attestations::publish_attestations(
task_spawner,
chain,
attestations,
network_tx,
reprocess_tx,
log,
"Some unagg attestations already known";
"count" => num_already_known
);
}
if failures.is_empty() {
Ok(())
} else {
Err(warp_utils::reject::indexed_bad_request(
"error processing attestations".to_string(),
failures,
))
}
})
)
.await
.map(|()| warp::reply::json(&()));
task_spawner::convert_rejection(result).await
},
);

View File

@ -0,0 +1,319 @@
//! Import attestations and publish them to the network.
//!
//! This module gracefully handles attestations to unknown blocks by requeuing them and then
//! efficiently waiting for them to finish reprocessing (using an async yield).
//!
//! The following comments relate to the handling of duplicate attestations (relocated here during
//! refactoring):
//!
//! Skip to the next attestation since an attestation for this
//! validator is already known in this epoch.
//!
//! There's little value for the network in validating a second
//! attestation for another validator since it is either:
//!
//! 1. A duplicate.
//! 2. Slashable.
//! 3. Invalid.
//!
//! We are likely to get duplicates in the case where a VC is using
//! fallback BNs. If the first BN actually publishes some/all of a
//! batch of attestations but fails to respond in a timely fashion,
//! the VC is likely to try publishing the attestations on another
//! BN. That second BN may have already seen the attestations from
//! the first BN and therefore indicate that the attestations are
//! "already seen". An attestation that has already been seen has
//! been published on the network so there's no actual error from
//! the perspective of the user.
//!
//! It's better to prevent slashable attestations from ever
//! appearing on the network than trying to slash validators,
//! especially those validators connected to the local API.
//!
//! There might be *some* value in determining that this attestation
//! is invalid, but since a valid attestation already it exists it
//! appears that this validator is capable of producing valid
//! attestations and there's no immediate cause for concern.
use crate::task_spawner::{Priority, TaskSpawner};
use beacon_chain::{
validator_monitor::timestamp_now, AttestationError, BeaconChain, BeaconChainError,
BeaconChainTypes,
};
use beacon_processor::work_reprocessing_queue::{QueuedUnaggregate, ReprocessQueueMessage};
use eth2::types::Failure;
use lighthouse_network::PubsubMessage;
use network::NetworkMessage;
use slog::{debug, error, warn, Logger};
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::{
mpsc::{Sender, UnboundedSender},
oneshot,
};
use types::Attestation;
// Error variants are only used in `Debug` and considered `dead_code` by the compiler.
#[derive(Debug)]
enum Error {
Validation(AttestationError),
Publication,
ForkChoice(#[allow(dead_code)] BeaconChainError),
AggregationPool(#[allow(dead_code)] AttestationError),
ReprocessDisabled,
ReprocessFull,
ReprocessTimeout,
}
enum PublishAttestationResult {
Success,
AlreadyKnown,
Reprocessing(oneshot::Receiver<Result<(), Error>>),
Failure(Error),
}
fn verify_and_publish_attestation<T: BeaconChainTypes>(
chain: &Arc<BeaconChain<T>>,
attestation: &Attestation<T::EthSpec>,
seen_timestamp: Duration,
network_tx: &UnboundedSender<NetworkMessage<T::EthSpec>>,
log: &Logger,
) -> Result<(), Error> {
let attestation = chain
.verify_unaggregated_attestation_for_gossip(attestation, None)
.map_err(Error::Validation)?;
// Publish.
network_tx
.send(NetworkMessage::Publish {
messages: vec![PubsubMessage::Attestation(Box::new((
attestation.subnet_id(),
attestation.attestation().clone(),
)))],
})
.map_err(|_| Error::Publication)?;
// Notify the validator monitor.
chain
.validator_monitor
.read()
.register_api_unaggregated_attestation(
seen_timestamp,
attestation.indexed_attestation(),
&chain.slot_clock,
);
let fc_result = chain.apply_attestation_to_fork_choice(&attestation);
let naive_aggregation_result = chain.add_to_naive_aggregation_pool(&attestation);
if let Err(e) = &fc_result {
warn!(
log,
"Attestation invalid for fork choice";
"err" => ?e,
);
}
if let Err(e) = &naive_aggregation_result {
warn!(
log,
"Attestation invalid for aggregation";
"err" => ?e
);
}
if let Err(e) = fc_result {
Err(Error::ForkChoice(e))
} else if let Err(e) = naive_aggregation_result {
Err(Error::AggregationPool(e))
} else {
Ok(())
}
}
pub async fn publish_attestations<T: BeaconChainTypes>(
task_spawner: TaskSpawner<T::EthSpec>,
chain: Arc<BeaconChain<T>>,
attestations: Vec<Attestation<T::EthSpec>>,
network_tx: UnboundedSender<NetworkMessage<T::EthSpec>>,
reprocess_send: Option<Sender<ReprocessQueueMessage>>,
log: Logger,
) -> Result<(), warp::Rejection> {
// Collect metadata about attestations which we'll use to report failures. We need to
// move the `attestations` vec into the blocking task, so this small overhead is unavoidable.
let attestation_metadata = attestations
.iter()
.map(|att| (att.data.slot, att.data.index))
.collect::<Vec<_>>();
// Gossip validate and publish attestations that can be immediately processed.
let seen_timestamp = timestamp_now();
let inner_log = log.clone();
let mut prelim_results = task_spawner
.blocking_task(Priority::P0, move || {
Ok(attestations
.into_iter()
.map(|attestation| {
match verify_and_publish_attestation(
&chain,
&attestation,
seen_timestamp,
&network_tx,
&inner_log,
) {
Ok(()) => PublishAttestationResult::Success,
Err(Error::Validation(AttestationError::UnknownHeadBlock {
beacon_block_root,
})) => {
let Some(reprocess_tx) = &reprocess_send else {
return PublishAttestationResult::Failure(Error::ReprocessDisabled);
};
// Re-process.
let (tx, rx) = oneshot::channel();
let reprocess_chain = chain.clone();
let reprocess_network_tx = network_tx.clone();
let reprocess_log = inner_log.clone();
let reprocess_fn = move || {
let result = verify_and_publish_attestation(
&reprocess_chain,
&attestation,
seen_timestamp,
&reprocess_network_tx,
&reprocess_log,
);
// Ignore failure on the oneshot that reports the result. This
// shouldn't happen unless some catastrophe befalls the waiting
// thread which causes it to drop.
let _ = tx.send(result);
};
let reprocess_msg =
ReprocessQueueMessage::UnknownBlockUnaggregate(QueuedUnaggregate {
beacon_block_root,
process_fn: Box::new(reprocess_fn),
});
if reprocess_tx.try_send(reprocess_msg).is_err() {
PublishAttestationResult::Failure(Error::ReprocessFull)
} else {
PublishAttestationResult::Reprocessing(rx)
}
}
Err(Error::Validation(AttestationError::PriorAttestationKnown {
..
})) => PublishAttestationResult::AlreadyKnown,
Err(e) => PublishAttestationResult::Failure(e),
}
})
.map(Some)
.collect::<Vec<_>>())
})
.await?;
// Asynchronously wait for re-processing of attestations to unknown blocks. This avoids blocking
// any of the beacon processor workers while we wait for reprocessing.
let (reprocess_indices, reprocess_futures): (Vec<_>, Vec<_>) = prelim_results
.iter_mut()
.enumerate()
.filter_map(|(i, opt_result)| {
if let Some(PublishAttestationResult::Reprocessing(..)) = &opt_result {
let PublishAttestationResult::Reprocessing(rx) = opt_result.take()? else {
// Unreachable.
return None;
};
Some((i, rx))
} else {
None
}
})
.unzip();
let reprocess_results = futures::future::join_all(reprocess_futures).await;
// Join everything back together and construct a response.
// This part should be quick so we just stay in the Tokio executor's async task.
for (i, reprocess_result) in reprocess_indices.into_iter().zip(reprocess_results) {
let Some(result_entry) = prelim_results.get_mut(i) else {
error!(
log,
"Unreachable case in attestation publishing";
"case" => "prelim out of bounds",
"request_index" => i,
);
continue;
};
*result_entry = Some(match reprocess_result {
Ok(Ok(())) => PublishAttestationResult::Success,
// Attestation failed processing on re-process.
Ok(Err(Error::Validation(AttestationError::PriorAttestationKnown { .. }))) => {
PublishAttestationResult::AlreadyKnown
}
Ok(Err(e)) => PublishAttestationResult::Failure(e),
// Oneshot was dropped, indicating that the attestation either timed out in the
// reprocess queue or was dropped due to some error.
Err(_) => PublishAttestationResult::Failure(Error::ReprocessTimeout),
});
}
// Construct the response.
let mut failures = vec![];
let mut num_already_known = 0;
for (index, result) in prelim_results.iter().enumerate() {
match result {
Some(PublishAttestationResult::Success) => {}
Some(PublishAttestationResult::AlreadyKnown) => num_already_known += 1,
Some(PublishAttestationResult::Failure(e)) => {
if let Some((slot, committee_index)) = attestation_metadata.get(index) {
error!(
log,
"Failure verifying attestation for gossip";
"error" => ?e,
"request_index" => index,
"committee_index" => committee_index,
"attestation_slot" => slot,
);
failures.push(Failure::new(index, format!("{e:?}")));
} else {
error!(
log,
"Unreachable case in attestation publishing";
"case" => "out of bounds",
"request_index" => index
);
failures.push(Failure::new(index, "metadata logic error".into()));
}
}
Some(PublishAttestationResult::Reprocessing(_)) => {
error!(
log,
"Unreachable case in attestation publishing";
"case" => "reprocessing",
"request_index" => index
);
failures.push(Failure::new(index, "reprocess logic error".into()));
}
None => {
error!(
log,
"Unreachable case in attestation publishing";
"case" => "result is None",
"request_index" => index
);
failures.push(Failure::new(index, "result logic error".into()));
}
}
}
if num_already_known > 0 {
debug!(
log,
"Some unagg attestations already known";
"count" => num_already_known
);
}
if failures.is_empty() {
Ok(())
} else {
Err(warp_utils::reject::indexed_bad_request(
"error processing attestations".to_string(),
failures,
))
}
}

View File

@ -60,11 +60,15 @@ impl<E: EthSpec> TaskSpawner<E> {
}
}
/// Executes a "blocking" (non-async) task which returns a `Response`.
pub async fn blocking_response_task<F, T>(self, priority: Priority, func: F) -> Response
/// Executes a "blocking" (non-async) task which returns an arbitrary value.
pub async fn blocking_task<F, T>(
self,
priority: Priority,
func: F,
) -> Result<T, warp::Rejection>
where
F: FnOnce() -> Result<T, warp::Rejection> + Send + Sync + 'static,
T: Reply + Send + 'static,
T: Send + 'static,
{
if let Some(beacon_processor_send) = &self.beacon_processor_send {
// Create a closure that will execute `func` and send the result to
@ -79,22 +83,31 @@ impl<E: EthSpec> TaskSpawner<E> {
};
// Send the function to the beacon processor for execution at some arbitrary time.
let result = send_to_beacon_processor(
send_to_beacon_processor(
beacon_processor_send,
priority,
BlockingOrAsync::Blocking(Box::new(process_fn)),
rx,
)
.await
.and_then(|x| x);
convert_rejection(result).await
.and_then(|x| x)
} else {
// There is no beacon processor so spawn a task directly on the
// tokio executor.
convert_rejection(warp_utils::task::blocking_response_task(func).await).await
warp_utils::task::blocking_task(func).await
}
}
/// Executes a "blocking" (non-async) task which returns a `Response`.
pub async fn blocking_response_task<F, T>(self, priority: Priority, func: F) -> Response
where
F: FnOnce() -> Result<T, warp::Rejection> + Send + Sync + 'static,
T: Reply + Send + 'static,
{
let result = self.blocking_task(priority, func).await;
convert_rejection(result).await
}
/// Executes a "blocking" (non-async) task which returns a JSON-serializable
/// object.
pub async fn blocking_json_task<F, T>(self, priority: Priority, func: F) -> Response

View File

@ -35,6 +35,7 @@ pub const EXTERNAL_ADDR: &str = "/ip4/0.0.0.0/tcp/9000";
/// HTTP API tester that allows interaction with the underlying beacon chain harness.
pub struct InteractiveTester<E: EthSpec> {
pub ctx: Arc<Context<EphemeralHarnessType<E>>>,
pub harness: BeaconChainHarness<EphemeralHarnessType<E>>,
pub client: BeaconNodeHttpClient,
pub network_rx: NetworkReceivers<E>,
@ -43,10 +44,11 @@ pub struct InteractiveTester<E: EthSpec> {
/// The result of calling `create_api_server`.
///
/// Glue-type between `tests::ApiTester` and `InteractiveTester`.
pub struct ApiServer<E: EthSpec, SFut: Future<Output = ()>> {
pub struct ApiServer<T: BeaconChainTypes, SFut: Future<Output = ()>> {
pub ctx: Arc<Context<T>>,
pub server: SFut,
pub listening_socket: SocketAddr,
pub network_rx: NetworkReceivers<E>,
pub network_rx: NetworkReceivers<T::EthSpec>,
pub local_enr: Enr,
pub external_peer_id: PeerId,
}
@ -90,6 +92,7 @@ impl<E: EthSpec> InteractiveTester<E> {
let harness = harness_builder.build();
let ApiServer {
ctx,
server,
listening_socket,
network_rx,
@ -114,6 +117,7 @@ impl<E: EthSpec> InteractiveTester<E> {
);
Self {
ctx,
harness,
client,
network_rx,
@ -125,7 +129,7 @@ pub async fn create_api_server<T: BeaconChainTypes>(
chain: Arc<BeaconChain<T>>,
test_runtime: &TestRuntime,
log: Logger,
) -> ApiServer<T::EthSpec, impl Future<Output = ()>> {
) -> ApiServer<T, impl Future<Output = ()>> {
// Use port 0 to allocate a new unused port.
let port = 0;
@ -187,6 +191,7 @@ pub async fn create_api_server<T: BeaconChainTypes>(
} = BeaconProcessorChannels::new(&beacon_processor_config);
let beacon_processor_send = beacon_processor_tx;
let reprocess_send = work_reprocessing_tx.clone();
BeaconProcessor {
network_globals: network_globals.clone(),
executor: test_runtime.task_executor.clone(),
@ -216,14 +221,17 @@ pub async fn create_api_server<T: BeaconChainTypes>(
network_senders: Some(network_senders),
network_globals: Some(network_globals),
beacon_processor_send: Some(beacon_processor_send),
beacon_processor_reprocess_send: Some(reprocess_send),
eth1_service: Some(eth1_service),
sse_logging_components: None,
log,
});
let (listening_socket, server) = crate::serve(ctx, test_runtime.task_executor.exit()).unwrap();
let (listening_socket, server) =
crate::serve(ctx.clone(), test_runtime.task_executor.exit()).unwrap();
ApiServer {
ctx,
server,
listening_socket,
network_rx: network_receivers,

View File

@ -4,6 +4,7 @@ use beacon_chain::{
test_utils::{AttestationStrategy, BlockStrategy, SyncCommitteeStrategy},
ChainConfig,
};
use beacon_processor::work_reprocessing_queue::ReprocessQueueMessage;
use eth2::types::ProduceBlockV3Response;
use eth2::types::{DepositContractData, StateId};
use execution_layer::{ForkchoiceState, PayloadAttributes};
@ -840,3 +841,78 @@ pub async fn fork_choice_before_proposal() {
// D's parent is B.
assert_eq!(block_d.parent_root(), block_root_b.into());
}
// Test that attestations to unknown blocks are requeued and processed when their block arrives.
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
async fn queue_attestations_from_http() {
let validator_count = 128;
let all_validators = (0..validator_count).collect::<Vec<_>>();
let tester = InteractiveTester::<E>::new(None, validator_count).await;
let harness = &tester.harness;
let client = tester.client.clone();
let num_initial = 5;
// Slot of the block attested to.
let attestation_slot = Slot::new(num_initial) + 1;
// Make some initial blocks.
harness.advance_slot();
harness
.extend_chain(
num_initial as usize,
BlockStrategy::OnCanonicalHead,
AttestationStrategy::AllValidators,
)
.await;
harness.advance_slot();
assert_eq!(harness.get_current_slot(), attestation_slot);
// Make the attested-to block without applying it.
let pre_state = harness.get_current_state();
let (block, post_state) = harness.make_block(pre_state, attestation_slot).await;
let block_root = block.0.canonical_root();
// Make attestations to the block and POST them to the beacon node on a background thread.
let attestations = harness
.make_unaggregated_attestations(
&all_validators,
&post_state,
block.0.state_root(),
block_root.into(),
attestation_slot,
)
.into_iter()
.flat_map(|attestations| attestations.into_iter().map(|(att, _subnet)| att))
.collect::<Vec<_>>();
let attestation_future = tokio::spawn(async move {
client
.post_beacon_pool_attestations(&attestations)
.await
.expect("attestations should be processed successfully")
});
// In parallel, apply the block. We need to manually notify the reprocess queue, because the
// `beacon_chain` does not know about the queue and will not update it for us.
let parent_root = block.0.parent_root();
harness
.process_block(attestation_slot, block_root, block)
.await
.unwrap();
tester
.ctx
.beacon_processor_reprocess_send
.as_ref()
.unwrap()
.send(ReprocessQueueMessage::BlockImported {
block_root,
parent_root,
})
.await
.unwrap();
attestation_future.await.unwrap();
}

View File

@ -248,6 +248,7 @@ impl ApiTester {
let log = null_logger().unwrap();
let ApiServer {
ctx: _,
server,
listening_socket,
network_rx,
@ -341,6 +342,7 @@ impl ApiTester {
let log = null_logger().unwrap();
let ApiServer {
ctx: _,
server,
listening_socket,
network_rx,