diff --git a/beacon_node/client/src/builder.rs b/beacon_node/client/src/builder.rs index 444a27750..de6e08cc3 100644 --- a/beacon_node/client/src/builder.rs +++ b/beacon_node/client/src/builder.rs @@ -505,6 +505,7 @@ where network_senders: None, network_globals: None, beacon_processor_send: None, + beacon_processor_reprocess_send: None, eth1_service: Some(genesis_service.eth1_service.clone()), log: context.log().clone(), sse_logging_components: runtime_context.sse_logging_components.clone(), @@ -747,6 +748,9 @@ where network_globals: self.network_globals.clone(), eth1_service: self.eth1_service.clone(), beacon_processor_send: Some(beacon_processor_channels.beacon_processor_tx.clone()), + beacon_processor_reprocess_send: Some( + beacon_processor_channels.work_reprocessing_tx.clone(), + ), sse_logging_components: runtime_context.sse_logging_components.clone(), log: log.clone(), }); diff --git a/beacon_node/http_api/src/lib.rs b/beacon_node/http_api/src/lib.rs index ec3fd494d..b39450d73 100644 --- a/beacon_node/http_api/src/lib.rs +++ b/beacon_node/http_api/src/lib.rs @@ -16,6 +16,7 @@ mod database; mod metrics; mod produce_block; mod proposer_duties; +mod publish_attestations; mod publish_blocks; mod standard_block_rewards; mod state_id; @@ -35,7 +36,7 @@ use beacon_chain::{ validator_monitor::timestamp_now, AttestationError as AttnError, BeaconChain, BeaconChainError, BeaconChainTypes, WhenSlotSkipped, }; -use beacon_processor::BeaconProcessorSend; +use beacon_processor::{work_reprocessing_queue::ReprocessQueueMessage, BeaconProcessorSend}; pub use block_id::BlockId; use builder_states::get_next_withdrawals; use bytes::Bytes; @@ -129,6 +130,7 @@ pub struct Context { pub network_senders: Option>, pub network_globals: Option>>, pub beacon_processor_send: Option>, + pub beacon_processor_reprocess_send: Option>, pub eth1_service: Option, pub sse_logging_components: Option, pub log: Logger, @@ -534,6 +536,11 @@ pub fn serve( .filter(|_| config.enable_beacon_processor); let task_spawner_filter = warp::any().map(move || TaskSpawner::new(beacon_processor_send.clone())); + let beacon_processor_reprocess_send = ctx + .beacon_processor_reprocess_send + .clone() + .filter(|_| config.enable_beacon_processor); + let reprocess_send_filter = warp::any().map(move || beacon_processor_reprocess_send.clone()); let duplicate_block_status_code = ctx.config.duplicate_block_status_code; @@ -1756,140 +1763,26 @@ pub fn serve( .and(warp::path::end()) .and(warp_utils::json::json()) .and(network_tx_filter.clone()) + .and(reprocess_send_filter) .and(log_filter.clone()) .then( |task_spawner: TaskSpawner, chain: Arc>, attestations: Vec>, network_tx: UnboundedSender>, - log: Logger| { - task_spawner.blocking_json_task(Priority::P0, move || { - let seen_timestamp = timestamp_now(); - let mut failures = Vec::new(); - let mut num_already_known = 0; - - for (index, attestation) in attestations.as_slice().iter().enumerate() { - let attestation = match chain - .verify_unaggregated_attestation_for_gossip(attestation, None) - { - Ok(attestation) => attestation, - Err(AttnError::PriorAttestationKnown { .. }) => { - num_already_known += 1; - - // Skip to the next attestation since an attestation for this - // validator is already known in this epoch. - // - // There's little value for the network in validating a second - // attestation for another validator since it is either: - // - // 1. A duplicate. - // 2. Slashable. - // 3. Invalid. - // - // We are likely to get duplicates in the case where a VC is using - // fallback BNs. If the first BN actually publishes some/all of a - // batch of attestations but fails to respond in a timely fashion, - // the VC is likely to try publishing the attestations on another - // BN. That second BN may have already seen the attestations from - // the first BN and therefore indicate that the attestations are - // "already seen". An attestation that has already been seen has - // been published on the network so there's no actual error from - // the perspective of the user. - // - // It's better to prevent slashable attestations from ever - // appearing on the network than trying to slash validators, - // especially those validators connected to the local API. - // - // There might be *some* value in determining that this attestation - // is invalid, but since a valid attestation already it exists it - // appears that this validator is capable of producing valid - // attestations and there's no immediate cause for concern. - continue; - } - Err(e) => { - error!(log, - "Failure verifying attestation for gossip"; - "error" => ?e, - "request_index" => index, - "committee_index" => attestation.data.index, - "attestation_slot" => attestation.data.slot, - ); - failures.push(api_types::Failure::new( - index, - format!("Verification: {:?}", e), - )); - // skip to the next attestation so we do not publish this one to gossip - continue; - } - }; - - // Notify the validator monitor. - chain - .validator_monitor - .read() - .register_api_unaggregated_attestation( - seen_timestamp, - attestation.indexed_attestation(), - &chain.slot_clock, - ); - - publish_pubsub_message( - &network_tx, - PubsubMessage::Attestation(Box::new(( - attestation.subnet_id(), - attestation.attestation().clone(), - ))), - )?; - - let committee_index = attestation.attestation().data.index; - let slot = attestation.attestation().data.slot; - - if let Err(e) = chain.apply_attestation_to_fork_choice(&attestation) { - error!(log, - "Failure applying verified attestation to fork choice"; - "error" => ?e, - "request_index" => index, - "committee_index" => committee_index, - "slot" => slot, - ); - failures.push(api_types::Failure::new( - index, - format!("Fork choice: {:?}", e), - )); - }; - - if let Err(e) = chain.add_to_naive_aggregation_pool(&attestation) { - error!(log, - "Failure adding verified attestation to the naive aggregation pool"; - "error" => ?e, - "request_index" => index, - "committee_index" => committee_index, - "slot" => slot, - ); - failures.push(api_types::Failure::new( - index, - format!("Naive aggregation pool: {:?}", e), - )); - } - } - - if num_already_known > 0 { - debug!( - log, - "Some unagg attestations already known"; - "count" => num_already_known - ); - } - - if failures.is_empty() { - Ok(()) - } else { - Err(warp_utils::reject::indexed_bad_request( - "error processing attestations".to_string(), - failures, - )) - } - }) + reprocess_tx: Option>, + log: Logger| async move { + let result = crate::publish_attestations::publish_attestations( + task_spawner, + chain, + attestations, + network_tx, + reprocess_tx, + log, + ) + .await + .map(|()| warp::reply::json(&())); + task_spawner::convert_rejection(result).await }, ); diff --git a/beacon_node/http_api/src/publish_attestations.rs b/beacon_node/http_api/src/publish_attestations.rs new file mode 100644 index 000000000..ed7f1ed17 --- /dev/null +++ b/beacon_node/http_api/src/publish_attestations.rs @@ -0,0 +1,319 @@ +//! Import attestations and publish them to the network. +//! +//! This module gracefully handles attestations to unknown blocks by requeuing them and then +//! efficiently waiting for them to finish reprocessing (using an async yield). +//! +//! The following comments relate to the handling of duplicate attestations (relocated here during +//! refactoring): +//! +//! Skip to the next attestation since an attestation for this +//! validator is already known in this epoch. +//! +//! There's little value for the network in validating a second +//! attestation for another validator since it is either: +//! +//! 1. A duplicate. +//! 2. Slashable. +//! 3. Invalid. +//! +//! We are likely to get duplicates in the case where a VC is using +//! fallback BNs. If the first BN actually publishes some/all of a +//! batch of attestations but fails to respond in a timely fashion, +//! the VC is likely to try publishing the attestations on another +//! BN. That second BN may have already seen the attestations from +//! the first BN and therefore indicate that the attestations are +//! "already seen". An attestation that has already been seen has +//! been published on the network so there's no actual error from +//! the perspective of the user. +//! +//! It's better to prevent slashable attestations from ever +//! appearing on the network than trying to slash validators, +//! especially those validators connected to the local API. +//! +//! There might be *some* value in determining that this attestation +//! is invalid, but since a valid attestation already it exists it +//! appears that this validator is capable of producing valid +//! attestations and there's no immediate cause for concern. +use crate::task_spawner::{Priority, TaskSpawner}; +use beacon_chain::{ + validator_monitor::timestamp_now, AttestationError, BeaconChain, BeaconChainError, + BeaconChainTypes, +}; +use beacon_processor::work_reprocessing_queue::{QueuedUnaggregate, ReprocessQueueMessage}; +use eth2::types::Failure; +use lighthouse_network::PubsubMessage; +use network::NetworkMessage; +use slog::{debug, error, warn, Logger}; +use std::sync::Arc; +use std::time::Duration; +use tokio::sync::{ + mpsc::{Sender, UnboundedSender}, + oneshot, +}; +use types::Attestation; + +// Error variants are only used in `Debug` and considered `dead_code` by the compiler. +#[derive(Debug)] +enum Error { + Validation(AttestationError), + Publication, + ForkChoice(#[allow(dead_code)] BeaconChainError), + AggregationPool(#[allow(dead_code)] AttestationError), + ReprocessDisabled, + ReprocessFull, + ReprocessTimeout, +} + +enum PublishAttestationResult { + Success, + AlreadyKnown, + Reprocessing(oneshot::Receiver>), + Failure(Error), +} + +fn verify_and_publish_attestation( + chain: &Arc>, + attestation: &Attestation, + seen_timestamp: Duration, + network_tx: &UnboundedSender>, + log: &Logger, +) -> Result<(), Error> { + let attestation = chain + .verify_unaggregated_attestation_for_gossip(attestation, None) + .map_err(Error::Validation)?; + + // Publish. + network_tx + .send(NetworkMessage::Publish { + messages: vec![PubsubMessage::Attestation(Box::new(( + attestation.subnet_id(), + attestation.attestation().clone(), + )))], + }) + .map_err(|_| Error::Publication)?; + + // Notify the validator monitor. + chain + .validator_monitor + .read() + .register_api_unaggregated_attestation( + seen_timestamp, + attestation.indexed_attestation(), + &chain.slot_clock, + ); + + let fc_result = chain.apply_attestation_to_fork_choice(&attestation); + let naive_aggregation_result = chain.add_to_naive_aggregation_pool(&attestation); + + if let Err(e) = &fc_result { + warn!( + log, + "Attestation invalid for fork choice"; + "err" => ?e, + ); + } + if let Err(e) = &naive_aggregation_result { + warn!( + log, + "Attestation invalid for aggregation"; + "err" => ?e + ); + } + + if let Err(e) = fc_result { + Err(Error::ForkChoice(e)) + } else if let Err(e) = naive_aggregation_result { + Err(Error::AggregationPool(e)) + } else { + Ok(()) + } +} + +pub async fn publish_attestations( + task_spawner: TaskSpawner, + chain: Arc>, + attestations: Vec>, + network_tx: UnboundedSender>, + reprocess_send: Option>, + log: Logger, +) -> Result<(), warp::Rejection> { + // Collect metadata about attestations which we'll use to report failures. We need to + // move the `attestations` vec into the blocking task, so this small overhead is unavoidable. + let attestation_metadata = attestations + .iter() + .map(|att| (att.data.slot, att.data.index)) + .collect::>(); + + // Gossip validate and publish attestations that can be immediately processed. + let seen_timestamp = timestamp_now(); + let inner_log = log.clone(); + let mut prelim_results = task_spawner + .blocking_task(Priority::P0, move || { + Ok(attestations + .into_iter() + .map(|attestation| { + match verify_and_publish_attestation( + &chain, + &attestation, + seen_timestamp, + &network_tx, + &inner_log, + ) { + Ok(()) => PublishAttestationResult::Success, + Err(Error::Validation(AttestationError::UnknownHeadBlock { + beacon_block_root, + })) => { + let Some(reprocess_tx) = &reprocess_send else { + return PublishAttestationResult::Failure(Error::ReprocessDisabled); + }; + // Re-process. + let (tx, rx) = oneshot::channel(); + let reprocess_chain = chain.clone(); + let reprocess_network_tx = network_tx.clone(); + let reprocess_log = inner_log.clone(); + let reprocess_fn = move || { + let result = verify_and_publish_attestation( + &reprocess_chain, + &attestation, + seen_timestamp, + &reprocess_network_tx, + &reprocess_log, + ); + // Ignore failure on the oneshot that reports the result. This + // shouldn't happen unless some catastrophe befalls the waiting + // thread which causes it to drop. + let _ = tx.send(result); + }; + let reprocess_msg = + ReprocessQueueMessage::UnknownBlockUnaggregate(QueuedUnaggregate { + beacon_block_root, + process_fn: Box::new(reprocess_fn), + }); + if reprocess_tx.try_send(reprocess_msg).is_err() { + PublishAttestationResult::Failure(Error::ReprocessFull) + } else { + PublishAttestationResult::Reprocessing(rx) + } + } + Err(Error::Validation(AttestationError::PriorAttestationKnown { + .. + })) => PublishAttestationResult::AlreadyKnown, + Err(e) => PublishAttestationResult::Failure(e), + } + }) + .map(Some) + .collect::>()) + }) + .await?; + + // Asynchronously wait for re-processing of attestations to unknown blocks. This avoids blocking + // any of the beacon processor workers while we wait for reprocessing. + let (reprocess_indices, reprocess_futures): (Vec<_>, Vec<_>) = prelim_results + .iter_mut() + .enumerate() + .filter_map(|(i, opt_result)| { + if let Some(PublishAttestationResult::Reprocessing(..)) = &opt_result { + let PublishAttestationResult::Reprocessing(rx) = opt_result.take()? else { + // Unreachable. + return None; + }; + Some((i, rx)) + } else { + None + } + }) + .unzip(); + let reprocess_results = futures::future::join_all(reprocess_futures).await; + + // Join everything back together and construct a response. + // This part should be quick so we just stay in the Tokio executor's async task. + for (i, reprocess_result) in reprocess_indices.into_iter().zip(reprocess_results) { + let Some(result_entry) = prelim_results.get_mut(i) else { + error!( + log, + "Unreachable case in attestation publishing"; + "case" => "prelim out of bounds", + "request_index" => i, + ); + continue; + }; + *result_entry = Some(match reprocess_result { + Ok(Ok(())) => PublishAttestationResult::Success, + // Attestation failed processing on re-process. + Ok(Err(Error::Validation(AttestationError::PriorAttestationKnown { .. }))) => { + PublishAttestationResult::AlreadyKnown + } + Ok(Err(e)) => PublishAttestationResult::Failure(e), + // Oneshot was dropped, indicating that the attestation either timed out in the + // reprocess queue or was dropped due to some error. + Err(_) => PublishAttestationResult::Failure(Error::ReprocessTimeout), + }); + } + + // Construct the response. + let mut failures = vec![]; + let mut num_already_known = 0; + + for (index, result) in prelim_results.iter().enumerate() { + match result { + Some(PublishAttestationResult::Success) => {} + Some(PublishAttestationResult::AlreadyKnown) => num_already_known += 1, + Some(PublishAttestationResult::Failure(e)) => { + if let Some((slot, committee_index)) = attestation_metadata.get(index) { + error!( + log, + "Failure verifying attestation for gossip"; + "error" => ?e, + "request_index" => index, + "committee_index" => committee_index, + "attestation_slot" => slot, + ); + failures.push(Failure::new(index, format!("{e:?}"))); + } else { + error!( + log, + "Unreachable case in attestation publishing"; + "case" => "out of bounds", + "request_index" => index + ); + failures.push(Failure::new(index, "metadata logic error".into())); + } + } + Some(PublishAttestationResult::Reprocessing(_)) => { + error!( + log, + "Unreachable case in attestation publishing"; + "case" => "reprocessing", + "request_index" => index + ); + failures.push(Failure::new(index, "reprocess logic error".into())); + } + None => { + error!( + log, + "Unreachable case in attestation publishing"; + "case" => "result is None", + "request_index" => index + ); + failures.push(Failure::new(index, "result logic error".into())); + } + } + } + + if num_already_known > 0 { + debug!( + log, + "Some unagg attestations already known"; + "count" => num_already_known + ); + } + + if failures.is_empty() { + Ok(()) + } else { + Err(warp_utils::reject::indexed_bad_request( + "error processing attestations".to_string(), + failures, + )) + } +} diff --git a/beacon_node/http_api/src/task_spawner.rs b/beacon_node/http_api/src/task_spawner.rs index 8768e057d..cfee5e01c 100644 --- a/beacon_node/http_api/src/task_spawner.rs +++ b/beacon_node/http_api/src/task_spawner.rs @@ -60,11 +60,15 @@ impl TaskSpawner { } } - /// Executes a "blocking" (non-async) task which returns a `Response`. - pub async fn blocking_response_task(self, priority: Priority, func: F) -> Response + /// Executes a "blocking" (non-async) task which returns an arbitrary value. + pub async fn blocking_task( + self, + priority: Priority, + func: F, + ) -> Result where F: FnOnce() -> Result + Send + Sync + 'static, - T: Reply + Send + 'static, + T: Send + 'static, { if let Some(beacon_processor_send) = &self.beacon_processor_send { // Create a closure that will execute `func` and send the result to @@ -79,22 +83,31 @@ impl TaskSpawner { }; // Send the function to the beacon processor for execution at some arbitrary time. - let result = send_to_beacon_processor( + send_to_beacon_processor( beacon_processor_send, priority, BlockingOrAsync::Blocking(Box::new(process_fn)), rx, ) .await - .and_then(|x| x); - convert_rejection(result).await + .and_then(|x| x) } else { // There is no beacon processor so spawn a task directly on the // tokio executor. - convert_rejection(warp_utils::task::blocking_response_task(func).await).await + warp_utils::task::blocking_task(func).await } } + /// Executes a "blocking" (non-async) task which returns a `Response`. + pub async fn blocking_response_task(self, priority: Priority, func: F) -> Response + where + F: FnOnce() -> Result + Send + Sync + 'static, + T: Reply + Send + 'static, + { + let result = self.blocking_task(priority, func).await; + convert_rejection(result).await + } + /// Executes a "blocking" (non-async) task which returns a JSON-serializable /// object. pub async fn blocking_json_task(self, priority: Priority, func: F) -> Response diff --git a/beacon_node/http_api/src/test_utils.rs b/beacon_node/http_api/src/test_utils.rs index b87fdf608..c1313168b 100644 --- a/beacon_node/http_api/src/test_utils.rs +++ b/beacon_node/http_api/src/test_utils.rs @@ -35,6 +35,7 @@ pub const EXTERNAL_ADDR: &str = "/ip4/0.0.0.0/tcp/9000"; /// HTTP API tester that allows interaction with the underlying beacon chain harness. pub struct InteractiveTester { + pub ctx: Arc>>, pub harness: BeaconChainHarness>, pub client: BeaconNodeHttpClient, pub network_rx: NetworkReceivers, @@ -43,10 +44,11 @@ pub struct InteractiveTester { /// The result of calling `create_api_server`. /// /// Glue-type between `tests::ApiTester` and `InteractiveTester`. -pub struct ApiServer> { +pub struct ApiServer> { + pub ctx: Arc>, pub server: SFut, pub listening_socket: SocketAddr, - pub network_rx: NetworkReceivers, + pub network_rx: NetworkReceivers, pub local_enr: Enr, pub external_peer_id: PeerId, } @@ -90,6 +92,7 @@ impl InteractiveTester { let harness = harness_builder.build(); let ApiServer { + ctx, server, listening_socket, network_rx, @@ -114,6 +117,7 @@ impl InteractiveTester { ); Self { + ctx, harness, client, network_rx, @@ -125,7 +129,7 @@ pub async fn create_api_server( chain: Arc>, test_runtime: &TestRuntime, log: Logger, -) -> ApiServer> { +) -> ApiServer> { // Use port 0 to allocate a new unused port. let port = 0; @@ -187,6 +191,7 @@ pub async fn create_api_server( } = BeaconProcessorChannels::new(&beacon_processor_config); let beacon_processor_send = beacon_processor_tx; + let reprocess_send = work_reprocessing_tx.clone(); BeaconProcessor { network_globals: network_globals.clone(), executor: test_runtime.task_executor.clone(), @@ -216,14 +221,17 @@ pub async fn create_api_server( network_senders: Some(network_senders), network_globals: Some(network_globals), beacon_processor_send: Some(beacon_processor_send), + beacon_processor_reprocess_send: Some(reprocess_send), eth1_service: Some(eth1_service), sse_logging_components: None, log, }); - let (listening_socket, server) = crate::serve(ctx, test_runtime.task_executor.exit()).unwrap(); + let (listening_socket, server) = + crate::serve(ctx.clone(), test_runtime.task_executor.exit()).unwrap(); ApiServer { + ctx, server, listening_socket, network_rx: network_receivers, diff --git a/beacon_node/http_api/tests/interactive_tests.rs b/beacon_node/http_api/tests/interactive_tests.rs index 6fb197b41..d63d04fce 100644 --- a/beacon_node/http_api/tests/interactive_tests.rs +++ b/beacon_node/http_api/tests/interactive_tests.rs @@ -4,6 +4,7 @@ use beacon_chain::{ test_utils::{AttestationStrategy, BlockStrategy, SyncCommitteeStrategy}, ChainConfig, }; +use beacon_processor::work_reprocessing_queue::ReprocessQueueMessage; use eth2::types::ProduceBlockV3Response; use eth2::types::{DepositContractData, StateId}; use execution_layer::{ForkchoiceState, PayloadAttributes}; @@ -840,3 +841,78 @@ pub async fn fork_choice_before_proposal() { // D's parent is B. assert_eq!(block_d.parent_root(), block_root_b.into()); } + +// Test that attestations to unknown blocks are requeued and processed when their block arrives. +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn queue_attestations_from_http() { + let validator_count = 128; + let all_validators = (0..validator_count).collect::>(); + + let tester = InteractiveTester::::new(None, validator_count).await; + let harness = &tester.harness; + let client = tester.client.clone(); + + let num_initial = 5; + + // Slot of the block attested to. + let attestation_slot = Slot::new(num_initial) + 1; + + // Make some initial blocks. + harness.advance_slot(); + harness + .extend_chain( + num_initial as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + harness.advance_slot(); + assert_eq!(harness.get_current_slot(), attestation_slot); + + // Make the attested-to block without applying it. + let pre_state = harness.get_current_state(); + let (block, post_state) = harness.make_block(pre_state, attestation_slot).await; + let block_root = block.0.canonical_root(); + + // Make attestations to the block and POST them to the beacon node on a background thread. + let attestations = harness + .make_unaggregated_attestations( + &all_validators, + &post_state, + block.0.state_root(), + block_root.into(), + attestation_slot, + ) + .into_iter() + .flat_map(|attestations| attestations.into_iter().map(|(att, _subnet)| att)) + .collect::>(); + + let attestation_future = tokio::spawn(async move { + client + .post_beacon_pool_attestations(&attestations) + .await + .expect("attestations should be processed successfully") + }); + + // In parallel, apply the block. We need to manually notify the reprocess queue, because the + // `beacon_chain` does not know about the queue and will not update it for us. + let parent_root = block.0.parent_root(); + harness + .process_block(attestation_slot, block_root, block) + .await + .unwrap(); + tester + .ctx + .beacon_processor_reprocess_send + .as_ref() + .unwrap() + .send(ReprocessQueueMessage::BlockImported { + block_root, + parent_root, + }) + .await + .unwrap(); + + attestation_future.await.unwrap(); +} diff --git a/beacon_node/http_api/tests/tests.rs b/beacon_node/http_api/tests/tests.rs index 2d946f309..a7ba2c1ab 100644 --- a/beacon_node/http_api/tests/tests.rs +++ b/beacon_node/http_api/tests/tests.rs @@ -248,6 +248,7 @@ impl ApiTester { let log = null_logger().unwrap(); let ApiServer { + ctx: _, server, listening_socket, network_rx, @@ -341,6 +342,7 @@ impl ApiTester { let log = null_logger().unwrap(); let ApiServer { + ctx: _, server, listening_socket, network_rx,