Handle processing results of non faulty batches (#3439)

## Issue Addressed
Solves #3390 

So after checking some logs @pawanjay176 got, we conclude that this happened because we blacklisted a chain after trying it "too much". Now here, in all occurrences it seems that "too much" means we got too many download failures. This happened very slowly, exactly because the batch is allowed to stay alive for very long times after not counting penalties when the ee is offline. The error here then was not that the batch failed because of offline ee errors, but that we blacklisted a chain because of download errors, which we can't pin on the chain but on the peer. This PR fixes that.

## Proposed Changes

Adds a missing piece of logic so that if a chain fails for errors that can't be attributed to an objectively bad behavior from the peer, it is not blacklisted. The issue at hand occurred when new peers arrived claiming a head that had wrongfully blacklisted, even if the original peers participating in the chain were not penalized.

Another notable change is that we need to consider a batch invalid if it processed correctly but its next non empty batch fails processing. Now since a batch can fail processing in non empty ways, there is no need to mark as invalid previous batches.

Improves some logging as well.

## Additional Info

We should do this regardless of pausing sync on ee offline/unsynced state. This is because I think it's almost impossible to ensure a processing result will reach in a predictable order with a synced notification from the ee. Doing this handles what I think are inevitable data races when we actually pause sync

This also fixes a return that reports which batch failed and caused us some confusion checking the logs
This commit is contained in:
Divma 2022-08-12 00:56:38 +00:00
parent a476ae4907
commit f4ffa9e0b4
12 changed files with 298 additions and 274 deletions

View File

@ -76,9 +76,7 @@ mod work_reprocessing_queue;
mod worker; mod worker;
use crate::beacon_processor::work_reprocessing_queue::QueuedGossipBlock; use crate::beacon_processor::work_reprocessing_queue::QueuedGossipBlock;
pub use worker::{ pub use worker::{ChainSegmentProcessId, GossipAggregatePackage, GossipAttestationPackage};
ChainSegmentProcessId, FailureMode, GossipAggregatePackage, GossipAttestationPackage,
};
/// The maximum size of the channel for work events to the `BeaconProcessor`. /// The maximum size of the channel for work events to the `BeaconProcessor`.
/// ///

View File

@ -10,7 +10,7 @@ mod rpc_methods;
mod sync_methods; mod sync_methods;
pub use gossip_methods::{GossipAggregatePackage, GossipAttestationPackage}; pub use gossip_methods::{GossipAggregatePackage, GossipAttestationPackage};
pub use sync_methods::{ChainSegmentProcessId, FailureMode}; pub use sync_methods::ChainSegmentProcessId;
pub(crate) const FUTURE_SLOT_TOLERANCE: u64 = 1; pub(crate) const FUTURE_SLOT_TOLERANCE: u64 = 1;

View File

@ -34,15 +34,6 @@ struct ChainSegmentFailed {
message: String, message: String,
/// Used to penalize peers. /// Used to penalize peers.
peer_action: Option<PeerAction>, peer_action: Option<PeerAction>,
/// Failure mode
mode: FailureMode,
}
/// Represents if a block processing failure was on the consensus or execution side.
#[derive(Debug)]
pub enum FailureMode {
ExecutionLayer { pause_sync: bool },
ConsensusLayer,
} }
impl<T: BeaconChainTypes> Worker<T> { impl<T: BeaconChainTypes> Worker<T> {
@ -150,7 +141,9 @@ impl<T: BeaconChainTypes> Worker<T> {
"last_block_slot" => end_slot, "last_block_slot" => end_slot,
"processed_blocks" => sent_blocks, "processed_blocks" => sent_blocks,
"service"=> "sync"); "service"=> "sync");
BatchProcessResult::Success(sent_blocks > 0) BatchProcessResult::Success {
was_non_empty: sent_blocks > 0,
}
} }
(imported_blocks, Err(e)) => { (imported_blocks, Err(e)) => {
debug!(self.log, "Batch processing failed"; debug!(self.log, "Batch processing failed";
@ -161,11 +154,12 @@ impl<T: BeaconChainTypes> Worker<T> {
"imported_blocks" => imported_blocks, "imported_blocks" => imported_blocks,
"error" => %e.message, "error" => %e.message,
"service" => "sync"); "service" => "sync");
match e.peer_action {
BatchProcessResult::Failed { Some(penalty) => BatchProcessResult::FaultyFailure {
imported_blocks: imported_blocks > 0, imported_blocks: imported_blocks > 0,
peer_action: e.peer_action, penalty,
mode: e.mode, },
None => BatchProcessResult::NonFaultyFailure,
} }
} }
} }
@ -184,7 +178,9 @@ impl<T: BeaconChainTypes> Worker<T> {
"last_block_slot" => end_slot, "last_block_slot" => end_slot,
"processed_blocks" => sent_blocks, "processed_blocks" => sent_blocks,
"service"=> "sync"); "service"=> "sync");
BatchProcessResult::Success(sent_blocks > 0) BatchProcessResult::Success {
was_non_empty: sent_blocks > 0,
}
} }
(_, Err(e)) => { (_, Err(e)) => {
debug!(self.log, "Backfill batch processing failed"; debug!(self.log, "Backfill batch processing failed";
@ -193,10 +189,12 @@ impl<T: BeaconChainTypes> Worker<T> {
"last_block_slot" => end_slot, "last_block_slot" => end_slot,
"error" => %e.message, "error" => %e.message,
"service" => "sync"); "service" => "sync");
BatchProcessResult::Failed { match e.peer_action {
imported_blocks: false, Some(penalty) => BatchProcessResult::FaultyFailure {
peer_action: e.peer_action, imported_blocks: false,
mode: e.mode, penalty,
},
None => BatchProcessResult::NonFaultyFailure,
} }
} }
} }
@ -216,15 +214,19 @@ impl<T: BeaconChainTypes> Worker<T> {
{ {
(imported_blocks, Err(e)) => { (imported_blocks, Err(e)) => {
debug!(self.log, "Parent lookup failed"; "error" => %e.message); debug!(self.log, "Parent lookup failed"; "error" => %e.message);
BatchProcessResult::Failed { match e.peer_action {
imported_blocks: imported_blocks > 0, Some(penalty) => BatchProcessResult::FaultyFailure {
peer_action: e.peer_action, imported_blocks: imported_blocks > 0,
mode: e.mode, penalty,
},
None => BatchProcessResult::NonFaultyFailure,
} }
} }
(imported_blocks, Ok(_)) => { (imported_blocks, Ok(_)) => {
debug!(self.log, "Parent lookup processed successfully"); debug!(self.log, "Parent lookup processed successfully");
BatchProcessResult::Success(imported_blocks > 0) BatchProcessResult::Success {
was_non_empty: imported_blocks > 0,
}
} }
} }
} }
@ -307,7 +309,6 @@ impl<T: BeaconChainTypes> Worker<T> {
message: String::from("mismatched_block_root"), message: String::from("mismatched_block_root"),
// The peer is faulty if they send blocks with bad roots. // The peer is faulty if they send blocks with bad roots.
peer_action: Some(PeerAction::LowToleranceError), peer_action: Some(PeerAction::LowToleranceError),
mode: FailureMode::ConsensusLayer,
} }
} }
HistoricalBlockError::InvalidSignature HistoricalBlockError::InvalidSignature
@ -322,7 +323,6 @@ impl<T: BeaconChainTypes> Worker<T> {
message: "invalid_signature".into(), message: "invalid_signature".into(),
// The peer is faulty if they bad signatures. // The peer is faulty if they bad signatures.
peer_action: Some(PeerAction::LowToleranceError), peer_action: Some(PeerAction::LowToleranceError),
mode: FailureMode::ConsensusLayer,
} }
} }
HistoricalBlockError::ValidatorPubkeyCacheTimeout => { HistoricalBlockError::ValidatorPubkeyCacheTimeout => {
@ -336,7 +336,6 @@ impl<T: BeaconChainTypes> Worker<T> {
message: "pubkey_cache_timeout".into(), message: "pubkey_cache_timeout".into(),
// This is an internal error, do not penalize the peer. // This is an internal error, do not penalize the peer.
peer_action: None, peer_action: None,
mode: FailureMode::ConsensusLayer,
} }
} }
HistoricalBlockError::NoAnchorInfo => { HistoricalBlockError::NoAnchorInfo => {
@ -347,7 +346,6 @@ impl<T: BeaconChainTypes> Worker<T> {
// There is no need to do a historical sync, this is not a fault of // There is no need to do a historical sync, this is not a fault of
// the peer. // the peer.
peer_action: None, peer_action: None,
mode: FailureMode::ConsensusLayer,
} }
} }
HistoricalBlockError::IndexOutOfBounds => { HistoricalBlockError::IndexOutOfBounds => {
@ -360,7 +358,6 @@ impl<T: BeaconChainTypes> Worker<T> {
message: String::from("logic_error"), message: String::from("logic_error"),
// This should never occur, don't penalize the peer. // This should never occur, don't penalize the peer.
peer_action: None, peer_action: None,
mode: FailureMode::ConsensusLayer,
} }
} }
HistoricalBlockError::BlockOutOfRange { .. } => { HistoricalBlockError::BlockOutOfRange { .. } => {
@ -373,7 +370,6 @@ impl<T: BeaconChainTypes> Worker<T> {
message: String::from("unexpected_error"), message: String::from("unexpected_error"),
// This should never occur, don't penalize the peer. // This should never occur, don't penalize the peer.
peer_action: None, peer_action: None,
mode: FailureMode::ConsensusLayer,
} }
} }
}, },
@ -383,7 +379,6 @@ impl<T: BeaconChainTypes> Worker<T> {
message: format!("{:?}", other), message: format!("{:?}", other),
// This is an internal error, don't penalize the peer. // This is an internal error, don't penalize the peer.
peer_action: None, peer_action: None,
mode: FailureMode::ConsensusLayer,
} }
} }
}; };
@ -404,7 +399,6 @@ impl<T: BeaconChainTypes> Worker<T> {
message: format!("Block has an unknown parent: {}", block.parent_root()), message: format!("Block has an unknown parent: {}", block.parent_root()),
// Peers are faulty if they send non-sequential blocks. // Peers are faulty if they send non-sequential blocks.
peer_action: Some(PeerAction::LowToleranceError), peer_action: Some(PeerAction::LowToleranceError),
mode: FailureMode::ConsensusLayer,
}) })
} }
BlockError::BlockIsAlreadyKnown => { BlockError::BlockIsAlreadyKnown => {
@ -442,7 +436,6 @@ impl<T: BeaconChainTypes> Worker<T> {
), ),
// Peers are faulty if they send blocks from the future. // Peers are faulty if they send blocks from the future.
peer_action: Some(PeerAction::LowToleranceError), peer_action: Some(PeerAction::LowToleranceError),
mode: FailureMode::ConsensusLayer,
}) })
} }
BlockError::WouldRevertFinalizedSlot { .. } => { BlockError::WouldRevertFinalizedSlot { .. } => {
@ -464,7 +457,6 @@ impl<T: BeaconChainTypes> Worker<T> {
message: format!("Internal error whilst processing block: {:?}", e), message: format!("Internal error whilst processing block: {:?}", e),
// Do not penalize peers for internal errors. // Do not penalize peers for internal errors.
peer_action: None, peer_action: None,
mode: FailureMode::ConsensusLayer,
}) })
} }
ref err @ BlockError::ExecutionPayloadError(ref epe) => { ref err @ BlockError::ExecutionPayloadError(ref epe) => {
@ -480,7 +472,6 @@ impl<T: BeaconChainTypes> Worker<T> {
message: format!("Execution layer offline. Reason: {:?}", err), message: format!("Execution layer offline. Reason: {:?}", err),
// Do not penalize peers for internal errors. // Do not penalize peers for internal errors.
peer_action: None, peer_action: None,
mode: FailureMode::ExecutionLayer { pause_sync: true },
}) })
} else { } else {
debug!(self.log, debug!(self.log,
@ -493,7 +484,6 @@ impl<T: BeaconChainTypes> Worker<T> {
err err
), ),
peer_action: Some(PeerAction::LowToleranceError), peer_action: Some(PeerAction::LowToleranceError),
mode: FailureMode::ExecutionLayer { pause_sync: false },
}) })
} }
} }
@ -508,7 +498,6 @@ impl<T: BeaconChainTypes> Worker<T> {
message: format!("Peer sent invalid block. Reason: {:?}", other), message: format!("Peer sent invalid block. Reason: {:?}", other),
// Do not penalize peers for internal errors. // Do not penalize peers for internal errors.
peer_action: None, peer_action: None,
mode: FailureMode::ConsensusLayer,
}) })
} }
} }

View File

@ -8,10 +8,12 @@
//! If a batch fails, the backfill sync cannot progress. In this scenario, we mark the backfill //! If a batch fails, the backfill sync cannot progress. In this scenario, we mark the backfill
//! sync as failed, log an error and attempt to retry once a new peer joins the node. //! sync as failed, log an error and attempt to retry once a new peer joins the node.
use crate::beacon_processor::{ChainSegmentProcessId, FailureMode, WorkEvent as BeaconWorkEvent}; use crate::beacon_processor::{ChainSegmentProcessId, WorkEvent as BeaconWorkEvent};
use crate::sync::manager::{BatchProcessResult, Id}; use crate::sync::manager::{BatchProcessResult, Id};
use crate::sync::network_context::SyncNetworkContext; use crate::sync::network_context::SyncNetworkContext;
use crate::sync::range_sync::{BatchConfig, BatchId, BatchInfo, BatchProcessingResult, BatchState}; use crate::sync::range_sync::{
BatchConfig, BatchId, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState,
};
use beacon_chain::{BeaconChain, BeaconChainTypes}; use beacon_chain::{BeaconChain, BeaconChainTypes};
use lighthouse_network::types::{BackFillState, NetworkGlobals}; use lighthouse_network::types::{BackFillState, NetworkGlobals};
use lighthouse_network::{PeerAction, PeerId}; use lighthouse_network::{PeerAction, PeerId};
@ -324,10 +326,10 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
for id in batch_ids { for id in batch_ids {
if let Some(batch) = self.batches.get_mut(&id) { if let Some(batch) = self.batches.get_mut(&id) {
match batch.download_failed(false) { match batch.download_failed(false) {
Ok(true) => { Ok(BatchOperationOutcome::Failed { blacklist: _ }) => {
self.fail_sync(BackFillError::BatchDownloadFailed(id))?; self.fail_sync(BackFillError::BatchDownloadFailed(id))?;
} }
Ok(false) => {} Ok(BatchOperationOutcome::Continue) => {}
Err(e) => { Err(e) => {
self.fail_sync(BackFillError::BatchInvalidState(id, e.0))?; self.fail_sync(BackFillError::BatchInvalidState(id, e.0))?;
} }
@ -371,8 +373,10 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
} }
match batch.download_failed(true) { match batch.download_failed(true) {
Err(e) => self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)), Err(e) => self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)),
Ok(true) => self.fail_sync(BackFillError::BatchDownloadFailed(batch_id)), Ok(BatchOperationOutcome::Failed { blacklist: _ }) => {
Ok(false) => self.retry_batch_download(network, batch_id), self.fail_sync(BackFillError::BatchDownloadFailed(batch_id))
}
Ok(BatchOperationOutcome::Continue) => self.retry_batch_download(network, batch_id),
} }
} else { } else {
// this could be an error for an old batch, removed when the chain advances // this could be an error for an old batch, removed when the chain advances
@ -439,7 +443,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
self.process_completed_batches(network) self.process_completed_batches(network)
} }
Err(result) => { Err(result) => {
let (expected_boundary, received_boundary, is_failed) = match result { let (expected_boundary, received_boundary, outcome) = match result {
Err(e) => { Err(e) => {
return self return self
.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)) .fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))
@ -450,7 +454,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
warn!(self.log, "Batch received out of range blocks"; "expected_boundary" => expected_boundary, "received_boundary" => received_boundary, warn!(self.log, "Batch received out of range blocks"; "expected_boundary" => expected_boundary, "received_boundary" => received_boundary,
"peer_id" => %peer_id, batch); "peer_id" => %peer_id, batch);
if is_failed { if let BatchOperationOutcome::Failed { blacklist: _ } = outcome {
error!(self.log, "Backfill failed"; "epoch" => batch_id, "received_boundary" => received_boundary, "expected_boundary" => expected_boundary); error!(self.log, "Backfill failed"; "epoch" => batch_id, "received_boundary" => received_boundary, "expected_boundary" => expected_boundary);
return self return self
.fail_sync(BackFillError::BatchDownloadFailed(batch_id)) .fail_sync(BackFillError::BatchDownloadFailed(batch_id))
@ -547,16 +551,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// blocks to continue, and the chain is expecting a processing result that won't // blocks to continue, and the chain is expecting a processing result that won't
// arrive. To mitigate this, (fake) fail this processing so that the batch is // arrive. To mitigate this, (fake) fail this processing so that the batch is
// re-downloaded. // re-downloaded.
self.on_batch_process_result( self.on_batch_process_result(network, batch_id, &BatchProcessResult::NonFaultyFailure)
network,
batch_id,
&BatchProcessResult::Failed {
imported_blocks: false,
// The beacon processor queue is full, no need to penalize the peer.
peer_action: None,
mode: FailureMode::ConsensusLayer,
},
)
} else { } else {
Ok(ProcessResult::Successful) Ok(ProcessResult::Successful)
} }
@ -575,7 +570,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// The first two cases are possible in regular sync, should not occur in backfill, but we // The first two cases are possible in regular sync, should not occur in backfill, but we
// keep this logic for handling potential processing race conditions. // keep this logic for handling potential processing race conditions.
// result // result
match &self.current_processing_batch { let batch = match &self.current_processing_batch {
Some(processing_id) if *processing_id != batch_id => { Some(processing_id) if *processing_id != batch_id => {
debug!(self.log, "Unexpected batch result"; debug!(self.log, "Unexpected batch result";
"batch_epoch" => batch_id, "expected_batch_epoch" => processing_id); "batch_epoch" => batch_id, "expected_batch_epoch" => processing_id);
@ -589,13 +584,9 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
_ => { _ => {
// batch_id matches, continue // batch_id matches, continue
self.current_processing_batch = None; self.current_processing_batch = None;
}
}
match result { match self.batches.get_mut(&batch_id) {
BatchProcessResult::Success(was_non_empty) => { Some(batch) => batch,
let batch = match self.batches.get_mut(&batch_id) {
Some(v) => v,
None => { None => {
// This is an error. Fail the sync algorithm. // This is an error. Fail the sync algorithm.
return self return self
@ -605,8 +596,27 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
))) )))
.map(|_| ProcessResult::Successful); .map(|_| ProcessResult::Successful);
} }
}; }
}
};
let peer = match batch.current_peer() {
Some(v) => *v,
None => {
return self
.fail_sync(BackFillError::BatchInvalidState(
batch_id,
String::from("Peer does not exist"),
))
.map(|_| ProcessResult::Successful)
}
};
debug!(self.log, "Backfill batch processed"; "result" => ?result, &batch,
"batch_epoch" => batch_id, "peer" => %peer, "client" => %network.client_type(&peer));
match result {
BatchProcessResult::Success { was_non_empty } => {
if let Err(e) = batch.processing_completed(BatchProcessingResult::Success) { if let Err(e) = batch.processing_completed(BatchProcessingResult::Success) {
self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))?; self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))?;
} }
@ -636,45 +646,17 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
self.process_completed_batches(network) self.process_completed_batches(network)
} }
} }
BatchProcessResult::Failed { BatchProcessResult::FaultyFailure {
imported_blocks, imported_blocks,
peer_action, penalty,
mode: _,
} => { } => {
let batch = match self.batches.get_mut(&batch_id) { match batch.processing_completed(BatchProcessingResult::FaultyFailure) {
Some(v) => v,
None => {
return self
.fail_sync(BackFillError::InvalidSyncState(format!(
"Batch not found for current processing target {}",
batch_id
)))
.map(|_| ProcessResult::Successful)
}
};
let peer = match batch.current_peer() {
Some(v) => *v,
None => {
return self
.fail_sync(BackFillError::BatchInvalidState(
batch_id,
String::from("Peer does not exist"),
))
.map(|_| ProcessResult::Successful)
}
};
debug!(self.log, "Batch processing failed"; "imported_blocks" => imported_blocks,
"batch_epoch" => batch_id, "peer" => %peer, "client" => %network.client_type(&peer));
match batch.processing_completed(BatchProcessingResult::Failed {
count_attempt: peer_action.is_some(),
}) {
Err(e) => { Err(e) => {
// Batch was in the wrong state // Batch was in the wrong state
self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)) self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))
.map(|_| ProcessResult::Successful) .map(|_| ProcessResult::Successful)
} }
Ok(true) => { Ok(BatchOperationOutcome::Failed { blacklist: _ }) => {
// check that we have not exceeded the re-process retry counter // check that we have not exceeded the re-process retry counter
// If a batch has exceeded the invalid batch lookup attempts limit, it means // If a batch has exceeded the invalid batch lookup attempts limit, it means
// that it is likely all peers are sending invalid batches // that it is likely all peers are sending invalid batches
@ -683,23 +665,18 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
warn!( warn!(
self.log, self.log,
"Backfill batch failed to download. Penalizing peers"; "Backfill batch failed to download. Penalizing peers";
"score_adjustment" => %peer_action "score_adjustment" => %penalty,
.as_ref()
.map(ToString::to_string)
.unwrap_or_else(|| "None".into()),
"batch_epoch"=> batch_id "batch_epoch"=> batch_id
); );
if let Some(peer_action) = peer_action { for peer in self.participating_peers.drain() {
for peer in self.participating_peers.drain() { network.report_peer(peer, *penalty, "backfill_batch_failed");
network.report_peer(peer, *peer_action, "backfill_batch_failed");
}
} }
self.fail_sync(BackFillError::BatchProcessingFailed(batch_id)) self.fail_sync(BackFillError::BatchProcessingFailed(batch_id))
.map(|_| ProcessResult::Successful) .map(|_| ProcessResult::Successful)
} }
Ok(false) => { Ok(BatchOperationOutcome::Continue) => {
// chain can continue. Check if it can be progressed // chain can continue. Check if it can be progressed
if *imported_blocks { if *imported_blocks {
// At least one block was successfully verified and imported, then we can be sure all // At least one block was successfully verified and imported, then we can be sure all
@ -713,6 +690,14 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
} }
} }
} }
BatchProcessResult::NonFaultyFailure => {
if let Err(e) = batch.processing_completed(BatchProcessingResult::NonFaultyFailure)
{
self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))?;
}
self.retry_batch_download(network, batch_id)
.map(|_| ProcessResult::Successful)
}
} }
} }
@ -905,11 +890,11 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
.validation_failed() .validation_failed()
.map_err(|e| BackFillError::BatchInvalidState(batch_id, e.0))? .map_err(|e| BackFillError::BatchInvalidState(batch_id, e.0))?
{ {
true => { BatchOperationOutcome::Failed { blacklist: _ } => {
// Batch has failed and cannot be redownloaded. // Batch has failed and cannot be redownloaded.
return self.fail_sync(BackFillError::BatchProcessingFailed(batch_id)); return self.fail_sync(BackFillError::BatchProcessingFailed(batch_id));
} }
false => { BatchOperationOutcome::Continue => {
redownload_queue.push(*id); redownload_queue.push(*id);
} }
} }
@ -1010,8 +995,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
Err(e) => { Err(e) => {
self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))? self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))?
} }
Ok(true) => self.fail_sync(BackFillError::BatchDownloadFailed(batch_id))?, Ok(BatchOperationOutcome::Failed { blacklist: _ }) => {
Ok(false) => return self.retry_batch_download(network, batch_id), self.fail_sync(BackFillError::BatchDownloadFailed(batch_id))?
}
Ok(BatchOperationOutcome::Continue) => {
return self.retry_batch_download(network, batch_id)
}
} }
} }
} }

View File

@ -11,7 +11,7 @@ use std::sync::Arc;
use store::{Hash256, SignedBeaconBlock}; use store::{Hash256, SignedBeaconBlock};
use tokio::sync::mpsc; use tokio::sync::mpsc;
use crate::beacon_processor::{ChainSegmentProcessId, FailureMode, WorkEvent}; use crate::beacon_processor::{ChainSegmentProcessId, WorkEvent};
use crate::metrics; use crate::metrics;
use self::{ use self::{
@ -610,35 +610,26 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
chain_hash chain_hash
); );
#[cfg(not(debug_assertions))] #[cfg(not(debug_assertions))]
return crit!(self.log, "Chain process response for a parent lookup request that was not found"; "chain_hash" => %chain_hash); return debug!(self.log, "Chain process response for a parent lookup request that was not found"; "chain_hash" => %chain_hash);
}; };
debug!(self.log, "Parent chain processed"; "chain_hash" => %chain_hash, "result" => ?result); debug!(self.log, "Parent chain processed"; "chain_hash" => %chain_hash, "result" => ?result);
match result { match result {
BatchProcessResult::Success(_) => { BatchProcessResult::Success { .. } => {
// nothing to do. // nothing to do.
} }
BatchProcessResult::Failed { BatchProcessResult::FaultyFailure {
imported_blocks: _, imported_blocks: _,
peer_action, penalty,
mode,
} => { } => {
if let FailureMode::ExecutionLayer { pause_sync: _ } = mode { self.failed_chains.insert(parent_lookup.chain_hash());
debug!( for &peer_id in parent_lookup.used_peers() {
self.log, cx.report_peer(peer_id, penalty, "parent_chain_failure")
"Chain segment processing failed. Execution layer is offline";
"chain_hash" => %chain_hash,
"error" => ?mode
);
} else {
self.failed_chains.insert(parent_lookup.chain_hash());
if let Some(peer_action) = peer_action {
for &peer_id in parent_lookup.used_peers() {
cx.report_peer(peer_id, peer_action, "parent_chain_failure")
}
}
} }
} }
BatchProcessResult::NonFaultyFailure => {
// We might request this chain again if there is need but otherwise, don't try again
}
} }
metrics::set_gauge( metrics::set_gauge(

View File

@ -284,7 +284,10 @@ fn test_parent_lookup_happy_path() {
// Processing succeeds, now the rest of the chain should be sent for processing. // Processing succeeds, now the rest of the chain should be sent for processing.
bl.parent_block_processed(chain_hash, BlockError::BlockIsAlreadyKnown.into(), &mut cx); bl.parent_block_processed(chain_hash, BlockError::BlockIsAlreadyKnown.into(), &mut cx);
rig.expect_parent_chain_process(); rig.expect_parent_chain_process();
bl.parent_chain_processed(chain_hash, BatchProcessResult::Success(true), &mut cx); let process_result = BatchProcessResult::Success {
was_non_empty: true,
};
bl.parent_chain_processed(chain_hash, process_result, &mut cx);
assert_eq!(bl.parent_queue.len(), 0); assert_eq!(bl.parent_queue.len(), 0);
} }
@ -318,7 +321,10 @@ fn test_parent_lookup_wrong_response() {
// Processing succeeds, now the rest of the chain should be sent for processing. // Processing succeeds, now the rest of the chain should be sent for processing.
bl.parent_block_processed(chain_hash, Ok(()).into(), &mut cx); bl.parent_block_processed(chain_hash, Ok(()).into(), &mut cx);
rig.expect_parent_chain_process(); rig.expect_parent_chain_process();
bl.parent_chain_processed(chain_hash, BatchProcessResult::Success(true), &mut cx); let process_result = BatchProcessResult::Success {
was_non_empty: true,
};
bl.parent_chain_processed(chain_hash, process_result, &mut cx);
assert_eq!(bl.parent_queue.len(), 0); assert_eq!(bl.parent_queue.len(), 0);
} }
@ -347,7 +353,10 @@ fn test_parent_lookup_empty_response() {
// Processing succeeds, now the rest of the chain should be sent for processing. // Processing succeeds, now the rest of the chain should be sent for processing.
bl.parent_block_processed(chain_hash, Ok(()).into(), &mut cx); bl.parent_block_processed(chain_hash, Ok(()).into(), &mut cx);
rig.expect_parent_chain_process(); rig.expect_parent_chain_process();
bl.parent_chain_processed(chain_hash, BatchProcessResult::Success(true), &mut cx); let process_result = BatchProcessResult::Success {
was_non_empty: true,
};
bl.parent_chain_processed(chain_hash, process_result, &mut cx);
assert_eq!(bl.parent_queue.len(), 0); assert_eq!(bl.parent_queue.len(), 0);
} }
@ -375,7 +384,10 @@ fn test_parent_lookup_rpc_failure() {
// Processing succeeds, now the rest of the chain should be sent for processing. // Processing succeeds, now the rest of the chain should be sent for processing.
bl.parent_block_processed(chain_hash, Ok(()).into(), &mut cx); bl.parent_block_processed(chain_hash, Ok(()).into(), &mut cx);
rig.expect_parent_chain_process(); rig.expect_parent_chain_process();
bl.parent_chain_processed(chain_hash, BatchProcessResult::Success(true), &mut cx); let process_result = BatchProcessResult::Success {
was_non_empty: true,
};
bl.parent_chain_processed(chain_hash, process_result, &mut cx);
assert_eq!(bl.parent_queue.len(), 0); assert_eq!(bl.parent_queue.len(), 0);
} }

View File

@ -38,7 +38,7 @@ use super::block_lookups::BlockLookups;
use super::network_context::SyncNetworkContext; use super::network_context::SyncNetworkContext;
use super::peer_sync_info::{remote_sync_type, PeerSyncType}; use super::peer_sync_info::{remote_sync_type, PeerSyncType};
use super::range_sync::{RangeSync, RangeSyncType, EPOCHS_PER_BATCH}; use super::range_sync::{RangeSync, RangeSyncType, EPOCHS_PER_BATCH};
use crate::beacon_processor::{ChainSegmentProcessId, FailureMode, WorkEvent as BeaconWorkEvent}; use crate::beacon_processor::{ChainSegmentProcessId, WorkEvent as BeaconWorkEvent};
use crate::service::NetworkMessage; use crate::service::NetworkMessage;
use crate::status::ToStatusMessage; use crate::status::ToStatusMessage;
use beacon_chain::{BeaconChain, BeaconChainTypes, BlockError}; use beacon_chain::{BeaconChain, BeaconChainTypes, BlockError};
@ -139,13 +139,15 @@ pub enum BlockProcessResult<T: EthSpec> {
#[derive(Debug)] #[derive(Debug)]
pub enum BatchProcessResult { pub enum BatchProcessResult {
/// The batch was completed successfully. It carries whether the sent batch contained blocks. /// The batch was completed successfully. It carries whether the sent batch contained blocks.
Success(bool), Success {
/// The batch processing failed. It carries whether the processing imported any block. was_non_empty: bool,
Failed {
imported_blocks: bool,
peer_action: Option<PeerAction>,
mode: FailureMode,
}, },
/// The batch processing failed. It carries whether the processing imported any block.
FaultyFailure {
imported_blocks: bool,
penalty: PeerAction,
},
NonFaultyFailure,
} }
/// The primary object for handling and driving all the current syncing logic. It maintains the /// The primary object for handling and driving all the current syncing logic. It maintains the

View File

@ -9,4 +9,4 @@ mod peer_sync_info;
mod range_sync; mod range_sync;
pub use manager::{BatchProcessResult, SyncMessage}; pub use manager::{BatchProcessResult, SyncMessage};
pub use range_sync::ChainId; pub use range_sync::{BatchOperationOutcome, ChainId};

View File

@ -70,12 +70,16 @@ impl BatchConfig for RangeSyncBatchConfig {
// Such errors should never be encountered. // Such errors should never be encountered.
pub struct WrongState(pub(crate) String); pub struct WrongState(pub(crate) String);
/// Auxiliary type alias for readability. /// After batch operations, we use this to communicate whether a batch can continue or not
type IsFailed = bool; pub enum BatchOperationOutcome {
Continue,
Failed { blacklist: bool },
}
pub enum BatchProcessingResult { pub enum BatchProcessingResult {
Success, Success,
Failed { count_attempt: bool }, FaultyFailure,
NonFaultyFailure,
} }
/// A segment of a chain. /// A segment of a chain.
@ -87,7 +91,7 @@ pub struct BatchInfo<T: EthSpec, B: BatchConfig = RangeSyncBatchConfig> {
/// The `Attempts` that have been made and failed to send us this batch. /// The `Attempts` that have been made and failed to send us this batch.
failed_processing_attempts: Vec<Attempt>, failed_processing_attempts: Vec<Attempt>,
/// Number of processing attempts that have failed but we do not count. /// Number of processing attempts that have failed but we do not count.
other_failed_processing_attempts: u8, non_faulty_processing_attempts: u8,
/// The number of download retries this batch has undergone due to a failed request. /// The number of download retries this batch has undergone due to a failed request.
failed_download_attempts: Vec<PeerId>, failed_download_attempts: Vec<PeerId>,
/// State of the batch. /// State of the batch.
@ -124,14 +128,6 @@ impl<T: EthSpec> BatchState<T> {
pub fn poison(&mut self) -> BatchState<T> { pub fn poison(&mut self) -> BatchState<T> {
std::mem::replace(self, BatchState::Poisoned) std::mem::replace(self, BatchState::Poisoned)
} }
pub fn is_failed(&self) -> IsFailed {
match self {
BatchState::Failed => true,
BatchState::Poisoned => unreachable!("Poisoned batch"),
_ => false,
}
}
} }
impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> { impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
@ -151,7 +147,7 @@ impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
end_slot, end_slot,
failed_processing_attempts: Vec::new(), failed_processing_attempts: Vec::new(),
failed_download_attempts: Vec::new(), failed_download_attempts: Vec::new(),
other_failed_processing_attempts: 0, non_faulty_processing_attempts: 0,
state: BatchState::AwaitingDownload, state: BatchState::AwaitingDownload,
marker: std::marker::PhantomData, marker: std::marker::PhantomData,
} }
@ -175,7 +171,16 @@ impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
peers peers
} }
/// Verifies if an incomming block belongs to this batch. /// Return the number of times this batch has failed downloading and failed processing, in this
/// order.
pub fn failed_attempts(&self) -> (usize, usize) {
(
self.failed_download_attempts.len(),
self.failed_processing_attempts.len(),
)
}
/// Verifies if an incoming block belongs to this batch.
pub fn is_expecting_block(&self, peer_id: &PeerId, request_id: &Id) -> bool { pub fn is_expecting_block(&self, peer_id: &PeerId, request_id: &Id) -> bool {
if let BatchState::Downloading(expected_peer, _, expected_id) = &self.state { if let BatchState::Downloading(expected_peer, _, expected_id) = &self.state {
return peer_id == expected_peer && expected_id == request_id; return peer_id == expected_peer && expected_id == request_id;
@ -203,6 +208,20 @@ impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
} }
} }
/// After different operations over a batch, this could be in a state that allows it to
/// continue, or in failed state. When the batch has failed, we check if it did mainly due to
/// processing failures. In this case the batch is considered failed and faulty.
pub fn outcome(&self) -> BatchOperationOutcome {
match self.state {
BatchState::Poisoned => unreachable!("Poisoned batch"),
BatchState::Failed => BatchOperationOutcome::Failed {
blacklist: self.failed_processing_attempts.len()
> self.failed_download_attempts.len(),
},
_ => BatchOperationOutcome::Continue,
}
}
pub fn state(&self) -> &BatchState<T> { pub fn state(&self) -> &BatchState<T> {
&self.state &self.state
} }
@ -235,7 +254,10 @@ impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
#[must_use = "Batch may have failed"] #[must_use = "Batch may have failed"]
pub fn download_completed( pub fn download_completed(
&mut self, &mut self,
) -> Result<usize /* Received blocks */, Result<(Slot, Slot, IsFailed), WrongState>> { ) -> Result<
usize, /* Received blocks */
Result<(Slot, Slot, BatchOperationOutcome), WrongState>,
> {
match self.state.poison() { match self.state.poison() {
BatchState::Downloading(peer, blocks, _request_id) => { BatchState::Downloading(peer, blocks, _request_id) => {
// verify that blocks are in range // verify that blocks are in range
@ -264,7 +286,7 @@ impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
BatchState::AwaitingDownload BatchState::AwaitingDownload
}; };
return Err(Ok((expected, received, self.state.is_failed()))); return Err(Ok((expected, received, self.outcome())));
} }
} }
@ -289,7 +311,10 @@ impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
/// THe `mark_failed` parameter, when set to false, does not increment the failed attempts of /// THe `mark_failed` parameter, when set to false, does not increment the failed attempts of
/// this batch and register the peer, rather attempts a re-download. /// this batch and register the peer, rather attempts a re-download.
#[must_use = "Batch may have failed"] #[must_use = "Batch may have failed"]
pub fn download_failed(&mut self, mark_failed: bool) -> Result<IsFailed, WrongState> { pub fn download_failed(
&mut self,
mark_failed: bool,
) -> Result<BatchOperationOutcome, WrongState> {
match self.state.poison() { match self.state.poison() {
BatchState::Downloading(peer, _, _request_id) => { BatchState::Downloading(peer, _, _request_id) => {
// register the attempt and check if the batch can be tried again // register the attempt and check if the batch can be tried again
@ -304,7 +329,7 @@ impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
// drop the blocks // drop the blocks
BatchState::AwaitingDownload BatchState::AwaitingDownload
}; };
Ok(self.state.is_failed()) Ok(self.outcome())
} }
BatchState::Poisoned => unreachable!("Poisoned batch"), BatchState::Poisoned => unreachable!("Poisoned batch"),
other => { other => {
@ -359,32 +384,31 @@ impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
pub fn processing_completed( pub fn processing_completed(
&mut self, &mut self,
procesing_result: BatchProcessingResult, procesing_result: BatchProcessingResult,
) -> Result<IsFailed, WrongState> { ) -> Result<BatchOperationOutcome, WrongState> {
match self.state.poison() { match self.state.poison() {
BatchState::Processing(attempt) => { BatchState::Processing(attempt) => {
self.state = match procesing_result { self.state = match procesing_result {
BatchProcessingResult::Success => BatchState::AwaitingValidation(attempt), BatchProcessingResult::Success => BatchState::AwaitingValidation(attempt),
BatchProcessingResult::Failed { count_attempt } => { BatchProcessingResult::FaultyFailure => {
if count_attempt { // register the failed attempt
// register the failed attempt self.failed_processing_attempts.push(attempt);
self.failed_processing_attempts.push(attempt);
// check if the batch can be downloaded again // check if the batch can be downloaded again
if self.failed_processing_attempts.len() if self.failed_processing_attempts.len()
>= B::max_batch_processing_attempts() as usize >= B::max_batch_processing_attempts() as usize
{ {
BatchState::Failed BatchState::Failed
} else {
BatchState::AwaitingDownload
}
} else { } else {
self.other_failed_processing_attempts =
self.other_failed_processing_attempts.saturating_add(1);
BatchState::AwaitingDownload BatchState::AwaitingDownload
} }
} }
BatchProcessingResult::NonFaultyFailure => {
self.non_faulty_processing_attempts =
self.non_faulty_processing_attempts.saturating_add(1);
BatchState::AwaitingDownload
}
}; };
Ok(self.state.is_failed()) Ok(self.outcome())
} }
BatchState::Poisoned => unreachable!("Poisoned batch"), BatchState::Poisoned => unreachable!("Poisoned batch"),
other => { other => {
@ -398,7 +422,7 @@ impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
} }
#[must_use = "Batch may have failed"] #[must_use = "Batch may have failed"]
pub fn validation_failed(&mut self) -> Result<IsFailed, WrongState> { pub fn validation_failed(&mut self) -> Result<BatchOperationOutcome, WrongState> {
match self.state.poison() { match self.state.poison() {
BatchState::AwaitingValidation(attempt) => { BatchState::AwaitingValidation(attempt) => {
self.failed_processing_attempts.push(attempt); self.failed_processing_attempts.push(attempt);
@ -411,7 +435,7 @@ impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
} else { } else {
BatchState::AwaitingDownload BatchState::AwaitingDownload
}; };
Ok(self.state.is_failed()) Ok(self.outcome())
} }
BatchState::Poisoned => unreachable!("Poisoned batch"), BatchState::Poisoned => unreachable!("Poisoned batch"),
other => { other => {
@ -472,10 +496,7 @@ impl<T: EthSpec, B: BatchConfig> slog::KV for BatchInfo<T, B> {
)?; )?;
serializer.emit_usize("downloaded", self.failed_download_attempts.len())?; serializer.emit_usize("downloaded", self.failed_download_attempts.len())?;
serializer.emit_usize("processed", self.failed_processing_attempts.len())?; serializer.emit_usize("processed", self.failed_processing_attempts.len())?;
serializer.emit_u8( serializer.emit_u8("processed_no_penalty", self.non_faulty_processing_attempts)?;
"processed_no_penalty",
self.other_failed_processing_attempts,
)?;
serializer.emit_arguments("state", &format_args!("{:?}", self.state))?; serializer.emit_arguments("state", &format_args!("{:?}", self.state))?;
slog::Result::Ok(()) slog::Result::Ok(())
} }

View File

@ -1,7 +1,8 @@
use super::batch::{BatchInfo, BatchProcessingResult, BatchState}; use super::batch::{BatchInfo, BatchProcessingResult, BatchState};
use crate::beacon_processor::WorkEvent as BeaconWorkEvent; use crate::beacon_processor::{ChainSegmentProcessId, WorkEvent as BeaconWorkEvent};
use crate::beacon_processor::{ChainSegmentProcessId, FailureMode}; use crate::sync::{
use crate::sync::{manager::Id, network_context::SyncNetworkContext, BatchProcessResult}; manager::Id, network_context::SyncNetworkContext, BatchOperationOutcome, BatchProcessResult,
};
use beacon_chain::{BeaconChainTypes, CountUnrealized}; use beacon_chain::{BeaconChainTypes, CountUnrealized};
use fnv::FnvHashMap; use fnv::FnvHashMap;
use lighthouse_network::{PeerAction, PeerId}; use lighthouse_network::{PeerAction, PeerId};
@ -37,7 +38,11 @@ pub type ProcessingResult = Result<KeepChain, RemoveChain>;
pub enum RemoveChain { pub enum RemoveChain {
EmptyPeerPool, EmptyPeerPool,
ChainCompleted, ChainCompleted,
ChainFailed(BatchId), /// A chain has failed. This boolean signals whether the chain should be blacklisted.
ChainFailed {
blacklist: bool,
failing_batch: BatchId,
},
WrongBatchState(String), WrongBatchState(String),
WrongChainState(String), WrongChainState(String),
} }
@ -187,8 +192,13 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// fail the batches // fail the batches
for id in batch_ids { for id in batch_ids {
if let Some(batch) = self.batches.get_mut(&id) { if let Some(batch) = self.batches.get_mut(&id) {
if batch.download_failed(true)? { if let BatchOperationOutcome::Failed { blacklist } =
return Err(RemoveChain::ChainFailed(id)); batch.download_failed(true)?
{
return Err(RemoveChain::ChainFailed {
blacklist,
failing_batch: id,
});
} }
self.retry_batch_download(network, id)?; self.retry_batch_download(network, id)?;
} else { } else {
@ -265,12 +275,15 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
self.process_completed_batches(network) self.process_completed_batches(network)
} }
Err(result) => { Err(result) => {
let (expected_boundary, received_boundary, is_failed) = result?; let (expected_boundary, received_boundary, outcome) = result?;
warn!(self.log, "Batch received out of range blocks"; "expected_boundary" => expected_boundary, "received_boundary" => received_boundary, warn!(self.log, "Batch received out of range blocks"; "expected_boundary" => expected_boundary, "received_boundary" => received_boundary,
"peer_id" => %peer_id, batch); "peer_id" => %peer_id, batch);
if is_failed { if let BatchOperationOutcome::Failed { blacklist } = outcome {
return Err(RemoveChain::ChainFailed(batch_id)); return Err(RemoveChain::ChainFailed {
blacklist,
failing_batch: batch_id,
});
} }
// this batch can't be used, so we need to request it again. // this batch can't be used, so we need to request it again.
self.retry_batch_download(network, batch_id) self.retry_batch_download(network, batch_id)
@ -324,15 +337,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// blocks to continue, and the chain is expecting a processing result that won't // blocks to continue, and the chain is expecting a processing result that won't
// arrive. To mitigate this, (fake) fail this processing so that the batch is // arrive. To mitigate this, (fake) fail this processing so that the batch is
// re-downloaded. // re-downloaded.
self.on_batch_process_result( self.on_batch_process_result(network, batch_id, &BatchProcessResult::NonFaultyFailure)
network,
batch_id,
&BatchProcessResult::Failed {
imported_blocks: false,
peer_action: None,
mode: FailureMode::ConsensusLayer,
},
)
} else { } else {
Ok(KeepChain) Ok(KeepChain)
} }
@ -448,7 +453,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
) -> ProcessingResult { ) -> ProcessingResult {
// the first two cases are possible if the chain advances while waiting for a processing // the first two cases are possible if the chain advances while waiting for a processing
// result // result
match &self.current_processing_batch { let batch = match &self.current_processing_batch {
Some(processing_id) if *processing_id != batch_id => { Some(processing_id) if *processing_id != batch_id => {
debug!(self.log, "Unexpected batch result"; debug!(self.log, "Unexpected batch result";
"batch_epoch" => batch_id, "expected_batch_epoch" => processing_id); "batch_epoch" => batch_id, "expected_batch_epoch" => processing_id);
@ -462,22 +467,35 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
_ => { _ => {
// batch_id matches, continue // batch_id matches, continue
self.current_processing_batch = None; self.current_processing_batch = None;
} self.batches.get_mut(&batch_id).ok_or_else(|| {
}
match result {
BatchProcessResult::Success(was_non_empty) => {
let batch = self.batches.get_mut(&batch_id).ok_or_else(|| {
RemoveChain::WrongChainState(format!( RemoveChain::WrongChainState(format!(
"Current processing batch not found: {}", "Current processing batch not found: {}",
batch_id batch_id
)) ))
})?; })?
}
};
let peer = batch.current_peer().cloned().ok_or_else(|| {
RemoveChain::WrongBatchState(format!(
"Processing target is in wrong state: {:?}",
batch.state(),
))
})?;
// Log the process result and the batch for debugging purposes.
debug!(self.log, "Batch processing result"; "result" => ?result, &batch,
"batch_epoch" => batch_id, "client" => %network.client_type(&peer));
// We consider three cases. Batch was successfully processed, Batch failed processing due
// to a faulty peer, or batch failed processing but the peer can't be deemed faulty.
match result {
BatchProcessResult::Success { was_non_empty } => {
batch.processing_completed(BatchProcessingResult::Success)?; batch.processing_completed(BatchProcessingResult::Success)?;
// If the processed batch was not empty, we can validate previous unvalidated
// blocks.
if *was_non_empty { if *was_non_empty {
// If the processed batch was not empty, we can validate previous unvalidated
// blocks.
self.advance_chain(network, batch_id); self.advance_chain(network, batch_id);
// we register so that on chain switching we don't try it again // we register so that on chain switching we don't try it again
self.attempted_optimistic_starts.insert(batch_id); self.attempted_optimistic_starts.insert(batch_id);
@ -507,64 +525,56 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
self.process_completed_batches(network) self.process_completed_batches(network)
} }
} }
BatchProcessResult::Failed { BatchProcessResult::FaultyFailure {
imported_blocks, imported_blocks,
peer_action, penalty,
mode: _,
} => { } => {
let batch = self.batches.get_mut(&batch_id).ok_or_else(|| { // Penalize the peer appropiately.
RemoveChain::WrongChainState(format!( network.report_peer(peer, *penalty, "faulty_batch");
"Batch not found for current processing target {}",
batch_id
))
})?;
let peer = batch.current_peer().cloned().ok_or_else(|| {
RemoveChain::WrongBatchState(format!(
"Processing target is in wrong state: {:?}",
batch.state(),
))
})?;
debug!(self.log, "Batch processing failed"; "imported_blocks" => imported_blocks, "peer_penalty" => ?peer_action,
"batch_epoch" => batch_id, "peer" => %peer, "client" => %network.client_type(&peer));
if batch.processing_completed(BatchProcessingResult::Failed { // Check if this batch is allowed to continue
count_attempt: peer_action.is_some(), match batch.processing_completed(BatchProcessingResult::FaultyFailure)? {
})? { BatchOperationOutcome::Continue => {
// check that we have not exceeded the re-process retry counter // Chain can continue. Check if it can be moved forward.
// If a batch has exceeded the invalid batch lookup attempts limit, it means if *imported_blocks {
// that it is likely all peers in this chain are are sending invalid batches // At least one block was successfully verified and imported, so we can be sure all
// repeatedly and are either malicious or faulty. We drop the chain and // previous batches are valid and we only need to download the current failed
// report all peers. // batch.
// There are some edge cases with forks that could land us in this situation. self.advance_chain(network, batch_id);
// This should be unlikely, so we tolerate these errors, but not often.
warn!(
self.log,
"Batch failed to download. Dropping chain scoring peers";
"score_adjustment" => %peer_action
.as_ref()
.map(ToString::to_string)
.unwrap_or_else(|| "None".into()),
"batch_epoch"=> batch_id
);
if let Some(peer_action) = peer_action {
for (peer, _) in self.peers.drain() {
network.report_peer(peer, *peer_action, "batch_failed");
} }
// Handle this invalid batch, that is within the re-process retries limit.
self.handle_invalid_batch(network, batch_id)
} }
Err(RemoveChain::ChainFailed(batch_id)) BatchOperationOutcome::Failed { blacklist } => {
} else { // Check that we have not exceeded the re-process retry counter,
// chain can continue. Check if it can be moved forward // If a batch has exceeded the invalid batch lookup attempts limit, it means
if *imported_blocks { // that it is likely all peers in this chain are are sending invalid batches
// At least one block was successfully verified and imported, so we can be sure all // repeatedly and are either malicious or faulty. We drop the chain and
// previous batches are valid and we only need to download the current failed // report all peers.
// batch. // There are some edge cases with forks that could land us in this situation.
self.advance_chain(network, batch_id); // This should be unlikely, so we tolerate these errors, but not often.
warn!(
self.log,
"Batch failed to download. Dropping chain scoring peers";
"score_adjustment" => %penalty,
"batch_epoch"=> batch_id,
);
for (peer, _) in self.peers.drain() {
network.report_peer(peer, *penalty, "faulty_chain");
}
Err(RemoveChain::ChainFailed {
blacklist,
failing_batch: batch_id,
})
} }
// Handle this invalid batch, that is within the re-process retries limit.
self.handle_invalid_batch(network, batch_id)
} }
} }
BatchProcessResult::NonFaultyFailure => {
batch.processing_completed(BatchProcessingResult::NonFaultyFailure)?;
// Simply redownload the batch.
self.retry_batch_download(network, batch_id)
}
} }
} }
@ -737,9 +747,12 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
let mut redownload_queue = Vec::new(); let mut redownload_queue = Vec::new();
for (id, batch) in self.batches.range_mut(..batch_id) { for (id, batch) in self.batches.range_mut(..batch_id) {
if batch.validation_failed()? { if let BatchOperationOutcome::Failed { blacklist } = batch.validation_failed()? {
// remove the chain early // remove the chain early
return Err(RemoveChain::ChainFailed(batch_id)); return Err(RemoveChain::ChainFailed {
blacklist,
failing_batch: *id,
});
} }
redownload_queue.push(*id); redownload_queue.push(*id);
} }
@ -836,8 +849,11 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
if let Some(active_requests) = self.peers.get_mut(peer_id) { if let Some(active_requests) = self.peers.get_mut(peer_id) {
active_requests.remove(&batch_id); active_requests.remove(&batch_id);
} }
if batch.download_failed(true)? { if let BatchOperationOutcome::Failed { blacklist } = batch.download_failed(true)? {
return Err(RemoveChain::ChainFailed(batch_id)); return Err(RemoveChain::ChainFailed {
blacklist,
failing_batch: batch_id,
});
} }
self.retry_batch_download(network, batch_id) self.retry_batch_download(network, batch_id)
} else { } else {
@ -925,10 +941,16 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
self.peers self.peers
.get_mut(&peer) .get_mut(&peer)
.map(|request| request.remove(&batch_id)); .map(|request| request.remove(&batch_id));
if batch.download_failed(true)? { match batch.download_failed(true)? {
return Err(RemoveChain::ChainFailed(batch_id)); BatchOperationOutcome::Failed { blacklist } => {
} else { return Err(RemoveChain::ChainFailed {
return self.retry_batch_download(network, batch_id); blacklist,
failing_batch: batch_id,
})
}
BatchOperationOutcome::Continue => {
return self.retry_batch_download(network, batch_id)
}
} }
} }
} }

View File

@ -8,7 +8,7 @@ mod chain_collection;
mod range; mod range;
mod sync_type; mod sync_type;
pub use batch::{BatchConfig, BatchInfo, BatchProcessingResult, BatchState}; pub use batch::{BatchConfig, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState};
pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH};
pub use range::RangeSync; pub use range::RangeSync;
pub use sync_type::RangeSyncType; pub use sync_type::RangeSyncType;

View File

@ -356,8 +356,8 @@ where
debug!(self.log, "Chain removed"; "sync_type" => ?sync_type, &chain, "reason" => ?remove_reason, "op" => op); debug!(self.log, "Chain removed"; "sync_type" => ?sync_type, &chain, "reason" => ?remove_reason, "op" => op);
} }
if let RemoveChain::ChainFailed(_) = remove_reason { if let RemoveChain::ChainFailed { blacklist, .. } = remove_reason {
if RangeSyncType::Finalized == sync_type { if RangeSyncType::Finalized == sync_type && blacklist {
warn!(self.log, "Chain failed! Syncing to its head won't be retried for at least the next {} seconds", FAILED_CHAINS_EXPIRY_SECONDS; &chain); warn!(self.log, "Chain failed! Syncing to its head won't be retried for at least the next {} seconds", FAILED_CHAINS_EXPIRY_SECONDS; &chain);
self.failed_chains.insert(chain.target_head_root); self.failed_chains.insert(chain.target_head_root);
} }