Avoid peer penalties on internal errors for batch block import (#2898)

## Issue Addressed

NA

## Proposed Changes

I've observed some Prater nodes (and potentially some mainnet nodes) banning peers due to validator pubkey cache lock timeouts. For the `BeaconChainError`-type of errors, they're caused by internal faults and we can't necessarily tell if the peer is bad or not. I think this is causing us to ban peers unnecessarily when running on under-resourced machines.

## Additional Info

NA
This commit is contained in:
Paul Hauner 2022-01-11 05:33:28 +00:00
parent 6976796162
commit 4848e53155
4 changed files with 156 additions and 46 deletions

View File

@ -7,7 +7,7 @@ use crate::sync::{BatchProcessResult, ChainId};
use beacon_chain::{ use beacon_chain::{
BeaconChainError, BeaconChainTypes, BlockError, ChainSegmentResult, HistoricalBlockError, BeaconChainError, BeaconChainTypes, BlockError, ChainSegmentResult, HistoricalBlockError,
}; };
use lighthouse_network::PeerId; use lighthouse_network::{PeerAction, PeerId};
use slog::{crit, debug, error, info, trace, warn}; use slog::{crit, debug, error, info, trace, warn};
use tokio::sync::mpsc; use tokio::sync::mpsc;
use types::{Epoch, Hash256, SignedBeaconBlock}; use types::{Epoch, Hash256, SignedBeaconBlock};
@ -23,6 +23,14 @@ pub enum ProcessId {
ParentLookup(PeerId, Hash256), ParentLookup(PeerId, Hash256),
} }
/// Returned when a chain segment import fails.
struct ChainSegmentFailed {
/// To be displayed in logs.
message: String,
/// Used to penalize peers.
peer_action: Option<PeerAction>,
}
impl<T: BeaconChainTypes> Worker<T> { impl<T: BeaconChainTypes> Worker<T> {
/// Attempt to process a block received from a direct RPC request, returning the processing /// Attempt to process a block received from a direct RPC request, returning the processing
/// result on the `result_tx` channel. /// result on the `result_tx` channel.
@ -123,9 +131,13 @@ impl<T: BeaconChainTypes> Worker<T> {
"chain" => chain_id, "chain" => chain_id,
"last_block_slot" => end_slot, "last_block_slot" => end_slot,
"imported_blocks" => imported_blocks, "imported_blocks" => imported_blocks,
"error" => e, "error" => %e.message,
"service" => "sync"); "service" => "sync");
BatchProcessResult::Failed(imported_blocks > 0)
BatchProcessResult::Failed {
imported_blocks: imported_blocks > 0,
peer_action: e.peer_action,
}
} }
}; };
@ -154,9 +166,12 @@ impl<T: BeaconChainTypes> Worker<T> {
"batch_epoch" => epoch, "batch_epoch" => epoch,
"first_block_slot" => start_slot, "first_block_slot" => start_slot,
"last_block_slot" => end_slot, "last_block_slot" => end_slot,
"error" => e, "error" => %e.message,
"service" => "sync"); "service" => "sync");
BatchProcessResult::Failed(false) BatchProcessResult::Failed {
imported_blocks: false,
peer_action: e.peer_action,
}
} }
}; };
@ -175,7 +190,7 @@ impl<T: BeaconChainTypes> Worker<T> {
// reverse // reverse
match self.process_blocks(downloaded_blocks.iter().rev()) { match self.process_blocks(downloaded_blocks.iter().rev()) {
(_, Err(e)) => { (_, Err(e)) => {
debug!(self.log, "Parent lookup failed"; "last_peer_id" => %peer_id, "error" => e); debug!(self.log, "Parent lookup failed"; "last_peer_id" => %peer_id, "error" => %e.message);
self.send_sync_message(SyncMessage::ParentLookupFailed { self.send_sync_message(SyncMessage::ParentLookupFailed {
peer_id, peer_id,
chain_head, chain_head,
@ -193,7 +208,7 @@ impl<T: BeaconChainTypes> Worker<T> {
fn process_blocks<'a>( fn process_blocks<'a>(
&self, &self,
downloaded_blocks: impl Iterator<Item = &'a SignedBeaconBlock<T::EthSpec>>, downloaded_blocks: impl Iterator<Item = &'a SignedBeaconBlock<T::EthSpec>>,
) -> (usize, Result<(), String>) { ) -> (usize, Result<(), ChainSegmentFailed>) {
let blocks = downloaded_blocks.cloned().collect::<Vec<_>>(); let blocks = downloaded_blocks.cloned().collect::<Vec<_>>();
match self.chain.process_chain_segment(blocks) { match self.chain.process_chain_segment(blocks) {
ChainSegmentResult::Successful { imported_blocks } => { ChainSegmentResult::Successful { imported_blocks } => {
@ -223,7 +238,7 @@ impl<T: BeaconChainTypes> Worker<T> {
fn process_backfill_blocks( fn process_backfill_blocks(
&self, &self,
blocks: &[SignedBeaconBlock<T::EthSpec>], blocks: &[SignedBeaconBlock<T::EthSpec>],
) -> (usize, Result<(), String>) { ) -> (usize, Result<(), ChainSegmentFailed>) {
match self.chain.import_historical_block_batch(blocks) { match self.chain.import_historical_block_batch(blocks) {
Ok(imported_blocks) => { Ok(imported_blocks) => {
metrics::inc_counter( metrics::inc_counter(
@ -250,7 +265,12 @@ impl<T: BeaconChainTypes> Worker<T> {
"block_root" => ?block_root, "block_root" => ?block_root,
"expected_root" => ?expected_block_root "expected_root" => ?expected_block_root
); );
String::from("mismatched_block_root")
ChainSegmentFailed {
message: String::from("mismatched_block_root"),
// The peer is faulty if they send blocks with bad roots.
peer_action: Some(PeerAction::LowToleranceError),
}
} }
HistoricalBlockError::InvalidSignature HistoricalBlockError::InvalidSignature
| HistoricalBlockError::SignatureSet(_) => { | HistoricalBlockError::SignatureSet(_) => {
@ -259,7 +279,12 @@ impl<T: BeaconChainTypes> Worker<T> {
"Backfill batch processing error"; "Backfill batch processing error";
"error" => ?e "error" => ?e
); );
"invalid_signature".into()
ChainSegmentFailed {
message: "invalid_signature".into(),
// The peer is faulty if they bad signatures.
peer_action: Some(PeerAction::LowToleranceError),
}
} }
HistoricalBlockError::ValidatorPubkeyCacheTimeout => { HistoricalBlockError::ValidatorPubkeyCacheTimeout => {
warn!( warn!(
@ -267,25 +292,55 @@ impl<T: BeaconChainTypes> Worker<T> {
"Backfill batch processing error"; "Backfill batch processing error";
"error" => "pubkey_cache_timeout" "error" => "pubkey_cache_timeout"
); );
"pubkey_cache_timeout".into()
ChainSegmentFailed {
message: "pubkey_cache_timeout".into(),
// This is an internal error, do not penalize the peer.
peer_action: None,
}
} }
HistoricalBlockError::NoAnchorInfo => { HistoricalBlockError::NoAnchorInfo => {
warn!(self.log, "Backfill not required"); warn!(self.log, "Backfill not required");
String::from("no_anchor_info")
ChainSegmentFailed {
message: String::from("no_anchor_info"),
// There is no need to do a historical sync, this is not a fault of
// the peer.
peer_action: None,
}
} }
HistoricalBlockError::IndexOutOfBounds HistoricalBlockError::IndexOutOfBounds => {
| HistoricalBlockError::BlockOutOfRange { .. } => {
error!( error!(
self.log, self.log,
"Backfill batch processing error"; "Backfill batch OOB error";
"error" => ?e, "error" => ?e,
); );
String::from("logic_error") ChainSegmentFailed {
message: String::from("logic_error"),
// This should never occur, don't penalize the peer.
peer_action: None,
}
}
HistoricalBlockError::BlockOutOfRange { .. } => {
error!(
self.log,
"Backfill batch error";
"error" => ?e,
);
ChainSegmentFailed {
message: String::from("unexpected_error"),
// This should never occur, don't penalize the peer.
peer_action: None,
}
} }
}, },
other => { other => {
warn!(self.log, "Backfill batch processing error"; "error" => ?other); warn!(self.log, "Backfill batch processing error"; "error" => ?other);
format!("{:?}", other) ChainSegmentFailed {
message: format!("{:?}", other),
// This is an internal error, don't penalize the peer.
peer_action: None,
}
} }
}; };
(0, Err(err)) (0, Err(err))
@ -312,15 +367,18 @@ impl<T: BeaconChainTypes> Worker<T> {
} }
/// Helper function to handle a `BlockError` from `process_chain_segment` /// Helper function to handle a `BlockError` from `process_chain_segment`
fn handle_failed_chain_segment(&self, error: BlockError<T::EthSpec>) -> Result<(), String> { fn handle_failed_chain_segment(
&self,
error: BlockError<T::EthSpec>,
) -> Result<(), ChainSegmentFailed> {
match error { match error {
BlockError::ParentUnknown(block) => { BlockError::ParentUnknown(block) => {
// blocks should be sequential and all parents should exist // blocks should be sequential and all parents should exist
Err(ChainSegmentFailed {
Err(format!( message: format!("Block has an unknown parent: {}", block.parent_root()),
"Block has an unknown parent: {}", // Peers are faulty if they send non-sequential blocks.
block.parent_root() peer_action: Some(PeerAction::LowToleranceError),
)) })
} }
BlockError::BlockIsAlreadyKnown => { BlockError::BlockIsAlreadyKnown => {
// This can happen for many reasons. Head sync's can download multiples and parent // This can happen for many reasons. Head sync's can download multiples and parent
@ -350,10 +408,14 @@ impl<T: BeaconChainTypes> Worker<T> {
); );
} }
Err(format!( Err(ChainSegmentFailed {
"Block with slot {} is higher than the current slot {}", message: format!(
block_slot, present_slot "Block with slot {} is higher than the current slot {}",
)) block_slot, present_slot
),
// Peers are faulty if they send blocks from the future.
peer_action: Some(PeerAction::LowToleranceError),
})
} }
BlockError::WouldRevertFinalizedSlot { .. } => { BlockError::WouldRevertFinalizedSlot { .. } => {
debug!(self.log, "Finalized or earlier block processed";); debug!(self.log, "Finalized or earlier block processed";);
@ -370,7 +432,11 @@ impl<T: BeaconChainTypes> Worker<T> {
"outcome" => ?e, "outcome" => ?e,
); );
Err(format!("Internal error whilst processing block: {:?}", e)) Err(ChainSegmentFailed {
message: format!("Internal error whilst processing block: {:?}", e),
// Do not penalize peers for internal errors.
peer_action: None,
})
} }
other => { other => {
debug!( debug!(
@ -379,7 +445,11 @@ impl<T: BeaconChainTypes> Worker<T> {
"outcome" => %other, "outcome" => %other,
); );
Err(format!("Peer sent invalid block. Reason: {:?}", other)) Err(ChainSegmentFailed {
message: format!("Peer sent invalid block. Reason: {:?}", other),
// Do not penalize peers for internal errors.
peer_action: None,
})
} }
} }
} }

View File

@ -541,7 +541,15 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// blocks to continue, and the chain is expecting a processing result that won't // blocks to continue, and the chain is expecting a processing result that won't
// arrive. To mitigate this, (fake) fail this processing so that the batch is // arrive. To mitigate this, (fake) fail this processing so that the batch is
// re-downloaded. // re-downloaded.
self.on_batch_process_result(network, batch_id, &BatchProcessResult::Failed(false)) self.on_batch_process_result(
network,
batch_id,
&BatchProcessResult::Failed {
imported_blocks: false,
// The beacon processor queue is full, no need to penalize the peer.
peer_action: None,
},
)
} else { } else {
Ok(ProcessResult::Successful) Ok(ProcessResult::Successful)
} }
@ -621,7 +629,10 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
self.process_completed_batches(network) self.process_completed_batches(network)
} }
} }
BatchProcessResult::Failed(imported_blocks) => { BatchProcessResult::Failed {
imported_blocks,
peer_action,
} => {
let batch = match self.batches.get_mut(&batch_id) { let batch = match self.batches.get_mut(&batch_id) {
Some(v) => v, Some(v) => v,
None => { None => {
@ -659,12 +670,20 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// that it is likely all peers are sending invalid batches // that it is likely all peers are sending invalid batches
// repeatedly and are either malicious or faulty. We stop the backfill sync and // repeatedly and are either malicious or faulty. We stop the backfill sync and
// report all synced peers that have participated. // report all synced peers that have participated.
let action = PeerAction::LowToleranceError; warn!(
warn!(self.log, "Backfill batch failed to download. Penalizing peers"; self.log,
"score_adjustment" => %action, "Backfill batch failed to download. Penalizing peers";
"batch_epoch"=> batch_id); "score_adjustment" => %peer_action
for peer in self.participating_peers.drain() { .as_ref()
network.report_peer(peer, action); .map(ToString::to_string)
.unwrap_or_else(|| "None".into()),
"batch_epoch"=> batch_id
);
if let Some(peer_action) = peer_action {
for peer in self.participating_peers.drain() {
network.report_peer(peer, *peer_action);
}
} }
self.fail_sync(BackFillError::BatchProcessingFailed(batch_id)) self.fail_sync(BackFillError::BatchProcessingFailed(batch_id))
.map(|_| ProcessResult::Successful) .map(|_| ProcessResult::Successful)

View File

@ -137,7 +137,10 @@ pub enum BatchProcessResult {
/// The batch was completed successfully. It carries whether the sent batch contained blocks. /// The batch was completed successfully. It carries whether the sent batch contained blocks.
Success(bool), Success(bool),
/// The batch processing failed. It carries whether the processing imported any block. /// The batch processing failed. It carries whether the processing imported any block.
Failed(bool), Failed {
imported_blocks: bool,
peer_action: Option<PeerAction>,
},
} }
/// Maintains a sequential list of parents to lookup and the lookup's current state. /// Maintains a sequential list of parents to lookup and the lookup's current state.

View File

@ -313,7 +313,14 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// blocks to continue, and the chain is expecting a processing result that won't // blocks to continue, and the chain is expecting a processing result that won't
// arrive. To mitigate this, (fake) fail this processing so that the batch is // arrive. To mitigate this, (fake) fail this processing so that the batch is
// re-downloaded. // re-downloaded.
self.on_batch_process_result(network, batch_id, &BatchProcessResult::Failed(false)) self.on_batch_process_result(
network,
batch_id,
&BatchProcessResult::Failed {
imported_blocks: false,
peer_action: None,
},
)
} else { } else {
Ok(KeepChain) Ok(KeepChain)
} }
@ -488,7 +495,10 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
self.process_completed_batches(network) self.process_completed_batches(network)
} }
} }
BatchProcessResult::Failed(imported_blocks) => { BatchProcessResult::Failed {
imported_blocks,
peer_action,
} => {
let batch = self.batches.get_mut(&batch_id).ok_or_else(|| { let batch = self.batches.get_mut(&batch_id).ok_or_else(|| {
RemoveChain::WrongChainState(format!( RemoveChain::WrongChainState(format!(
"Batch not found for current processing target {}", "Batch not found for current processing target {}",
@ -511,12 +521,20 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// report all peers. // report all peers.
// There are some edge cases with forks that could land us in this situation. // There are some edge cases with forks that could land us in this situation.
// This should be unlikely, so we tolerate these errors, but not often. // This should be unlikely, so we tolerate these errors, but not often.
let action = PeerAction::LowToleranceError; warn!(
warn!(self.log, "Batch failed to download. Dropping chain scoring peers"; self.log,
"score_adjustment" => %action, "Batch failed to download. Dropping chain scoring peers";
"batch_epoch"=> batch_id); "score_adjustment" => %peer_action
for (peer, _) in self.peers.drain() { .as_ref()
network.report_peer(peer, action); .map(ToString::to_string)
.unwrap_or_else(|| "None".into()),
"batch_epoch"=> batch_id
);
if let Some(peer_action) = peer_action {
for (peer, _) in self.peers.drain() {
network.report_peer(peer, *peer_action);
}
} }
Err(RemoveChain::ChainFailed(batch_id)) Err(RemoveChain::ChainFailed(batch_id))
} else { } else {