Do not penalize peers on execution layer offline errors (#3258)

## Issue Addressed

Partly resolves https://github.com/sigp/lighthouse/issues/3032

## Proposed Changes

Extracts some of the functionality of #3094 into a separate PR as the original PR requires a bit more work.
Do not unnecessarily penalize peers when we fail to validate received execution payloads because our execution layer is offline.
This commit is contained in:
Pawan Dhananjay 2022-06-19 23:13:40 +00:00
parent 21b3425a12
commit f428719761
8 changed files with 121 additions and 11 deletions

View File

@ -75,7 +75,9 @@ mod work_reprocessing_queue;
mod worker; mod worker;
use crate::beacon_processor::work_reprocessing_queue::QueuedBlock; use crate::beacon_processor::work_reprocessing_queue::QueuedBlock;
pub use worker::{ChainSegmentProcessId, GossipAggregatePackage, GossipAttestationPackage}; pub use worker::{
ChainSegmentProcessId, FailureMode, GossipAggregatePackage, GossipAttestationPackage,
};
/// The maximum size of the channel for work events to the `BeaconProcessor`. /// The maximum size of the channel for work events to the `BeaconProcessor`.
/// ///

View File

@ -943,6 +943,16 @@ impl<T: BeaconChainTypes> Worker<T> {
); );
self.send_sync_message(SyncMessage::UnknownBlock(peer_id, block)); self.send_sync_message(SyncMessage::UnknownBlock(peer_id, block));
} }
Err(e @ BlockError::ExecutionPayloadError(ExecutionPayloadError::RequestFailed(_)))
| Err(
e @ BlockError::ExecutionPayloadError(ExecutionPayloadError::NoExecutionConnection),
) => {
debug!(
self.log,
"Failed to verify execution payload";
"error" => %e
);
}
other => { other => {
debug!( debug!(
self.log, self.log,

View File

@ -10,7 +10,7 @@ mod rpc_methods;
mod sync_methods; mod sync_methods;
pub use gossip_methods::{GossipAggregatePackage, GossipAttestationPackage}; pub use gossip_methods::{GossipAggregatePackage, GossipAttestationPackage};
pub use sync_methods::ChainSegmentProcessId; pub use sync_methods::{ChainSegmentProcessId, FailureMode};
pub(crate) const FUTURE_SLOT_TOLERANCE: u64 = 1; pub(crate) const FUTURE_SLOT_TOLERANCE: u64 = 1;

View File

@ -6,6 +6,7 @@ use crate::beacon_processor::DuplicateCache;
use crate::metrics; use crate::metrics;
use crate::sync::manager::{BlockProcessType, SyncMessage}; use crate::sync::manager::{BlockProcessType, SyncMessage};
use crate::sync::{BatchProcessResult, ChainId}; use crate::sync::{BatchProcessResult, ChainId};
use beacon_chain::ExecutionPayloadError;
use beacon_chain::{ use beacon_chain::{
BeaconChainError, BeaconChainTypes, BlockError, ChainSegmentResult, HistoricalBlockError, BeaconChainError, BeaconChainTypes, BlockError, ChainSegmentResult, HistoricalBlockError,
}; };
@ -31,6 +32,15 @@ struct ChainSegmentFailed {
message: String, message: String,
/// Used to penalize peers. /// Used to penalize peers.
peer_action: Option<PeerAction>, peer_action: Option<PeerAction>,
/// Failure mode
mode: FailureMode,
}
/// Represents if a block processing failure was on the consensus or execution side.
#[derive(Debug)]
pub enum FailureMode {
ExecutionLayer { pause_sync: bool },
ConsensusLayer,
} }
impl<T: BeaconChainTypes> Worker<T> { impl<T: BeaconChainTypes> Worker<T> {
@ -128,6 +138,7 @@ impl<T: BeaconChainTypes> Worker<T> {
BatchProcessResult::Failed { BatchProcessResult::Failed {
imported_blocks: imported_blocks > 0, imported_blocks: imported_blocks > 0,
peer_action: e.peer_action, peer_action: e.peer_action,
mode: e.mode,
} }
} }
} }
@ -158,6 +169,7 @@ impl<T: BeaconChainTypes> Worker<T> {
BatchProcessResult::Failed { BatchProcessResult::Failed {
imported_blocks: false, imported_blocks: false,
peer_action: e.peer_action, peer_action: e.peer_action,
mode: e.mode,
} }
} }
} }
@ -177,6 +189,7 @@ impl<T: BeaconChainTypes> Worker<T> {
BatchProcessResult::Failed { BatchProcessResult::Failed {
imported_blocks: imported_blocks > 0, imported_blocks: imported_blocks > 0,
peer_action: e.peer_action, peer_action: e.peer_action,
mode: e.mode,
} }
} }
(imported_blocks, Ok(_)) => { (imported_blocks, Ok(_)) => {
@ -257,6 +270,7 @@ impl<T: BeaconChainTypes> Worker<T> {
message: String::from("mismatched_block_root"), message: String::from("mismatched_block_root"),
// The peer is faulty if they send blocks with bad roots. // The peer is faulty if they send blocks with bad roots.
peer_action: Some(PeerAction::LowToleranceError), peer_action: Some(PeerAction::LowToleranceError),
mode: FailureMode::ConsensusLayer,
} }
} }
HistoricalBlockError::InvalidSignature HistoricalBlockError::InvalidSignature
@ -271,6 +285,7 @@ impl<T: BeaconChainTypes> Worker<T> {
message: "invalid_signature".into(), message: "invalid_signature".into(),
// The peer is faulty if they bad signatures. // The peer is faulty if they bad signatures.
peer_action: Some(PeerAction::LowToleranceError), peer_action: Some(PeerAction::LowToleranceError),
mode: FailureMode::ConsensusLayer,
} }
} }
HistoricalBlockError::ValidatorPubkeyCacheTimeout => { HistoricalBlockError::ValidatorPubkeyCacheTimeout => {
@ -284,6 +299,7 @@ impl<T: BeaconChainTypes> Worker<T> {
message: "pubkey_cache_timeout".into(), message: "pubkey_cache_timeout".into(),
// This is an internal error, do not penalize the peer. // This is an internal error, do not penalize the peer.
peer_action: None, peer_action: None,
mode: FailureMode::ConsensusLayer,
} }
} }
HistoricalBlockError::NoAnchorInfo => { HistoricalBlockError::NoAnchorInfo => {
@ -294,6 +310,7 @@ impl<T: BeaconChainTypes> Worker<T> {
// There is no need to do a historical sync, this is not a fault of // There is no need to do a historical sync, this is not a fault of
// the peer. // the peer.
peer_action: None, peer_action: None,
mode: FailureMode::ConsensusLayer,
} }
} }
HistoricalBlockError::IndexOutOfBounds => { HistoricalBlockError::IndexOutOfBounds => {
@ -306,6 +323,7 @@ impl<T: BeaconChainTypes> Worker<T> {
message: String::from("logic_error"), message: String::from("logic_error"),
// This should never occur, don't penalize the peer. // This should never occur, don't penalize the peer.
peer_action: None, peer_action: None,
mode: FailureMode::ConsensusLayer,
} }
} }
HistoricalBlockError::BlockOutOfRange { .. } => { HistoricalBlockError::BlockOutOfRange { .. } => {
@ -318,6 +336,7 @@ impl<T: BeaconChainTypes> Worker<T> {
message: String::from("unexpected_error"), message: String::from("unexpected_error"),
// This should never occur, don't penalize the peer. // This should never occur, don't penalize the peer.
peer_action: None, peer_action: None,
mode: FailureMode::ConsensusLayer,
} }
} }
}, },
@ -327,6 +346,7 @@ impl<T: BeaconChainTypes> Worker<T> {
message: format!("{:?}", other), message: format!("{:?}", other),
// This is an internal error, don't penalize the peer. // This is an internal error, don't penalize the peer.
peer_action: None, peer_action: None,
mode: FailureMode::ConsensusLayer,
} }
} }
}; };
@ -365,6 +385,7 @@ impl<T: BeaconChainTypes> Worker<T> {
message: format!("Block has an unknown parent: {}", block.parent_root()), message: format!("Block has an unknown parent: {}", block.parent_root()),
// Peers are faulty if they send non-sequential blocks. // Peers are faulty if they send non-sequential blocks.
peer_action: Some(PeerAction::LowToleranceError), peer_action: Some(PeerAction::LowToleranceError),
mode: FailureMode::ConsensusLayer,
}) })
} }
BlockError::BlockIsAlreadyKnown => { BlockError::BlockIsAlreadyKnown => {
@ -402,6 +423,7 @@ impl<T: BeaconChainTypes> Worker<T> {
), ),
// Peers are faulty if they send blocks from the future. // Peers are faulty if they send blocks from the future.
peer_action: Some(PeerAction::LowToleranceError), peer_action: Some(PeerAction::LowToleranceError),
mode: FailureMode::ConsensusLayer,
}) })
} }
BlockError::WouldRevertFinalizedSlot { .. } => { BlockError::WouldRevertFinalizedSlot { .. } => {
@ -423,8 +445,41 @@ impl<T: BeaconChainTypes> Worker<T> {
message: format!("Internal error whilst processing block: {:?}", e), message: format!("Internal error whilst processing block: {:?}", e),
// Do not penalize peers for internal errors. // Do not penalize peers for internal errors.
peer_action: None, peer_action: None,
mode: FailureMode::ConsensusLayer,
}) })
} }
BlockError::ExecutionPayloadError(e) => match &e {
ExecutionPayloadError::NoExecutionConnection { .. }
| ExecutionPayloadError::RequestFailed { .. } => {
// These errors indicate an issue with the EL and not the `ChainSegment`.
// Pause the syncing while the EL recovers
debug!(self.log,
"Execution layer verification failed";
"outcome" => "pausing sync",
"err" => ?e
);
Err(ChainSegmentFailed {
message: format!("Execution layer offline. Reason: {:?}", e),
// Do not penalize peers for internal errors.
peer_action: None,
mode: FailureMode::ExecutionLayer { pause_sync: true },
})
}
err => {
debug!(self.log,
"Invalid execution payload";
"error" => ?err
);
Err(ChainSegmentFailed {
message: format!(
"Peer sent a block containing invalid execution payload. Reason: {:?}",
err
),
peer_action: Some(PeerAction::LowToleranceError),
mode: FailureMode::ExecutionLayer { pause_sync: false },
})
}
},
other => { other => {
debug!( debug!(
self.log, "Invalid block received"; self.log, "Invalid block received";
@ -436,6 +491,7 @@ impl<T: BeaconChainTypes> Worker<T> {
message: format!("Peer sent invalid block. Reason: {:?}", other), message: format!("Peer sent invalid block. Reason: {:?}", other),
// Do not penalize peers for internal errors. // Do not penalize peers for internal errors.
peer_action: None, peer_action: None,
mode: FailureMode::ConsensusLayer,
}) })
} }
} }

View File

@ -8,7 +8,7 @@
//! If a batch fails, the backfill sync cannot progress. In this scenario, we mark the backfill //! If a batch fails, the backfill sync cannot progress. In this scenario, we mark the backfill
//! sync as failed, log an error and attempt to retry once a new peer joins the node. //! sync as failed, log an error and attempt to retry once a new peer joins the node.
use crate::beacon_processor::{ChainSegmentProcessId, WorkEvent as BeaconWorkEvent}; use crate::beacon_processor::{ChainSegmentProcessId, FailureMode, WorkEvent as BeaconWorkEvent};
use crate::sync::manager::{BatchProcessResult, Id}; use crate::sync::manager::{BatchProcessResult, Id};
use crate::sync::network_context::SyncNetworkContext; use crate::sync::network_context::SyncNetworkContext;
use crate::sync::range_sync::{BatchConfig, BatchId, BatchInfo, BatchProcessingResult, BatchState}; use crate::sync::range_sync::{BatchConfig, BatchId, BatchInfo, BatchProcessingResult, BatchState};
@ -554,6 +554,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
imported_blocks: false, imported_blocks: false,
// The beacon processor queue is full, no need to penalize the peer. // The beacon processor queue is full, no need to penalize the peer.
peer_action: None, peer_action: None,
mode: FailureMode::ConsensusLayer,
}, },
) )
} else { } else {
@ -638,6 +639,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
BatchProcessResult::Failed { BatchProcessResult::Failed {
imported_blocks, imported_blocks,
peer_action, peer_action,
mode: _,
} => { } => {
let batch = match self.batches.get_mut(&batch_id) { let batch = match self.batches.get_mut(&batch_id) {
Some(v) => v, Some(v) => v,

View File

@ -1,7 +1,7 @@
use std::collections::hash_map::Entry; use std::collections::hash_map::Entry;
use std::time::Duration; use std::time::Duration;
use beacon_chain::{BeaconChainTypes, BlockError}; use beacon_chain::{BeaconChainTypes, BlockError, ExecutionPayloadError};
use fnv::FnvHashMap; use fnv::FnvHashMap;
use lighthouse_network::{PeerAction, PeerId}; use lighthouse_network::{PeerAction, PeerId};
use lru_cache::LRUTimeCache; use lru_cache::LRUTimeCache;
@ -10,7 +10,7 @@ use smallvec::SmallVec;
use store::{Hash256, SignedBeaconBlock}; use store::{Hash256, SignedBeaconBlock};
use tokio::sync::mpsc; use tokio::sync::mpsc;
use crate::beacon_processor::{ChainSegmentProcessId, WorkEvent}; use crate::beacon_processor::{ChainSegmentProcessId, FailureMode, WorkEvent};
use crate::metrics; use crate::metrics;
use self::{ use self::{
@ -420,6 +420,20 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
BlockError::ParentUnknown(block) => { BlockError::ParentUnknown(block) => {
self.search_parent(block, peer_id, cx); self.search_parent(block, peer_id, cx);
} }
e @ BlockError::ExecutionPayloadError(ExecutionPayloadError::RequestFailed(_))
| e @ BlockError::ExecutionPayloadError(
ExecutionPayloadError::NoExecutionConnection,
) => {
// These errors indicate that the execution layer is offline
// and failed to validate the execution payload. Do not downscore peer.
debug!(
self.log,
"Single block lookup failed. Execution layer is offline";
"root" => %root,
"error" => ?e
);
}
other => { other => {
warn!(self.log, "Peer sent invalid block in single block lookup"; "root" => %root, "error" => ?other, "peer_id" => %peer_id); warn!(self.log, "Peer sent invalid block in single block lookup"; "root" => %root, "error" => ?other, "peer_id" => %peer_id);
cx.report_peer( cx.report_peer(
@ -506,6 +520,19 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
} }
} }
} }
Err(e @ BlockError::ExecutionPayloadError(ExecutionPayloadError::RequestFailed(_)))
| Err(
e @ BlockError::ExecutionPayloadError(ExecutionPayloadError::NoExecutionConnection),
) => {
// These errors indicate that the execution layer is offline
// and failed to validate the execution payload. Do not downscore peer.
debug!(
self.log,
"Parent lookup failed. Execution layer is offline";
"chain_hash" => %chain_hash,
"error" => ?e
);
}
Err(outcome) => { Err(outcome) => {
// all else we consider the chain a failure and downvote the peer that sent // all else we consider the chain a failure and downvote the peer that sent
// us the last block // us the last block
@ -561,7 +588,16 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
BatchProcessResult::Failed { BatchProcessResult::Failed {
imported_blocks: _, imported_blocks: _,
peer_action, peer_action,
mode,
} => { } => {
if let FailureMode::ExecutionLayer { pause_sync: _ } = mode {
debug!(
self.log,
"Chain segment processing failed. Execution layer is offline";
"chain_hash" => %chain_hash,
"error" => ?mode
);
} else {
self.failed_chains.insert(parent_lookup.chain_hash()); self.failed_chains.insert(parent_lookup.chain_hash());
if let Some(peer_action) = peer_action { if let Some(peer_action) = peer_action {
for &peer_id in parent_lookup.used_peers() { for &peer_id in parent_lookup.used_peers() {
@ -570,6 +606,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
} }
} }
} }
}
metrics::set_gauge( metrics::set_gauge(
&metrics::SYNC_PARENT_BLOCK_LOOKUPS, &metrics::SYNC_PARENT_BLOCK_LOOKUPS,

View File

@ -38,7 +38,7 @@ use super::block_lookups::BlockLookups;
use super::network_context::SyncNetworkContext; use super::network_context::SyncNetworkContext;
use super::peer_sync_info::{remote_sync_type, PeerSyncType}; use super::peer_sync_info::{remote_sync_type, PeerSyncType};
use super::range_sync::{RangeSync, RangeSyncType, EPOCHS_PER_BATCH}; use super::range_sync::{RangeSync, RangeSyncType, EPOCHS_PER_BATCH};
use crate::beacon_processor::{ChainSegmentProcessId, WorkEvent as BeaconWorkEvent}; use crate::beacon_processor::{ChainSegmentProcessId, FailureMode, WorkEvent as BeaconWorkEvent};
use crate::service::NetworkMessage; use crate::service::NetworkMessage;
use crate::status::ToStatusMessage; use crate::status::ToStatusMessage;
use beacon_chain::{BeaconChain, BeaconChainTypes, BlockError}; use beacon_chain::{BeaconChain, BeaconChainTypes, BlockError};
@ -137,6 +137,7 @@ pub enum BatchProcessResult {
Failed { Failed {
imported_blocks: bool, imported_blocks: bool,
peer_action: Option<PeerAction>, peer_action: Option<PeerAction>,
mode: FailureMode,
}, },
} }

View File

@ -1,6 +1,6 @@
use super::batch::{BatchInfo, BatchProcessingResult, BatchState}; use super::batch::{BatchInfo, BatchProcessingResult, BatchState};
use crate::beacon_processor::ChainSegmentProcessId;
use crate::beacon_processor::WorkEvent as BeaconWorkEvent; use crate::beacon_processor::WorkEvent as BeaconWorkEvent;
use crate::beacon_processor::{ChainSegmentProcessId, FailureMode};
use crate::sync::{manager::Id, network_context::SyncNetworkContext, BatchProcessResult}; use crate::sync::{manager::Id, network_context::SyncNetworkContext, BatchProcessResult};
use beacon_chain::BeaconChainTypes; use beacon_chain::BeaconChainTypes;
use fnv::FnvHashMap; use fnv::FnvHashMap;
@ -320,6 +320,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
&BatchProcessResult::Failed { &BatchProcessResult::Failed {
imported_blocks: false, imported_blocks: false,
peer_action: None, peer_action: None,
mode: FailureMode::ConsensusLayer,
}, },
) )
} else { } else {
@ -499,6 +500,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
BatchProcessResult::Failed { BatchProcessResult::Failed {
imported_blocks, imported_blocks,
peer_action, peer_action,
mode: _,
} => { } => {
let batch = self.batches.get_mut(&batch_id).ok_or_else(|| { let batch = self.batches.get_mut(&batch_id).ok_or_else(|| {
RemoveChain::WrongChainState(format!( RemoveChain::WrongChainState(format!(