Do not penalize peers on execution layer offline errors (#3258)

## Issue Addressed

Partly resolves https://github.com/sigp/lighthouse/issues/3032

## Proposed Changes

Extracts some of the functionality of #3094 into a separate PR as the original PR requires a bit more work.
Do not unnecessarily penalize peers when we fail to validate received execution payloads because our execution layer is offline.
This commit is contained in:
Pawan Dhananjay 2022-06-19 23:13:40 +00:00
parent 21b3425a12
commit f428719761
8 changed files with 121 additions and 11 deletions

View File

@ -75,7 +75,9 @@ mod work_reprocessing_queue;
mod worker;
use crate::beacon_processor::work_reprocessing_queue::QueuedBlock;
pub use worker::{ChainSegmentProcessId, GossipAggregatePackage, GossipAttestationPackage};
pub use worker::{
ChainSegmentProcessId, FailureMode, GossipAggregatePackage, GossipAttestationPackage,
};
/// The maximum size of the channel for work events to the `BeaconProcessor`.
///

View File

@ -943,6 +943,16 @@ impl<T: BeaconChainTypes> Worker<T> {
);
self.send_sync_message(SyncMessage::UnknownBlock(peer_id, block));
}
Err(e @ BlockError::ExecutionPayloadError(ExecutionPayloadError::RequestFailed(_)))
| Err(
e @ BlockError::ExecutionPayloadError(ExecutionPayloadError::NoExecutionConnection),
) => {
debug!(
self.log,
"Failed to verify execution payload";
"error" => %e
);
}
other => {
debug!(
self.log,

View File

@ -10,7 +10,7 @@ mod rpc_methods;
mod sync_methods;
pub use gossip_methods::{GossipAggregatePackage, GossipAttestationPackage};
pub use sync_methods::ChainSegmentProcessId;
pub use sync_methods::{ChainSegmentProcessId, FailureMode};
pub(crate) const FUTURE_SLOT_TOLERANCE: u64 = 1;

View File

@ -6,6 +6,7 @@ use crate::beacon_processor::DuplicateCache;
use crate::metrics;
use crate::sync::manager::{BlockProcessType, SyncMessage};
use crate::sync::{BatchProcessResult, ChainId};
use beacon_chain::ExecutionPayloadError;
use beacon_chain::{
BeaconChainError, BeaconChainTypes, BlockError, ChainSegmentResult, HistoricalBlockError,
};
@ -31,6 +32,15 @@ struct ChainSegmentFailed {
message: String,
/// Used to penalize peers.
peer_action: Option<PeerAction>,
/// Failure mode
mode: FailureMode,
}
/// Represents if a block processing failure was on the consensus or execution side.
#[derive(Debug)]
pub enum FailureMode {
ExecutionLayer { pause_sync: bool },
ConsensusLayer,
}
impl<T: BeaconChainTypes> Worker<T> {
@ -128,6 +138,7 @@ impl<T: BeaconChainTypes> Worker<T> {
BatchProcessResult::Failed {
imported_blocks: imported_blocks > 0,
peer_action: e.peer_action,
mode: e.mode,
}
}
}
@ -158,6 +169,7 @@ impl<T: BeaconChainTypes> Worker<T> {
BatchProcessResult::Failed {
imported_blocks: false,
peer_action: e.peer_action,
mode: e.mode,
}
}
}
@ -177,6 +189,7 @@ impl<T: BeaconChainTypes> Worker<T> {
BatchProcessResult::Failed {
imported_blocks: imported_blocks > 0,
peer_action: e.peer_action,
mode: e.mode,
}
}
(imported_blocks, Ok(_)) => {
@ -257,6 +270,7 @@ impl<T: BeaconChainTypes> Worker<T> {
message: String::from("mismatched_block_root"),
// The peer is faulty if they send blocks with bad roots.
peer_action: Some(PeerAction::LowToleranceError),
mode: FailureMode::ConsensusLayer,
}
}
HistoricalBlockError::InvalidSignature
@ -271,6 +285,7 @@ impl<T: BeaconChainTypes> Worker<T> {
message: "invalid_signature".into(),
// The peer is faulty if they bad signatures.
peer_action: Some(PeerAction::LowToleranceError),
mode: FailureMode::ConsensusLayer,
}
}
HistoricalBlockError::ValidatorPubkeyCacheTimeout => {
@ -284,6 +299,7 @@ impl<T: BeaconChainTypes> Worker<T> {
message: "pubkey_cache_timeout".into(),
// This is an internal error, do not penalize the peer.
peer_action: None,
mode: FailureMode::ConsensusLayer,
}
}
HistoricalBlockError::NoAnchorInfo => {
@ -294,6 +310,7 @@ impl<T: BeaconChainTypes> Worker<T> {
// There is no need to do a historical sync, this is not a fault of
// the peer.
peer_action: None,
mode: FailureMode::ConsensusLayer,
}
}
HistoricalBlockError::IndexOutOfBounds => {
@ -306,6 +323,7 @@ impl<T: BeaconChainTypes> Worker<T> {
message: String::from("logic_error"),
// This should never occur, don't penalize the peer.
peer_action: None,
mode: FailureMode::ConsensusLayer,
}
}
HistoricalBlockError::BlockOutOfRange { .. } => {
@ -318,6 +336,7 @@ impl<T: BeaconChainTypes> Worker<T> {
message: String::from("unexpected_error"),
// This should never occur, don't penalize the peer.
peer_action: None,
mode: FailureMode::ConsensusLayer,
}
}
},
@ -327,6 +346,7 @@ impl<T: BeaconChainTypes> Worker<T> {
message: format!("{:?}", other),
// This is an internal error, don't penalize the peer.
peer_action: None,
mode: FailureMode::ConsensusLayer,
}
}
};
@ -365,6 +385,7 @@ impl<T: BeaconChainTypes> Worker<T> {
message: format!("Block has an unknown parent: {}", block.parent_root()),
// Peers are faulty if they send non-sequential blocks.
peer_action: Some(PeerAction::LowToleranceError),
mode: FailureMode::ConsensusLayer,
})
}
BlockError::BlockIsAlreadyKnown => {
@ -402,6 +423,7 @@ impl<T: BeaconChainTypes> Worker<T> {
),
// Peers are faulty if they send blocks from the future.
peer_action: Some(PeerAction::LowToleranceError),
mode: FailureMode::ConsensusLayer,
})
}
BlockError::WouldRevertFinalizedSlot { .. } => {
@ -423,8 +445,41 @@ impl<T: BeaconChainTypes> Worker<T> {
message: format!("Internal error whilst processing block: {:?}", e),
// Do not penalize peers for internal errors.
peer_action: None,
mode: FailureMode::ConsensusLayer,
})
}
BlockError::ExecutionPayloadError(e) => match &e {
ExecutionPayloadError::NoExecutionConnection { .. }
| ExecutionPayloadError::RequestFailed { .. } => {
// These errors indicate an issue with the EL and not the `ChainSegment`.
// Pause the syncing while the EL recovers
debug!(self.log,
"Execution layer verification failed";
"outcome" => "pausing sync",
"err" => ?e
);
Err(ChainSegmentFailed {
message: format!("Execution layer offline. Reason: {:?}", e),
// Do not penalize peers for internal errors.
peer_action: None,
mode: FailureMode::ExecutionLayer { pause_sync: true },
})
}
err => {
debug!(self.log,
"Invalid execution payload";
"error" => ?err
);
Err(ChainSegmentFailed {
message: format!(
"Peer sent a block containing invalid execution payload. Reason: {:?}",
err
),
peer_action: Some(PeerAction::LowToleranceError),
mode: FailureMode::ExecutionLayer { pause_sync: false },
})
}
},
other => {
debug!(
self.log, "Invalid block received";
@ -436,6 +491,7 @@ impl<T: BeaconChainTypes> Worker<T> {
message: format!("Peer sent invalid block. Reason: {:?}", other),
// Do not penalize peers for internal errors.
peer_action: None,
mode: FailureMode::ConsensusLayer,
})
}
}

View File

@ -8,7 +8,7 @@
//! If a batch fails, the backfill sync cannot progress. In this scenario, we mark the backfill
//! sync as failed, log an error and attempt to retry once a new peer joins the node.
use crate::beacon_processor::{ChainSegmentProcessId, WorkEvent as BeaconWorkEvent};
use crate::beacon_processor::{ChainSegmentProcessId, FailureMode, WorkEvent as BeaconWorkEvent};
use crate::sync::manager::{BatchProcessResult, Id};
use crate::sync::network_context::SyncNetworkContext;
use crate::sync::range_sync::{BatchConfig, BatchId, BatchInfo, BatchProcessingResult, BatchState};
@ -554,6 +554,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
imported_blocks: false,
// The beacon processor queue is full, no need to penalize the peer.
peer_action: None,
mode: FailureMode::ConsensusLayer,
},
)
} else {
@ -638,6 +639,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
BatchProcessResult::Failed {
imported_blocks,
peer_action,
mode: _,
} => {
let batch = match self.batches.get_mut(&batch_id) {
Some(v) => v,

View File

@ -1,7 +1,7 @@
use std::collections::hash_map::Entry;
use std::time::Duration;
use beacon_chain::{BeaconChainTypes, BlockError};
use beacon_chain::{BeaconChainTypes, BlockError, ExecutionPayloadError};
use fnv::FnvHashMap;
use lighthouse_network::{PeerAction, PeerId};
use lru_cache::LRUTimeCache;
@ -10,7 +10,7 @@ use smallvec::SmallVec;
use store::{Hash256, SignedBeaconBlock};
use tokio::sync::mpsc;
use crate::beacon_processor::{ChainSegmentProcessId, WorkEvent};
use crate::beacon_processor::{ChainSegmentProcessId, FailureMode, WorkEvent};
use crate::metrics;
use self::{
@ -420,6 +420,20 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
BlockError::ParentUnknown(block) => {
self.search_parent(block, peer_id, cx);
}
e @ BlockError::ExecutionPayloadError(ExecutionPayloadError::RequestFailed(_))
| e @ BlockError::ExecutionPayloadError(
ExecutionPayloadError::NoExecutionConnection,
) => {
// These errors indicate that the execution layer is offline
// and failed to validate the execution payload. Do not downscore peer.
debug!(
self.log,
"Single block lookup failed. Execution layer is offline";
"root" => %root,
"error" => ?e
);
}
other => {
warn!(self.log, "Peer sent invalid block in single block lookup"; "root" => %root, "error" => ?other, "peer_id" => %peer_id);
cx.report_peer(
@ -506,6 +520,19 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
}
}
}
Err(e @ BlockError::ExecutionPayloadError(ExecutionPayloadError::RequestFailed(_)))
| Err(
e @ BlockError::ExecutionPayloadError(ExecutionPayloadError::NoExecutionConnection),
) => {
// These errors indicate that the execution layer is offline
// and failed to validate the execution payload. Do not downscore peer.
debug!(
self.log,
"Parent lookup failed. Execution layer is offline";
"chain_hash" => %chain_hash,
"error" => ?e
);
}
Err(outcome) => {
// all else we consider the chain a failure and downvote the peer that sent
// us the last block
@ -561,11 +588,21 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
BatchProcessResult::Failed {
imported_blocks: _,
peer_action,
mode,
} => {
self.failed_chains.insert(parent_lookup.chain_hash());
if let Some(peer_action) = peer_action {
for &peer_id in parent_lookup.used_peers() {
cx.report_peer(peer_id, peer_action, "parent_chain_failure")
if let FailureMode::ExecutionLayer { pause_sync: _ } = mode {
debug!(
self.log,
"Chain segment processing failed. Execution layer is offline";
"chain_hash" => %chain_hash,
"error" => ?mode
);
} else {
self.failed_chains.insert(parent_lookup.chain_hash());
if let Some(peer_action) = peer_action {
for &peer_id in parent_lookup.used_peers() {
cx.report_peer(peer_id, peer_action, "parent_chain_failure")
}
}
}
}

View File

@ -38,7 +38,7 @@ use super::block_lookups::BlockLookups;
use super::network_context::SyncNetworkContext;
use super::peer_sync_info::{remote_sync_type, PeerSyncType};
use super::range_sync::{RangeSync, RangeSyncType, EPOCHS_PER_BATCH};
use crate::beacon_processor::{ChainSegmentProcessId, WorkEvent as BeaconWorkEvent};
use crate::beacon_processor::{ChainSegmentProcessId, FailureMode, WorkEvent as BeaconWorkEvent};
use crate::service::NetworkMessage;
use crate::status::ToStatusMessage;
use beacon_chain::{BeaconChain, BeaconChainTypes, BlockError};
@ -137,6 +137,7 @@ pub enum BatchProcessResult {
Failed {
imported_blocks: bool,
peer_action: Option<PeerAction>,
mode: FailureMode,
},
}

View File

@ -1,6 +1,6 @@
use super::batch::{BatchInfo, BatchProcessingResult, BatchState};
use crate::beacon_processor::ChainSegmentProcessId;
use crate::beacon_processor::WorkEvent as BeaconWorkEvent;
use crate::beacon_processor::{ChainSegmentProcessId, FailureMode};
use crate::sync::{manager::Id, network_context::SyncNetworkContext, BatchProcessResult};
use beacon_chain::BeaconChainTypes;
use fnv::FnvHashMap;
@ -320,6 +320,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
&BatchProcessResult::Failed {
imported_blocks: false,
peer_action: None,
mode: FailureMode::ConsensusLayer,
},
)
} else {
@ -499,6 +500,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
BatchProcessResult::Failed {
imported_blocks,
peer_action,
mode: _,
} => {
let batch = self.batches.get_mut(&batch_id).ok_or_else(|| {
RemoveChain::WrongChainState(format!(