Do not penalize peers on execution layer offline errors (#3258)
## Issue Addressed Partly resolves https://github.com/sigp/lighthouse/issues/3032 ## Proposed Changes Extracts some of the functionality of #3094 into a separate PR as the original PR requires a bit more work. Do not unnecessarily penalize peers when we fail to validate received execution payloads because our execution layer is offline.
This commit is contained in:
parent
21b3425a12
commit
f428719761
@ -75,7 +75,9 @@ mod work_reprocessing_queue;
|
||||
mod worker;
|
||||
|
||||
use crate::beacon_processor::work_reprocessing_queue::QueuedBlock;
|
||||
pub use worker::{ChainSegmentProcessId, GossipAggregatePackage, GossipAttestationPackage};
|
||||
pub use worker::{
|
||||
ChainSegmentProcessId, FailureMode, GossipAggregatePackage, GossipAttestationPackage,
|
||||
};
|
||||
|
||||
/// The maximum size of the channel for work events to the `BeaconProcessor`.
|
||||
///
|
||||
|
@ -943,6 +943,16 @@ impl<T: BeaconChainTypes> Worker<T> {
|
||||
);
|
||||
self.send_sync_message(SyncMessage::UnknownBlock(peer_id, block));
|
||||
}
|
||||
Err(e @ BlockError::ExecutionPayloadError(ExecutionPayloadError::RequestFailed(_)))
|
||||
| Err(
|
||||
e @ BlockError::ExecutionPayloadError(ExecutionPayloadError::NoExecutionConnection),
|
||||
) => {
|
||||
debug!(
|
||||
self.log,
|
||||
"Failed to verify execution payload";
|
||||
"error" => %e
|
||||
);
|
||||
}
|
||||
other => {
|
||||
debug!(
|
||||
self.log,
|
||||
|
@ -10,7 +10,7 @@ mod rpc_methods;
|
||||
mod sync_methods;
|
||||
|
||||
pub use gossip_methods::{GossipAggregatePackage, GossipAttestationPackage};
|
||||
pub use sync_methods::ChainSegmentProcessId;
|
||||
pub use sync_methods::{ChainSegmentProcessId, FailureMode};
|
||||
|
||||
pub(crate) const FUTURE_SLOT_TOLERANCE: u64 = 1;
|
||||
|
||||
|
@ -6,6 +6,7 @@ use crate::beacon_processor::DuplicateCache;
|
||||
use crate::metrics;
|
||||
use crate::sync::manager::{BlockProcessType, SyncMessage};
|
||||
use crate::sync::{BatchProcessResult, ChainId};
|
||||
use beacon_chain::ExecutionPayloadError;
|
||||
use beacon_chain::{
|
||||
BeaconChainError, BeaconChainTypes, BlockError, ChainSegmentResult, HistoricalBlockError,
|
||||
};
|
||||
@ -31,6 +32,15 @@ struct ChainSegmentFailed {
|
||||
message: String,
|
||||
/// Used to penalize peers.
|
||||
peer_action: Option<PeerAction>,
|
||||
/// Failure mode
|
||||
mode: FailureMode,
|
||||
}
|
||||
|
||||
/// Represents if a block processing failure was on the consensus or execution side.
|
||||
#[derive(Debug)]
|
||||
pub enum FailureMode {
|
||||
ExecutionLayer { pause_sync: bool },
|
||||
ConsensusLayer,
|
||||
}
|
||||
|
||||
impl<T: BeaconChainTypes> Worker<T> {
|
||||
@ -128,6 +138,7 @@ impl<T: BeaconChainTypes> Worker<T> {
|
||||
BatchProcessResult::Failed {
|
||||
imported_blocks: imported_blocks > 0,
|
||||
peer_action: e.peer_action,
|
||||
mode: e.mode,
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -158,6 +169,7 @@ impl<T: BeaconChainTypes> Worker<T> {
|
||||
BatchProcessResult::Failed {
|
||||
imported_blocks: false,
|
||||
peer_action: e.peer_action,
|
||||
mode: e.mode,
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -177,6 +189,7 @@ impl<T: BeaconChainTypes> Worker<T> {
|
||||
BatchProcessResult::Failed {
|
||||
imported_blocks: imported_blocks > 0,
|
||||
peer_action: e.peer_action,
|
||||
mode: e.mode,
|
||||
}
|
||||
}
|
||||
(imported_blocks, Ok(_)) => {
|
||||
@ -257,6 +270,7 @@ impl<T: BeaconChainTypes> Worker<T> {
|
||||
message: String::from("mismatched_block_root"),
|
||||
// The peer is faulty if they send blocks with bad roots.
|
||||
peer_action: Some(PeerAction::LowToleranceError),
|
||||
mode: FailureMode::ConsensusLayer,
|
||||
}
|
||||
}
|
||||
HistoricalBlockError::InvalidSignature
|
||||
@ -271,6 +285,7 @@ impl<T: BeaconChainTypes> Worker<T> {
|
||||
message: "invalid_signature".into(),
|
||||
// The peer is faulty if they bad signatures.
|
||||
peer_action: Some(PeerAction::LowToleranceError),
|
||||
mode: FailureMode::ConsensusLayer,
|
||||
}
|
||||
}
|
||||
HistoricalBlockError::ValidatorPubkeyCacheTimeout => {
|
||||
@ -284,6 +299,7 @@ impl<T: BeaconChainTypes> Worker<T> {
|
||||
message: "pubkey_cache_timeout".into(),
|
||||
// This is an internal error, do not penalize the peer.
|
||||
peer_action: None,
|
||||
mode: FailureMode::ConsensusLayer,
|
||||
}
|
||||
}
|
||||
HistoricalBlockError::NoAnchorInfo => {
|
||||
@ -294,6 +310,7 @@ impl<T: BeaconChainTypes> Worker<T> {
|
||||
// There is no need to do a historical sync, this is not a fault of
|
||||
// the peer.
|
||||
peer_action: None,
|
||||
mode: FailureMode::ConsensusLayer,
|
||||
}
|
||||
}
|
||||
HistoricalBlockError::IndexOutOfBounds => {
|
||||
@ -306,6 +323,7 @@ impl<T: BeaconChainTypes> Worker<T> {
|
||||
message: String::from("logic_error"),
|
||||
// This should never occur, don't penalize the peer.
|
||||
peer_action: None,
|
||||
mode: FailureMode::ConsensusLayer,
|
||||
}
|
||||
}
|
||||
HistoricalBlockError::BlockOutOfRange { .. } => {
|
||||
@ -318,6 +336,7 @@ impl<T: BeaconChainTypes> Worker<T> {
|
||||
message: String::from("unexpected_error"),
|
||||
// This should never occur, don't penalize the peer.
|
||||
peer_action: None,
|
||||
mode: FailureMode::ConsensusLayer,
|
||||
}
|
||||
}
|
||||
},
|
||||
@ -327,6 +346,7 @@ impl<T: BeaconChainTypes> Worker<T> {
|
||||
message: format!("{:?}", other),
|
||||
// This is an internal error, don't penalize the peer.
|
||||
peer_action: None,
|
||||
mode: FailureMode::ConsensusLayer,
|
||||
}
|
||||
}
|
||||
};
|
||||
@ -365,6 +385,7 @@ impl<T: BeaconChainTypes> Worker<T> {
|
||||
message: format!("Block has an unknown parent: {}", block.parent_root()),
|
||||
// Peers are faulty if they send non-sequential blocks.
|
||||
peer_action: Some(PeerAction::LowToleranceError),
|
||||
mode: FailureMode::ConsensusLayer,
|
||||
})
|
||||
}
|
||||
BlockError::BlockIsAlreadyKnown => {
|
||||
@ -402,6 +423,7 @@ impl<T: BeaconChainTypes> Worker<T> {
|
||||
),
|
||||
// Peers are faulty if they send blocks from the future.
|
||||
peer_action: Some(PeerAction::LowToleranceError),
|
||||
mode: FailureMode::ConsensusLayer,
|
||||
})
|
||||
}
|
||||
BlockError::WouldRevertFinalizedSlot { .. } => {
|
||||
@ -423,8 +445,41 @@ impl<T: BeaconChainTypes> Worker<T> {
|
||||
message: format!("Internal error whilst processing block: {:?}", e),
|
||||
// Do not penalize peers for internal errors.
|
||||
peer_action: None,
|
||||
mode: FailureMode::ConsensusLayer,
|
||||
})
|
||||
}
|
||||
BlockError::ExecutionPayloadError(e) => match &e {
|
||||
ExecutionPayloadError::NoExecutionConnection { .. }
|
||||
| ExecutionPayloadError::RequestFailed { .. } => {
|
||||
// These errors indicate an issue with the EL and not the `ChainSegment`.
|
||||
// Pause the syncing while the EL recovers
|
||||
debug!(self.log,
|
||||
"Execution layer verification failed";
|
||||
"outcome" => "pausing sync",
|
||||
"err" => ?e
|
||||
);
|
||||
Err(ChainSegmentFailed {
|
||||
message: format!("Execution layer offline. Reason: {:?}", e),
|
||||
// Do not penalize peers for internal errors.
|
||||
peer_action: None,
|
||||
mode: FailureMode::ExecutionLayer { pause_sync: true },
|
||||
})
|
||||
}
|
||||
err => {
|
||||
debug!(self.log,
|
||||
"Invalid execution payload";
|
||||
"error" => ?err
|
||||
);
|
||||
Err(ChainSegmentFailed {
|
||||
message: format!(
|
||||
"Peer sent a block containing invalid execution payload. Reason: {:?}",
|
||||
err
|
||||
),
|
||||
peer_action: Some(PeerAction::LowToleranceError),
|
||||
mode: FailureMode::ExecutionLayer { pause_sync: false },
|
||||
})
|
||||
}
|
||||
},
|
||||
other => {
|
||||
debug!(
|
||||
self.log, "Invalid block received";
|
||||
@ -436,6 +491,7 @@ impl<T: BeaconChainTypes> Worker<T> {
|
||||
message: format!("Peer sent invalid block. Reason: {:?}", other),
|
||||
// Do not penalize peers for internal errors.
|
||||
peer_action: None,
|
||||
mode: FailureMode::ConsensusLayer,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -8,7 +8,7 @@
|
||||
//! If a batch fails, the backfill sync cannot progress. In this scenario, we mark the backfill
|
||||
//! sync as failed, log an error and attempt to retry once a new peer joins the node.
|
||||
|
||||
use crate::beacon_processor::{ChainSegmentProcessId, WorkEvent as BeaconWorkEvent};
|
||||
use crate::beacon_processor::{ChainSegmentProcessId, FailureMode, WorkEvent as BeaconWorkEvent};
|
||||
use crate::sync::manager::{BatchProcessResult, Id};
|
||||
use crate::sync::network_context::SyncNetworkContext;
|
||||
use crate::sync::range_sync::{BatchConfig, BatchId, BatchInfo, BatchProcessingResult, BatchState};
|
||||
@ -554,6 +554,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
imported_blocks: false,
|
||||
// The beacon processor queue is full, no need to penalize the peer.
|
||||
peer_action: None,
|
||||
mode: FailureMode::ConsensusLayer,
|
||||
},
|
||||
)
|
||||
} else {
|
||||
@ -638,6 +639,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
BatchProcessResult::Failed {
|
||||
imported_blocks,
|
||||
peer_action,
|
||||
mode: _,
|
||||
} => {
|
||||
let batch = match self.batches.get_mut(&batch_id) {
|
||||
Some(v) => v,
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::time::Duration;
|
||||
|
||||
use beacon_chain::{BeaconChainTypes, BlockError};
|
||||
use beacon_chain::{BeaconChainTypes, BlockError, ExecutionPayloadError};
|
||||
use fnv::FnvHashMap;
|
||||
use lighthouse_network::{PeerAction, PeerId};
|
||||
use lru_cache::LRUTimeCache;
|
||||
@ -10,7 +10,7 @@ use smallvec::SmallVec;
|
||||
use store::{Hash256, SignedBeaconBlock};
|
||||
use tokio::sync::mpsc;
|
||||
|
||||
use crate::beacon_processor::{ChainSegmentProcessId, WorkEvent};
|
||||
use crate::beacon_processor::{ChainSegmentProcessId, FailureMode, WorkEvent};
|
||||
use crate::metrics;
|
||||
|
||||
use self::{
|
||||
@ -420,6 +420,20 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
BlockError::ParentUnknown(block) => {
|
||||
self.search_parent(block, peer_id, cx);
|
||||
}
|
||||
|
||||
e @ BlockError::ExecutionPayloadError(ExecutionPayloadError::RequestFailed(_))
|
||||
| e @ BlockError::ExecutionPayloadError(
|
||||
ExecutionPayloadError::NoExecutionConnection,
|
||||
) => {
|
||||
// These errors indicate that the execution layer is offline
|
||||
// and failed to validate the execution payload. Do not downscore peer.
|
||||
debug!(
|
||||
self.log,
|
||||
"Single block lookup failed. Execution layer is offline";
|
||||
"root" => %root,
|
||||
"error" => ?e
|
||||
);
|
||||
}
|
||||
other => {
|
||||
warn!(self.log, "Peer sent invalid block in single block lookup"; "root" => %root, "error" => ?other, "peer_id" => %peer_id);
|
||||
cx.report_peer(
|
||||
@ -506,6 +520,19 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e @ BlockError::ExecutionPayloadError(ExecutionPayloadError::RequestFailed(_)))
|
||||
| Err(
|
||||
e @ BlockError::ExecutionPayloadError(ExecutionPayloadError::NoExecutionConnection),
|
||||
) => {
|
||||
// These errors indicate that the execution layer is offline
|
||||
// and failed to validate the execution payload. Do not downscore peer.
|
||||
debug!(
|
||||
self.log,
|
||||
"Parent lookup failed. Execution layer is offline";
|
||||
"chain_hash" => %chain_hash,
|
||||
"error" => ?e
|
||||
);
|
||||
}
|
||||
Err(outcome) => {
|
||||
// all else we consider the chain a failure and downvote the peer that sent
|
||||
// us the last block
|
||||
@ -561,11 +588,21 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
BatchProcessResult::Failed {
|
||||
imported_blocks: _,
|
||||
peer_action,
|
||||
mode,
|
||||
} => {
|
||||
self.failed_chains.insert(parent_lookup.chain_hash());
|
||||
if let Some(peer_action) = peer_action {
|
||||
for &peer_id in parent_lookup.used_peers() {
|
||||
cx.report_peer(peer_id, peer_action, "parent_chain_failure")
|
||||
if let FailureMode::ExecutionLayer { pause_sync: _ } = mode {
|
||||
debug!(
|
||||
self.log,
|
||||
"Chain segment processing failed. Execution layer is offline";
|
||||
"chain_hash" => %chain_hash,
|
||||
"error" => ?mode
|
||||
);
|
||||
} else {
|
||||
self.failed_chains.insert(parent_lookup.chain_hash());
|
||||
if let Some(peer_action) = peer_action {
|
||||
for &peer_id in parent_lookup.used_peers() {
|
||||
cx.report_peer(peer_id, peer_action, "parent_chain_failure")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -38,7 +38,7 @@ use super::block_lookups::BlockLookups;
|
||||
use super::network_context::SyncNetworkContext;
|
||||
use super::peer_sync_info::{remote_sync_type, PeerSyncType};
|
||||
use super::range_sync::{RangeSync, RangeSyncType, EPOCHS_PER_BATCH};
|
||||
use crate::beacon_processor::{ChainSegmentProcessId, WorkEvent as BeaconWorkEvent};
|
||||
use crate::beacon_processor::{ChainSegmentProcessId, FailureMode, WorkEvent as BeaconWorkEvent};
|
||||
use crate::service::NetworkMessage;
|
||||
use crate::status::ToStatusMessage;
|
||||
use beacon_chain::{BeaconChain, BeaconChainTypes, BlockError};
|
||||
@ -137,6 +137,7 @@ pub enum BatchProcessResult {
|
||||
Failed {
|
||||
imported_blocks: bool,
|
||||
peer_action: Option<PeerAction>,
|
||||
mode: FailureMode,
|
||||
},
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
use super::batch::{BatchInfo, BatchProcessingResult, BatchState};
|
||||
use crate::beacon_processor::ChainSegmentProcessId;
|
||||
use crate::beacon_processor::WorkEvent as BeaconWorkEvent;
|
||||
use crate::beacon_processor::{ChainSegmentProcessId, FailureMode};
|
||||
use crate::sync::{manager::Id, network_context::SyncNetworkContext, BatchProcessResult};
|
||||
use beacon_chain::BeaconChainTypes;
|
||||
use fnv::FnvHashMap;
|
||||
@ -320,6 +320,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
&BatchProcessResult::Failed {
|
||||
imported_blocks: false,
|
||||
peer_action: None,
|
||||
mode: FailureMode::ConsensusLayer,
|
||||
},
|
||||
)
|
||||
} else {
|
||||
@ -499,6 +500,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
BatchProcessResult::Failed {
|
||||
imported_blocks,
|
||||
peer_action,
|
||||
mode: _,
|
||||
} => {
|
||||
let batch = self.batches.get_mut(&batch_id).ok_or_else(|| {
|
||||
RemoveChain::WrongChainState(format!(
|
||||
|
Loading…
Reference in New Issue
Block a user