Metrics and DEBG log for late gossip blocks (#2533)

## Issue Addressed

Which issue # does this PR address?

## Proposed Changes

- Add a counter metric to log when a block is received late from gossip.
- Also push a `DEBG` log for the above condition.
- Use Debug (`?`) instead of Display (`%`) for a bunch of logs in the beacon processor, so we don't have to deal with concatenated block roots.
- Add new ERRO and CRIT to HTTP API to alert users when they're publishing late blocks.

## Additional Info

NA
This commit is contained in:
Paul Hauner 2021-08-23 00:59:14 +00:00
parent 12fe72bd37
commit f2a8c6229c
5 changed files with 60 additions and 29 deletions

View File

@ -2932,6 +2932,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
.beacon_state .beacon_state
.previous_epoch() .previous_epoch()
.start_slot(T::EthSpec::slots_per_epoch()); .start_slot(T::EthSpec::slots_per_epoch());
let head_proposer_index = new_head.beacon_block.message().proposer_index();
// Update the snapshot that stores the head of the chain at the time it received the // Update the snapshot that stores the head of the chain at the time it received the
// block. // block.
@ -2962,9 +2963,10 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
debug!( debug!(
self.log, self.log,
"Delayed head block"; "Delayed head block";
"delay" => ?block_delay, "block_root" => ?beacon_block_root,
"root" => ?beacon_block_root, "proposer_index" => head_proposer_index,
"slot" => head_slot, "slot" => head_slot,
"block_delay" => ?block_delay,
); );
} }

View File

@ -895,7 +895,10 @@ pub fn serve<T: BeaconChainTypes>(
info!( info!(
log, log,
"Valid block from HTTP API"; "Valid block from HTTP API";
"root" => format!("{}", root) "block_delay" => ?delay,
"root" => format!("{}", root),
"proposer_index" => block.message().proposer_index(),
"slot" => block.slot(),
); );
// Notify the validator monitor. // Notify the validator monitor.
@ -917,25 +920,25 @@ pub fn serve<T: BeaconChainTypes>(
// //
// Check to see the thresholds are non-zero to avoid logging errors with small // Check to see the thresholds are non-zero to avoid logging errors with small
// slot times (e.g., during testing) // slot times (e.g., during testing)
let crit_threshold = chain.spec.seconds_per_slot / 3; let crit_threshold = chain.slot_clock.unagg_attestation_production_delay();
let warn_threshold = chain.spec.seconds_per_slot / 6; let error_threshold = crit_threshold / 2;
if crit_threshold > 0 && delay.as_secs() > crit_threshold { if delay >= crit_threshold {
crit!( crit!(
log, log,
"Block was broadcast too late"; "Block was broadcast too late";
"root" => ?root,
"slot" => block.slot(),
"delay_ms" => delay.as_millis(),
"msg" => "system may be overloaded, block likely to be orphaned", "msg" => "system may be overloaded, block likely to be orphaned",
"delay_ms" => delay.as_millis(),
"slot" => block.slot(),
"root" => ?root,
) )
} else if warn_threshold > 0 && delay.as_secs() > warn_threshold { } else if delay >= error_threshold {
warn!( error!(
log, log,
"Block broadcast was delayed"; "Block broadcast was delayed";
"root" => ?root,
"slot" => block.slot(),
"delay_ms" => delay.as_millis(),
"msg" => "system may be overloaded, block may be orphaned", "msg" => "system may be overloaded, block may be orphaned",
"delay_ms" => delay.as_millis(),
"slot" => block.slot(),
"root" => ?root,
) )
} }

View File

@ -33,4 +33,12 @@ lazy_static::lazy_static! {
"http_api_block_broadcast_delay_times", "http_api_block_broadcast_delay_times",
"Time between start of the slot and when the block was broadcast" "Time between start of the slot and when the block was broadcast"
); );
pub static ref HTTP_API_BLOCK_PUBLISHED_LATE_TOTAL: Result<IntCounter> = try_create_int_counter(
"http_api_block_published_late_total",
"The count of times a block was published beyond more than half way to the attestation deadline"
);
pub static ref HTTP_API_BLOCK_PUBLISHED_VERY_LATE_TOTAL: Result<IntCounter> = try_create_int_counter(
"http_api_block_published_very_late_total",
"The count of times a block was published beyond the attestation deadline"
);
} }

View File

@ -291,19 +291,33 @@ impl<T: BeaconChainTypes> Worker<T> {
reprocess_tx: mpsc::Sender<ReprocessQueueMessage<T>>, reprocess_tx: mpsc::Sender<ReprocessQueueMessage<T>>,
seen_duration: Duration, seen_duration: Duration,
) { ) {
let block_delay =
get_block_delay_ms(seen_duration, block.message(), &self.chain.slot_clock);
// Log metrics to track delay from other nodes on the network. // Log metrics to track delay from other nodes on the network.
metrics::observe_duration( metrics::observe_duration(
&metrics::BEACON_BLOCK_GOSSIP_SLOT_START_DELAY_TIME, &metrics::BEACON_BLOCK_GOSSIP_SLOT_START_DELAY_TIME,
get_block_delay_ms(seen_duration, block.message(), &self.chain.slot_clock), block_delay,
); );
let verified_block = match self.chain.verify_block_for_gossip(block) { let verified_block = match self.chain.verify_block_for_gossip(block) {
Ok(verified_block) => { Ok(verified_block) => {
if block_delay >= self.chain.slot_clock.unagg_attestation_production_delay() {
metrics::inc_counter(&metrics::BEACON_BLOCK_GOSSIP_ARRIVED_LATE_TOTAL);
debug!(
self.log,
"Gossip block arrived late";
"block_root" => ?verified_block.block_root,
"proposer_index" => verified_block.block.message().proposer_index(),
"slot" => verified_block.block.slot(),
"block_delay" => ?block_delay,
);
}
info!( info!(
self.log, self.log,
"New block received"; "New block received";
"slot" => verified_block.block.slot(), "slot" => verified_block.block.slot(),
"hash" => %verified_block.block_root "hash" => ?verified_block.block_root
); );
self.propagate_validation_result(message_id, peer_id, MessageAcceptance::Accept); self.propagate_validation_result(message_id, peer_id, MessageAcceptance::Accept);
@ -325,7 +339,7 @@ impl<T: BeaconChainTypes> Worker<T> {
debug!( debug!(
self.log, self.log,
"Unknown parent for gossip block"; "Unknown parent for gossip block";
"root" => %block.canonical_root() "root" => ?block.canonical_root()
); );
self.send_sync_message(SyncMessage::UnknownBlock(peer_id, block)); self.send_sync_message(SyncMessage::UnknownBlock(peer_id, block));
return; return;
@ -392,7 +406,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log, self.log,
"Block arrived early"; "Block arrived early";
"block_slot" => %block_slot, "block_slot" => %block_slot,
"block_root" => %block_root, "block_root" => ?block_root,
"msg" => "if this happens consistently, check system clock" "msg" => "if this happens consistently, check system clock"
); );
@ -423,7 +437,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log, self.log,
"Failed to defer block import"; "Failed to defer block import";
"block_slot" => %block_slot, "block_slot" => %block_slot,
"block_root" => %block_root, "block_root" => ?block_root,
"location" => "block gossip" "location" => "block gossip"
) )
} }
@ -440,7 +454,7 @@ impl<T: BeaconChainTypes> Worker<T> {
"Failed to defer block import"; "Failed to defer block import";
"error" => ?e, "error" => ?e,
"block_slot" => %block_slot, "block_slot" => %block_slot,
"block_root" => %block_root, "block_root" => ?block_root,
"location" => "block gossip" "location" => "block gossip"
) )
} }
@ -472,7 +486,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log, self.log,
"Failed to inform block import"; "Failed to inform block import";
"source" => "gossip", "source" => "gossip",
"block_root" => %block_root, "block_root" => ?block_root,
) )
}; };
@ -511,7 +525,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log, self.log,
"Invalid gossip beacon block"; "Invalid gossip beacon block";
"outcome" => ?other, "outcome" => ?other,
"block root" => %block.canonical_root(), "block root" => ?block.canonical_root(),
"block slot" => block.slot() "block slot" => block.slot()
); );
self.gossip_penalize_peer(peer_id, PeerAction::MidToleranceError); self.gossip_penalize_peer(peer_id, PeerAction::MidToleranceError);
@ -843,7 +857,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log, self.log,
"Attestation is not within the last ATTESTATION_PROPAGATION_SLOT_RANGE slots"; "Attestation is not within the last ATTESTATION_PROPAGATION_SLOT_RANGE slots";
"peer_id" => %peer_id, "peer_id" => %peer_id,
"block" => %beacon_block_root, "block" => ?beacon_block_root,
"type" => ?attestation_type, "type" => ?attestation_type,
); );
@ -918,7 +932,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log, self.log,
"Attestation already known"; "Attestation already known";
"peer_id" => %peer_id, "peer_id" => %peer_id,
"block" => %beacon_block_root, "block" => ?beacon_block_root,
"type" => ?attestation_type, "type" => ?attestation_type,
); );
self.propagate_validation_result(message_id, peer_id, MessageAcceptance::Ignore); self.propagate_validation_result(message_id, peer_id, MessageAcceptance::Ignore);
@ -935,7 +949,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log, self.log,
"Aggregator already known"; "Aggregator already known";
"peer_id" => %peer_id, "peer_id" => %peer_id,
"block" => %beacon_block_root, "block" => ?beacon_block_root,
"type" => ?attestation_type, "type" => ?attestation_type,
); );
// This is an allowed behaviour. // This is an allowed behaviour.
@ -956,7 +970,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log, self.log,
"Prior attestation known"; "Prior attestation known";
"peer_id" => %peer_id, "peer_id" => %peer_id,
"block" => %beacon_block_root, "block" => ?beacon_block_root,
"epoch" => %epoch, "epoch" => %epoch,
"validator_index" => validator_index, "validator_index" => validator_index,
"type" => ?attestation_type, "type" => ?attestation_type,
@ -980,7 +994,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log, self.log,
"Validation Index too high"; "Validation Index too high";
"peer_id" => %peer_id, "peer_id" => %peer_id,
"block" => %beacon_block_root, "block" => ?beacon_block_root,
"type" => ?attestation_type, "type" => ?attestation_type,
); );
self.propagate_validation_result(message_id, peer_id, MessageAcceptance::Reject); self.propagate_validation_result(message_id, peer_id, MessageAcceptance::Reject);
@ -991,7 +1005,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log, self.log,
"Attestation for unknown block"; "Attestation for unknown block";
"peer_id" => %peer_id, "peer_id" => %peer_id,
"block" => %beacon_block_root "block" => ?beacon_block_root
); );
if let Some(sender) = reprocess_tx { if let Some(sender) = reprocess_tx {
// We don't know the block, get the sync manager to handle the block lookup, and // We don't know the block, get the sync manager to handle the block lookup, and
@ -1200,7 +1214,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log, self.log,
"Invalid attestation from network"; "Invalid attestation from network";
"reason" => ?error, "reason" => ?error,
"block" => %beacon_block_root, "block" => ?beacon_block_root,
"peer_id" => %peer_id, "peer_id" => %peer_id,
"type" => ?attestation_type, "type" => ?attestation_type,
); );

View File

@ -465,6 +465,10 @@ lazy_static! {
"beacon_block_gossip_slot_start_delay_time", "beacon_block_gossip_slot_start_delay_time",
"Duration between when the block is received and the start of the slot it belongs to.", "Duration between when the block is received and the start of the slot it belongs to.",
); );
pub static ref BEACON_BLOCK_GOSSIP_ARRIVED_LATE_TOTAL: Result<IntCounter> = try_create_int_counter(
"beacon_block_gossip_arrived_late_total",
"Count of times when a gossip block arrived from the network later than the attestation deadline.",
);
/* /*
* Attestation reprocessing queue metrics. * Attestation reprocessing queue metrics.