Metrics and DEBG log for late gossip blocks (#2533)

## Issue Addressed

Which issue # does this PR address?

## Proposed Changes

- Add a counter metric to log when a block is received late from gossip.
- Also push a `DEBG` log for the above condition.
- Use Debug (`?`) instead of Display (`%`) for a bunch of logs in the beacon processor, so we don't have to deal with concatenated block roots.
- Add new ERRO and CRIT to HTTP API to alert users when they're publishing late blocks.

## Additional Info

NA
This commit is contained in:
Paul Hauner 2021-08-23 00:59:14 +00:00
parent 12fe72bd37
commit f2a8c6229c
5 changed files with 60 additions and 29 deletions

View File

@ -2932,6 +2932,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
.beacon_state
.previous_epoch()
.start_slot(T::EthSpec::slots_per_epoch());
let head_proposer_index = new_head.beacon_block.message().proposer_index();
// Update the snapshot that stores the head of the chain at the time it received the
// block.
@ -2962,9 +2963,10 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
debug!(
self.log,
"Delayed head block";
"delay" => ?block_delay,
"root" => ?beacon_block_root,
"block_root" => ?beacon_block_root,
"proposer_index" => head_proposer_index,
"slot" => head_slot,
"block_delay" => ?block_delay,
);
}

View File

@ -895,7 +895,10 @@ pub fn serve<T: BeaconChainTypes>(
info!(
log,
"Valid block from HTTP API";
"root" => format!("{}", root)
"block_delay" => ?delay,
"root" => format!("{}", root),
"proposer_index" => block.message().proposer_index(),
"slot" => block.slot(),
);
// Notify the validator monitor.
@ -917,25 +920,25 @@ pub fn serve<T: BeaconChainTypes>(
//
// Check to see the thresholds are non-zero to avoid logging errors with small
// slot times (e.g., during testing)
let crit_threshold = chain.spec.seconds_per_slot / 3;
let warn_threshold = chain.spec.seconds_per_slot / 6;
if crit_threshold > 0 && delay.as_secs() > crit_threshold {
let crit_threshold = chain.slot_clock.unagg_attestation_production_delay();
let error_threshold = crit_threshold / 2;
if delay >= crit_threshold {
crit!(
log,
"Block was broadcast too late";
"root" => ?root,
"slot" => block.slot(),
"delay_ms" => delay.as_millis(),
"msg" => "system may be overloaded, block likely to be orphaned",
"delay_ms" => delay.as_millis(),
"slot" => block.slot(),
"root" => ?root,
)
} else if warn_threshold > 0 && delay.as_secs() > warn_threshold {
warn!(
} else if delay >= error_threshold {
error!(
log,
"Block broadcast was delayed";
"root" => ?root,
"slot" => block.slot(),
"delay_ms" => delay.as_millis(),
"msg" => "system may be overloaded, block may be orphaned",
"delay_ms" => delay.as_millis(),
"slot" => block.slot(),
"root" => ?root,
)
}

View File

@ -33,4 +33,12 @@ lazy_static::lazy_static! {
"http_api_block_broadcast_delay_times",
"Time between start of the slot and when the block was broadcast"
);
pub static ref HTTP_API_BLOCK_PUBLISHED_LATE_TOTAL: Result<IntCounter> = try_create_int_counter(
"http_api_block_published_late_total",
"The count of times a block was published beyond more than half way to the attestation deadline"
);
pub static ref HTTP_API_BLOCK_PUBLISHED_VERY_LATE_TOTAL: Result<IntCounter> = try_create_int_counter(
"http_api_block_published_very_late_total",
"The count of times a block was published beyond the attestation deadline"
);
}

View File

@ -291,19 +291,33 @@ impl<T: BeaconChainTypes> Worker<T> {
reprocess_tx: mpsc::Sender<ReprocessQueueMessage<T>>,
seen_duration: Duration,
) {
let block_delay =
get_block_delay_ms(seen_duration, block.message(), &self.chain.slot_clock);
// Log metrics to track delay from other nodes on the network.
metrics::observe_duration(
&metrics::BEACON_BLOCK_GOSSIP_SLOT_START_DELAY_TIME,
get_block_delay_ms(seen_duration, block.message(), &self.chain.slot_clock),
block_delay,
);
let verified_block = match self.chain.verify_block_for_gossip(block) {
Ok(verified_block) => {
if block_delay >= self.chain.slot_clock.unagg_attestation_production_delay() {
metrics::inc_counter(&metrics::BEACON_BLOCK_GOSSIP_ARRIVED_LATE_TOTAL);
debug!(
self.log,
"Gossip block arrived late";
"block_root" => ?verified_block.block_root,
"proposer_index" => verified_block.block.message().proposer_index(),
"slot" => verified_block.block.slot(),
"block_delay" => ?block_delay,
);
}
info!(
self.log,
"New block received";
"slot" => verified_block.block.slot(),
"hash" => %verified_block.block_root
"hash" => ?verified_block.block_root
);
self.propagate_validation_result(message_id, peer_id, MessageAcceptance::Accept);
@ -325,7 +339,7 @@ impl<T: BeaconChainTypes> Worker<T> {
debug!(
self.log,
"Unknown parent for gossip block";
"root" => %block.canonical_root()
"root" => ?block.canonical_root()
);
self.send_sync_message(SyncMessage::UnknownBlock(peer_id, block));
return;
@ -392,7 +406,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log,
"Block arrived early";
"block_slot" => %block_slot,
"block_root" => %block_root,
"block_root" => ?block_root,
"msg" => "if this happens consistently, check system clock"
);
@ -423,7 +437,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log,
"Failed to defer block import";
"block_slot" => %block_slot,
"block_root" => %block_root,
"block_root" => ?block_root,
"location" => "block gossip"
)
}
@ -440,7 +454,7 @@ impl<T: BeaconChainTypes> Worker<T> {
"Failed to defer block import";
"error" => ?e,
"block_slot" => %block_slot,
"block_root" => %block_root,
"block_root" => ?block_root,
"location" => "block gossip"
)
}
@ -472,7 +486,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log,
"Failed to inform block import";
"source" => "gossip",
"block_root" => %block_root,
"block_root" => ?block_root,
)
};
@ -511,7 +525,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log,
"Invalid gossip beacon block";
"outcome" => ?other,
"block root" => %block.canonical_root(),
"block root" => ?block.canonical_root(),
"block slot" => block.slot()
);
self.gossip_penalize_peer(peer_id, PeerAction::MidToleranceError);
@ -843,7 +857,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log,
"Attestation is not within the last ATTESTATION_PROPAGATION_SLOT_RANGE slots";
"peer_id" => %peer_id,
"block" => %beacon_block_root,
"block" => ?beacon_block_root,
"type" => ?attestation_type,
);
@ -918,7 +932,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log,
"Attestation already known";
"peer_id" => %peer_id,
"block" => %beacon_block_root,
"block" => ?beacon_block_root,
"type" => ?attestation_type,
);
self.propagate_validation_result(message_id, peer_id, MessageAcceptance::Ignore);
@ -935,7 +949,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log,
"Aggregator already known";
"peer_id" => %peer_id,
"block" => %beacon_block_root,
"block" => ?beacon_block_root,
"type" => ?attestation_type,
);
// This is an allowed behaviour.
@ -956,7 +970,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log,
"Prior attestation known";
"peer_id" => %peer_id,
"block" => %beacon_block_root,
"block" => ?beacon_block_root,
"epoch" => %epoch,
"validator_index" => validator_index,
"type" => ?attestation_type,
@ -980,7 +994,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log,
"Validation Index too high";
"peer_id" => %peer_id,
"block" => %beacon_block_root,
"block" => ?beacon_block_root,
"type" => ?attestation_type,
);
self.propagate_validation_result(message_id, peer_id, MessageAcceptance::Reject);
@ -991,7 +1005,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log,
"Attestation for unknown block";
"peer_id" => %peer_id,
"block" => %beacon_block_root
"block" => ?beacon_block_root
);
if let Some(sender) = reprocess_tx {
// We don't know the block, get the sync manager to handle the block lookup, and
@ -1200,7 +1214,7 @@ impl<T: BeaconChainTypes> Worker<T> {
self.log,
"Invalid attestation from network";
"reason" => ?error,
"block" => %beacon_block_root,
"block" => ?beacon_block_root,
"peer_id" => %peer_id,
"type" => ?attestation_type,
);

View File

@ -465,6 +465,10 @@ lazy_static! {
"beacon_block_gossip_slot_start_delay_time",
"Duration between when the block is received and the start of the slot it belongs to.",
);
pub static ref BEACON_BLOCK_GOSSIP_ARRIVED_LATE_TOTAL: Result<IntCounter> = try_create_int_counter(
"beacon_block_gossip_arrived_late_total",
"Count of times when a gossip block arrived from the network later than the attestation deadline.",
);
/*
* Attestation reprocessing queue metrics.