From 84843d67d790e91721a66a1afb2466a885ec46a7 Mon Sep 17 00:00:00 2001 From: Paul Hauner Date: Sun, 12 Feb 2023 23:14:08 +0000 Subject: [PATCH] Reduce some EE and builder related ERRO logs to WARN (#3966) ## Issue Addressed NA ## Proposed Changes Our `ERRO` stream has been rather noisy since the merge due to some unexpected behaviours of builders and EEs. Now that we've been running post-merge for a while, I think we can drop some of these `ERRO` to `WARN` so we're not "crying wolf". The modified logs are: #### `ERRO Execution engine call failed` I'm seeing this quite frequently on Geth nodes. They seem to timeout when they're busy and it rarely indicates a serious issue. We also have logging across block import, fork choice updating and payload production that raise `ERRO` or `CRIT` when the EE times out, so I think we're not at risk of silencing actual issues. #### `ERRO "Builder failed to reveal payload"` In #3775 we reduced this log from `CRIT` to `ERRO` since it's common for builders to fail to reveal the block to the producer directly whilst still broadcasting it to the networ. I think it's worth dropping this to `WARN` since it's rarely interesting. I elected to stay with `WARN` since I really do wish builders would fulfill their API promises by returning the block to us. Perhaps I'm just being pedantic here, I could be convinced otherwise. #### `ERRO "Relay error when registering validator(s)"` It seems like builders and/or mev-boost struggle to handle heavy loads of validator registrations. I haven't observed issues with validators not actually being registered, but I see timeouts on these endpoints many times a day. It doesn't seem like this `ERRO` is worth it. #### `ERRO Error fetching block for peer ExecutionLayerErrorPayloadReconstruction` This means we failed to respond to a peer on the P2P network with a block they requested because of an error in the `execution_layer`. It's very common to see timeouts or incomplete responses on this endpoint whilst the EE is busy and I don't think it's important enough for an `ERRO`. As long as the peer count stays high, I don't think the user needs to be actively concerned about how we're responding to peers. ## Additional Info NA --- Cargo.lock | 1 + beacon_node/execution_layer/src/engines.rs | 4 +-- beacon_node/execution_layer/src/lib.rs | 4 +-- beacon_node/http_api/src/lib.rs | 2 +- beacon_node/network/Cargo.toml | 1 + .../beacon_processor/worker/rpc_methods.rs | 28 ++++++++++++++----- 6 files changed, 28 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 69651fb7e..6159778b7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4988,6 +4988,7 @@ dependencies = [ "eth2_ssz", "eth2_ssz_types", "ethereum-types 0.14.1", + "execution_layer", "exit-future", "fnv", "futures", diff --git a/beacon_node/execution_layer/src/engines.rs b/beacon_node/execution_layer/src/engines.rs index 339006c1b..eaaa271c5 100644 --- a/beacon_node/execution_layer/src/engines.rs +++ b/beacon_node/execution_layer/src/engines.rs @@ -5,7 +5,7 @@ use crate::engine_api::{ }; use crate::HttpJsonRpc; use lru::LruCache; -use slog::{debug, error, info, Logger}; +use slog::{debug, error, info, warn, Logger}; use std::future::Future; use std::sync::Arc; use task_executor::TaskExecutor; @@ -325,7 +325,7 @@ impl Engine { Ok(result) } Err(error) => { - error!( + warn!( self.log, "Execution engine call failed"; "error" => ?error, diff --git a/beacon_node/execution_layer/src/lib.rs b/beacon_node/execution_layer/src/lib.rs index a4d15abb3..5b0fecbf2 100644 --- a/beacon_node/execution_layer/src/lib.rs +++ b/beacon_node/execution_layer/src/lib.rs @@ -1575,10 +1575,10 @@ impl ExecutionLayer { &metrics::EXECUTION_LAYER_BUILDER_REVEAL_PAYLOAD_OUTCOME, &[metrics::FAILURE], ); - error!( + warn!( self.log(), "Builder failed to reveal payload"; - "info" => "this relay failure may cause a missed proposal", + "info" => "this is common behaviour for some builders and may not indicate an issue", "error" => ?e, "relay_response_ms" => duration.as_millis(), "block_root" => ?block_root, diff --git a/beacon_node/http_api/src/lib.rs b/beacon_node/http_api/src/lib.rs index 973be2d49..60e5d2adf 100644 --- a/beacon_node/http_api/src/lib.rs +++ b/beacon_node/http_api/src/lib.rs @@ -2855,7 +2855,7 @@ pub fn serve( .await .map(|resp| warp::reply::json(&resp)) .map_err(|e| { - error!( + warn!( log, "Relay error when registering validator(s)"; "num_registrations" => filtered_registration_data.len(), diff --git a/beacon_node/network/Cargo.toml b/beacon_node/network/Cargo.toml index b1d928eec..1b036b32c 100644 --- a/beacon_node/network/Cargo.toml +++ b/beacon_node/network/Cargo.toml @@ -45,6 +45,7 @@ tokio-util = { version = "0.6.3", features = ["time"] } derivative = "2.2.0" delay_map = "0.1.1" ethereum-types = { version = "0.14.1", optional = true } +execution_layer = { path = "../execution_layer" } [features] deterministic_long_lived_attnets = [ "ethereum-types" ] diff --git a/beacon_node/network/src/beacon_processor/worker/rpc_methods.rs b/beacon_node/network/src/beacon_processor/worker/rpc_methods.rs index bfa0ea516..afcc15280 100644 --- a/beacon_node/network/src/beacon_processor/worker/rpc_methods.rs +++ b/beacon_node/network/src/beacon_processor/worker/rpc_methods.rs @@ -7,7 +7,7 @@ use itertools::process_results; use lighthouse_network::rpc::StatusMessage; use lighthouse_network::rpc::*; use lighthouse_network::{PeerId, PeerRequestId, ReportSource, Response, SyncInfo}; -use slog::{debug, error}; +use slog::{debug, error, warn}; use slot_clock::SlotClock; use std::sync::Arc; use task_executor::TaskExecutor; @@ -392,12 +392,26 @@ impl Worker { break; } Err(e) => { - error!( - self.log, - "Error fetching block for peer"; - "block_root" => ?root, - "error" => ?e - ); + if matches!( + e, + BeaconChainError::ExecutionLayerErrorPayloadReconstruction(_block_hash, ref boxed_error) + if matches!(**boxed_error, execution_layer::Error::EngineError(_)) + ) { + warn!( + self.log, + "Error rebuilding payload for peer"; + "info" => "this may occur occasionally when the EE is busy", + "block_root" => ?root, + "error" => ?e, + ); + } else { + error!( + self.log, + "Error fetching block for peer"; + "block_root" => ?root, + "error" => ?e + ); + } // send the stream terminator self.send_error_response(