Shutdown gracefully on panic (#2596)
## Proposed Changes
* Modify the `TaskExecutor` so that it spawns a "monitor" future for each future spawned by `spawn` or `spawn_blocking`. This monitor future joins the handle of the child future and shuts down the executor if it detects a panic.
* Enable backtraces by default by setting the environment variable `RUST_BACKTRACE`.
* Spawn the `ProductionBeaconNode` on the `TaskExecutor` so that if a panic occurs during start-up it will take down the whole process. Previously we were using a raw Tokio `spawn`, but I can't see any reason not to use the executor (perhaps someone else can).
## Additional Info
I considered using [`std::panic::set_hook`](https://doc.rust-lang.org/std/panic/fn.set_hook.html) to instantiate a custom panic handler, however this doesn't allow us to send a shutdown signal because `Fn` functions can't move variables (i.e. the shutdown sender) out of their environment. This also prevents it from receiving a `Logger`. Hence I decided to leave the panic handler untouched, but with backtraces turned on by default.
I did a run through the code base with all the raw Tokio spawn functions disallowed by Clippy, and found only two instances where we bypass the `TaskExecutor`: the HTTP API and `InitializedValidators` in the VC. In both places we use `spawn_blocking` and handle the return value, so I figured that was OK for now.
In terms of performance I think the overhead should be minimal. The monitor tasks will just get parked by the executor until their child resolves.
I've checked that this covers Discv5, as the `TaskExecutor` gets injected into Discv5 here: f9bba92db3/beacon_node/src/lib.rs (L125-L126)
This commit is contained in:
parent
95b17137a8
commit
58012f85e1
@ -2,7 +2,7 @@ mod metrics;
|
|||||||
|
|
||||||
use futures::channel::mpsc::Sender;
|
use futures::channel::mpsc::Sender;
|
||||||
use futures::prelude::*;
|
use futures::prelude::*;
|
||||||
use slog::{debug, o, trace};
|
use slog::{crit, debug, o, trace};
|
||||||
use std::sync::Weak;
|
use std::sync::Weak;
|
||||||
use tokio::runtime::Runtime;
|
use tokio::runtime::Runtime;
|
||||||
|
|
||||||
@ -83,34 +83,56 @@ impl TaskExecutor {
|
|||||||
self.spawn(task.map(|_| ()), name)
|
self.spawn(task.map(|_| ()), name)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Spawn a future on the tokio runtime wrapped in an `exit_future::Exit`. The task is canceled
|
/// Spawn a task to monitor the completion of another task.
|
||||||
/// when the corresponding exit_future `Signal` is fired/dropped.
|
///
|
||||||
|
/// If the other task exits by panicking, then the monitor task will shut down the executor.
|
||||||
|
fn spawn_monitor<R: Send>(
|
||||||
|
&self,
|
||||||
|
task_handle: impl Future<Output = Result<R, tokio::task::JoinError>> + Send + 'static,
|
||||||
|
name: &'static str,
|
||||||
|
) {
|
||||||
|
let mut shutdown_sender = self.shutdown_sender();
|
||||||
|
let log = self.log.clone();
|
||||||
|
|
||||||
|
if let Some(runtime) = self.runtime.upgrade() {
|
||||||
|
runtime.spawn(async move {
|
||||||
|
if let Err(join_error) = task_handle.await {
|
||||||
|
if let Ok(panic) = join_error.try_into_panic() {
|
||||||
|
let message = panic.downcast_ref::<&str>().unwrap_or(&"<none>");
|
||||||
|
|
||||||
|
crit!(
|
||||||
|
log,
|
||||||
|
"Task panic. This is a bug!";
|
||||||
|
"task_name" => name,
|
||||||
|
"message" => message,
|
||||||
|
"advice" => "Please check above for a backtrace and notify \
|
||||||
|
the developers"
|
||||||
|
);
|
||||||
|
let _ = shutdown_sender
|
||||||
|
.try_send(ShutdownReason::Failure("Panic (fatal error)"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
debug!(
|
||||||
|
self.log,
|
||||||
|
"Couldn't spawn monitor task. Runtime shutting down"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Spawn a future on the tokio runtime.
|
||||||
|
///
|
||||||
|
/// The future is wrapped in an `exit_future::Exit`. The task is canceled when the corresponding
|
||||||
|
/// exit_future `Signal` is fired/dropped.
|
||||||
|
///
|
||||||
|
/// The future is monitored via another spawned future to ensure that it doesn't panic. In case
|
||||||
|
/// of a panic, the executor will be shut down via `self.signal_tx`.
|
||||||
///
|
///
|
||||||
/// This function generates prometheus metrics on number of tasks and task duration.
|
/// This function generates prometheus metrics on number of tasks and task duration.
|
||||||
pub fn spawn(&self, task: impl Future<Output = ()> + Send + 'static, name: &'static str) {
|
pub fn spawn(&self, task: impl Future<Output = ()> + Send + 'static, name: &'static str) {
|
||||||
let exit = self.exit.clone();
|
if let Some(task_handle) = self.spawn_handle(task, name) {
|
||||||
let log = self.log.clone();
|
self.spawn_monitor(task_handle, name)
|
||||||
|
|
||||||
if let Some(int_gauge) = metrics::get_int_gauge(&metrics::ASYNC_TASKS_COUNT, &[name]) {
|
|
||||||
// Task is shutdown before it completes if `exit` receives
|
|
||||||
let int_gauge_1 = int_gauge.clone();
|
|
||||||
let future = future::select(Box::pin(task), exit).then(move |either| {
|
|
||||||
match either {
|
|
||||||
future::Either::Left(_) => trace!(log, "Async task completed"; "task" => name),
|
|
||||||
future::Either::Right(_) => {
|
|
||||||
debug!(log, "Async task shutdown, exit received"; "task" => name)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
int_gauge_1.dec();
|
|
||||||
futures::future::ready(())
|
|
||||||
});
|
|
||||||
|
|
||||||
int_gauge.inc();
|
|
||||||
if let Some(runtime) = self.runtime.upgrade() {
|
|
||||||
runtime.spawn(future);
|
|
||||||
} else {
|
|
||||||
debug!(self.log, "Couldn't spawn task. Runtime shutting down");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -150,38 +172,11 @@ impl TaskExecutor {
|
|||||||
where
|
where
|
||||||
F: FnOnce() + Send + 'static,
|
F: FnOnce() + Send + 'static,
|
||||||
{
|
{
|
||||||
let log = self.log.clone();
|
if let Some(task_handle) = self.spawn_blocking_handle(task, name) {
|
||||||
|
self.spawn_monitor(task_handle, name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if let Some(metric) = metrics::get_histogram(&metrics::BLOCKING_TASKS_HISTOGRAM, &[name]) {
|
|
||||||
if let Some(int_gauge) = metrics::get_int_gauge(&metrics::BLOCKING_TASKS_COUNT, &[name])
|
|
||||||
{
|
|
||||||
let int_gauge_1 = int_gauge.clone();
|
|
||||||
let timer = metric.start_timer();
|
|
||||||
let join_handle = if let Some(runtime) = self.runtime.upgrade() {
|
|
||||||
runtime.spawn_blocking(task)
|
|
||||||
} else {
|
|
||||||
debug!(self.log, "Couldn't spawn task. Runtime shutting down");
|
|
||||||
return;
|
|
||||||
};
|
|
||||||
|
|
||||||
let future = async move {
|
|
||||||
match join_handle.await {
|
|
||||||
Ok(_) => trace!(log, "Blocking task completed"; "task" => name),
|
|
||||||
Err(e) => debug!(log, "Blocking task failed"; "error" => %e),
|
|
||||||
};
|
|
||||||
timer.observe_duration();
|
|
||||||
int_gauge_1.dec();
|
|
||||||
};
|
|
||||||
|
|
||||||
int_gauge.inc();
|
|
||||||
if let Some(runtime) = self.runtime.upgrade() {
|
|
||||||
runtime.spawn(future);
|
|
||||||
} else {
|
|
||||||
debug!(self.log, "Couldn't spawn task. Runtime shutting down");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/// Spawn a future on the tokio runtime wrapped in an `exit_future::Exit` returning an optional
|
/// Spawn a future on the tokio runtime wrapped in an `exit_future::Exit` returning an optional
|
||||||
/// join handle to the future.
|
/// join handle to the future.
|
||||||
/// The task is canceled when the corresponding exit_future `Signal` is fired/dropped.
|
/// The task is canceled when the corresponding exit_future `Signal` is fired/dropped.
|
||||||
@ -200,9 +195,9 @@ impl TaskExecutor {
|
|||||||
let int_gauge_1 = int_gauge.clone();
|
let int_gauge_1 = int_gauge.clone();
|
||||||
let future = future::select(Box::pin(task), exit).then(move |either| {
|
let future = future::select(Box::pin(task), exit).then(move |either| {
|
||||||
let result = match either {
|
let result = match either {
|
||||||
future::Either::Left((task, _)) => {
|
future::Either::Left((value, _)) => {
|
||||||
trace!(log, "Async task completed"; "task" => name);
|
trace!(log, "Async task completed"; "task" => name);
|
||||||
Some(task)
|
Some(value)
|
||||||
}
|
}
|
||||||
future::Either::Right(_) => {
|
future::Either::Right(_) => {
|
||||||
debug!(log, "Async task shutdown, exit received"; "task" => name);
|
debug!(log, "Async task shutdown, exit received"; "task" => name);
|
||||||
|
@ -32,6 +32,11 @@ fn bls_library_name() -> &'static str {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
|
// Enable backtraces unless a RUST_BACKTRACE value has already been explicitly provided.
|
||||||
|
if std::env::var("RUST_BACKTRACE").is_err() {
|
||||||
|
std::env::set_var("RUST_BACKTRACE", "1");
|
||||||
|
}
|
||||||
|
|
||||||
// Parse the CLI parameters.
|
// Parse the CLI parameters.
|
||||||
let matches = App::new("Lighthouse")
|
let matches = App::new("Lighthouse")
|
||||||
.version(VERSION.replace("Lighthouse/", "").as_str())
|
.version(VERSION.replace("Lighthouse/", "").as_str())
|
||||||
@ -344,7 +349,8 @@ fn run<E: EthSpec>(
|
|||||||
.map_err(|e| format!("Error serializing config: {:?}", e))?;
|
.map_err(|e| format!("Error serializing config: {:?}", e))?;
|
||||||
};
|
};
|
||||||
|
|
||||||
environment.runtime().spawn(async move {
|
executor.clone().spawn(
|
||||||
|
async move {
|
||||||
if let Err(e) = ProductionBeaconNode::new(context.clone(), config).await {
|
if let Err(e) = ProductionBeaconNode::new(context.clone(), config).await {
|
||||||
crit!(log, "Failed to start beacon node"; "reason" => e);
|
crit!(log, "Failed to start beacon node"; "reason" => e);
|
||||||
// Ignore the error since it always occurs during normal operation when
|
// Ignore the error since it always occurs during normal operation when
|
||||||
@ -357,7 +363,9 @@ fn run<E: EthSpec>(
|
|||||||
"Beacon node immediate shutdown triggered.",
|
"Beacon node immediate shutdown triggered.",
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
});
|
},
|
||||||
|
"beacon_node",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
("validator_client", Some(matches)) => {
|
("validator_client", Some(matches)) => {
|
||||||
let context = environment.core_context();
|
let context = environment.core_context();
|
||||||
@ -374,7 +382,8 @@ fn run<E: EthSpec>(
|
|||||||
.map_err(|e| format!("Error serializing config: {:?}", e))?;
|
.map_err(|e| format!("Error serializing config: {:?}", e))?;
|
||||||
};
|
};
|
||||||
if !shutdown_flag {
|
if !shutdown_flag {
|
||||||
environment.runtime().spawn(async move {
|
executor.clone().spawn(
|
||||||
|
async move {
|
||||||
if let Err(e) = ProductionValidatorClient::new(context, config)
|
if let Err(e) = ProductionValidatorClient::new(context, config)
|
||||||
.await
|
.await
|
||||||
.and_then(|mut vc| vc.start_service())
|
.and_then(|mut vc| vc.start_service())
|
||||||
@ -382,11 +391,13 @@ fn run<E: EthSpec>(
|
|||||||
crit!(log, "Failed to start validator client"; "reason" => e);
|
crit!(log, "Failed to start validator client"; "reason" => e);
|
||||||
// Ignore the error since it always occurs during normal operation when
|
// Ignore the error since it always occurs during normal operation when
|
||||||
// shutting down.
|
// shutting down.
|
||||||
let _ = executor
|
let _ = executor.shutdown_sender().try_send(ShutdownReason::Failure(
|
||||||
.shutdown_sender()
|
"Failed to start validator client",
|
||||||
.try_send(ShutdownReason::Failure("Failed to start validator client"));
|
));
|
||||||
}
|
}
|
||||||
});
|
},
|
||||||
|
"validator_client",
|
||||||
|
);
|
||||||
} else {
|
} else {
|
||||||
let _ = executor.shutdown_sender().try_send(ShutdownReason::Success(
|
let _ = executor.shutdown_sender().try_send(ShutdownReason::Success(
|
||||||
"Validator client immediate shutdown triggered.",
|
"Validator client immediate shutdown triggered.",
|
||||||
|
Loading…
Reference in New Issue
Block a user