Fix validator lockfiles (#1586)

## Issue Addressed

- Resolves #1313 

## Proposed Changes

Changes the way we start the validator client and beacon node to ensure that we cleanly drop the validator keystores (which therefore ensures we cleanup their lockfiles).

Previously we were holding the validator keystores in a tokio task that was being forcefully killed (i.e., without `Drop`). Now, we hold them in a task that can gracefully handle a shutdown.

Also, switches the `--strict-lockfiles` flag to `--delete-lockfiles`. This means two things:

1. We are now strict on lockfiles by default (before we weren't).
1. There's a simple way for people delete the lockfiles if they experience a crash.

## Additional Info

I've only given the option to ignore *and* delete lockfiles, not just ignore them. I can't see a strong need for ignore-only but could easily add it, if the need arises.

I've flagged this as `api-breaking` since users that have lockfiles lingering around will be required to supply `--delete-lockfiles` next time they run.
This commit is contained in:
Paul Hauner 2020-09-24 04:06:02 +00:00
parent 996887376d
commit dffc56ef1d
No known key found for this signature in database
GPG Key ID: 5E2CFF9B75FA63DF
6 changed files with 168 additions and 130 deletions

View File

@ -7,7 +7,7 @@ mod config;
pub use beacon_chain;
pub use cli::cli_app;
pub use client::{Client, ClientBuilder, ClientConfig, ClientGenesis};
pub use config::{get_data_dir, get_eth2_testnet_config, set_network_config};
pub use config::{get_config, get_data_dir, get_eth2_testnet_config, set_network_config};
pub use eth2_config::Eth2Config;
use beacon_chain::events::TeeEventHandler;
@ -17,7 +17,6 @@ use beacon_chain::{
builder::Witness, eth1_chain::CachingEth1Backend, slot_clock::SystemTimeSlotClock,
};
use clap::ArgMatches;
use config::get_config;
use environment::RuntimeContext;
use slog::{info, warn};
use std::ops::{Deref, DerefMut};
@ -54,7 +53,7 @@ impl<E: EthSpec> ProductionBeaconNode<E> {
/// configurations hosted remotely.
pub async fn new_from_cli(
context: RuntimeContext<E>,
matches: &ArgMatches<'_>,
matches: ArgMatches<'static>,
) -> Result<Self, String> {
let client_config = get_config::<E>(
&matches,

View File

@ -255,61 +255,63 @@ fn run<E: EthSpec>(
"name" => testnet_name
);
let beacon_node = if let Some(sub_matches) = matches.subcommand_matches("beacon_node") {
let runtime_context = environment.core_context();
match matches.subcommand() {
("beacon_node", Some(matches)) => {
let context = environment.core_context();
let log = context.log().clone();
let executor = context.executor.clone();
let config = beacon_node::get_config::<E>(
matches,
&context.eth2_config.spec_constants,
&context.eth2_config().spec,
context.log().clone(),
)?;
environment.runtime().spawn(async move {
if let Err(e) = ProductionBeaconNode::new(context.clone(), config).await {
crit!(log, "Failed to start beacon node"; "reason" => e);
// Ignore the error since it always occurs during normal operation when
// shutting down.
let _ = executor
.shutdown_sender()
.try_send("Failed to start beacon node");
}
})
}
("validator_client", Some(matches)) => {
let context = environment.core_context();
let log = context.log().clone();
let executor = context.executor.clone();
let config = validator_client::Config::from_cli(&matches)
.map_err(|e| format!("Unable to initialize validator config: {}", e))?;
environment.runtime().spawn(async move {
let run = async {
ProductionValidatorClient::new(context, config)
.await?
.start_service()?;
let beacon = environment
.runtime()
.block_on(ProductionBeaconNode::new_from_cli(
runtime_context,
sub_matches,
))
.map_err(|e| format!("Failed to start beacon node: {}", e))?;
Some(beacon)
} else {
None
Ok::<(), String>(())
};
let validator_client = if let Some(sub_matches) = matches.subcommand_matches("validator_client")
{
let runtime_context = environment.core_context();
let mut validator = environment
.runtime()
.block_on(ProductionValidatorClient::new_from_cli(
runtime_context,
sub_matches,
))
.map_err(|e| format!("Failed to init validator client: {}", e))?;
environment
.core_context()
.executor
.runtime_handle()
.enter(|| {
validator
.start_service()
.map_err(|e| format!("Failed to start validator client service: {}", e))
})?;
Some(validator)
} else {
None
};
if beacon_node.is_none() && validator_client.is_none() {
if let Err(e) = run.await {
crit!(log, "Failed to start validator client"; "reason" => e);
// Ignore the error since it always occurs during normal operation when
// shutting down.
let _ = executor
.shutdown_sender()
.try_send("Failed to start validator client");
}
})
}
_ => {
crit!(log, "No subcommand supplied. See --help .");
return Err("No subcommand supplied.".into());
}
};
// Block this thread until we get a ctrl-c or a task sends a shutdown signal.
environment.block_until_shutdown_requested()?;
info!(log, "Shutting down..");
environment.fire_signal();
drop(beacon_node);
drop(validator_client);
// Shutdown the environment once all tasks have completed.
environment.shutdown_on_idle();

View File

@ -37,11 +37,15 @@ pub fn cli_app<'a, 'b>() -> App<'a, 'b> {
nodes using the same key. Automatically enabled unless `--strict` is specified",
))
.arg(
Arg::with_name("strict-lockfiles")
.long("strict-lockfiles")
Arg::with_name("delete-lockfiles")
.long("delete-lockfiles")
.help(
"If present, do not load validators that are guarded by a lockfile. Note: for \
Eth2 mainnet, this flag will likely be removed and its behaviour will become default."
"If present, ignore and delete any keystore lockfiles encountered during start up. \
This is useful if the validator client did not exit gracefully on the last run. \
WARNING: lockfiles help prevent users from accidentally running the same validator \
using two different validator clients, an action that likely leads to slashing. \
Ensure you are certain that there are no other validator client instances running \
that might also be using the same keystores."
)
)
.arg(

View File

@ -24,8 +24,8 @@ pub struct Config {
/// If true, the validator client will still poll for duties and produce blocks even if the
/// beacon node is not synced at startup.
pub allow_unsynced_beacon_node: bool,
/// If true, refuse to unlock a keypair that is guarded by a lockfile.
pub strict_lockfiles: bool,
/// If true, delete any validator keystore lockfiles that would prevent starting.
pub delete_lockfiles: bool,
/// If true, don't scan the validators dir for new keystores.
pub disable_auto_discover: bool,
/// Graffiti to be inserted everytime we create a block.
@ -46,7 +46,7 @@ impl Default for Config {
secrets_dir,
http_server: DEFAULT_HTTP_SERVER.to_string(),
allow_unsynced_beacon_node: false,
strict_lockfiles: false,
delete_lockfiles: false,
disable_auto_discover: false,
graffiti: None,
}
@ -77,7 +77,7 @@ impl Config {
}
config.allow_unsynced_beacon_node = cli_args.is_present("allow-unsynced");
config.strict_lockfiles = cli_args.is_present("strict-lockfiles");
config.delete_lockfiles = cli_args.is_present("delete-lockfiles");
config.disable_auto_discover = cli_args.is_present("disable-auto-discover");
if let Some(secrets_dir) = parse_optional(cli_args, "secrets-dir")? {

View File

@ -54,6 +54,10 @@ pub enum Error {
PasswordUnknown(PathBuf),
/// There was an error reading from stdin.
UnableToReadPasswordFromUser(String),
/// There was an error running a tokio async task.
TokioJoin(tokio::task::JoinError),
/// There was a filesystem error when deleting a lockfile.
UnableToDeleteLockfile(io::Error),
}
/// A method used by a validator to sign messages.
@ -86,7 +90,7 @@ impl InitializedValidator {
/// If the validator is unable to be initialized for whatever reason.
pub fn from_definition(
def: ValidatorDefinition,
strict_lockfiles: bool,
delete_lockfiles: bool,
log: &Logger,
) -> Result<Self, Error> {
if !def.enabled {
@ -150,16 +154,17 @@ impl InitializedValidator {
})?;
if voting_keystore_lockfile_path.exists() {
if strict_lockfiles {
return Err(Error::LockfileExists(voting_keystore_lockfile_path));
} else {
// If **not** respecting lockfiles, just raise a warning if the voting
// keypair cannot be unlocked.
if delete_lockfiles {
warn!(
log,
"Ignoring validator lockfile";
"Deleting validator lockfile";
"file" => format!("{:?}", voting_keystore_lockfile_path)
);
fs::remove_file(&voting_keystore_lockfile_path)
.map_err(Error::UnableToDeleteLockfile)?;
} else {
return Err(Error::LockfileExists(voting_keystore_lockfile_path));
}
} else {
// Create a new lockfile.
@ -279,7 +284,7 @@ pub struct InitializedValidators {
impl InitializedValidators {
/// Instantiates `Self`, initializing all validators in `definitions`.
pub fn from_definitions(
pub async fn from_definitions(
definitions: ValidatorDefinitions,
validators_dir: PathBuf,
strict_lockfiles: bool,
@ -292,7 +297,7 @@ impl InitializedValidators {
validators: HashMap::default(),
log,
};
this.update_validators()?;
this.update_validators().await?;
Ok(this)
}
@ -328,7 +333,7 @@ impl InitializedValidators {
/// validator will be removed from `self.validators`.
///
/// Saves the `ValidatorDefinitions` to file, even if no definitions were changed.
pub fn set_validator_status(
pub async fn set_validator_status(
&mut self,
voting_public_key: &PublicKey,
enabled: bool,
@ -342,7 +347,7 @@ impl InitializedValidators {
def.enabled = enabled;
}
self.update_validators()?;
self.update_validators().await?;
self.definitions
.save(&self.validators_dir)
@ -362,7 +367,7 @@ impl InitializedValidators {
/// A validator is considered "already known" and skipped if the public key is already known.
/// I.e., if there are two different definitions with the same public key then the second will
/// be ignored.
fn update_validators(&mut self) -> Result<(), Error> {
async fn update_validators(&mut self) -> Result<(), Error> {
for def in self.definitions.as_slice() {
if def.enabled {
match &def.signing_definition {
@ -371,11 +376,23 @@ impl InitializedValidators {
continue;
}
match InitializedValidator::from_definition(
def.clone(),
self.strict_lockfiles,
&self.log,
) {
// Decoding a local keystore can take several seconds, therefore it's best
// to keep if off the core executor. This also has the fortunate effect of
// interrupting the potentially long-running task during shut down.
let inner_def = def.clone();
let strict_lockfiles = self.strict_lockfiles;
let inner_log = self.log.clone();
let result = tokio::task::spawn_blocking(move || {
InitializedValidator::from_definition(
inner_def,
strict_lockfiles,
&inner_log,
)
})
.await
.map_err(Error::TokioJoin)?;
match result {
Ok(init) => {
self.validators
.insert(init.voting_public_key().clone(), init);

View File

@ -18,6 +18,7 @@ use block_service::{BlockService, BlockServiceBuilder};
use clap::ArgMatches;
use duties_service::{DutiesService, DutiesServiceBuilder};
use environment::RuntimeContext;
use eth2_config::Eth2Config;
use fork_service::{ForkService, ForkServiceBuilder};
use futures::channel::mpsc;
use initialized_validators::InitializedValidators;
@ -28,7 +29,7 @@ use slot_clock::SlotClock;
use slot_clock::SystemTimeSlotClock;
use std::time::{SystemTime, UNIX_EPOCH};
use tokio::time::{delay_for, Duration};
use types::EthSpec;
use types::{EthSpec, Hash256};
use validator_store::ValidatorStore;
/// The interval between attempts to contact the beacon node during startup.
@ -90,9 +91,10 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
let validators = InitializedValidators::from_definitions(
validator_defs,
config.data_dir.clone(),
config.strict_lockfiles,
config.delete_lockfiles,
log.clone(),
)
.await
.map_err(|e| format!("Unable to initialize validators: {:?}", e))?;
info!(
@ -106,56 +108,11 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
RemoteBeaconNode::new_with_timeout(config.http_server.clone(), HTTP_TIMEOUT)
.map_err(|e| format!("Unable to init beacon node http client: {}", e))?;
// TODO: check if all logs in wait_for_node are produed while awaiting
let beacon_node = wait_for_node(beacon_node, &log).await?;
let eth2_config = beacon_node
.http
.spec()
.get_eth2_config()
.await
.map_err(|e| format!("Unable to read eth2 config from beacon node: {:?}", e))?;
let genesis_time = beacon_node
.http
.beacon()
.get_genesis_time()
.await
.map_err(|e| format!("Unable to read genesis time from beacon node: {:?}", e))?;
let now = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map_err(|e| format!("Unable to read system time: {:?}", e))?;
let genesis = Duration::from_secs(genesis_time);
// If the time now is less than (prior to) genesis, then delay until the
// genesis instant.
//
// If the validator client starts before genesis, it will get errors from
// the slot clock.
if now < genesis {
info!(
log,
"Starting node prior to genesis";
"seconds_to_wait" => (genesis - now).as_secs()
);
delay_for(genesis - now).await
} else {
info!(
log,
"Genesis has already occurred";
"seconds_ago" => (now - genesis).as_secs()
);
}
let genesis_validators_root = beacon_node
.http
.beacon()
.get_genesis_validators_root()
.await
.map_err(|e| {
format!(
"Unable to read genesis validators root from beacon node: {:?}",
e
)
})?;
// Perform some potentially long-running initialization tasks.
let (eth2_config, genesis_time, genesis_validators_root) = tokio::select! {
tuple = init_from_beacon_node(&beacon_node, &context) => tuple?,
() = context.executor.exit() => return Err("Shutting down".to_string())
};
// Do not permit a connection to a beacon node using different spec constants.
if context.eth2_config.spec_constants != eth2_config.spec_constants {
@ -270,12 +227,71 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
}
}
async fn init_from_beacon_node<E: EthSpec>(
beacon_node: &RemoteBeaconNode<E>,
context: &RuntimeContext<E>,
) -> Result<(Eth2Config, u64, Hash256), String> {
// Wait for the beacon node to come online.
wait_for_node(beacon_node, context.log()).await?;
let eth2_config = beacon_node
.http
.spec()
.get_eth2_config()
.await
.map_err(|e| format!("Unable to read eth2 config from beacon node: {:?}", e))?;
let genesis_time = beacon_node
.http
.beacon()
.get_genesis_time()
.await
.map_err(|e| format!("Unable to read genesis time from beacon node: {:?}", e))?;
let now = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map_err(|e| format!("Unable to read system time: {:?}", e))?;
let genesis = Duration::from_secs(genesis_time);
// If the time now is less than (prior to) genesis, then delay until the
// genesis instant.
//
// If the validator client starts before genesis, it will get errors from
// the slot clock.
if now < genesis {
info!(
context.log(),
"Starting node prior to genesis";
"seconds_to_wait" => (genesis - now).as_secs()
);
delay_for(genesis - now).await;
} else {
info!(
context.log(),
"Genesis has already occurred";
"seconds_ago" => (now - genesis).as_secs()
);
}
let genesis_validators_root = beacon_node
.http
.beacon()
.get_genesis_validators_root()
.await
.map_err(|e| {
format!(
"Unable to read genesis validators root from beacon node: {:?}",
e
)
})?;
Ok((eth2_config, genesis_time, genesis_validators_root))
}
/// Request the version from the node, looping back and trying again on failure. Exit once the node
/// has been contacted.
async fn wait_for_node<E: EthSpec>(
beacon_node: RemoteBeaconNode<E>,
beacon_node: &RemoteBeaconNode<E>,
log: &Logger,
) -> Result<RemoteBeaconNode<E>, String> {
) -> Result<(), String> {
// Try to get the version string from the node, looping until success is returned.
loop {
let log = log.clone();
@ -295,7 +311,7 @@ async fn wait_for_node<E: EthSpec>(
"version" => version,
);
return Ok(beacon_node);
return Ok(());
}
Err(e) => {
error!(