Fix validator lockfiles (#1586)

## Issue Addressed

- Resolves #1313 

## Proposed Changes

Changes the way we start the validator client and beacon node to ensure that we cleanly drop the validator keystores (which therefore ensures we cleanup their lockfiles).

Previously we were holding the validator keystores in a tokio task that was being forcefully killed (i.e., without `Drop`). Now, we hold them in a task that can gracefully handle a shutdown.

Also, switches the `--strict-lockfiles` flag to `--delete-lockfiles`. This means two things:

1. We are now strict on lockfiles by default (before we weren't).
1. There's a simple way for people delete the lockfiles if they experience a crash.

## Additional Info

I've only given the option to ignore *and* delete lockfiles, not just ignore them. I can't see a strong need for ignore-only but could easily add it, if the need arises.

I've flagged this as `api-breaking` since users that have lockfiles lingering around will be required to supply `--delete-lockfiles` next time they run.
This commit is contained in:
Paul Hauner 2020-09-24 04:06:02 +00:00
parent 996887376d
commit dffc56ef1d
No known key found for this signature in database
GPG Key ID: 5E2CFF9B75FA63DF
6 changed files with 168 additions and 130 deletions

View File

@ -7,7 +7,7 @@ mod config;
pub use beacon_chain; pub use beacon_chain;
pub use cli::cli_app; pub use cli::cli_app;
pub use client::{Client, ClientBuilder, ClientConfig, ClientGenesis}; pub use client::{Client, ClientBuilder, ClientConfig, ClientGenesis};
pub use config::{get_data_dir, get_eth2_testnet_config, set_network_config}; pub use config::{get_config, get_data_dir, get_eth2_testnet_config, set_network_config};
pub use eth2_config::Eth2Config; pub use eth2_config::Eth2Config;
use beacon_chain::events::TeeEventHandler; use beacon_chain::events::TeeEventHandler;
@ -17,7 +17,6 @@ use beacon_chain::{
builder::Witness, eth1_chain::CachingEth1Backend, slot_clock::SystemTimeSlotClock, builder::Witness, eth1_chain::CachingEth1Backend, slot_clock::SystemTimeSlotClock,
}; };
use clap::ArgMatches; use clap::ArgMatches;
use config::get_config;
use environment::RuntimeContext; use environment::RuntimeContext;
use slog::{info, warn}; use slog::{info, warn};
use std::ops::{Deref, DerefMut}; use std::ops::{Deref, DerefMut};
@ -54,7 +53,7 @@ impl<E: EthSpec> ProductionBeaconNode<E> {
/// configurations hosted remotely. /// configurations hosted remotely.
pub async fn new_from_cli( pub async fn new_from_cli(
context: RuntimeContext<E>, context: RuntimeContext<E>,
matches: &ArgMatches<'_>, matches: ArgMatches<'static>,
) -> Result<Self, String> { ) -> Result<Self, String> {
let client_config = get_config::<E>( let client_config = get_config::<E>(
&matches, &matches,

View File

@ -255,61 +255,63 @@ fn run<E: EthSpec>(
"name" => testnet_name "name" => testnet_name
); );
let beacon_node = if let Some(sub_matches) = matches.subcommand_matches("beacon_node") { match matches.subcommand() {
let runtime_context = environment.core_context(); ("beacon_node", Some(matches)) => {
let context = environment.core_context();
let log = context.log().clone();
let executor = context.executor.clone();
let config = beacon_node::get_config::<E>(
matches,
&context.eth2_config.spec_constants,
&context.eth2_config().spec,
context.log().clone(),
)?;
environment.runtime().spawn(async move {
if let Err(e) = ProductionBeaconNode::new(context.clone(), config).await {
crit!(log, "Failed to start beacon node"; "reason" => e);
// Ignore the error since it always occurs during normal operation when
// shutting down.
let _ = executor
.shutdown_sender()
.try_send("Failed to start beacon node");
}
})
}
("validator_client", Some(matches)) => {
let context = environment.core_context();
let log = context.log().clone();
let executor = context.executor.clone();
let config = validator_client::Config::from_cli(&matches)
.map_err(|e| format!("Unable to initialize validator config: {}", e))?;
environment.runtime().spawn(async move {
let run = async {
ProductionValidatorClient::new(context, config)
.await?
.start_service()?;
let beacon = environment Ok::<(), String>(())
.runtime() };
.block_on(ProductionBeaconNode::new_from_cli( if let Err(e) = run.await {
runtime_context, crit!(log, "Failed to start validator client"; "reason" => e);
sub_matches, // Ignore the error since it always occurs during normal operation when
)) // shutting down.
.map_err(|e| format!("Failed to start beacon node: {}", e))?; let _ = executor
.shutdown_sender()
Some(beacon) .try_send("Failed to start validator client");
} else { }
None })
}
_ => {
crit!(log, "No subcommand supplied. See --help .");
return Err("No subcommand supplied.".into());
}
}; };
let validator_client = if let Some(sub_matches) = matches.subcommand_matches("validator_client")
{
let runtime_context = environment.core_context();
let mut validator = environment
.runtime()
.block_on(ProductionValidatorClient::new_from_cli(
runtime_context,
sub_matches,
))
.map_err(|e| format!("Failed to init validator client: {}", e))?;
environment
.core_context()
.executor
.runtime_handle()
.enter(|| {
validator
.start_service()
.map_err(|e| format!("Failed to start validator client service: {}", e))
})?;
Some(validator)
} else {
None
};
if beacon_node.is_none() && validator_client.is_none() {
crit!(log, "No subcommand supplied. See --help .");
return Err("No subcommand supplied.".into());
}
// Block this thread until we get a ctrl-c or a task sends a shutdown signal. // Block this thread until we get a ctrl-c or a task sends a shutdown signal.
environment.block_until_shutdown_requested()?; environment.block_until_shutdown_requested()?;
info!(log, "Shutting down.."); info!(log, "Shutting down..");
environment.fire_signal(); environment.fire_signal();
drop(beacon_node);
drop(validator_client);
// Shutdown the environment once all tasks have completed. // Shutdown the environment once all tasks have completed.
environment.shutdown_on_idle(); environment.shutdown_on_idle();

View File

@ -37,11 +37,15 @@ pub fn cli_app<'a, 'b>() -> App<'a, 'b> {
nodes using the same key. Automatically enabled unless `--strict` is specified", nodes using the same key. Automatically enabled unless `--strict` is specified",
)) ))
.arg( .arg(
Arg::with_name("strict-lockfiles") Arg::with_name("delete-lockfiles")
.long("strict-lockfiles") .long("delete-lockfiles")
.help( .help(
"If present, do not load validators that are guarded by a lockfile. Note: for \ "If present, ignore and delete any keystore lockfiles encountered during start up. \
Eth2 mainnet, this flag will likely be removed and its behaviour will become default." This is useful if the validator client did not exit gracefully on the last run. \
WARNING: lockfiles help prevent users from accidentally running the same validator \
using two different validator clients, an action that likely leads to slashing. \
Ensure you are certain that there are no other validator client instances running \
that might also be using the same keystores."
) )
) )
.arg( .arg(

View File

@ -24,8 +24,8 @@ pub struct Config {
/// If true, the validator client will still poll for duties and produce blocks even if the /// If true, the validator client will still poll for duties and produce blocks even if the
/// beacon node is not synced at startup. /// beacon node is not synced at startup.
pub allow_unsynced_beacon_node: bool, pub allow_unsynced_beacon_node: bool,
/// If true, refuse to unlock a keypair that is guarded by a lockfile. /// If true, delete any validator keystore lockfiles that would prevent starting.
pub strict_lockfiles: bool, pub delete_lockfiles: bool,
/// If true, don't scan the validators dir for new keystores. /// If true, don't scan the validators dir for new keystores.
pub disable_auto_discover: bool, pub disable_auto_discover: bool,
/// Graffiti to be inserted everytime we create a block. /// Graffiti to be inserted everytime we create a block.
@ -46,7 +46,7 @@ impl Default for Config {
secrets_dir, secrets_dir,
http_server: DEFAULT_HTTP_SERVER.to_string(), http_server: DEFAULT_HTTP_SERVER.to_string(),
allow_unsynced_beacon_node: false, allow_unsynced_beacon_node: false,
strict_lockfiles: false, delete_lockfiles: false,
disable_auto_discover: false, disable_auto_discover: false,
graffiti: None, graffiti: None,
} }
@ -77,7 +77,7 @@ impl Config {
} }
config.allow_unsynced_beacon_node = cli_args.is_present("allow-unsynced"); config.allow_unsynced_beacon_node = cli_args.is_present("allow-unsynced");
config.strict_lockfiles = cli_args.is_present("strict-lockfiles"); config.delete_lockfiles = cli_args.is_present("delete-lockfiles");
config.disable_auto_discover = cli_args.is_present("disable-auto-discover"); config.disable_auto_discover = cli_args.is_present("disable-auto-discover");
if let Some(secrets_dir) = parse_optional(cli_args, "secrets-dir")? { if let Some(secrets_dir) = parse_optional(cli_args, "secrets-dir")? {

View File

@ -54,6 +54,10 @@ pub enum Error {
PasswordUnknown(PathBuf), PasswordUnknown(PathBuf),
/// There was an error reading from stdin. /// There was an error reading from stdin.
UnableToReadPasswordFromUser(String), UnableToReadPasswordFromUser(String),
/// There was an error running a tokio async task.
TokioJoin(tokio::task::JoinError),
/// There was a filesystem error when deleting a lockfile.
UnableToDeleteLockfile(io::Error),
} }
/// A method used by a validator to sign messages. /// A method used by a validator to sign messages.
@ -86,7 +90,7 @@ impl InitializedValidator {
/// If the validator is unable to be initialized for whatever reason. /// If the validator is unable to be initialized for whatever reason.
pub fn from_definition( pub fn from_definition(
def: ValidatorDefinition, def: ValidatorDefinition,
strict_lockfiles: bool, delete_lockfiles: bool,
log: &Logger, log: &Logger,
) -> Result<Self, Error> { ) -> Result<Self, Error> {
if !def.enabled { if !def.enabled {
@ -150,16 +154,17 @@ impl InitializedValidator {
})?; })?;
if voting_keystore_lockfile_path.exists() { if voting_keystore_lockfile_path.exists() {
if strict_lockfiles { if delete_lockfiles {
return Err(Error::LockfileExists(voting_keystore_lockfile_path));
} else {
// If **not** respecting lockfiles, just raise a warning if the voting
// keypair cannot be unlocked.
warn!( warn!(
log, log,
"Ignoring validator lockfile"; "Deleting validator lockfile";
"file" => format!("{:?}", voting_keystore_lockfile_path) "file" => format!("{:?}", voting_keystore_lockfile_path)
); );
fs::remove_file(&voting_keystore_lockfile_path)
.map_err(Error::UnableToDeleteLockfile)?;
} else {
return Err(Error::LockfileExists(voting_keystore_lockfile_path));
} }
} else { } else {
// Create a new lockfile. // Create a new lockfile.
@ -279,7 +284,7 @@ pub struct InitializedValidators {
impl InitializedValidators { impl InitializedValidators {
/// Instantiates `Self`, initializing all validators in `definitions`. /// Instantiates `Self`, initializing all validators in `definitions`.
pub fn from_definitions( pub async fn from_definitions(
definitions: ValidatorDefinitions, definitions: ValidatorDefinitions,
validators_dir: PathBuf, validators_dir: PathBuf,
strict_lockfiles: bool, strict_lockfiles: bool,
@ -292,7 +297,7 @@ impl InitializedValidators {
validators: HashMap::default(), validators: HashMap::default(),
log, log,
}; };
this.update_validators()?; this.update_validators().await?;
Ok(this) Ok(this)
} }
@ -328,7 +333,7 @@ impl InitializedValidators {
/// validator will be removed from `self.validators`. /// validator will be removed from `self.validators`.
/// ///
/// Saves the `ValidatorDefinitions` to file, even if no definitions were changed. /// Saves the `ValidatorDefinitions` to file, even if no definitions were changed.
pub fn set_validator_status( pub async fn set_validator_status(
&mut self, &mut self,
voting_public_key: &PublicKey, voting_public_key: &PublicKey,
enabled: bool, enabled: bool,
@ -342,7 +347,7 @@ impl InitializedValidators {
def.enabled = enabled; def.enabled = enabled;
} }
self.update_validators()?; self.update_validators().await?;
self.definitions self.definitions
.save(&self.validators_dir) .save(&self.validators_dir)
@ -362,7 +367,7 @@ impl InitializedValidators {
/// A validator is considered "already known" and skipped if the public key is already known. /// A validator is considered "already known" and skipped if the public key is already known.
/// I.e., if there are two different definitions with the same public key then the second will /// I.e., if there are two different definitions with the same public key then the second will
/// be ignored. /// be ignored.
fn update_validators(&mut self) -> Result<(), Error> { async fn update_validators(&mut self) -> Result<(), Error> {
for def in self.definitions.as_slice() { for def in self.definitions.as_slice() {
if def.enabled { if def.enabled {
match &def.signing_definition { match &def.signing_definition {
@ -371,11 +376,23 @@ impl InitializedValidators {
continue; continue;
} }
match InitializedValidator::from_definition( // Decoding a local keystore can take several seconds, therefore it's best
def.clone(), // to keep if off the core executor. This also has the fortunate effect of
self.strict_lockfiles, // interrupting the potentially long-running task during shut down.
&self.log, let inner_def = def.clone();
) { let strict_lockfiles = self.strict_lockfiles;
let inner_log = self.log.clone();
let result = tokio::task::spawn_blocking(move || {
InitializedValidator::from_definition(
inner_def,
strict_lockfiles,
&inner_log,
)
})
.await
.map_err(Error::TokioJoin)?;
match result {
Ok(init) => { Ok(init) => {
self.validators self.validators
.insert(init.voting_public_key().clone(), init); .insert(init.voting_public_key().clone(), init);

View File

@ -18,6 +18,7 @@ use block_service::{BlockService, BlockServiceBuilder};
use clap::ArgMatches; use clap::ArgMatches;
use duties_service::{DutiesService, DutiesServiceBuilder}; use duties_service::{DutiesService, DutiesServiceBuilder};
use environment::RuntimeContext; use environment::RuntimeContext;
use eth2_config::Eth2Config;
use fork_service::{ForkService, ForkServiceBuilder}; use fork_service::{ForkService, ForkServiceBuilder};
use futures::channel::mpsc; use futures::channel::mpsc;
use initialized_validators::InitializedValidators; use initialized_validators::InitializedValidators;
@ -28,7 +29,7 @@ use slot_clock::SlotClock;
use slot_clock::SystemTimeSlotClock; use slot_clock::SystemTimeSlotClock;
use std::time::{SystemTime, UNIX_EPOCH}; use std::time::{SystemTime, UNIX_EPOCH};
use tokio::time::{delay_for, Duration}; use tokio::time::{delay_for, Duration};
use types::EthSpec; use types::{EthSpec, Hash256};
use validator_store::ValidatorStore; use validator_store::ValidatorStore;
/// The interval between attempts to contact the beacon node during startup. /// The interval between attempts to contact the beacon node during startup.
@ -90,9 +91,10 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
let validators = InitializedValidators::from_definitions( let validators = InitializedValidators::from_definitions(
validator_defs, validator_defs,
config.data_dir.clone(), config.data_dir.clone(),
config.strict_lockfiles, config.delete_lockfiles,
log.clone(), log.clone(),
) )
.await
.map_err(|e| format!("Unable to initialize validators: {:?}", e))?; .map_err(|e| format!("Unable to initialize validators: {:?}", e))?;
info!( info!(
@ -106,56 +108,11 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
RemoteBeaconNode::new_with_timeout(config.http_server.clone(), HTTP_TIMEOUT) RemoteBeaconNode::new_with_timeout(config.http_server.clone(), HTTP_TIMEOUT)
.map_err(|e| format!("Unable to init beacon node http client: {}", e))?; .map_err(|e| format!("Unable to init beacon node http client: {}", e))?;
// TODO: check if all logs in wait_for_node are produed while awaiting // Perform some potentially long-running initialization tasks.
let beacon_node = wait_for_node(beacon_node, &log).await?; let (eth2_config, genesis_time, genesis_validators_root) = tokio::select! {
let eth2_config = beacon_node tuple = init_from_beacon_node(&beacon_node, &context) => tuple?,
.http () = context.executor.exit() => return Err("Shutting down".to_string())
.spec() };
.get_eth2_config()
.await
.map_err(|e| format!("Unable to read eth2 config from beacon node: {:?}", e))?;
let genesis_time = beacon_node
.http
.beacon()
.get_genesis_time()
.await
.map_err(|e| format!("Unable to read genesis time from beacon node: {:?}", e))?;
let now = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map_err(|e| format!("Unable to read system time: {:?}", e))?;
let genesis = Duration::from_secs(genesis_time);
// If the time now is less than (prior to) genesis, then delay until the
// genesis instant.
//
// If the validator client starts before genesis, it will get errors from
// the slot clock.
if now < genesis {
info!(
log,
"Starting node prior to genesis";
"seconds_to_wait" => (genesis - now).as_secs()
);
delay_for(genesis - now).await
} else {
info!(
log,
"Genesis has already occurred";
"seconds_ago" => (now - genesis).as_secs()
);
}
let genesis_validators_root = beacon_node
.http
.beacon()
.get_genesis_validators_root()
.await
.map_err(|e| {
format!(
"Unable to read genesis validators root from beacon node: {:?}",
e
)
})?;
// Do not permit a connection to a beacon node using different spec constants. // Do not permit a connection to a beacon node using different spec constants.
if context.eth2_config.spec_constants != eth2_config.spec_constants { if context.eth2_config.spec_constants != eth2_config.spec_constants {
@ -270,12 +227,71 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
} }
} }
async fn init_from_beacon_node<E: EthSpec>(
beacon_node: &RemoteBeaconNode<E>,
context: &RuntimeContext<E>,
) -> Result<(Eth2Config, u64, Hash256), String> {
// Wait for the beacon node to come online.
wait_for_node(beacon_node, context.log()).await?;
let eth2_config = beacon_node
.http
.spec()
.get_eth2_config()
.await
.map_err(|e| format!("Unable to read eth2 config from beacon node: {:?}", e))?;
let genesis_time = beacon_node
.http
.beacon()
.get_genesis_time()
.await
.map_err(|e| format!("Unable to read genesis time from beacon node: {:?}", e))?;
let now = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map_err(|e| format!("Unable to read system time: {:?}", e))?;
let genesis = Duration::from_secs(genesis_time);
// If the time now is less than (prior to) genesis, then delay until the
// genesis instant.
//
// If the validator client starts before genesis, it will get errors from
// the slot clock.
if now < genesis {
info!(
context.log(),
"Starting node prior to genesis";
"seconds_to_wait" => (genesis - now).as_secs()
);
delay_for(genesis - now).await;
} else {
info!(
context.log(),
"Genesis has already occurred";
"seconds_ago" => (now - genesis).as_secs()
);
}
let genesis_validators_root = beacon_node
.http
.beacon()
.get_genesis_validators_root()
.await
.map_err(|e| {
format!(
"Unable to read genesis validators root from beacon node: {:?}",
e
)
})?;
Ok((eth2_config, genesis_time, genesis_validators_root))
}
/// Request the version from the node, looping back and trying again on failure. Exit once the node /// Request the version from the node, looping back and trying again on failure. Exit once the node
/// has been contacted. /// has been contacted.
async fn wait_for_node<E: EthSpec>( async fn wait_for_node<E: EthSpec>(
beacon_node: RemoteBeaconNode<E>, beacon_node: &RemoteBeaconNode<E>,
log: &Logger, log: &Logger,
) -> Result<RemoteBeaconNode<E>, String> { ) -> Result<(), String> {
// Try to get the version string from the node, looping until success is returned. // Try to get the version string from the node, looping until success is returned.
loop { loop {
let log = log.clone(); let log = log.clone();
@ -295,7 +311,7 @@ async fn wait_for_node<E: EthSpec>(
"version" => version, "version" => version,
); );
return Ok(beacon_node); return Ok(());
} }
Err(e) => { Err(e) => {
error!( error!(