lighthouse/beacon_node/beacon_chain/src/migrate.rs
Michael Sproul 20067b9465 Remove checkpoint alignment requirements and enable historic state pruning (#4610)
## Issue Addressed

Closes #3210
Closes #3211

## Proposed Changes

- Checkpoint sync from the latest finalized state regardless of its alignment.
- Add the `block_root` to the database's split point. This is _only_ added to the in-memory split in order to avoid a schema migration. See `load_split`.
- Add a new method to the DB called `get_advanced_state`, which looks up a state _by block root_, with a `state_root` as fallback. Using this method prevents accidental accesses of the split's unadvanced state, which does not exist in the hot DB and is not guaranteed to exist in the freezer DB at all. Previously Lighthouse would look up this state _from the freezer DB_, even if it was required for block/attestation processing, which was suboptimal.
- Replace several state look-ups in block and attestation processing with `get_advanced_state` so that they can't hit the split block's unadvanced state.
- Do not store any states in the freezer database by default. All states will be deleted upon being evicted from the hot database unless `--reconstruct-historic-states` is set. The anchor info which was previously used for checkpoint sync is used to implement this, including when syncing from genesis.

## Additional Info

Needs further testing. I want to stress-test the pruned database under Hydra.

The `get_advanced_state` method is intended to become more relevant over time: `tree-states` includes an identically named method that returns advanced states from its in-memory cache.

Co-authored-by: realbigsean <seananderson33@gmail.com>
2023-08-21 05:02:32 +00:00

717 lines
28 KiB
Rust

use crate::beacon_chain::BEACON_CHAIN_DB_KEY;
use crate::errors::BeaconChainError;
use crate::head_tracker::{HeadTracker, SszHeadTracker};
use crate::persisted_beacon_chain::{PersistedBeaconChain, DUMMY_CANONICAL_HEAD_BLOCK_ROOT};
use parking_lot::Mutex;
use slog::{debug, error, info, warn, Logger};
use std::collections::{HashMap, HashSet};
use std::mem;
use std::sync::{mpsc, Arc};
use std::thread;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use store::hot_cold_store::{migrate_database, HotColdDBError};
use store::iter::RootsIterator;
use store::{Error, ItemStore, StoreItem, StoreOp};
pub use store::{HotColdDB, MemoryStore};
use types::{
BeaconState, BeaconStateError, BeaconStateHash, Checkpoint, Epoch, EthSpec, Hash256,
SignedBeaconBlockHash, Slot,
};
/// Compact at least this frequently, finalization permitting (7 days).
const MAX_COMPACTION_PERIOD_SECONDS: u64 = 604800;
/// Compact at *most* this frequently, to prevent over-compaction during sync (2 hours).
const MIN_COMPACTION_PERIOD_SECONDS: u64 = 7200;
/// Compact after a large finality gap, if we respect `MIN_COMPACTION_PERIOD_SECONDS`.
const COMPACTION_FINALITY_DISTANCE: u64 = 1024;
/// Default number of epochs to wait between finalization migrations.
pub const DEFAULT_EPOCHS_PER_MIGRATION: u64 = 1;
/// The background migrator runs a thread to perform pruning and migrate state from the hot
/// to the cold database.
pub struct BackgroundMigrator<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> {
db: Arc<HotColdDB<E, Hot, Cold>>,
/// Record of when the last migration ran, for enforcing `epochs_per_migration`.
prev_migration: Arc<Mutex<PrevMigration>>,
#[allow(clippy::type_complexity)]
tx_thread: Option<Mutex<(mpsc::Sender<Notification>, thread::JoinHandle<()>)>>,
/// Genesis block root, for persisting the `PersistedBeaconChain`.
genesis_block_root: Hash256,
log: Logger,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct MigratorConfig {
pub blocking: bool,
/// Run migrations at most once per `epochs_per_migration`.
///
/// If set to 0 or 1, then run every finalization.
pub epochs_per_migration: u64,
}
impl Default for MigratorConfig {
fn default() -> Self {
Self {
blocking: false,
epochs_per_migration: DEFAULT_EPOCHS_PER_MIGRATION,
}
}
}
impl MigratorConfig {
pub fn blocking(mut self) -> Self {
self.blocking = true;
self
}
pub fn epochs_per_migration(mut self, epochs_per_migration: u64) -> Self {
self.epochs_per_migration = epochs_per_migration;
self
}
}
/// Record of when the last migration ran.
pub struct PrevMigration {
/// The epoch at which the last finalization migration ran.
epoch: Epoch,
/// The number of epochs to wait between runs.
epochs_per_migration: u64,
}
/// Pruning can be successful, or in rare cases deferred to a later point.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum PruningOutcome {
/// The pruning succeeded and updated the pruning checkpoint from `old_finalized_checkpoint`.
Successful {
old_finalized_checkpoint: Checkpoint,
},
/// The run was aborted because the new finalized checkpoint is older than the previous one.
OutOfOrderFinalization {
old_finalized_checkpoint: Checkpoint,
new_finalized_checkpoint: Checkpoint,
},
/// The run was aborted due to a concurrent mutation of the head tracker.
DeferredConcurrentHeadTrackerMutation,
}
/// Logic errors that can occur during pruning, none of these should ever happen.
#[derive(Debug)]
pub enum PruningError {
IncorrectFinalizedState {
state_slot: Slot,
new_finalized_slot: Slot,
},
MissingInfoForCanonicalChain {
slot: Slot,
},
FinalizedStateOutOfOrder {
old_finalized_checkpoint: Checkpoint,
new_finalized_checkpoint: Checkpoint,
},
UnexpectedEqualStateRoots,
UnexpectedUnequalStateRoots,
}
/// Message sent to the migration thread containing the information it needs to run.
pub enum Notification {
Finalization(FinalizationNotification),
Reconstruction,
}
pub struct FinalizationNotification {
finalized_state_root: BeaconStateHash,
finalized_checkpoint: Checkpoint,
head_tracker: Arc<HeadTracker>,
prev_migration: Arc<Mutex<PrevMigration>>,
genesis_block_root: Hash256,
}
impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> BackgroundMigrator<E, Hot, Cold> {
/// Create a new `BackgroundMigrator` and spawn its thread if necessary.
pub fn new(
db: Arc<HotColdDB<E, Hot, Cold>>,
config: MigratorConfig,
genesis_block_root: Hash256,
log: Logger,
) -> Self {
// Estimate last migration run from DB split slot.
let prev_migration = Arc::new(Mutex::new(PrevMigration {
epoch: db.get_split_slot().epoch(E::slots_per_epoch()),
epochs_per_migration: config.epochs_per_migration,
}));
let tx_thread = if config.blocking {
None
} else {
Some(Mutex::new(Self::spawn_thread(db.clone(), log.clone())))
};
Self {
db,
tx_thread,
prev_migration,
genesis_block_root,
log,
}
}
/// Process a finalized checkpoint from the `BeaconChain`.
///
/// If successful, all forks descending from before the `finalized_checkpoint` will be
/// pruned, and the split point of the database will be advanced to the slot of the finalized
/// checkpoint.
pub fn process_finalization(
&self,
finalized_state_root: BeaconStateHash,
finalized_checkpoint: Checkpoint,
head_tracker: Arc<HeadTracker>,
) -> Result<(), BeaconChainError> {
let notif = FinalizationNotification {
finalized_state_root,
finalized_checkpoint,
head_tracker,
prev_migration: self.prev_migration.clone(),
genesis_block_root: self.genesis_block_root,
};
// Send to background thread if configured, otherwise run in foreground.
if let Some(Notification::Finalization(notif)) =
self.send_background_notification(Notification::Finalization(notif))
{
Self::run_migration(self.db.clone(), notif, &self.log);
}
Ok(())
}
pub fn process_reconstruction(&self) {
if let Some(Notification::Reconstruction) =
self.send_background_notification(Notification::Reconstruction)
{
Self::run_reconstruction(self.db.clone(), &self.log);
}
}
pub fn run_reconstruction(db: Arc<HotColdDB<E, Hot, Cold>>, log: &Logger) {
if let Err(e) = db.reconstruct_historic_states() {
error!(
log,
"State reconstruction failed";
"error" => ?e,
);
}
}
/// If configured to run in the background, send `notif` to the background thread.
///
/// Return `None` if the message was sent to the background thread, `Some(notif)` otherwise.
#[must_use = "Message is not processed when this function returns `Some`"]
fn send_background_notification(&self, notif: Notification) -> Option<Notification> {
// Async path, on the background thread.
if let Some(tx_thread) = &self.tx_thread {
let (ref mut tx, ref mut thread) = *tx_thread.lock();
// Restart the background thread if it has crashed.
if let Err(tx_err) = tx.send(notif) {
let (new_tx, new_thread) = Self::spawn_thread(self.db.clone(), self.log.clone());
*tx = new_tx;
let old_thread = mem::replace(thread, new_thread);
// Join the old thread, which will probably have panicked, or may have
// halted normally just now as a result of us dropping the old `mpsc::Sender`.
if let Err(thread_err) = old_thread.join() {
warn!(
self.log,
"Migration thread died, so it was restarted";
"reason" => format!("{:?}", thread_err)
);
}
// Retry at most once, we could recurse but that would risk overflowing the stack.
let _ = tx.send(tx_err.0);
}
None
// Synchronous path, on the current thread.
} else {
Some(notif)
}
}
/// Perform the actual work of `process_finalization`.
fn run_migration(
db: Arc<HotColdDB<E, Hot, Cold>>,
notif: FinalizationNotification,
log: &Logger,
) {
// Do not run too frequently.
let epoch = notif.finalized_checkpoint.epoch;
let mut prev_migration = notif.prev_migration.lock();
if epoch < prev_migration.epoch + prev_migration.epochs_per_migration {
debug!(
log,
"Database consolidation deferred";
"last_finalized_epoch" => prev_migration.epoch,
"new_finalized_epoch" => epoch,
"epochs_per_migration" => prev_migration.epochs_per_migration,
);
return;
}
// Update the previous migration epoch immediately to avoid holding the lock. If the
// migration doesn't succeed then the next migration will be retried at the next scheduled
// run.
prev_migration.epoch = epoch;
drop(prev_migration);
debug!(log, "Database consolidation started");
let finalized_state_root = notif.finalized_state_root;
let finalized_block_root = notif.finalized_checkpoint.root;
let finalized_state = match db.get_state(&finalized_state_root.into(), None) {
Ok(Some(state)) => state,
other => {
error!(
log,
"Migrator failed to load state";
"state_root" => ?finalized_state_root,
"error" => ?other
);
return;
}
};
let old_finalized_checkpoint = match Self::prune_abandoned_forks(
db.clone(),
notif.head_tracker,
finalized_state_root,
&finalized_state,
notif.finalized_checkpoint,
notif.genesis_block_root,
log,
) {
Ok(PruningOutcome::Successful {
old_finalized_checkpoint,
}) => old_finalized_checkpoint,
Ok(PruningOutcome::DeferredConcurrentHeadTrackerMutation) => {
warn!(
log,
"Pruning deferred because of a concurrent mutation";
"message" => "this is expected only very rarely!"
);
return;
}
Ok(PruningOutcome::OutOfOrderFinalization {
old_finalized_checkpoint,
new_finalized_checkpoint,
}) => {
warn!(
log,
"Ignoring out of order finalization request";
"old_finalized_epoch" => old_finalized_checkpoint.epoch,
"new_finalized_epoch" => new_finalized_checkpoint.epoch,
"message" => "this is expected occasionally due to a (harmless) race condition"
);
return;
}
Err(e) => {
warn!(log, "Block pruning failed"; "error" => ?e);
return;
}
};
match migrate_database(
db.clone(),
finalized_state_root.into(),
finalized_block_root,
&finalized_state,
) {
Ok(()) => {}
Err(Error::HotColdDBError(HotColdDBError::FreezeSlotUnaligned(slot))) => {
debug!(
log,
"Database migration postponed, unaligned finalized block";
"slot" => slot.as_u64()
);
}
Err(e) => {
warn!(
log,
"Database migration failed";
"error" => format!("{:?}", e)
);
return;
}
};
// Finally, compact the database so that new free space is properly reclaimed.
if let Err(e) = Self::run_compaction(
db,
old_finalized_checkpoint.epoch,
notif.finalized_checkpoint.epoch,
log,
) {
warn!(log, "Database compaction failed"; "error" => format!("{:?}", e));
}
debug!(log, "Database consolidation complete");
}
/// Spawn a new child thread to run the migration process.
///
/// Return a channel handle for sending requests to the thread.
fn spawn_thread(
db: Arc<HotColdDB<E, Hot, Cold>>,
log: Logger,
) -> (mpsc::Sender<Notification>, thread::JoinHandle<()>) {
let (tx, rx) = mpsc::channel();
let thread = thread::spawn(move || {
while let Ok(notif) = rx.recv() {
// Read the rest of the messages in the channel, preferring any reconstruction
// notification, or the finalization notification with the greatest finalized epoch.
let notif =
rx.try_iter()
.fold(notif, |best, other: Notification| match (&best, &other) {
(Notification::Reconstruction, _)
| (_, Notification::Reconstruction) => Notification::Reconstruction,
(
Notification::Finalization(fin1),
Notification::Finalization(fin2),
) => {
if fin2.finalized_checkpoint.epoch > fin1.finalized_checkpoint.epoch
{
other
} else {
best
}
}
});
match notif {
Notification::Reconstruction => Self::run_reconstruction(db.clone(), &log),
Notification::Finalization(fin) => Self::run_migration(db.clone(), fin, &log),
}
}
});
(tx, thread)
}
/// Traverses live heads and prunes blocks and states of chains that we know can't be built
/// upon because finalization would prohibit it. This is an optimisation intended to save disk
/// space.
#[allow(clippy::too_many_arguments)]
fn prune_abandoned_forks(
store: Arc<HotColdDB<E, Hot, Cold>>,
head_tracker: Arc<HeadTracker>,
new_finalized_state_hash: BeaconStateHash,
new_finalized_state: &BeaconState<E>,
new_finalized_checkpoint: Checkpoint,
genesis_block_root: Hash256,
log: &Logger,
) -> Result<PruningOutcome, BeaconChainError> {
let old_finalized_checkpoint =
store
.load_pruning_checkpoint()?
.unwrap_or_else(|| Checkpoint {
epoch: Epoch::new(0),
root: Hash256::zero(),
});
let old_finalized_slot = old_finalized_checkpoint
.epoch
.start_slot(E::slots_per_epoch());
let new_finalized_slot = new_finalized_checkpoint
.epoch
.start_slot(E::slots_per_epoch());
let new_finalized_block_hash = new_finalized_checkpoint.root.into();
// The finalized state must be for the epoch boundary slot, not the slot of the finalized
// block.
if new_finalized_state.slot() != new_finalized_slot {
return Err(PruningError::IncorrectFinalizedState {
state_slot: new_finalized_state.slot(),
new_finalized_slot,
}
.into());
}
// The new finalized state must be newer than the previous finalized state.
// I think this can happen sometimes currently due to `fork_choice` running in parallel
// with itself and sending us notifications out of order.
if old_finalized_slot > new_finalized_slot {
return Ok(PruningOutcome::OutOfOrderFinalization {
old_finalized_checkpoint,
new_finalized_checkpoint,
});
}
debug!(
log,
"Starting database pruning";
"old_finalized_epoch" => old_finalized_checkpoint.epoch,
"new_finalized_epoch" => new_finalized_checkpoint.epoch,
);
// For each slot between the new finalized checkpoint and the old finalized checkpoint,
// collect the beacon block root and state root of the canonical chain.
let newly_finalized_chain: HashMap<Slot, (SignedBeaconBlockHash, BeaconStateHash)> =
std::iter::once(Ok((
new_finalized_slot,
(new_finalized_block_hash, new_finalized_state_hash),
)))
.chain(RootsIterator::new(&store, new_finalized_state).map(|res| {
res.map(|(block_root, state_root, slot)| {
(slot, (block_root.into(), state_root.into()))
})
}))
.take_while(|res| {
res.as_ref()
.map_or(true, |(slot, _)| *slot >= old_finalized_slot)
})
.collect::<Result<_, _>>()?;
// We don't know which blocks are shared among abandoned chains, so we buffer and delete
// everything in one fell swoop.
let mut abandoned_blocks: HashSet<SignedBeaconBlockHash> = HashSet::new();
let mut abandoned_states: HashSet<(Slot, BeaconStateHash)> = HashSet::new();
let mut abandoned_heads: HashSet<Hash256> = HashSet::new();
let heads = head_tracker.heads();
debug!(
log,
"Extra pruning information";
"old_finalized_root" => format!("{:?}", old_finalized_checkpoint.root),
"new_finalized_root" => format!("{:?}", new_finalized_checkpoint.root),
"head_count" => heads.len(),
);
for (head_hash, head_slot) in heads {
// Load head block. If it fails with a decode error, it's likely a reverted block,
// so delete it from the head tracker but leave it and its states in the database
// This is suboptimal as it wastes disk space, but it's difficult to fix. A re-sync
// can be used to reclaim the space.
let head_state_root = match store.get_blinded_block(&head_hash) {
Ok(Some(block)) => block.state_root(),
Ok(None) => {
return Err(BeaconStateError::MissingBeaconBlock(head_hash.into()).into())
}
Err(Error::SszDecodeError(e)) => {
warn!(
log,
"Forgetting invalid head block";
"block_root" => ?head_hash,
"error" => ?e,
);
abandoned_heads.insert(head_hash);
continue;
}
Err(e) => return Err(e.into()),
};
let mut potentially_abandoned_head = Some(head_hash);
let mut potentially_abandoned_blocks = vec![];
// Iterate backwards from this head, staging blocks and states for deletion.
let iter = std::iter::once(Ok((head_hash, head_state_root, head_slot)))
.chain(RootsIterator::from_block(&store, head_hash)?);
for maybe_tuple in iter {
let (block_root, state_root, slot) = maybe_tuple?;
let block_root = SignedBeaconBlockHash::from(block_root);
let state_root = BeaconStateHash::from(state_root);
match newly_finalized_chain.get(&slot) {
// If there's no information about a slot on the finalized chain, then
// it should be because it's ahead of the new finalized slot. Stage
// the fork's block and state for possible deletion.
None => {
if slot > new_finalized_slot {
potentially_abandoned_blocks.push((
slot,
Some(block_root),
Some(state_root),
));
} else if slot >= old_finalized_slot {
return Err(PruningError::MissingInfoForCanonicalChain { slot }.into());
} else {
// We must assume here any candidate chains include the old finalized
// checkpoint, i.e. there aren't any forks starting at a block that is a
// strict ancestor of old_finalized_checkpoint.
warn!(
log,
"Found a chain that should already have been pruned";
"head_block_root" => format!("{:?}", head_hash),
"head_slot" => head_slot,
);
potentially_abandoned_head.take();
break;
}
}
Some((finalized_block_root, finalized_state_root)) => {
// This fork descends from a newly finalized block, we can stop.
if block_root == *finalized_block_root {
// Sanity check: if the slot and block root match, then the
// state roots should match too.
if state_root != *finalized_state_root {
return Err(PruningError::UnexpectedUnequalStateRoots.into());
}
// If the fork descends from the whole finalized chain,
// do not prune it. Otherwise continue to delete all
// of the blocks and states that have been staged for
// deletion so far.
if slot == new_finalized_slot {
potentially_abandoned_blocks.clear();
potentially_abandoned_head.take();
}
// If there are skipped slots on the fork to be pruned, then
// we will have just staged the common block for deletion.
// Unstage it.
else {
for (_, block_root, _) in
potentially_abandoned_blocks.iter_mut().rev()
{
if block_root.as_ref() == Some(finalized_block_root) {
*block_root = None;
} else {
break;
}
}
}
break;
} else {
if state_root == *finalized_state_root {
return Err(PruningError::UnexpectedEqualStateRoots.into());
}
potentially_abandoned_blocks.push((
slot,
Some(block_root),
Some(state_root),
));
}
}
}
}
if let Some(abandoned_head) = potentially_abandoned_head {
debug!(
log,
"Pruning head";
"head_block_root" => format!("{:?}", abandoned_head),
"head_slot" => head_slot,
);
abandoned_heads.insert(abandoned_head);
abandoned_blocks.extend(
potentially_abandoned_blocks
.iter()
.filter_map(|(_, maybe_block_hash, _)| *maybe_block_hash),
);
abandoned_states.extend(potentially_abandoned_blocks.iter().filter_map(
|(slot, _, maybe_state_hash)| maybe_state_hash.map(|sr| (*slot, sr)),
));
}
}
// Update the head tracker before the database, so that we maintain the invariant
// that a block present in the head tracker is present in the database.
// See https://github.com/sigp/lighthouse/issues/1557
let mut head_tracker_lock = head_tracker.0.write();
// Check that all the heads to be deleted are still present. The absence of any
// head indicates a race, that will likely resolve itself, so we defer pruning until
// later.
for head_hash in &abandoned_heads {
if !head_tracker_lock.contains_key(head_hash) {
return Ok(PruningOutcome::DeferredConcurrentHeadTrackerMutation);
}
}
// Then remove them for real.
for head_hash in abandoned_heads {
head_tracker_lock.remove(&head_hash);
}
let batch: Vec<StoreOp<E>> = abandoned_blocks
.into_iter()
.map(Into::into)
.flat_map(|block_root: Hash256| {
[
StoreOp::DeleteBlock(block_root),
StoreOp::DeleteExecutionPayload(block_root),
]
})
.chain(
abandoned_states
.into_iter()
.map(|(slot, state_hash)| StoreOp::DeleteState(state_hash.into(), Some(slot))),
)
.collect();
let mut kv_batch = store.convert_to_kv_batch(batch)?;
// Persist the head in case the process is killed or crashes here. This prevents
// the head tracker reverting after our mutation above.
let persisted_head = PersistedBeaconChain {
_canonical_head_block_root: DUMMY_CANONICAL_HEAD_BLOCK_ROOT,
genesis_block_root,
ssz_head_tracker: SszHeadTracker::from_map(&head_tracker_lock),
};
drop(head_tracker_lock);
kv_batch.push(persisted_head.as_kv_store_op(BEACON_CHAIN_DB_KEY));
// Persist the new finalized checkpoint as the pruning checkpoint.
kv_batch.push(store.pruning_checkpoint_store_op(new_finalized_checkpoint));
store.hot_db.do_atomically(kv_batch)?;
debug!(log, "Database pruning complete");
Ok(PruningOutcome::Successful {
old_finalized_checkpoint,
})
}
/// Compact the database if it has been more than `COMPACTION_PERIOD_SECONDS` since it
/// was last compacted.
pub fn run_compaction(
db: Arc<HotColdDB<E, Hot, Cold>>,
old_finalized_epoch: Epoch,
new_finalized_epoch: Epoch,
log: &Logger,
) -> Result<(), Error> {
if !db.compact_on_prune() {
return Ok(());
}
let last_compaction_timestamp = db
.load_compaction_timestamp()?
.unwrap_or_else(|| Duration::from_secs(0));
let start_time = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or(last_compaction_timestamp);
let seconds_since_last_compaction = start_time
.checked_sub(last_compaction_timestamp)
.as_ref()
.map_or(0, Duration::as_secs);
if seconds_since_last_compaction > MAX_COMPACTION_PERIOD_SECONDS
|| (new_finalized_epoch - old_finalized_epoch > COMPACTION_FINALITY_DISTANCE
&& seconds_since_last_compaction > MIN_COMPACTION_PERIOD_SECONDS)
{
info!(
log,
"Starting database compaction";
"old_finalized_epoch" => old_finalized_epoch,
"new_finalized_epoch" => new_finalized_epoch,
);
db.compact()?;
let finish_time = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or(start_time);
db.store_compaction_timestamp(finish_time)?;
info!(log, "Database compaction complete");
}
Ok(())
}
}