Conserve disk space by raising default SPRP (#3137)

## Proposed Changes Increase the default `--slots-per-restore-point` to 8192 for a 4x reduction in freezer DB disk usage. Existing nodes that use the previous default of 2048 will be left unchanged. Newly synced nodes (with or without checkpoint sync) will use the new 8192 default. Long-term we could do away with the freezer DB entirely for validator-only nodes, but this change is much simpler and grants us some extra space in the short term. We can also roll it out gradually across our nodes by purging databases one by one, while keeping the Ansible config the same. ## Additional Info We ignore a change from 2048 to 8192 if the user hasn't set the 8192 explicitly. We fire a debug log in the case where we do ignore: ``` DEBG Ignoring slots-per-restore-point config in favour of on-disk value, on_disk: 2048, config: 8192 ```
2022-04-01 07:16:25 +00:00 · 2022-04-01 07:16:25 +00:00 · 375e2b49b3
commit 375e2b49b3
parent 414197b06d
9 changed files with 114 additions and 17 deletions
--- a/beacon_node/http_api/src/database.rs
+++ b/beacon_node/http_api/src/database.rs
@ -9,10 +9,12 @@ pub fn info<T: BeaconChainTypes>(
 ) -> Result<DatabaseInfo, warp::Rejection> {
    let store = &chain.store;
    let split = store.get_split_info();
+    let config = store.get_config().clone();
    let anchor = store.get_anchor_info();

    Ok(DatabaseInfo {
        schema_version: CURRENT_SCHEMA_VERSION.as_u64(),
+        config,
        split,
        anchor,
    })
--- a/beacon_node/src/config.rs
+++ b/beacon_node/src/config.rs
@ -284,7 +284,9 @@ pub fn get_config<E: EthSpec>(
        client_config.freezer_db_path = Some(PathBuf::from(freezer_dir));
    }

-    client_config.store.slots_per_restore_point = get_slots_per_restore_point::<E>(cli_args)?;
+    let (sprp, sprp_explicit) = get_slots_per_restore_point::<E>(cli_args)?;
+    client_config.store.slots_per_restore_point = sprp;
+    client_config.store.slots_per_restore_point_set_explicitly = sprp_explicit;

    if let Some(block_cache_size) = cli_args.value_of("block-cache-size") {
        client_config.store.block_cache_size = block_cache_size
@ -813,15 +815,20 @@ pub fn get_data_dir(cli_args: &ArgMatches) -> PathBuf {
 }

 /// Get the `slots_per_restore_point` value to use for the database.
-pub fn get_slots_per_restore_point<E: EthSpec>(cli_args: &ArgMatches) -> Result<u64, String> {
+///
+/// Return `(sprp, set_explicitly)` where `set_explicitly` is `true` if the user provided the value.
+pub fn get_slots_per_restore_point<E: EthSpec>(
+    cli_args: &ArgMatches,
+) -> Result<(u64, bool), String> {
    if let Some(slots_per_restore_point) =
        clap_utils::parse_optional(cli_args, "slots-per-restore-point")?
    {
-        Ok(slots_per_restore_point)
+        Ok((slots_per_restore_point, true))
    } else {
-        Ok(std::cmp::min(
+        let default = std::cmp::min(
            E::slots_per_historical_root() as u64,
            store::config::DEFAULT_SLOTS_PER_RESTORE_POINT,
-        ))
+        );
+        Ok((default, false))
    }
 }
--- a/beacon_node/store/src/config.rs
+++ b/beacon_node/store/src/config.rs
@ -4,7 +4,8 @@ use ssz::{Decode, Encode};
 use ssz_derive::{Decode, Encode};
 use types::{EthSpec, MinimalEthSpec};

-pub const DEFAULT_SLOTS_PER_RESTORE_POINT: u64 = 2048;
+pub const PREV_DEFAULT_SLOTS_PER_RESTORE_POINT: u64 = 2048;
+pub const DEFAULT_SLOTS_PER_RESTORE_POINT: u64 = 8192;
 pub const DEFAULT_BLOCK_CACHE_SIZE: usize = 5;

 /// Database configuration parameters.
@ -12,6 +13,8 @@ pub const DEFAULT_BLOCK_CACHE_SIZE: usize = 5;
 pub struct StoreConfig {
    /// Number of slots to wait between storing restore points in the freezer database.
    pub slots_per_restore_point: u64,
+    /// Flag indicating whether the `slots_per_restore_point` was set explicitly by the user.
+    pub slots_per_restore_point_set_explicitly: bool,
    /// Maximum number of blocks to store in the in-memory block cache.
    pub block_cache_size: usize,
    /// Whether to compact the database on initialization.
@ -36,6 +39,7 @@ impl Default for StoreConfig {
        Self {
            // Safe default for tests, shouldn't ever be read by a CLI node.
            slots_per_restore_point: MinimalEthSpec::slots_per_historical_root() as u64,
+            slots_per_restore_point_set_explicitly: false,
            block_cache_size: DEFAULT_BLOCK_CACHE_SIZE,
            compact_on_init: false,
            compact_on_prune: true,
--- a/beacon_node/store/src/hot_cold_store.rs
+++ b/beacon_node/store/src/hot_cold_store.rs
@ -1,7 +1,10 @@
 use crate::chunked_vector::{
    store_updated_vector, BlockRoots, HistoricalRoots, RandaoMixes, StateRoots,
 };
-use crate::config::{OnDiskStoreConfig, StoreConfig};
+use crate::config::{
+    OnDiskStoreConfig, StoreConfig, DEFAULT_SLOTS_PER_RESTORE_POINT,
+    PREV_DEFAULT_SLOTS_PER_RESTORE_POINT,
+};
 use crate::forwards_iter::{HybridForwardsBlockRootsIterator, HybridForwardsStateRootsIterator};
 use crate::impls::beacon_state::{get_full_state, store_full_state};
 use crate::iter::{ParentRootBlockIterator, StateRootsIterator};
@ -150,7 +153,7 @@ impl<E: EthSpec> HotColdDB<E, LevelDB<E>, LevelDB<E>> {
    ) -> Result<Arc<Self>, Error> {
        Self::verify_slots_per_restore_point(config.slots_per_restore_point)?;

-        let db = Arc::new(HotColdDB {
+        let mut db = HotColdDB {
            split: RwLock::new(Split::default()),
            anchor_info: RwLock::new(None),
            cold_db: LevelDB::open(cold_path)?,
@ -160,10 +163,31 @@ impl<E: EthSpec> HotColdDB<E, LevelDB<E>, LevelDB<E>> {
            spec,
            log,
            _phantom: PhantomData,
-        });
+        };
+
+        // Allow the slots-per-restore-point value to stay at the previous default if the config
+        // uses the new default. Don't error on a failed read because the config itself may need
+        // migrating.
+        if let Ok(Some(disk_config)) = db.load_config() {
+            if !db.config.slots_per_restore_point_set_explicitly
+                && disk_config.slots_per_restore_point == PREV_DEFAULT_SLOTS_PER_RESTORE_POINT
+                && db.config.slots_per_restore_point == DEFAULT_SLOTS_PER_RESTORE_POINT
+            {
+                debug!(
+                    db.log,
+                    "Ignoring slots-per-restore-point config in favour of on-disk value";
+                    "config" => db.config.slots_per_restore_point,
+                    "on_disk" => disk_config.slots_per_restore_point,
+                );
+
+                // Mutate the in-memory config so that it's compatible.
+                db.config.slots_per_restore_point = PREV_DEFAULT_SLOTS_PER_RESTORE_POINT;
+            }
+        }

        // Ensure that the schema version of the on-disk database matches the software.
        // If the version is mismatched, an automatic migration will be attempted.
+        let db = Arc::new(db);
        if let Some(schema_version) = db.load_schema_version()? {
            debug!(
                db.log,
@ -1108,6 +1132,11 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> HotColdDB<E, Hot, Cold>
            .map_or(self.spec.genesis_slot, |anchor| anchor.oldest_block_slot)
    }

+    /// Return the in-memory configuration used by the database.
+    pub fn get_config(&self) -> &StoreConfig {
+        &self.config
+    }
+
    /// Load previously-stored config from disk.
    fn load_config(&self) -> Result<Option<OnDiskStoreConfig>, Error> {
        self.hot_db.get(&CONFIG_KEY)
--- a/book/src/advanced_database.md
+++ b/book/src/advanced_database.md
@ -23,27 +23,39 @@ states to slow down dramatically. A lower _slots per restore point_ value (SPRP)
 frequent restore points, while a higher SPRP corresponds to less frequent. The table below shows
 some example values.

-| Use Case                | SPRP           | Yearly Disk Usage | Load Historical State |
-| ----------------------  | -------------- | ----------------- | --------------------- |
-| Block explorer/analysis | 32             | 1.4 TB            | 155 ms                |
-| Default                 | 2048           | 23.1 GB           | 10.2 s                |
-| Validator only          | 8192           | 5.7 GB            | 41 s                  |
+| Use Case                 | SPRP           | Yearly Disk Usage | Load Historical State |
+| ----------------------   | -------------- | ----------------- | --------------------- |
+| Block explorer/analysis  | 32             | 1.4 TB            | 155 ms                |
+| Hobbyist (prev. default) | 2048           | 23.1 GB           | 10.2 s                |
+| Validator only (default) | 8192           | 5.7 GB            | 41 s                  |

 As you can see, it's a high-stakes trade-off! The relationships to disk usage and historical state
 load time are both linear – doubling SPRP halves disk usage and doubles load time. The minimum SPRP
 is 32, and the maximum is 8192.

+The default value is 8192 for databases synced from scratch using Lighthouse v2.2.0 or later, or
+2048 for prior versions. Please see the section on [Defaults](#defaults) below.
+
 The values shown in the table are approximate, calculated using a simple heuristic: each
 `BeaconState` consumes around 18MB of disk space, and each block replayed takes around 5ms.  The
 **Yearly Disk Usage** column shows the approx size of the freezer DB _alone_ (hot DB not included),
 and the **Load Historical State** time is the worst-case load time for a state in the last slot
 before a restore point.

+### Defaults
+
+As of Lighthouse v2.2.0, the default slots-per-restore-point value has been increased from 2048
+to 8192 in order to conserve disk space. Existing nodes will continue to use SPRP=2048 unless
+re-synced. Note that it is currently not possible to change the SPRP without re-syncing, although
+fast re-syncing may be achieved with [Checkpoint Sync](./checkpoint-sync.md).
+
+### CLI Configuration
+
 To configure your Lighthouse node's database with a non-default SPRP, run your Beacon Node with
 the `--slots-per-restore-point` flag:

 ```bash
-lighthouse beacon_node --slots-per-restore-point 8192
+lighthouse beacon_node --slots-per-restore-point 32
 ```

 ## Glossary
--- a/book/src/api-lighthouse.md
+++ b/book/src/api-lighthouse.md
@ -366,6 +366,12 @@ curl "http://localhost:5052/lighthouse/database/info" | jq
 ```json
 {
  "schema_version": 5,
+  "config": {
+    "slots_per_restore_point": 2048,
+    "block_cache_size": 5,
+    "compact_on_init": false,
+    "compact_on_prune": true
+  },
  "split": {
    "slot": "2034912",
    "state_root": "0x11c8516aa7d4d1613e84121e3a557ceca34618b4c1a38f05b66ad045ff82b33b"
--- a/common/eth2/src/lighthouse.rs
+++ b/common/eth2/src/lighthouse.rs
@ -14,7 +14,7 @@ use reqwest::IntoUrl;
 use serde::{Deserialize, Serialize};
 use ssz::four_byte_option_impl;
 use ssz_derive::{Decode, Encode};
-use store::{AnchorInfo, Split};
+use store::{AnchorInfo, Split, StoreConfig};

 pub use attestation_performance::{
    AttestationPerformance, AttestationPerformanceQuery, AttestationPerformanceStatistics,
@ -334,6 +334,7 @@ impl Eth1Block {
 #[derive(Debug, Serialize, Deserialize)]
 pub struct DatabaseInfo {
    pub schema_version: u64,
+    pub config: StoreConfig,
    pub split: Split,
    pub anchor: Option<AnchorInfo>,
 }
--- a/database_manager/src/lib.rs
+++ b/database_manager/src/lib.rs
@ -100,7 +100,9 @@ fn parse_client_config<E: EthSpec>(
        client_config.freezer_db_path = Some(freezer_dir);
    }

-    client_config.store.slots_per_restore_point = get_slots_per_restore_point::<E>(cli_args)?;
+    let (sprp, sprp_explicit) = get_slots_per_restore_point::<E>(cli_args)?;
+    client_config.store.slots_per_restore_point = sprp;
+    client_config.store.slots_per_restore_point_set_explicitly = sprp_explicit;

    Ok(client_config)
 }
--- a/lighthouse/tests/beacon_node.rs
+++ b/lighthouse/tests/beacon_node.rs
@ -804,6 +804,40 @@ fn slots_per_restore_point_flag() {
        .run_with_zero_port()
        .with_config(|config| assert_eq!(config.store.slots_per_restore_point, 64));
 }
+#[test]
+fn slots_per_restore_point_update_prev_default() {
+    use beacon_node::beacon_chain::store::config::{
+        DEFAULT_SLOTS_PER_RESTORE_POINT, PREV_DEFAULT_SLOTS_PER_RESTORE_POINT,
+    };
+
+    CommandLineTest::new()
+        .flag("slots-per-restore-point", Some("2048"))
+        .run_with_zero_port()
+        .with_config_and_dir(|config, dir| {
+            // Check that 2048 is the previous default.
+            assert_eq!(
+                config.store.slots_per_restore_point,
+                PREV_DEFAULT_SLOTS_PER_RESTORE_POINT
+            );
+
+            // Restart the BN with the same datadir and the new default SPRP. It should
+            // allow this.
+            CommandLineTest::new()
+                .flag("datadir", Some(&dir.path().display().to_string()))
+                .flag("zero-ports", None)
+                .run_with_no_datadir()
+                .with_config(|config| {
+                    // The dumped config will have the new default 8192 value, but the fact that
+                    // the BN started and ran (with the same datadir) means that the override
+                    // was successful.
+                    assert_eq!(
+                        config.store.slots_per_restore_point,
+                        DEFAULT_SLOTS_PER_RESTORE_POINT
+                    );
+                });
+        })
+}
+
 #[test]
 fn block_cache_size_flag() {
    CommandLineTest::new()