Fix broken beacon chain metrics, add slot clock metrics

This commit is contained in:
Paul Hauner 2019-08-12 17:44:47 +10:00
parent 95a320817e
commit d7c546844c
No known key found for this signature in database
GPG Key ID: 5E2CFF9B75FA63DF
9 changed files with 68 additions and 28 deletions

View File

@ -1,3 +1,4 @@
#![recursion_limit = "128"] // For lazy-static
#[macro_use]
extern crate lazy_static;

View File

@ -1,6 +1,6 @@
use crate::{BeaconChain, BeaconChainTypes};
pub use lighthouse_metrics::*;
use types::{BeaconState, Epoch, EthSpec, Hash256, Slot};
use types::{BeaconState, Epoch, Hash256, Slot};
lazy_static! {
/*
@ -140,17 +140,6 @@ lazy_static! {
*/
pub static ref PERSIST_CHAIN: Result<Histogram> =
try_create_histogram("beacon_persist_chain", "Time taken to update the canonical head");
}
// Lazy-static is split so we don't reach the crate-level recursion limit.
lazy_static! {
/*
* Slot Clock
*/
pub static ref PRESENT_SLOT: Result<IntGauge> =
try_create_int_gauge("beacon_present_slot", "The present slot, according to system time");
pub static ref PRESENT_EPOCH: Result<IntGauge> =
try_create_int_gauge("beacon_present_epoch", "The present epoch, according to system time");
/*
* Chain Head
@ -194,21 +183,6 @@ lazy_static! {
/// Scrape the `beacon_chain` for metrics that are not constantly updated (e.g., the present slot,
/// head state info, etc) and update the Prometheus `DEFAULT_REGISTRY`.
pub fn scrape_for_metrics<T: BeaconChainTypes>(beacon_chain: &BeaconChain<T>) {
set_gauge_by_slot(
&PRESENT_SLOT,
beacon_chain
.read_slot_clock()
.unwrap_or_else(|| Slot::new(0)),
);
set_gauge_by_epoch(
&PRESENT_EPOCH,
beacon_chain
.read_slot_clock()
.map(|s| s.epoch(T::EthSpec::slots_per_epoch()))
.unwrap_or_else(|| Epoch::new(0)),
);
scrape_head_state::<T>(
&beacon_chain.head().beacon_state,
beacon_chain.head().beacon_state_root,

View File

@ -26,3 +26,4 @@ tokio = "0.1.17"
url = "2.0"
lazy_static = "1.3.0"
lighthouse_metrics = { path = "../../eth2/utils/lighthouse_metrics" }
slot_clock = { path = "../../eth2/utils/slot_clock" }

View File

@ -39,6 +39,23 @@ pub fn get_prometheus<T: BeaconChainTypes + 'static>(req: Request<Body>) -> ApiR
.get::<DBPath>()
.ok_or_else(|| ApiError::ServerError("DBPath extension missing".to_string()))?;
// There are two categories of metrics:
//
// - Dynamically updated: things like histograms and event counters that are updated on the
// fly.
// - Statically updated: things which are only updated at the time of the scrape (used where we
// can avoid cluttering up code with metrics calls).
//
// The `prometheus` crate has a `DEFAULT_REGISTRY` global singleton (via `lazy_static`) which
// keeps the state of all the metrics. Dynamically updated things will already be up-to-date in
// the registry (because they update themselves) however statically updated things need to be
// "scraped".
//
// We proceed by, first updating all the static metrics using `scrape_for_metrics(..)`. Then,
// using `prometheus::gather(..)` to collect the global `DEFAULT_REGISTRY` metrics into a
// string that can be returned via HTTP.
slot_clock::scrape_for_metrics::<T::EthSpec, T::SlotClock>(&beacon_chain.slot_clock);
store::scrape_for_metrics(&db_path);
beacon_chain::scrape_for_metrics(&beacon_chain);

View File

@ -6,3 +6,5 @@ edition = "2018"
[dependencies]
types = { path = "../../types" }
lazy_static = "1.3.0"
lighthouse_metrics = { path = "../lighthouse_metrics" }

View File

@ -1,9 +1,15 @@
#[macro_use]
extern crate lazy_static;
mod metrics;
mod system_time_slot_clock;
mod testing_slot_clock;
use std::time::Duration;
pub use crate::system_time_slot_clock::{Error as SystemTimeSlotClockError, SystemTimeSlotClock};
pub use crate::testing_slot_clock::{Error as TestingSlotClockError, TestingSlotClock};
use std::time::Duration;
pub use metrics::scrape_for_metrics;
pub use types::Slot;
pub trait SlotClock: Send + Sync + Sized {
@ -17,4 +23,6 @@ pub trait SlotClock: Send + Sync + Sized {
fn present_slot(&self) -> Result<Option<Slot>, Self::Error>;
fn duration_to_next_slot(&self) -> Result<Option<Duration>, Self::Error>;
fn slot_duration_millis(&self) -> u64;
}

View File

@ -0,0 +1,29 @@
use crate::SlotClock;
pub use lighthouse_metrics::*;
use types::{EthSpec, Slot};
lazy_static! {
pub static ref PRESENT_SLOT: Result<IntGauge> =
try_create_int_gauge("slotclock_present_slot", "The present wall-clock slot");
pub static ref PRESENT_EPOCH: Result<IntGauge> =
try_create_int_gauge("slotclock_present_epoch", "The present wall-clock epoch");
pub static ref MILLISECONDS_PER_SLOT: Result<IntGauge> = try_create_int_gauge(
"slotclock_slot_time_milliseconds",
"The duration in milliseconds between each slot"
);
}
/// Update the global metrics `DEFAULT_REGISTRY` with info from the slot clock.
pub fn scrape_for_metrics<T: EthSpec, U: SlotClock>(clock: &U) {
let present_slot = match clock.present_slot() {
Ok(Some(slot)) => slot,
_ => Slot::new(0),
};
set_gauge(&PRESENT_SLOT, present_slot.as_u64() as i64);
set_gauge(
&PRESENT_EPOCH,
present_slot.epoch(T::slots_per_epoch()).as_u64() as i64,
);
set_gauge(&MILLISECONDS_PER_SLOT, clock.slot_duration_millis() as i64);
}

View File

@ -52,6 +52,10 @@ impl SlotClock for SystemTimeSlotClock {
fn duration_to_next_slot(&self) -> Result<Option<Duration>, Error> {
duration_to_next_slot(self.genesis_seconds, self.slot_duration_seconds)
}
fn slot_duration_millis(&self) -> u64 {
self.slot_duration_seconds * 1000
}
}
impl From<SystemTimeError> for Error {

View File

@ -40,6 +40,10 @@ impl SlotClock for TestingSlotClock {
fn duration_to_next_slot(&self) -> Result<Option<Duration>, Error> {
Ok(Some(Duration::from_secs(1)))
}
fn slot_duration_millis(&self) -> u64 {
0
}
}
#[cfg(test)]