Monitoring service api (#2251)
## Issue Addressed N/A ## Proposed Changes Adds a client side api for collecting system and process metrics and pushing it to a monitoring service.
This commit is contained in:
parent
55aada006f
commit
fdaeec631b
24
Cargo.lock
generated
24
Cargo.lock
generated
@ -651,6 +651,7 @@ dependencies = [
|
|||||||
"hyper 0.14.7",
|
"hyper 0.14.7",
|
||||||
"lighthouse_version",
|
"lighthouse_version",
|
||||||
"logging",
|
"logging",
|
||||||
|
"monitoring_api",
|
||||||
"node_test_rig",
|
"node_test_rig",
|
||||||
"rand 0.7.3",
|
"rand 0.7.3",
|
||||||
"sensitive_url",
|
"sensitive_url",
|
||||||
@ -1100,6 +1101,7 @@ dependencies = [
|
|||||||
"http_metrics",
|
"http_metrics",
|
||||||
"lazy_static",
|
"lazy_static",
|
||||||
"lighthouse_metrics",
|
"lighthouse_metrics",
|
||||||
|
"monitoring_api",
|
||||||
"network",
|
"network",
|
||||||
"parking_lot",
|
"parking_lot",
|
||||||
"prometheus",
|
"prometheus",
|
||||||
@ -4036,6 +4038,27 @@ dependencies = [
|
|||||||
"winapi 0.3.9",
|
"winapi 0.3.9",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "monitoring_api"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"eth2",
|
||||||
|
"futures 0.3.14",
|
||||||
|
"lazy_static",
|
||||||
|
"lighthouse_metrics",
|
||||||
|
"lighthouse_version",
|
||||||
|
"regex",
|
||||||
|
"reqwest",
|
||||||
|
"sensitive_url",
|
||||||
|
"serde",
|
||||||
|
"serde_derive",
|
||||||
|
"serde_json",
|
||||||
|
"slog",
|
||||||
|
"store",
|
||||||
|
"task_executor",
|
||||||
|
"tokio 1.5.0",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "multihash"
|
name = "multihash"
|
||||||
version = "0.13.2"
|
version = "0.13.2"
|
||||||
@ -7135,6 +7158,7 @@ dependencies = [
|
|||||||
"lighthouse_version",
|
"lighthouse_version",
|
||||||
"lockfile",
|
"lockfile",
|
||||||
"logging",
|
"logging",
|
||||||
|
"monitoring_api",
|
||||||
"parking_lot",
|
"parking_lot",
|
||||||
"rand 0.7.3",
|
"rand 0.7.3",
|
||||||
"rayon",
|
"rayon",
|
||||||
|
@ -40,6 +40,7 @@ members = [
|
|||||||
"common/validator_dir",
|
"common/validator_dir",
|
||||||
"common/warp_utils",
|
"common/warp_utils",
|
||||||
"common/fallback",
|
"common/fallback",
|
||||||
|
"common/monitoring_api",
|
||||||
|
|
||||||
"consensus/cached_tree_hash",
|
"consensus/cached_tree_hash",
|
||||||
"consensus/int_to_bytes",
|
"consensus/int_to_bytes",
|
||||||
|
@ -44,4 +44,5 @@ hyper = "0.14.4"
|
|||||||
lighthouse_version = { path = "../common/lighthouse_version" }
|
lighthouse_version = { path = "../common/lighthouse_version" }
|
||||||
hex = "0.4.2"
|
hex = "0.4.2"
|
||||||
slasher = { path = "../slasher" }
|
slasher = { path = "../slasher" }
|
||||||
|
monitoring_api = { path = "../common/monitoring_api" }
|
||||||
sensitive_url = { path = "../common/sensitive_url" }
|
sensitive_url = { path = "../common/sensitive_url" }
|
||||||
|
@ -44,3 +44,4 @@ http_api = { path = "../http_api" }
|
|||||||
http_metrics = { path = "../http_metrics" }
|
http_metrics = { path = "../http_metrics" }
|
||||||
slasher = { path = "../../slasher" }
|
slasher = { path = "../../slasher" }
|
||||||
slasher_service = { path = "../../slasher/service" }
|
slasher_service = { path = "../../slasher/service" }
|
||||||
|
monitoring_api = {path = "../../common/monitoring_api"}
|
||||||
|
@ -14,6 +14,7 @@ use environment::RuntimeContext;
|
|||||||
use eth1::{Config as Eth1Config, Service as Eth1Service};
|
use eth1::{Config as Eth1Config, Service as Eth1Service};
|
||||||
use eth2_libp2p::NetworkGlobals;
|
use eth2_libp2p::NetworkGlobals;
|
||||||
use genesis::{interop_genesis_state, Eth1GenesisService};
|
use genesis::{interop_genesis_state, Eth1GenesisService};
|
||||||
|
use monitoring_api::{MonitoringHttpClient, ProcessType};
|
||||||
use network::{NetworkConfig, NetworkMessage, NetworkService};
|
use network::{NetworkConfig, NetworkMessage, NetworkService};
|
||||||
use slasher::Slasher;
|
use slasher::Slasher;
|
||||||
use slasher_service::SlasherService;
|
use slasher_service::SlasherService;
|
||||||
@ -374,6 +375,22 @@ where
|
|||||||
SlasherService::new(beacon_chain, network_send).run(&context.executor)
|
SlasherService::new(beacon_chain, network_send).run(&context.executor)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Start the explorer client which periodically sends beacon
|
||||||
|
/// and system metrics to the configured endpoint.
|
||||||
|
pub fn monitoring_client(self, config: &monitoring_api::Config) -> Result<Self, String> {
|
||||||
|
let context = self
|
||||||
|
.runtime_context
|
||||||
|
.as_ref()
|
||||||
|
.ok_or("monitoring_client requires a runtime_context")?
|
||||||
|
.service_context("monitoring_client".into());
|
||||||
|
let monitoring_client = MonitoringHttpClient::new(config, context.log().clone())?;
|
||||||
|
monitoring_client.auto_update(
|
||||||
|
context.executor,
|
||||||
|
vec![ProcessType::BeaconNode, ProcessType::System],
|
||||||
|
);
|
||||||
|
Ok(self)
|
||||||
|
}
|
||||||
|
|
||||||
/// Immediately starts the service that periodically logs information each slot.
|
/// Immediately starts the service that periodically logs information each slot.
|
||||||
pub fn notifier(self) -> Result<Self, String> {
|
pub fn notifier(self) -> Result<Self, String> {
|
||||||
let context = self
|
let context = self
|
||||||
|
@ -66,6 +66,7 @@ pub struct Config {
|
|||||||
pub eth1: eth1::Config,
|
pub eth1: eth1::Config,
|
||||||
pub http_api: http_api::Config,
|
pub http_api: http_api::Config,
|
||||||
pub http_metrics: http_metrics::Config,
|
pub http_metrics: http_metrics::Config,
|
||||||
|
pub monitoring_api: Option<monitoring_api::Config>,
|
||||||
pub slasher: Option<slasher::Config>,
|
pub slasher: Option<slasher::Config>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -87,6 +88,7 @@ impl Default for Config {
|
|||||||
graffiti: Graffiti::default(),
|
graffiti: Graffiti::default(),
|
||||||
http_api: <_>::default(),
|
http_api: <_>::default(),
|
||||||
http_metrics: <_>::default(),
|
http_metrics: <_>::default(),
|
||||||
|
monitoring_api: None,
|
||||||
slasher: None,
|
slasher: None,
|
||||||
validator_monitor_auto: false,
|
validator_monitor_auto: false,
|
||||||
validator_monitor_pubkeys: vec![],
|
validator_monitor_pubkeys: vec![],
|
||||||
|
@ -6,4 +6,14 @@ lazy_static! {
|
|||||||
"sync_slots_per_second",
|
"sync_slots_per_second",
|
||||||
"The number of blocks being imported per second"
|
"The number of blocks being imported per second"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
pub static ref IS_SYNCED: Result<IntGauge> = try_create_int_gauge(
|
||||||
|
"sync_eth2_synced",
|
||||||
|
"Metric to check if the beacon chain is synced to head. 0 if not synced and non-zero if synced"
|
||||||
|
);
|
||||||
|
|
||||||
|
pub static ref NOTIFIER_HEAD_SLOT: Result<IntGauge> = try_create_int_gauge(
|
||||||
|
"notifier_head_slot",
|
||||||
|
"The head slot sourced from the beacon chain notifier"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
@ -77,6 +77,9 @@ pub fn spawn_notifier<T: BeaconChainTypes>(
|
|||||||
};
|
};
|
||||||
|
|
||||||
let head_slot = head_info.slot;
|
let head_slot = head_info.slot;
|
||||||
|
|
||||||
|
metrics::set_gauge(&metrics::NOTIFIER_HEAD_SLOT, head_slot.as_u64() as i64);
|
||||||
|
|
||||||
let current_slot = match beacon_chain.slot() {
|
let current_slot = match beacon_chain.slot() {
|
||||||
Ok(slot) => slot,
|
Ok(slot) => slot,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
@ -123,6 +126,7 @@ pub fn spawn_notifier<T: BeaconChainTypes>(
|
|||||||
|
|
||||||
// Log if we are syncing
|
// Log if we are syncing
|
||||||
if sync_state.is_syncing() {
|
if sync_state.is_syncing() {
|
||||||
|
metrics::set_gauge(&metrics::IS_SYNCED, 0);
|
||||||
let distance = format!(
|
let distance = format!(
|
||||||
"{} slots ({})",
|
"{} slots ({})",
|
||||||
head_distance.as_u64(),
|
head_distance.as_u64(),
|
||||||
@ -151,6 +155,7 @@ pub fn spawn_notifier<T: BeaconChainTypes>(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
} else if sync_state.is_synced() {
|
} else if sync_state.is_synced() {
|
||||||
|
metrics::set_gauge(&metrics::IS_SYNCED, 1);
|
||||||
let block_info = if current_slot > head_slot {
|
let block_info = if current_slot > head_slot {
|
||||||
" … empty".to_string()
|
" … empty".to_string()
|
||||||
} else {
|
} else {
|
||||||
@ -167,6 +172,7 @@ pub fn spawn_notifier<T: BeaconChainTypes>(
|
|||||||
"slot" => current_slot,
|
"slot" => current_slot,
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
|
metrics::set_gauge(&metrics::IS_SYNCED, 0);
|
||||||
info!(
|
info!(
|
||||||
log,
|
log,
|
||||||
"Searching for peers";
|
"Searching for peers";
|
||||||
|
@ -26,4 +26,23 @@ lazy_static! {
|
|||||||
pub static ref ENDPOINT_REQUESTS: Result<IntCounterVec> = try_create_int_counter_vec(
|
pub static ref ENDPOINT_REQUESTS: Result<IntCounterVec> = try_create_int_counter_vec(
|
||||||
"eth1_endpoint_requests", "The number of eth1 requests for each endpoint", &["endpoint"]
|
"eth1_endpoint_requests", "The number of eth1 requests for each endpoint", &["endpoint"]
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Eth1 rpc connection
|
||||||
|
*/
|
||||||
|
|
||||||
|
pub static ref ETH1_CONNECTED: Result<IntGauge> = try_create_int_gauge(
|
||||||
|
"sync_eth1_connected", "Set to 1 if connected to an eth1 node, otherwise set to 0"
|
||||||
|
);
|
||||||
|
|
||||||
|
pub static ref ETH1_FALLBACK_CONFIGURED: Result<IntGauge> = try_create_int_gauge(
|
||||||
|
"sync_eth1_fallback_configured", "Number of configured eth1 fallbacks"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Note: This metric only checks if an eth1 fallback is configured, not if it is connected and synced.
|
||||||
|
// Checking for liveness of the fallback would require moving away from lazy checking of fallbacks.
|
||||||
|
pub static ref ETH1_FALLBACK_CONNECTED: Result<IntGauge> = try_create_int_gauge(
|
||||||
|
"eth1_sync_fallback_connected", "Set to 1 if an eth1 fallback is connected, otherwise set to 0"
|
||||||
|
);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -94,6 +94,9 @@ impl EndpointsCache {
|
|||||||
&crate::metrics::ENDPOINT_ERRORS,
|
&crate::metrics::ENDPOINT_ERRORS,
|
||||||
&[&endpoint.0.to_string()],
|
&[&endpoint.0.to_string()],
|
||||||
);
|
);
|
||||||
|
crate::metrics::set_gauge(&metrics::ETH1_CONNECTED, 0);
|
||||||
|
} else {
|
||||||
|
crate::metrics::set_gauge(&metrics::ETH1_CONNECTED, 1);
|
||||||
}
|
}
|
||||||
state
|
state
|
||||||
}
|
}
|
||||||
@ -730,6 +733,7 @@ impl Service {
|
|||||||
|
|
||||||
let mut interval = interval_at(Instant::now(), update_interval);
|
let mut interval = interval_at(Instant::now(), update_interval);
|
||||||
|
|
||||||
|
let num_fallbacks = self.config().endpoints.len() - 1;
|
||||||
let update_future = async move {
|
let update_future = async move {
|
||||||
loop {
|
loop {
|
||||||
interval.tick().await;
|
interval.tick().await;
|
||||||
@ -737,6 +741,15 @@ impl Service {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Set the number of configured eth1 servers
|
||||||
|
metrics::set_gauge(&metrics::ETH1_FALLBACK_CONFIGURED, num_fallbacks as i64);
|
||||||
|
// Since we lazily update eth1 fallbacks, it's not possible to know connection status of fallback.
|
||||||
|
// Hence, we set it to 1 if we have atleast one configured fallback.
|
||||||
|
if num_fallbacks > 0 {
|
||||||
|
metrics::set_gauge(&metrics::ETH1_FALLBACK_CONNECTED, 1);
|
||||||
|
} else {
|
||||||
|
metrics::set_gauge(&metrics::ETH1_FALLBACK_CONNECTED, 0);
|
||||||
|
}
|
||||||
handle.spawn(update_future, "eth1");
|
handle.spawn(update_future, "eth1");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -232,6 +232,23 @@ pub fn cli_app<'a, 'b>() -> App<'a, 'b> {
|
|||||||
.takes_value(true),
|
.takes_value(true),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Monitoring metrics
|
||||||
|
*/
|
||||||
|
|
||||||
|
.arg(
|
||||||
|
Arg::with_name("monitoring-endpoint")
|
||||||
|
.long("monitoring-endpoint")
|
||||||
|
.value_name("ADDRESS")
|
||||||
|
.help("Enables the monitoring service for sending system metrics to a remote endpoint. \
|
||||||
|
This can be used to monitor your setup on certain services (e.g. beaconcha.in). \
|
||||||
|
This flag sets the endpoint where the beacon node metrics will be sent. \
|
||||||
|
Note: This will send information to a remote sever which may identify and associate your \
|
||||||
|
validators, IP address and other personal information. Always use a HTTPS connection \
|
||||||
|
and never provide an untrusted URL.")
|
||||||
|
.takes_value(true),
|
||||||
|
)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Standard staking flags
|
* Standard staking flags
|
||||||
*/
|
*/
|
||||||
|
@ -136,6 +136,17 @@ pub fn get_config<E: EthSpec>(
|
|||||||
client_config.http_metrics.allow_origin = Some(allow_origin.to_string());
|
client_config.http_metrics.allow_origin = Some(allow_origin.to_string());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Explorer metrics
|
||||||
|
*/
|
||||||
|
if let Some(monitoring_endpoint) = cli_args.value_of("monitoring-endpoint") {
|
||||||
|
client_config.monitoring_api = Some(monitoring_api::Config {
|
||||||
|
db_path: None,
|
||||||
|
freezer_db_path: None,
|
||||||
|
monitoring_endpoint: monitoring_endpoint.to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// Log a warning indicating an open HTTP server if it wasn't specified explicitly
|
// Log a warning indicating an open HTTP server if it wasn't specified explicitly
|
||||||
// (e.g. using the --staking flag).
|
// (e.g. using the --staking flag).
|
||||||
if cli_args.is_present("staking") {
|
if cli_args.is_present("staking") {
|
||||||
|
@ -63,14 +63,14 @@ impl<E: EthSpec> ProductionBeaconNode<E> {
|
|||||||
let log = context.log().clone();
|
let log = context.log().clone();
|
||||||
let datadir = client_config.create_data_dir()?;
|
let datadir = client_config.create_data_dir()?;
|
||||||
let db_path = client_config.create_db_path()?;
|
let db_path = client_config.create_db_path()?;
|
||||||
let freezer_db_path_res = client_config.create_freezer_db_path();
|
let freezer_db_path = client_config.create_freezer_db_path()?;
|
||||||
let executor = context.executor.clone();
|
let executor = context.executor.clone();
|
||||||
|
|
||||||
let builder = ClientBuilder::new(context.eth_spec_instance.clone())
|
let builder = ClientBuilder::new(context.eth_spec_instance.clone())
|
||||||
.runtime_context(context)
|
.runtime_context(context)
|
||||||
.chain_spec(spec)
|
.chain_spec(spec)
|
||||||
.http_api_config(client_config.http_api.clone())
|
.http_api_config(client_config.http_api.clone())
|
||||||
.disk_store(&datadir, &db_path, &freezer_db_path_res?, store_config)?;
|
.disk_store(&datadir, &db_path, &freezer_db_path, store_config)?;
|
||||||
|
|
||||||
let builder = if let Some(slasher_config) = client_config.slasher.clone() {
|
let builder = if let Some(slasher_config) = client_config.slasher.clone() {
|
||||||
let slasher = Arc::new(
|
let slasher = Arc::new(
|
||||||
@ -82,6 +82,14 @@ impl<E: EthSpec> ProductionBeaconNode<E> {
|
|||||||
builder
|
builder
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let builder = if let Some(monitoring_config) = &mut client_config.monitoring_api {
|
||||||
|
monitoring_config.db_path = Some(db_path);
|
||||||
|
monitoring_config.freezer_db_path = Some(freezer_db_path);
|
||||||
|
builder.monitoring_client(monitoring_config)?
|
||||||
|
} else {
|
||||||
|
builder
|
||||||
|
};
|
||||||
|
|
||||||
let builder = builder
|
let builder = builder
|
||||||
.beacon_chain_builder(client_genesis, client_config.clone())
|
.beacon_chain_builder(client_genesis, client_config.clone())
|
||||||
.await?;
|
.await?;
|
||||||
|
@ -21,7 +21,7 @@ mod impls;
|
|||||||
mod leveldb_store;
|
mod leveldb_store;
|
||||||
mod memory_store;
|
mod memory_store;
|
||||||
pub mod metadata;
|
pub mod metadata;
|
||||||
mod metrics;
|
pub mod metrics;
|
||||||
mod partial_beacon_state;
|
mod partial_beacon_state;
|
||||||
|
|
||||||
pub mod iter;
|
pub mod iter;
|
||||||
|
@ -27,20 +27,39 @@ curl -X GET "http://localhost:5052/lighthouse/health" -H "accept: application/j
|
|||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"pid": 1728254,
|
"sys_virt_mem_total": 16671133696,
|
||||||
"pid_num_threads": 47,
|
"sys_virt_mem_available": 8273715200,
|
||||||
"pid_mem_resident_set_size": 510054400,
|
"sys_virt_mem_used": 7304818688,
|
||||||
"pid_mem_virtual_memory_size": 3963158528,
|
"sys_virt_mem_free": 2998190080,
|
||||||
"sys_virt_mem_total": 16715530240,
|
"sys_virt_mem_percent": 50.37101,
|
||||||
"sys_virt_mem_available": 4065374208,
|
"sys_virt_mem_cached": 5013975040,
|
||||||
"sys_virt_mem_used": 11383402496,
|
"sys_virt_mem_buffers": 1354149888,
|
||||||
"sys_virt_mem_free": 1368662016,
|
"sys_loadavg_1": 2.29,
|
||||||
"sys_virt_mem_percent": 75.67906,
|
"sys_loadavg_5": 3.48,
|
||||||
"sys_loadavg_1": 4.92,
|
"sys_loadavg_15": 3.72,
|
||||||
"sys_loadavg_5": 5.53,
|
"cpu_cores": 4,
|
||||||
"sys_loadavg_15": 5.58
|
"cpu_threads": 8,
|
||||||
|
"system_seconds_total": 5728,
|
||||||
|
"user_seconds_total": 33680,
|
||||||
|
"iowait_seconds_total": 873,
|
||||||
|
"idle_seconds_total": 177530,
|
||||||
|
"cpu_time_total": 217447,
|
||||||
|
"disk_node_bytes_total": 358443397120,
|
||||||
|
"disk_node_bytes_free": 70025089024,
|
||||||
|
"disk_node_reads_total": 1141863,
|
||||||
|
"disk_node_writes_total": 1377993,
|
||||||
|
"network_node_bytes_total_received": 2405639308,
|
||||||
|
"network_node_bytes_total_transmit": 328304685,
|
||||||
|
"misc_node_boot_ts_seconds": 1620629638,
|
||||||
|
"misc_os": "linux",
|
||||||
|
"pid": 4698,
|
||||||
|
"pid_num_threads": 25,
|
||||||
|
"pid_mem_resident_set_size": 783757312,
|
||||||
|
"pid_mem_virtual_memory_size": 2564665344,
|
||||||
|
"pid_process_seconds_total": 22
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### `/lighthouse/syncing`
|
### `/lighthouse/syncing`
|
||||||
|
@ -76,11 +76,147 @@ pub struct ValidatorInclusionData {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
use {procinfo::pid, psutil::process::Process};
|
use {
|
||||||
|
procinfo::pid, psutil::cpu::os::linux::CpuTimesExt,
|
||||||
|
psutil::memory::os::linux::VirtualMemoryExt, psutil::process::Process,
|
||||||
|
};
|
||||||
|
|
||||||
/// Reports on the health of the Lighthouse instance.
|
/// Reports on the health of the Lighthouse instance.
|
||||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||||
pub struct Health {
|
pub struct Health {
|
||||||
|
#[serde(flatten)]
|
||||||
|
pub system: SystemHealth,
|
||||||
|
#[serde(flatten)]
|
||||||
|
pub process: ProcessHealth,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// System related health.
|
||||||
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||||
|
pub struct SystemHealth {
|
||||||
|
/// Total virtual memory on the system
|
||||||
|
pub sys_virt_mem_total: u64,
|
||||||
|
/// Total virtual memory available for new processes.
|
||||||
|
pub sys_virt_mem_available: u64,
|
||||||
|
/// Total virtual memory used on the system.
|
||||||
|
pub sys_virt_mem_used: u64,
|
||||||
|
/// Total virtual memory not used on the system.
|
||||||
|
pub sys_virt_mem_free: u64,
|
||||||
|
/// Percentage of virtual memory used on the system.
|
||||||
|
pub sys_virt_mem_percent: f32,
|
||||||
|
/// Total cached virtual memory on the system.
|
||||||
|
pub sys_virt_mem_cached: u64,
|
||||||
|
/// Total buffered virtual memory on the system.
|
||||||
|
pub sys_virt_mem_buffers: u64,
|
||||||
|
|
||||||
|
/// System load average over 1 minute.
|
||||||
|
pub sys_loadavg_1: f64,
|
||||||
|
/// System load average over 5 minutes.
|
||||||
|
pub sys_loadavg_5: f64,
|
||||||
|
/// System load average over 15 minutes.
|
||||||
|
pub sys_loadavg_15: f64,
|
||||||
|
|
||||||
|
/// Total cpu cores.
|
||||||
|
pub cpu_cores: u64,
|
||||||
|
/// Total cpu threads.
|
||||||
|
pub cpu_threads: u64,
|
||||||
|
|
||||||
|
/// Total time spent in kernel mode.
|
||||||
|
pub system_seconds_total: u64,
|
||||||
|
/// Total time spent in user mode.
|
||||||
|
pub user_seconds_total: u64,
|
||||||
|
/// Total time spent in waiting for io.
|
||||||
|
pub iowait_seconds_total: u64,
|
||||||
|
/// Total idle cpu time.
|
||||||
|
pub idle_seconds_total: u64,
|
||||||
|
/// Total cpu time.
|
||||||
|
pub cpu_time_total: u64,
|
||||||
|
|
||||||
|
/// Total capacity of disk.
|
||||||
|
pub disk_node_bytes_total: u64,
|
||||||
|
/// Free space in disk.
|
||||||
|
pub disk_node_bytes_free: u64,
|
||||||
|
/// Number of disk reads.
|
||||||
|
pub disk_node_reads_total: u64,
|
||||||
|
/// Number of disk writes.
|
||||||
|
pub disk_node_writes_total: u64,
|
||||||
|
|
||||||
|
/// Total bytes received over all network interfaces.
|
||||||
|
pub network_node_bytes_total_received: u64,
|
||||||
|
/// Total bytes sent over all network interfaces.
|
||||||
|
pub network_node_bytes_total_transmit: u64,
|
||||||
|
|
||||||
|
/// Boot time
|
||||||
|
pub misc_node_boot_ts_seconds: u64,
|
||||||
|
/// OS
|
||||||
|
pub misc_os: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SystemHealth {
|
||||||
|
#[cfg(not(target_os = "linux"))]
|
||||||
|
pub fn observe() -> Result<Self, String> {
|
||||||
|
Err("Health is only available on Linux".into())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
pub fn observe() -> Result<Self, String> {
|
||||||
|
let vm = psutil::memory::virtual_memory()
|
||||||
|
.map_err(|e| format!("Unable to get virtual memory: {:?}", e))?;
|
||||||
|
let loadavg =
|
||||||
|
psutil::host::loadavg().map_err(|e| format!("Unable to get loadavg: {:?}", e))?;
|
||||||
|
|
||||||
|
let cpu =
|
||||||
|
psutil::cpu::cpu_times().map_err(|e| format!("Unable to get cpu times: {:?}", e))?;
|
||||||
|
|
||||||
|
let disk_usage = psutil::disk::disk_usage("/")
|
||||||
|
.map_err(|e| format!("Unable to disk usage info: {:?}", e))?;
|
||||||
|
|
||||||
|
let disk = psutil::disk::DiskIoCountersCollector::default()
|
||||||
|
.disk_io_counters()
|
||||||
|
.map_err(|e| format!("Unable to get disk counters: {:?}", e))?;
|
||||||
|
|
||||||
|
let net = psutil::network::NetIoCountersCollector::default()
|
||||||
|
.net_io_counters()
|
||||||
|
.map_err(|e| format!("Unable to get network io counters: {:?}", e))?;
|
||||||
|
|
||||||
|
let boot_time = psutil::host::boot_time()
|
||||||
|
.map_err(|e| format!("Unable to get system boot time: {:?}", e))?
|
||||||
|
.duration_since(std::time::UNIX_EPOCH)
|
||||||
|
.map_err(|e| format!("Boot time is lower than unix epoch: {}", e))?
|
||||||
|
.as_secs();
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
sys_virt_mem_total: vm.total(),
|
||||||
|
sys_virt_mem_available: vm.available(),
|
||||||
|
sys_virt_mem_used: vm.used(),
|
||||||
|
sys_virt_mem_free: vm.free(),
|
||||||
|
sys_virt_mem_cached: vm.cached(),
|
||||||
|
sys_virt_mem_buffers: vm.buffers(),
|
||||||
|
sys_virt_mem_percent: vm.percent(),
|
||||||
|
sys_loadavg_1: loadavg.one,
|
||||||
|
sys_loadavg_5: loadavg.five,
|
||||||
|
sys_loadavg_15: loadavg.fifteen,
|
||||||
|
cpu_cores: psutil::cpu::cpu_count_physical(),
|
||||||
|
cpu_threads: psutil::cpu::cpu_count(),
|
||||||
|
system_seconds_total: cpu.system().as_secs(),
|
||||||
|
cpu_time_total: cpu.total().as_secs(),
|
||||||
|
user_seconds_total: cpu.user().as_secs(),
|
||||||
|
iowait_seconds_total: cpu.iowait().as_secs(),
|
||||||
|
idle_seconds_total: cpu.idle().as_secs(),
|
||||||
|
disk_node_bytes_total: disk_usage.total(),
|
||||||
|
disk_node_bytes_free: disk_usage.free(),
|
||||||
|
disk_node_reads_total: disk.read_count(),
|
||||||
|
disk_node_writes_total: disk.write_count(),
|
||||||
|
network_node_bytes_total_received: net.bytes_recv(),
|
||||||
|
network_node_bytes_total_transmit: net.bytes_sent(),
|
||||||
|
misc_node_boot_ts_seconds: boot_time,
|
||||||
|
misc_os: std::env::consts::OS.to_string(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Process specific health
|
||||||
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||||
|
pub struct ProcessHealth {
|
||||||
/// The pid of this process.
|
/// The pid of this process.
|
||||||
pub pid: u32,
|
pub pid: u32,
|
||||||
/// The number of threads used by this pid.
|
/// The number of threads used by this pid.
|
||||||
@ -89,25 +225,11 @@ pub struct Health {
|
|||||||
pub pid_mem_resident_set_size: u64,
|
pub pid_mem_resident_set_size: u64,
|
||||||
/// The total virtual memory used by this pid.
|
/// The total virtual memory used by this pid.
|
||||||
pub pid_mem_virtual_memory_size: u64,
|
pub pid_mem_virtual_memory_size: u64,
|
||||||
/// Total virtual memory on the system
|
/// Number of cpu seconds consumed by this pid.
|
||||||
pub sys_virt_mem_total: u64,
|
pub pid_process_seconds_total: u64,
|
||||||
/// Total virtual memory available for new processes.
|
|
||||||
pub sys_virt_mem_available: u64,
|
|
||||||
/// Total virtual memory used on the system
|
|
||||||
pub sys_virt_mem_used: u64,
|
|
||||||
/// Total virtual memory not used on the system
|
|
||||||
pub sys_virt_mem_free: u64,
|
|
||||||
/// Percentage of virtual memory used on the system
|
|
||||||
pub sys_virt_mem_percent: f32,
|
|
||||||
/// System load average over 1 minute.
|
|
||||||
pub sys_loadavg_1: f64,
|
|
||||||
/// System load average over 5 minutes.
|
|
||||||
pub sys_loadavg_5: f64,
|
|
||||||
/// System load average over 15 minutes.
|
|
||||||
pub sys_loadavg_15: f64,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Health {
|
impl ProcessHealth {
|
||||||
#[cfg(not(target_os = "linux"))]
|
#[cfg(not(target_os = "linux"))]
|
||||||
pub fn observe() -> Result<Self, String> {
|
pub fn observe() -> Result<Self, String> {
|
||||||
Err("Health is only available on Linux".into())
|
Err("Health is only available on Linux".into())
|
||||||
@ -123,25 +245,33 @@ impl Health {
|
|||||||
.map_err(|e| format!("Unable to get process memory info: {:?}", e))?;
|
.map_err(|e| format!("Unable to get process memory info: {:?}", e))?;
|
||||||
|
|
||||||
let stat = pid::stat_self().map_err(|e| format!("Unable to get stat: {:?}", e))?;
|
let stat = pid::stat_self().map_err(|e| format!("Unable to get stat: {:?}", e))?;
|
||||||
|
let process_times = process
|
||||||
let vm = psutil::memory::virtual_memory()
|
.cpu_times()
|
||||||
.map_err(|e| format!("Unable to get virtual memory: {:?}", e))?;
|
.map_err(|e| format!("Unable to get process cpu times : {:?}", e))?;
|
||||||
let loadavg =
|
|
||||||
psutil::host::loadavg().map_err(|e| format!("Unable to get loadavg: {:?}", e))?;
|
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
pid: process.pid(),
|
pid: process.pid(),
|
||||||
pid_num_threads: stat.num_threads,
|
pid_num_threads: stat.num_threads,
|
||||||
pid_mem_resident_set_size: process_mem.rss(),
|
pid_mem_resident_set_size: process_mem.rss(),
|
||||||
pid_mem_virtual_memory_size: process_mem.vms(),
|
pid_mem_virtual_memory_size: process_mem.vms(),
|
||||||
sys_virt_mem_total: vm.total(),
|
pid_process_seconds_total: process_times.busy().as_secs()
|
||||||
sys_virt_mem_available: vm.available(),
|
+ process_times.children_system().as_secs()
|
||||||
sys_virt_mem_used: vm.used(),
|
+ process_times.children_system().as_secs(),
|
||||||
sys_virt_mem_free: vm.free(),
|
})
|
||||||
sys_virt_mem_percent: vm.percent(),
|
}
|
||||||
sys_loadavg_1: loadavg.one,
|
}
|
||||||
sys_loadavg_5: loadavg.five,
|
|
||||||
sys_loadavg_15: loadavg.fifteen,
|
impl Health {
|
||||||
|
#[cfg(not(target_os = "linux"))]
|
||||||
|
pub fn observe() -> Result<Self, String> {
|
||||||
|
Err("Health is only available on Linux".into())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
pub fn observe() -> Result<Self, String> {
|
||||||
|
Ok(Self {
|
||||||
|
process: ProcessHealth::observe()?,
|
||||||
|
system: SystemHealth::observe()?,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -59,6 +59,7 @@ use std::time::Duration;
|
|||||||
|
|
||||||
use prometheus::core::{Atomic, GenericGauge, GenericGaugeVec};
|
use prometheus::core::{Atomic, GenericGauge, GenericGaugeVec};
|
||||||
pub use prometheus::{
|
pub use prometheus::{
|
||||||
|
proto::{Metric, MetricFamily, MetricType},
|
||||||
Encoder, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge,
|
Encoder, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge,
|
||||||
IntGaugeVec, Result, TextEncoder,
|
IntGaugeVec, Result, TextEncoder,
|
||||||
};
|
};
|
||||||
|
24
common/monitoring_api/Cargo.toml
Normal file
24
common/monitoring_api/Cargo.toml
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
[package]
|
||||||
|
name = "monitoring_api"
|
||||||
|
version = "0.1.0"
|
||||||
|
authors = ["pawan <pawandhananjay@gmail.com>"]
|
||||||
|
edition = "2018"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
reqwest = { version = "0.11.0", features = ["json","stream"] }
|
||||||
|
futures = "0.3.7"
|
||||||
|
task_executor = { path = "../task_executor" }
|
||||||
|
tokio = "1.1.0"
|
||||||
|
eth2 = {path = "../eth2"}
|
||||||
|
serde_json = "1.0.58"
|
||||||
|
serde = "1.0.116"
|
||||||
|
serde_derive = "1.0.116"
|
||||||
|
lighthouse_version = { path = "../lighthouse_version"}
|
||||||
|
lighthouse_metrics = { path = "../lighthouse_metrics" }
|
||||||
|
slog = "2.5.2"
|
||||||
|
store = { path = "../../beacon_node/store" }
|
||||||
|
lazy_static = "1.4.0"
|
||||||
|
regex = "1"
|
||||||
|
sensitive_url = { path = "../sensitive_url" }
|
193
common/monitoring_api/src/gather.rs
Normal file
193
common/monitoring_api/src/gather.rs
Normal file
@ -0,0 +1,193 @@
|
|||||||
|
use super::types::{BeaconProcessMetrics, ValidatorProcessMetrics};
|
||||||
|
use lazy_static::lazy_static;
|
||||||
|
use lighthouse_metrics::{MetricFamily, MetricType};
|
||||||
|
use serde_json::json;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
/// Represents a metric that needs to be fetched from lighthouse metrics registry
|
||||||
|
/// and sent to the remote monitoring service.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct JsonMetric {
|
||||||
|
/// Name of the metric as used in Lighthouse metrics.
|
||||||
|
lighthouse_metric_name: &'static str,
|
||||||
|
/// Json key for the metric that we send to the remote monitoring endpoint.
|
||||||
|
json_output_key: &'static str,
|
||||||
|
/// Type of the json value to be sent to the remote monitoring endpoint
|
||||||
|
ty: JsonType,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl JsonMetric {
|
||||||
|
const fn new(
|
||||||
|
lighthouse_metric_name: &'static str,
|
||||||
|
json_output_key: &'static str,
|
||||||
|
ty: JsonType,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
lighthouse_metric_name,
|
||||||
|
json_output_key,
|
||||||
|
ty,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return a json value given given the metric type.
|
||||||
|
fn get_typed_value(&self, value: i64) -> serde_json::Value {
|
||||||
|
match self.ty {
|
||||||
|
JsonType::Integer => json!(value),
|
||||||
|
JsonType::Boolean => {
|
||||||
|
if value > 0 {
|
||||||
|
json!(true)
|
||||||
|
} else {
|
||||||
|
json!(false)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The required metrics for the beacon and validator processes.
|
||||||
|
const BEACON_PROCESS_METRICS: &[JsonMetric] = &[
|
||||||
|
JsonMetric::new(
|
||||||
|
"sync_eth1_fallback_configured",
|
||||||
|
"sync_eth1_fallback_configured",
|
||||||
|
JsonType::Boolean,
|
||||||
|
),
|
||||||
|
JsonMetric::new(
|
||||||
|
"sync_eth1_fallback_connected",
|
||||||
|
"sync_eth1_fallback_connected",
|
||||||
|
JsonType::Boolean,
|
||||||
|
),
|
||||||
|
JsonMetric::new(
|
||||||
|
"sync_eth1_connected",
|
||||||
|
"sync_eth1_connected",
|
||||||
|
JsonType::Boolean,
|
||||||
|
),
|
||||||
|
JsonMetric::new(
|
||||||
|
"store_disk_db_size",
|
||||||
|
"disk_beaconchain_bytes_total",
|
||||||
|
JsonType::Integer,
|
||||||
|
),
|
||||||
|
JsonMetric::new(
|
||||||
|
"libp2p_peer_connected_peers_total",
|
||||||
|
"network_peers_connected",
|
||||||
|
JsonType::Integer,
|
||||||
|
),
|
||||||
|
JsonMetric::new(
|
||||||
|
"libp2p_outbound_bytes",
|
||||||
|
"network_libp2p_bytes_total_transmit",
|
||||||
|
JsonType::Integer,
|
||||||
|
),
|
||||||
|
JsonMetric::new(
|
||||||
|
"libp2p_inbound_bytes",
|
||||||
|
"network_libp2p_bytes_total_receive",
|
||||||
|
JsonType::Integer,
|
||||||
|
),
|
||||||
|
JsonMetric::new(
|
||||||
|
"notifier_head_slot",
|
||||||
|
"sync_beacon_head_slot",
|
||||||
|
JsonType::Integer,
|
||||||
|
),
|
||||||
|
JsonMetric::new("sync_eth2_synced", "sync_eth2_synced", JsonType::Boolean),
|
||||||
|
];
|
||||||
|
|
||||||
|
const VALIDATOR_PROCESS_METRICS: &[JsonMetric] = &[
|
||||||
|
JsonMetric::new(
|
||||||
|
"vc_validators_enabled_count",
|
||||||
|
"validator_active",
|
||||||
|
JsonType::Integer,
|
||||||
|
),
|
||||||
|
JsonMetric::new(
|
||||||
|
"vc_validators_total_count",
|
||||||
|
"validator_total",
|
||||||
|
JsonType::Integer,
|
||||||
|
),
|
||||||
|
JsonMetric::new(
|
||||||
|
"sync_eth2_fallback_configured",
|
||||||
|
"sync_eth2_fallback_configured",
|
||||||
|
JsonType::Boolean,
|
||||||
|
),
|
||||||
|
JsonMetric::new(
|
||||||
|
"sync_eth2_fallback_connected",
|
||||||
|
"sync_eth2_fallback_connected",
|
||||||
|
JsonType::Boolean,
|
||||||
|
),
|
||||||
|
];
|
||||||
|
|
||||||
|
/// Represents the type for the JSON output.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum JsonType {
|
||||||
|
Integer,
|
||||||
|
Boolean,
|
||||||
|
}
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
/// HashMap representing the `BEACON_PROCESS_METRICS`.
|
||||||
|
pub static ref BEACON_METRICS_MAP: HashMap<String, JsonMetric> = BEACON_PROCESS_METRICS
|
||||||
|
.iter()
|
||||||
|
.map(|metric| (metric.lighthouse_metric_name.to_string(), metric.clone()))
|
||||||
|
.collect();
|
||||||
|
/// HashMap representing the `VALIDATOR_PROCESS_METRICS`.
|
||||||
|
pub static ref VALIDATOR_METRICS_MAP: HashMap<String,JsonMetric> =
|
||||||
|
VALIDATOR_PROCESS_METRICS
|
||||||
|
.iter()
|
||||||
|
.map(|metric| (metric.lighthouse_metric_name.to_string(), metric.clone()))
|
||||||
|
.collect();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the value from a Counter/Gauge `MetricType` assuming that it has no associated labels
|
||||||
|
/// else it returns `None`.
|
||||||
|
fn get_value(mf: &MetricFamily) -> Option<i64> {
|
||||||
|
let metric = mf.get_metric().first()?;
|
||||||
|
match mf.get_field_type() {
|
||||||
|
MetricType::COUNTER => Some(metric.get_counter().get_value() as i64),
|
||||||
|
MetricType::GAUGE => Some(metric.get_gauge().get_value() as i64),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collects all metrics and returns a `serde_json::Value` object with the required metrics
|
||||||
|
/// from the metrics hashmap.
|
||||||
|
pub fn gather_metrics(metrics_map: &HashMap<String, JsonMetric>) -> Option<serde_json::Value> {
|
||||||
|
let metric_families = lighthouse_metrics::gather();
|
||||||
|
let mut res = serde_json::Map::with_capacity(metrics_map.len());
|
||||||
|
for mf in metric_families.iter() {
|
||||||
|
let metric_name = mf.get_name();
|
||||||
|
if metrics_map.contains_key(metric_name) {
|
||||||
|
let value = get_value(&mf).unwrap_or_default();
|
||||||
|
let metric = metrics_map.get(metric_name)?;
|
||||||
|
let value = metric.get_typed_value(value);
|
||||||
|
let _ = res.insert(metric.json_output_key.to_string(), value);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
Some(serde_json::Value::Object(res))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gathers and returns the lighthouse beacon metrics.
|
||||||
|
pub fn gather_beacon_metrics(
|
||||||
|
db_path: &Path,
|
||||||
|
freezer_db_path: &Path,
|
||||||
|
) -> Result<BeaconProcessMetrics, String> {
|
||||||
|
// Update db size metrics
|
||||||
|
store::metrics::scrape_for_metrics(db_path, freezer_db_path);
|
||||||
|
|
||||||
|
let beacon_metrics = gather_metrics(&BEACON_METRICS_MAP)
|
||||||
|
.ok_or_else(|| "Failed to gather beacon metrics".to_string())?;
|
||||||
|
let process = eth2::lighthouse::ProcessHealth::observe()?.into();
|
||||||
|
|
||||||
|
Ok(BeaconProcessMetrics {
|
||||||
|
beacon: beacon_metrics,
|
||||||
|
common: process,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gathers and returns the lighthouse validator metrics.
|
||||||
|
pub fn gather_validator_metrics() -> Result<ValidatorProcessMetrics, String> {
|
||||||
|
let validator_metrics = gather_metrics(&VALIDATOR_METRICS_MAP)
|
||||||
|
.ok_or_else(|| "Failed to gather validator metrics".to_string())?;
|
||||||
|
|
||||||
|
let process = eth2::lighthouse::ProcessHealth::observe()?.into();
|
||||||
|
Ok(ValidatorProcessMetrics {
|
||||||
|
validator: validator_metrics,
|
||||||
|
common: process,
|
||||||
|
})
|
||||||
|
}
|
208
common/monitoring_api/src/lib.rs
Normal file
208
common/monitoring_api/src/lib.rs
Normal file
@ -0,0 +1,208 @@
|
|||||||
|
mod gather;
|
||||||
|
mod types;
|
||||||
|
use std::{path::PathBuf, time::Duration};
|
||||||
|
|
||||||
|
use eth2::lighthouse::SystemHealth;
|
||||||
|
use gather::{gather_beacon_metrics, gather_validator_metrics};
|
||||||
|
use reqwest::{IntoUrl, Response};
|
||||||
|
pub use reqwest::{StatusCode, Url};
|
||||||
|
use sensitive_url::SensitiveUrl;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use slog::{debug, error, info};
|
||||||
|
use task_executor::TaskExecutor;
|
||||||
|
use tokio::time::{interval_at, Instant};
|
||||||
|
use types::*;
|
||||||
|
|
||||||
|
pub use types::ProcessType;
|
||||||
|
|
||||||
|
/// Duration after which we collect and send metrics to remote endpoint.
|
||||||
|
pub const UPDATE_DURATION: u64 = 60;
|
||||||
|
/// Timeout for HTTP requests.
|
||||||
|
pub const TIMEOUT_DURATION: u64 = 5;
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum Error {
|
||||||
|
/// The `reqwest` client raised an error.
|
||||||
|
Reqwest(reqwest::Error),
|
||||||
|
/// The supplied URL is badly formatted. It should look something like `http://127.0.0.1:5052`.
|
||||||
|
InvalidUrl(SensitiveUrl),
|
||||||
|
SystemMetricsFailed(String),
|
||||||
|
BeaconMetricsFailed(String),
|
||||||
|
ValidatorMetricsFailed(String),
|
||||||
|
/// The server returned an error message where the body was able to be parsed.
|
||||||
|
ServerMessage(ErrorMessage),
|
||||||
|
/// The server returned an error message where the body was unable to be parsed.
|
||||||
|
StatusCode(StatusCode),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for Error {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match &self {
|
||||||
|
Error::Reqwest(e) => write!(f, "Reqwest error: {}", e),
|
||||||
|
// Print the debug value
|
||||||
|
e => write!(f, "{:?}", e),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
||||||
|
pub struct Config {
|
||||||
|
/// Endpoint
|
||||||
|
pub monitoring_endpoint: String,
|
||||||
|
/// Path for the hot database required for fetching beacon db size metrics.
|
||||||
|
/// Note: not relevant for validator and system metrics.
|
||||||
|
pub db_path: Option<PathBuf>,
|
||||||
|
/// Path for the cold database required for fetching beacon db size metrics.
|
||||||
|
/// Note: not relevant for validator and system metrics.
|
||||||
|
pub freezer_db_path: Option<PathBuf>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct MonitoringHttpClient {
|
||||||
|
client: reqwest::Client,
|
||||||
|
/// Path to the hot database. Required for getting db size metrics
|
||||||
|
db_path: Option<PathBuf>,
|
||||||
|
/// Path to the freezer database.
|
||||||
|
freezer_db_path: Option<PathBuf>,
|
||||||
|
monitoring_endpoint: SensitiveUrl,
|
||||||
|
log: slog::Logger,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MonitoringHttpClient {
|
||||||
|
pub fn new(config: &Config, log: slog::Logger) -> Result<Self, String> {
|
||||||
|
Ok(Self {
|
||||||
|
client: reqwest::Client::new(),
|
||||||
|
db_path: config.db_path.clone(),
|
||||||
|
freezer_db_path: config.freezer_db_path.clone(),
|
||||||
|
monitoring_endpoint: SensitiveUrl::parse(&config.monitoring_endpoint)
|
||||||
|
.map_err(|e| format!("Invalid monitoring endpoint: {:?}", e))?,
|
||||||
|
log,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Perform a HTTP POST request.
|
||||||
|
async fn post<T: Serialize, U: IntoUrl>(&self, url: U, body: &T) -> Result<(), Error> {
|
||||||
|
let response = self
|
||||||
|
.client
|
||||||
|
.post(url)
|
||||||
|
.json(body)
|
||||||
|
.timeout(Duration::from_secs(TIMEOUT_DURATION))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(Error::Reqwest)?;
|
||||||
|
ok_or_error(response).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a task which periodically sends the provided process metrics
|
||||||
|
/// to the configured remote endpoint.
|
||||||
|
pub fn auto_update(self, executor: TaskExecutor, processes: Vec<ProcessType>) {
|
||||||
|
let mut interval = interval_at(
|
||||||
|
// Have some initial delay for the metrics to get initialized
|
||||||
|
Instant::now() + Duration::from_secs(25),
|
||||||
|
Duration::from_secs(UPDATE_DURATION),
|
||||||
|
);
|
||||||
|
|
||||||
|
info!(self.log, "Starting monitoring api"; "endpoint" => %self.monitoring_endpoint);
|
||||||
|
|
||||||
|
let update_future = async move {
|
||||||
|
loop {
|
||||||
|
interval.tick().await;
|
||||||
|
match self.send_metrics(&processes).await {
|
||||||
|
Ok(()) => {
|
||||||
|
debug!(self.log, "Metrics sent to remote server"; "endpoint" => %self.monitoring_endpoint);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!(self.log, "Failed to send metrics to remote endpoint"; "error" => %e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
executor.spawn(update_future, "monitoring_api");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets beacon metrics and updates the metrics struct
|
||||||
|
pub fn get_beacon_metrics(&self) -> Result<MonitoringMetrics, Error> {
|
||||||
|
let db_path = self.db_path.as_ref().ok_or_else(|| {
|
||||||
|
Error::BeaconMetricsFailed("Beacon metrics require db path".to_string())
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let freezer_db_path = self.db_path.as_ref().ok_or_else(|| {
|
||||||
|
Error::BeaconMetricsFailed("Beacon metrics require freezer db path".to_string())
|
||||||
|
})?;
|
||||||
|
let metrics = gather_beacon_metrics(&db_path, &freezer_db_path)
|
||||||
|
.map_err(Error::BeaconMetricsFailed)?;
|
||||||
|
Ok(MonitoringMetrics {
|
||||||
|
metadata: Metadata::new(ProcessType::BeaconNode),
|
||||||
|
process_metrics: Process::Beacon(metrics),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets validator process metrics by querying the validator metrics endpoint
|
||||||
|
pub fn get_validator_metrics(&self) -> Result<MonitoringMetrics, Error> {
|
||||||
|
let metrics = gather_validator_metrics().map_err(Error::BeaconMetricsFailed)?;
|
||||||
|
Ok(MonitoringMetrics {
|
||||||
|
metadata: Metadata::new(ProcessType::Validator),
|
||||||
|
process_metrics: Process::Validator(metrics),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets system metrics by observing capturing the SystemHealth metrics.
|
||||||
|
pub fn get_system_metrics(&self) -> Result<MonitoringMetrics, Error> {
|
||||||
|
let system_health = SystemHealth::observe().map_err(Error::SystemMetricsFailed)?;
|
||||||
|
Ok(MonitoringMetrics {
|
||||||
|
metadata: Metadata::new(ProcessType::System),
|
||||||
|
process_metrics: Process::System(system_health.into()),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return metric based on process type.
|
||||||
|
pub async fn get_metrics(
|
||||||
|
&self,
|
||||||
|
process_type: &ProcessType,
|
||||||
|
) -> Result<MonitoringMetrics, Error> {
|
||||||
|
match process_type {
|
||||||
|
ProcessType::BeaconNode => self.get_beacon_metrics(),
|
||||||
|
ProcessType::System => self.get_system_metrics(),
|
||||||
|
ProcessType::Validator => self.get_validator_metrics(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Send metrics to the remote endpoint
|
||||||
|
pub async fn send_metrics(&self, processes: &[ProcessType]) -> Result<(), Error> {
|
||||||
|
let mut metrics = Vec::new();
|
||||||
|
for process in processes {
|
||||||
|
match self.get_metrics(process).await {
|
||||||
|
Err(e) => error!(
|
||||||
|
self.log,
|
||||||
|
"Failed to get metrics";
|
||||||
|
"process_type" => ?process,
|
||||||
|
"error" => %e
|
||||||
|
),
|
||||||
|
Ok(metric) => metrics.push(metric),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
info!(
|
||||||
|
self.log,
|
||||||
|
"Sending metrics to remote endpoint";
|
||||||
|
"endpoint" => %self.monitoring_endpoint
|
||||||
|
);
|
||||||
|
self.post(self.monitoring_endpoint.full.clone(), &metrics)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns `Ok(response)` if the response is a `200 OK` response. Otherwise, creates an
|
||||||
|
/// appropriate error message.
|
||||||
|
async fn ok_or_error(response: Response) -> Result<Response, Error> {
|
||||||
|
let status = response.status();
|
||||||
|
|
||||||
|
if status == StatusCode::OK {
|
||||||
|
Ok(response)
|
||||||
|
} else if let Ok(message) = response.json().await {
|
||||||
|
Err(Error::ServerMessage(message))
|
||||||
|
} else {
|
||||||
|
Err(Error::StatusCode(status))
|
||||||
|
}
|
||||||
|
}
|
177
common/monitoring_api/src/types.rs
Normal file
177
common/monitoring_api/src/types.rs
Normal file
@ -0,0 +1,177 @@
|
|||||||
|
use std::time::{SystemTime, UNIX_EPOCH};
|
||||||
|
|
||||||
|
use eth2::lighthouse::{ProcessHealth, SystemHealth};
|
||||||
|
use serde_derive::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
pub const VERSION: u64 = 1;
|
||||||
|
pub const CLIENT_NAME: &str = "lighthouse";
|
||||||
|
|
||||||
|
/// An API error serializable to JSON.
|
||||||
|
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||||
|
pub struct ErrorMessage {
|
||||||
|
pub code: u16,
|
||||||
|
pub message: String,
|
||||||
|
#[serde(default)]
|
||||||
|
pub stacktraces: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||||
|
pub struct MonitoringMetrics {
|
||||||
|
#[serde(flatten)]
|
||||||
|
pub metadata: Metadata,
|
||||||
|
#[serde(flatten)]
|
||||||
|
pub process_metrics: Process,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||||
|
#[serde(rename_all = "lowercase")]
|
||||||
|
pub enum ProcessType {
|
||||||
|
BeaconNode,
|
||||||
|
Validator,
|
||||||
|
System,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||||
|
pub struct Metadata {
|
||||||
|
version: u64,
|
||||||
|
timestamp: u128,
|
||||||
|
process: ProcessType,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Metadata {
|
||||||
|
pub fn new(process: ProcessType) -> Self {
|
||||||
|
Self {
|
||||||
|
version: VERSION,
|
||||||
|
timestamp: SystemTime::now()
|
||||||
|
.duration_since(UNIX_EPOCH)
|
||||||
|
.expect("time should be greater than unix epoch")
|
||||||
|
.as_millis(),
|
||||||
|
process,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||||
|
#[serde(untagged)]
|
||||||
|
pub enum Process {
|
||||||
|
Beacon(BeaconProcessMetrics),
|
||||||
|
System(SystemMetrics),
|
||||||
|
Validator(ValidatorProcessMetrics),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Common metrics for all processes.
|
||||||
|
#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
|
||||||
|
pub struct ProcessMetrics {
|
||||||
|
cpu_process_seconds_total: u64,
|
||||||
|
memory_process_bytes: u64,
|
||||||
|
|
||||||
|
client_name: String,
|
||||||
|
client_version: String,
|
||||||
|
client_build: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<ProcessHealth> for ProcessMetrics {
|
||||||
|
fn from(health: ProcessHealth) -> Self {
|
||||||
|
Self {
|
||||||
|
cpu_process_seconds_total: health.pid_process_seconds_total,
|
||||||
|
memory_process_bytes: health.pid_mem_resident_set_size,
|
||||||
|
client_name: CLIENT_NAME.to_string(),
|
||||||
|
client_version: client_version().unwrap_or_default(),
|
||||||
|
client_build: client_build(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Metrics related to the system.
|
||||||
|
#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
|
||||||
|
pub struct SystemMetrics {
|
||||||
|
cpu_cores: u64,
|
||||||
|
cpu_threads: u64,
|
||||||
|
cpu_node_system_seconds_total: u64,
|
||||||
|
cpu_node_user_seconds_total: u64,
|
||||||
|
cpu_node_iowait_seconds_total: u64,
|
||||||
|
cpu_node_idle_seconds_total: u64,
|
||||||
|
|
||||||
|
memory_node_bytes_total: u64,
|
||||||
|
memory_node_bytes_free: u64,
|
||||||
|
memory_node_bytes_cached: u64,
|
||||||
|
memory_node_bytes_buffers: u64,
|
||||||
|
|
||||||
|
disk_node_bytes_total: u64,
|
||||||
|
disk_node_bytes_free: u64,
|
||||||
|
|
||||||
|
disk_node_io_seconds: u64,
|
||||||
|
disk_node_reads_total: u64,
|
||||||
|
disk_node_writes_total: u64,
|
||||||
|
|
||||||
|
network_node_bytes_total_receive: u64,
|
||||||
|
network_node_bytes_total_transmit: u64,
|
||||||
|
|
||||||
|
misc_node_boot_ts_seconds: u64,
|
||||||
|
misc_os: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<SystemHealth> for SystemMetrics {
|
||||||
|
fn from(health: SystemHealth) -> Self {
|
||||||
|
// Export format uses 3 letter os names
|
||||||
|
let misc_os = health.misc_os.get(0..3).unwrap_or("unk").to_string();
|
||||||
|
Self {
|
||||||
|
cpu_cores: health.cpu_cores,
|
||||||
|
cpu_threads: health.cpu_threads,
|
||||||
|
cpu_node_system_seconds_total: health.cpu_time_total,
|
||||||
|
cpu_node_user_seconds_total: health.user_seconds_total,
|
||||||
|
cpu_node_iowait_seconds_total: health.iowait_seconds_total,
|
||||||
|
cpu_node_idle_seconds_total: health.idle_seconds_total,
|
||||||
|
|
||||||
|
memory_node_bytes_total: health.sys_virt_mem_total,
|
||||||
|
memory_node_bytes_free: health.sys_virt_mem_free,
|
||||||
|
memory_node_bytes_cached: health.sys_virt_mem_cached,
|
||||||
|
memory_node_bytes_buffers: health.sys_virt_mem_buffers,
|
||||||
|
|
||||||
|
disk_node_bytes_total: health.disk_node_bytes_total,
|
||||||
|
disk_node_bytes_free: health.disk_node_bytes_free,
|
||||||
|
|
||||||
|
// Unavaliable for now
|
||||||
|
disk_node_io_seconds: 0,
|
||||||
|
disk_node_reads_total: health.disk_node_reads_total,
|
||||||
|
disk_node_writes_total: health.disk_node_writes_total,
|
||||||
|
|
||||||
|
network_node_bytes_total_receive: health.network_node_bytes_total_received,
|
||||||
|
network_node_bytes_total_transmit: health.network_node_bytes_total_transmit,
|
||||||
|
|
||||||
|
misc_node_boot_ts_seconds: health.misc_node_boot_ts_seconds,
|
||||||
|
misc_os,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// All beacon process metrics.
|
||||||
|
#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
|
||||||
|
pub struct BeaconProcessMetrics {
|
||||||
|
#[serde(flatten)]
|
||||||
|
pub common: ProcessMetrics,
|
||||||
|
#[serde(flatten)]
|
||||||
|
pub beacon: serde_json::Value,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// All validator process metrics
|
||||||
|
#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
|
||||||
|
pub struct ValidatorProcessMetrics {
|
||||||
|
#[serde(flatten)]
|
||||||
|
pub common: ProcessMetrics,
|
||||||
|
#[serde(flatten)]
|
||||||
|
pub validator: serde_json::Value,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the client version
|
||||||
|
fn client_version() -> Option<String> {
|
||||||
|
let re = regex::Regex::new(r"\d+\.\d+\.\d+").expect("Regex is valid");
|
||||||
|
re.find(lighthouse_version::VERSION)
|
||||||
|
.map(|m| m.as_str().to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the client build
|
||||||
|
/// Note: Lighthouse does not support build numbers, this is effectively a null-value.
|
||||||
|
fn client_build() -> u64 {
|
||||||
|
0
|
||||||
|
}
|
@ -1,4 +1,4 @@
|
|||||||
use eth2::lighthouse::Health;
|
use eth2::lighthouse::{ProcessHealth, SystemHealth};
|
||||||
use lighthouse_metrics::*;
|
use lighthouse_metrics::*;
|
||||||
|
|
||||||
lazy_static::lazy_static! {
|
lazy_static::lazy_static! {
|
||||||
@ -14,6 +14,10 @@ lazy_static::lazy_static! {
|
|||||||
"process_virtual_memory_bytes",
|
"process_virtual_memory_bytes",
|
||||||
"Virtual memory used by the current process"
|
"Virtual memory used by the current process"
|
||||||
);
|
);
|
||||||
|
pub static ref PROCESS_SECONDS: Result<IntGauge> = try_create_int_gauge(
|
||||||
|
"process_cpu_seconds_total",
|
||||||
|
"Total cpu time taken by the current process"
|
||||||
|
);
|
||||||
pub static ref SYSTEM_VIRT_MEM_TOTAL: Result<IntGauge> =
|
pub static ref SYSTEM_VIRT_MEM_TOTAL: Result<IntGauge> =
|
||||||
try_create_int_gauge("system_virt_mem_total_bytes", "Total system virtual memory");
|
try_create_int_gauge("system_virt_mem_total_bytes", "Total system virtual memory");
|
||||||
pub static ref SYSTEM_VIRT_MEM_AVAILABLE: Result<IntGauge> = try_create_int_gauge(
|
pub static ref SYSTEM_VIRT_MEM_AVAILABLE: Result<IntGauge> = try_create_int_gauge(
|
||||||
@ -24,6 +28,10 @@ lazy_static::lazy_static! {
|
|||||||
try_create_int_gauge("system_virt_mem_used_bytes", "Used system virtual memory");
|
try_create_int_gauge("system_virt_mem_used_bytes", "Used system virtual memory");
|
||||||
pub static ref SYSTEM_VIRT_MEM_FREE: Result<IntGauge> =
|
pub static ref SYSTEM_VIRT_MEM_FREE: Result<IntGauge> =
|
||||||
try_create_int_gauge("system_virt_mem_free_bytes", "Free system virtual memory");
|
try_create_int_gauge("system_virt_mem_free_bytes", "Free system virtual memory");
|
||||||
|
pub static ref SYSTEM_VIRT_MEM_CACHED: Result<IntGauge> =
|
||||||
|
try_create_int_gauge("system_virt_mem_cached_bytes", "Used system virtual memory");
|
||||||
|
pub static ref SYSTEM_VIRT_MEM_BUFFERS: Result<IntGauge> =
|
||||||
|
try_create_int_gauge("system_virt_mem_buffer_bytes", "Free system virtual memory");
|
||||||
pub static ref SYSTEM_VIRT_MEM_PERCENTAGE: Result<Gauge> = try_create_float_gauge(
|
pub static ref SYSTEM_VIRT_MEM_PERCENTAGE: Result<Gauge> = try_create_float_gauge(
|
||||||
"system_virt_mem_percentage",
|
"system_virt_mem_percentage",
|
||||||
"Percentage of used virtual memory"
|
"Percentage of used virtual memory"
|
||||||
@ -34,15 +42,62 @@ lazy_static::lazy_static! {
|
|||||||
try_create_float_gauge("system_loadavg_5", "Loadavg over 5 minutes");
|
try_create_float_gauge("system_loadavg_5", "Loadavg over 5 minutes");
|
||||||
pub static ref SYSTEM_LOADAVG_15: Result<Gauge> =
|
pub static ref SYSTEM_LOADAVG_15: Result<Gauge> =
|
||||||
try_create_float_gauge("system_loadavg_15", "Loadavg over 15 minutes");
|
try_create_float_gauge("system_loadavg_15", "Loadavg over 15 minutes");
|
||||||
|
|
||||||
|
pub static ref CPU_CORES: Result<IntGauge> =
|
||||||
|
try_create_int_gauge("cpu_cores", "Number of physical cpu cores");
|
||||||
|
pub static ref CPU_THREADS: Result<IntGauge> =
|
||||||
|
try_create_int_gauge("cpu_threads", "Number of logical cpu cores");
|
||||||
|
|
||||||
|
pub static ref CPU_SYSTEM_SECONDS_TOTAL: Result<IntGauge> =
|
||||||
|
try_create_int_gauge("cpu_system_seconds_total", "Total time spent in kernel mode");
|
||||||
|
pub static ref CPU_USER_SECONDS_TOTAL: Result<IntGauge> =
|
||||||
|
try_create_int_gauge("cpu_user_seconds_total", "Total time spent in user mode");
|
||||||
|
pub static ref CPU_IOWAIT_SECONDS_TOTAL: Result<IntGauge> =
|
||||||
|
try_create_int_gauge("cpu_iowait_seconds_total", "Total time spent waiting for io");
|
||||||
|
pub static ref CPU_IDLE_SECONDS_TOTAL: Result<IntGauge> =
|
||||||
|
try_create_int_gauge("cpu_idle_seconds_total", "Total time spent idle");
|
||||||
|
|
||||||
|
pub static ref DISK_BYTES_TOTAL: Result<IntGauge> =
|
||||||
|
try_create_int_gauge("disk_node_bytes_total", "Total capacity of disk");
|
||||||
|
|
||||||
|
pub static ref DISK_BYTES_FREE: Result<IntGauge> =
|
||||||
|
try_create_int_gauge("disk_node_bytes_free", "Free space in disk");
|
||||||
|
|
||||||
|
pub static ref DISK_READS: Result<IntGauge> =
|
||||||
|
try_create_int_gauge("disk_node_reads_total", "Number of disk reads");
|
||||||
|
|
||||||
|
pub static ref DISK_WRITES: Result<IntGauge> =
|
||||||
|
try_create_int_gauge("disk_node_writes_total", "Number of disk writes");
|
||||||
|
|
||||||
|
pub static ref NETWORK_BYTES_RECEIVED: Result<IntGauge> =
|
||||||
|
try_create_int_gauge("network_node_bytes_total_received", "Total bytes received over all network interfaces");
|
||||||
|
pub static ref NETWORK_BYTES_SENT: Result<IntGauge> =
|
||||||
|
try_create_int_gauge("network_node_bytes_total_transmit", "Total bytes sent over all network interfaces");
|
||||||
|
|
||||||
|
pub static ref BOOT_TIME: Result<IntGauge> =
|
||||||
|
try_create_int_gauge("misc_node_boot_ts_seconds", "Boot time as unix epoch timestamp");
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn scrape_health_metrics() {
|
pub fn scrape_health_metrics() {
|
||||||
|
scrape_process_health_metrics();
|
||||||
|
scrape_system_health_metrics();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn scrape_process_health_metrics() {
|
||||||
// This will silently fail if we are unable to observe the health. This is desired behaviour
|
// This will silently fail if we are unable to observe the health. This is desired behaviour
|
||||||
// since we don't support `Health` for all platforms.
|
// since we don't support `Health` for all platforms.
|
||||||
if let Ok(health) = Health::observe() {
|
if let Ok(health) = ProcessHealth::observe() {
|
||||||
set_gauge(&PROCESS_NUM_THREADS, health.pid_num_threads as i64);
|
set_gauge(&PROCESS_NUM_THREADS, health.pid_num_threads as i64);
|
||||||
set_gauge(&PROCESS_RES_MEM, health.pid_mem_resident_set_size as i64);
|
set_gauge(&PROCESS_RES_MEM, health.pid_mem_resident_set_size as i64);
|
||||||
set_gauge(&PROCESS_VIRT_MEM, health.pid_mem_virtual_memory_size as i64);
|
set_gauge(&PROCESS_VIRT_MEM, health.pid_mem_virtual_memory_size as i64);
|
||||||
|
set_gauge(&PROCESS_SECONDS, health.pid_process_seconds_total as i64);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn scrape_system_health_metrics() {
|
||||||
|
// This will silently fail if we are unable to observe the health. This is desired behaviour
|
||||||
|
// since we don't support `Health` for all platforms.
|
||||||
|
if let Ok(health) = SystemHealth::observe() {
|
||||||
set_gauge(&SYSTEM_VIRT_MEM_TOTAL, health.sys_virt_mem_total as i64);
|
set_gauge(&SYSTEM_VIRT_MEM_TOTAL, health.sys_virt_mem_total as i64);
|
||||||
set_gauge(
|
set_gauge(
|
||||||
&SYSTEM_VIRT_MEM_AVAILABLE,
|
&SYSTEM_VIRT_MEM_AVAILABLE,
|
||||||
@ -57,5 +112,34 @@ pub fn scrape_health_metrics() {
|
|||||||
set_float_gauge(&SYSTEM_LOADAVG_1, health.sys_loadavg_1);
|
set_float_gauge(&SYSTEM_LOADAVG_1, health.sys_loadavg_1);
|
||||||
set_float_gauge(&SYSTEM_LOADAVG_5, health.sys_loadavg_5);
|
set_float_gauge(&SYSTEM_LOADAVG_5, health.sys_loadavg_5);
|
||||||
set_float_gauge(&SYSTEM_LOADAVG_15, health.sys_loadavg_15);
|
set_float_gauge(&SYSTEM_LOADAVG_15, health.sys_loadavg_15);
|
||||||
|
|
||||||
|
set_gauge(&CPU_CORES, health.cpu_cores as i64);
|
||||||
|
set_gauge(&CPU_THREADS, health.cpu_threads as i64);
|
||||||
|
|
||||||
|
set_gauge(
|
||||||
|
&CPU_SYSTEM_SECONDS_TOTAL,
|
||||||
|
health.system_seconds_total as i64,
|
||||||
|
);
|
||||||
|
set_gauge(&CPU_USER_SECONDS_TOTAL, health.user_seconds_total as i64);
|
||||||
|
set_gauge(
|
||||||
|
&CPU_IOWAIT_SECONDS_TOTAL,
|
||||||
|
health.iowait_seconds_total as i64,
|
||||||
|
);
|
||||||
|
set_gauge(&CPU_IDLE_SECONDS_TOTAL, health.idle_seconds_total as i64);
|
||||||
|
|
||||||
|
set_gauge(&DISK_BYTES_TOTAL, health.disk_node_bytes_total as i64);
|
||||||
|
|
||||||
|
set_gauge(&DISK_BYTES_FREE, health.disk_node_bytes_free as i64);
|
||||||
|
set_gauge(&DISK_READS, health.disk_node_reads_total as i64);
|
||||||
|
set_gauge(&DISK_WRITES, health.disk_node_writes_total as i64);
|
||||||
|
|
||||||
|
set_gauge(
|
||||||
|
&NETWORK_BYTES_RECEIVED,
|
||||||
|
health.network_node_bytes_total_received as i64,
|
||||||
|
);
|
||||||
|
set_gauge(
|
||||||
|
&NETWORK_BYTES_SENT,
|
||||||
|
health.network_node_bytes_total_transmit as i64,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -64,4 +64,5 @@ scrypt = { version = "0.5.0", default-features = false }
|
|||||||
lighthouse_metrics = { path = "../common/lighthouse_metrics" }
|
lighthouse_metrics = { path = "../common/lighthouse_metrics" }
|
||||||
lazy_static = "1.4.0"
|
lazy_static = "1.4.0"
|
||||||
fallback = { path = "../common/fallback" }
|
fallback = { path = "../common/fallback" }
|
||||||
|
monitoring_api = { path = "../common/monitoring_api" }
|
||||||
sensitive_url = { path = "../common/sensitive_url" }
|
sensitive_url = { path = "../common/sensitive_url" }
|
||||||
|
@ -302,7 +302,7 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// The count of candidates, regardless of their state.
|
/// The count of candidates, regardless of their state.
|
||||||
pub async fn num_total(&self) -> usize {
|
pub fn num_total(&self) -> usize {
|
||||||
self.candidates.len()
|
self.candidates.len()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -317,6 +317,17 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
|
|||||||
n
|
n
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The count of synced and ready fallbacks excluding the primary beacon node candidate.
|
||||||
|
pub async fn num_synced_fallback(&self) -> usize {
|
||||||
|
let mut n = 0;
|
||||||
|
for candidate in self.candidates.iter().skip(1) {
|
||||||
|
if candidate.status(RequireSynced::Yes).await.is_ok() {
|
||||||
|
n += 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
n
|
||||||
|
}
|
||||||
|
|
||||||
/// The count of candidates that are online and compatible, but not necessarily synced.
|
/// The count of candidates that are online and compatible, but not necessarily synced.
|
||||||
pub async fn num_available(&self) -> usize {
|
pub async fn num_available(&self) -> usize {
|
||||||
let mut n = 0;
|
let mut n = 0;
|
||||||
|
@ -181,4 +181,19 @@ pub fn cli_app<'a, 'b>() -> App<'a, 'b> {
|
|||||||
address of this server (e.g., http://localhost:5064).")
|
address of this server (e.g., http://localhost:5064).")
|
||||||
.takes_value(true),
|
.takes_value(true),
|
||||||
)
|
)
|
||||||
|
/*
|
||||||
|
* Explorer metrics
|
||||||
|
*/
|
||||||
|
.arg(
|
||||||
|
Arg::with_name("monitoring-endpoint")
|
||||||
|
.long("monitoring-endpoint")
|
||||||
|
.value_name("ADDRESS")
|
||||||
|
.help("Enables the monitoring service for sending system metrics to a remote endpoint. \
|
||||||
|
This can be used to monitor your setup on certain services (e.g. beaconcha.in). \
|
||||||
|
This flag sets the endpoint where the beacon node metrics will be sent. \
|
||||||
|
Note: This will send information to a remote sever which may identify and associate your \
|
||||||
|
validators, IP address and other personal information. Always use a HTTPS connection \
|
||||||
|
and never provide an untrusted URL.")
|
||||||
|
.takes_value(true),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
@ -43,6 +43,8 @@ pub struct Config {
|
|||||||
pub http_api: http_api::Config,
|
pub http_api: http_api::Config,
|
||||||
/// Configuration for the HTTP REST API.
|
/// Configuration for the HTTP REST API.
|
||||||
pub http_metrics: http_metrics::Config,
|
pub http_metrics: http_metrics::Config,
|
||||||
|
/// Configuration for sending metrics to a remote explorer endpoint.
|
||||||
|
pub monitoring_api: Option<monitoring_api::Config>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for Config {
|
impl Default for Config {
|
||||||
@ -70,6 +72,7 @@ impl Default for Config {
|
|||||||
graffiti_file: None,
|
graffiti_file: None,
|
||||||
http_api: <_>::default(),
|
http_api: <_>::default(),
|
||||||
http_metrics: <_>::default(),
|
http_metrics: <_>::default(),
|
||||||
|
monitoring_api: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -233,6 +236,16 @@ impl Config {
|
|||||||
|
|
||||||
config.http_metrics.allow_origin = Some(allow_origin.to_string());
|
config.http_metrics.allow_origin = Some(allow_origin.to_string());
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
|
* Explorer metrics
|
||||||
|
*/
|
||||||
|
if let Some(monitoring_endpoint) = cli_args.value_of("monitoring-endpoint") {
|
||||||
|
config.monitoring_api = Some(monitoring_api::Config {
|
||||||
|
db_path: None,
|
||||||
|
freezer_db_path: None,
|
||||||
|
monitoring_endpoint: monitoring_endpoint.to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
Ok(config)
|
Ok(config)
|
||||||
}
|
}
|
||||||
|
@ -108,6 +108,16 @@ lazy_static::lazy_static! {
|
|||||||
"The number of beacon node requests for each endpoint",
|
"The number of beacon node requests for each endpoint",
|
||||||
&["endpoint"]
|
&["endpoint"]
|
||||||
);
|
);
|
||||||
|
|
||||||
|
pub static ref ETH2_FALLBACK_CONFIGURED: Result<IntGauge> = try_create_int_gauge(
|
||||||
|
"sync_eth2_fallback_configured",
|
||||||
|
"The number of configured eth2 fallbacks",
|
||||||
|
);
|
||||||
|
|
||||||
|
pub static ref ETH2_FALLBACK_CONNECTED: Result<IntGauge> = try_create_int_gauge(
|
||||||
|
"sync_eth2_fallback_connected",
|
||||||
|
"Set to 1 if connected to atleast one synced eth2 fallback node, otherwise set to 0",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn gather_prometheus_metrics<T: EthSpec>(
|
pub fn gather_prometheus_metrics<T: EthSpec>(
|
||||||
@ -126,20 +136,6 @@ pub fn gather_prometheus_metrics<T: EthSpec>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(validator_store) = &shared.validator_store {
|
|
||||||
let initialized_validators_lock = validator_store.initialized_validators();
|
|
||||||
let initialized_validators = initialized_validators_lock.read();
|
|
||||||
|
|
||||||
set_gauge(
|
|
||||||
&ENABLED_VALIDATORS_COUNT,
|
|
||||||
initialized_validators.num_enabled() as i64,
|
|
||||||
);
|
|
||||||
set_gauge(
|
|
||||||
&TOTAL_VALIDATORS_COUNT,
|
|
||||||
initialized_validators.num_total() as i64,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(duties_service) = &shared.duties_service {
|
if let Some(duties_service) = &shared.duties_service {
|
||||||
if let Some(slot) = duties_service.slot_clock.now() {
|
if let Some(slot) = duties_service.slot_clock.now() {
|
||||||
let current_epoch = slot.epoch(T::slots_per_epoch());
|
let current_epoch = slot.epoch(T::slots_per_epoch());
|
||||||
|
@ -14,6 +14,7 @@ use account_utils::{
|
|||||||
ZeroizeString,
|
ZeroizeString,
|
||||||
};
|
};
|
||||||
use eth2_keystore::Keystore;
|
use eth2_keystore::Keystore;
|
||||||
|
use lighthouse_metrics::set_gauge;
|
||||||
use lockfile::{Lockfile, LockfileError};
|
use lockfile::{Lockfile, LockfileError};
|
||||||
use slog::{debug, error, info, warn, Logger};
|
use slog::{debug, error, info, warn, Logger};
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
@ -609,6 +610,16 @@ impl InitializedValidators {
|
|||||||
} else {
|
} else {
|
||||||
debug!(log, "Key cache not modified");
|
debug!(log, "Key cache not modified");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Update the enabled and total validator counts
|
||||||
|
set_gauge(
|
||||||
|
&crate::http_metrics::metrics::ENABLED_VALIDATORS_COUNT,
|
||||||
|
self.num_enabled() as i64,
|
||||||
|
);
|
||||||
|
set_gauge(
|
||||||
|
&crate::http_metrics::metrics::TOTAL_VALIDATORS_COUNT,
|
||||||
|
self.num_total() as i64,
|
||||||
|
);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -17,6 +17,8 @@ pub mod http_api;
|
|||||||
|
|
||||||
pub use cli::cli_app;
|
pub use cli::cli_app;
|
||||||
pub use config::Config;
|
pub use config::Config;
|
||||||
|
use lighthouse_metrics::set_gauge;
|
||||||
|
use monitoring_api::{MonitoringHttpClient, ProcessType};
|
||||||
|
|
||||||
use crate::beacon_node_fallback::{
|
use crate::beacon_node_fallback::{
|
||||||
start_fallback_updater_service, BeaconNodeFallback, CandidateBeaconNode, RequireSynced,
|
start_fallback_updater_service, BeaconNodeFallback, CandidateBeaconNode, RequireSynced,
|
||||||
@ -125,6 +127,17 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
|
|||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Start the explorer client which periodically sends validator process
|
||||||
|
// and system metrics to the configured endpoint.
|
||||||
|
if let Some(monitoring_config) = &config.monitoring_api {
|
||||||
|
let monitoring_client =
|
||||||
|
MonitoringHttpClient::new(monitoring_config, context.log().clone())?;
|
||||||
|
monitoring_client.auto_update(
|
||||||
|
context.executor.clone(),
|
||||||
|
vec![ProcessType::Validator, ProcessType::System],
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
let mut validator_defs = ValidatorDefinitions::open_or_create(&config.validator_dir)
|
let mut validator_defs = ValidatorDefinitions::open_or_create(&config.validator_dir)
|
||||||
.map_err(|e| format!("Unable to open or create validator definitions: {:?}", e))?;
|
.map_err(|e| format!("Unable to open or create validator definitions: {:?}", e))?;
|
||||||
|
|
||||||
@ -225,10 +238,19 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
|
|||||||
})
|
})
|
||||||
.collect::<Result<Vec<BeaconNodeHttpClient>, String>>()?;
|
.collect::<Result<Vec<BeaconNodeHttpClient>, String>>()?;
|
||||||
|
|
||||||
|
let num_nodes = beacon_nodes.len();
|
||||||
let candidates = beacon_nodes
|
let candidates = beacon_nodes
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(CandidateBeaconNode::new)
|
.map(CandidateBeaconNode::new)
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
|
// Set the count for beacon node fallbacks excluding the primary beacon node
|
||||||
|
set_gauge(
|
||||||
|
&http_metrics::metrics::ETH2_FALLBACK_CONFIGURED,
|
||||||
|
num_nodes.saturating_sub(1) as i64,
|
||||||
|
);
|
||||||
|
// Initialize the number of connected, synced fallbacks to 0.
|
||||||
|
set_gauge(&http_metrics::metrics::ETH2_FALLBACK_CONNECTED, 0);
|
||||||
let mut beacon_nodes: BeaconNodeFallback<_, T> =
|
let mut beacon_nodes: BeaconNodeFallback<_, T> =
|
||||||
BeaconNodeFallback::new(candidates, context.eth2_config.spec.clone(), log.clone());
|
BeaconNodeFallback::new(candidates, context.eth2_config.spec.clone(), log.clone());
|
||||||
|
|
||||||
@ -409,7 +431,7 @@ async fn init_from_beacon_node<E: EthSpec>(
|
|||||||
loop {
|
loop {
|
||||||
beacon_nodes.update_unready_candidates().await;
|
beacon_nodes.update_unready_candidates().await;
|
||||||
let num_available = beacon_nodes.num_available().await;
|
let num_available = beacon_nodes.num_available().await;
|
||||||
let num_total = beacon_nodes.num_total().await;
|
let num_total = beacon_nodes.num_total();
|
||||||
if num_available > 0 {
|
if num_available > 0 {
|
||||||
info!(
|
info!(
|
||||||
context.log(),
|
context.log(),
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
|
use crate::http_metrics;
|
||||||
use crate::{DutiesService, ProductionValidatorClient};
|
use crate::{DutiesService, ProductionValidatorClient};
|
||||||
|
use lighthouse_metrics::set_gauge;
|
||||||
use slog::{error, info, Logger};
|
use slog::{error, info, Logger};
|
||||||
use slot_clock::SlotClock;
|
use slot_clock::SlotClock;
|
||||||
use tokio::time::{sleep, Duration};
|
use tokio::time::{sleep, Duration};
|
||||||
@ -39,7 +41,7 @@ async fn notify<T: SlotClock + 'static, E: EthSpec>(
|
|||||||
) {
|
) {
|
||||||
let num_available = duties_service.beacon_nodes.num_available().await;
|
let num_available = duties_service.beacon_nodes.num_available().await;
|
||||||
let num_synced = duties_service.beacon_nodes.num_synced().await;
|
let num_synced = duties_service.beacon_nodes.num_synced().await;
|
||||||
let num_total = duties_service.beacon_nodes.num_total().await;
|
let num_total = duties_service.beacon_nodes.num_total();
|
||||||
if num_synced > 0 {
|
if num_synced > 0 {
|
||||||
info!(
|
info!(
|
||||||
log,
|
log,
|
||||||
@ -57,6 +59,12 @@ async fn notify<T: SlotClock + 'static, E: EthSpec>(
|
|||||||
"synced" => num_synced,
|
"synced" => num_synced,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
let num_synced_fallback = duties_service.beacon_nodes.num_synced_fallback().await;
|
||||||
|
if num_synced_fallback > 0 {
|
||||||
|
set_gauge(&http_metrics::metrics::ETH2_FALLBACK_CONNECTED, 1);
|
||||||
|
} else {
|
||||||
|
set_gauge(&http_metrics::metrics::ETH2_FALLBACK_CONNECTED, 0);
|
||||||
|
}
|
||||||
|
|
||||||
if let Some(slot) = duties_service.slot_clock.now() {
|
if let Some(slot) = duties_service.slot_clock.now() {
|
||||||
let epoch = slot.epoch(E::slots_per_epoch());
|
let epoch = slot.epoch(E::slots_per_epoch());
|
||||||
|
Loading…
Reference in New Issue
Block a user