Add /node/health endpoint (#1197)
* Start adding health endpoint * Use psutil more * Add get_health test * Expose health to Prom * Update comments * Add /node/health to docs * Update Prom naming
This commit is contained in:
parent
58a9f979e0
commit
ea4a52984c
@ -3,6 +3,7 @@ use crate::{ApiError, ApiResult};
|
|||||||
use beacon_chain::{BeaconChain, BeaconChainTypes};
|
use beacon_chain::{BeaconChain, BeaconChainTypes};
|
||||||
use hyper::{Body, Request};
|
use hyper::{Body, Request};
|
||||||
use lighthouse_metrics::{Encoder, TextEncoder};
|
use lighthouse_metrics::{Encoder, TextEncoder};
|
||||||
|
use rest_types::Health;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
@ -36,6 +37,38 @@ lazy_static! {
|
|||||||
"http_server_validator_duties_get_request_duration_seconds",
|
"http_server_validator_duties_get_request_duration_seconds",
|
||||||
"Time taken to respond to GET /validator/duties"
|
"Time taken to respond to GET /validator/duties"
|
||||||
);
|
);
|
||||||
|
pub static ref PROCESS_NUM_THREADS: Result<IntGauge> = try_create_int_gauge(
|
||||||
|
"process_num_threads",
|
||||||
|
"Number of threads used by the current process"
|
||||||
|
);
|
||||||
|
pub static ref PROCESS_RES_MEM: Result<IntGauge> = try_create_int_gauge(
|
||||||
|
"process_resident_memory_bytes",
|
||||||
|
"Resident memory used by the current process"
|
||||||
|
);
|
||||||
|
pub static ref PROCESS_VIRT_MEM: Result<IntGauge> = try_create_int_gauge(
|
||||||
|
"process_virtual_memory_bytes",
|
||||||
|
"Virtual memory used by the current process"
|
||||||
|
);
|
||||||
|
pub static ref SYSTEM_VIRT_MEM_TOTAL: Result<IntGauge> =
|
||||||
|
try_create_int_gauge("system_virt_mem_total_bytes", "Total system virtual memory");
|
||||||
|
pub static ref SYSTEM_VIRT_MEM_AVAILABLE: Result<IntGauge> = try_create_int_gauge(
|
||||||
|
"system_virt_mem_available_bytes",
|
||||||
|
"Available system virtual memory"
|
||||||
|
);
|
||||||
|
pub static ref SYSTEM_VIRT_MEM_USED: Result<IntGauge> =
|
||||||
|
try_create_int_gauge("system_virt_mem_used_bytes", "Used system virtual memory");
|
||||||
|
pub static ref SYSTEM_VIRT_MEM_FREE: Result<IntGauge> =
|
||||||
|
try_create_int_gauge("system_virt_mem_free_bytes", "Free system virtual memory");
|
||||||
|
pub static ref SYSTEM_VIRT_MEM_PERCENTAGE: Result<Gauge> = try_create_float_gauge(
|
||||||
|
"system_virt_mem_percentage",
|
||||||
|
"Percentage of used virtual memory"
|
||||||
|
);
|
||||||
|
pub static ref SYSTEM_LOADAVG_1: Result<Gauge> =
|
||||||
|
try_create_float_gauge("system_loadavg_1", "Loadavg over 1 minute");
|
||||||
|
pub static ref SYSTEM_LOADAVG_5: Result<Gauge> =
|
||||||
|
try_create_float_gauge("system_loadavg_5", "Loadavg over 5 minutes");
|
||||||
|
pub static ref SYSTEM_LOADAVG_15: Result<Gauge> =
|
||||||
|
try_create_float_gauge("system_loadavg_15", "Loadavg over 15 minutes");
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the full set of Prometheus metrics for the Beacon Node application.
|
/// Returns the full set of Prometheus metrics for the Beacon Node application.
|
||||||
@ -72,6 +105,26 @@ pub fn get_prometheus<T: BeaconChainTypes>(
|
|||||||
store::scrape_for_metrics(&db_path, &freezer_db_path);
|
store::scrape_for_metrics(&db_path, &freezer_db_path);
|
||||||
beacon_chain::scrape_for_metrics(&beacon_chain);
|
beacon_chain::scrape_for_metrics(&beacon_chain);
|
||||||
|
|
||||||
|
if let Ok(health) = Health::observe() {
|
||||||
|
set_gauge(&PROCESS_NUM_THREADS, health.pid_num_threads as i64);
|
||||||
|
set_gauge(&PROCESS_RES_MEM, health.pid_mem_resident_set_size as i64);
|
||||||
|
set_gauge(&PROCESS_VIRT_MEM, health.pid_mem_virtual_memory_size as i64);
|
||||||
|
set_gauge(&SYSTEM_VIRT_MEM_TOTAL, health.sys_virt_mem_total as i64);
|
||||||
|
set_gauge(
|
||||||
|
&SYSTEM_VIRT_MEM_AVAILABLE,
|
||||||
|
health.sys_virt_mem_available as i64,
|
||||||
|
);
|
||||||
|
set_gauge(&SYSTEM_VIRT_MEM_USED, health.sys_virt_mem_used as i64);
|
||||||
|
set_gauge(&SYSTEM_VIRT_MEM_FREE, health.sys_virt_mem_free as i64);
|
||||||
|
set_float_gauge(
|
||||||
|
&SYSTEM_VIRT_MEM_PERCENTAGE,
|
||||||
|
health.sys_virt_mem_percent as f64,
|
||||||
|
);
|
||||||
|
set_float_gauge(&SYSTEM_LOADAVG_1, health.sys_loadavg_1);
|
||||||
|
set_float_gauge(&SYSTEM_LOADAVG_5, health.sys_loadavg_5);
|
||||||
|
set_float_gauge(&SYSTEM_LOADAVG_15, health.sys_loadavg_15);
|
||||||
|
}
|
||||||
|
|
||||||
encoder
|
encoder
|
||||||
.encode(&lighthouse_metrics::gather(), &mut buffer)
|
.encode(&lighthouse_metrics::gather(), &mut buffer)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
use crate::response_builder::ResponseBuilder;
|
use crate::response_builder::ResponseBuilder;
|
||||||
use crate::ApiResult;
|
use crate::{ApiError, ApiResult};
|
||||||
use eth2_libp2p::{types::SyncState, NetworkGlobals};
|
use eth2_libp2p::{types::SyncState, NetworkGlobals};
|
||||||
use hyper::{Body, Request};
|
use hyper::{Body, Request};
|
||||||
use rest_types::{SyncingResponse, SyncingStatus};
|
use rest_types::{Health, SyncingResponse, SyncingStatus};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use types::{EthSpec, Slot};
|
use types::{EthSpec, Slot};
|
||||||
use version;
|
use version;
|
||||||
@ -41,3 +41,9 @@ pub fn syncing<T: EthSpec>(
|
|||||||
sync_status,
|
sync_status,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_health(req: Request<Body>) -> ApiResult {
|
||||||
|
let health = Health::observe().map_err(|e| ApiError::ServerError(e))?;
|
||||||
|
|
||||||
|
ResponseBuilder::new(&req)?.body_no_ssz(&health)
|
||||||
|
}
|
||||||
|
@ -33,6 +33,7 @@ pub async fn route<T: BeaconChainTypes>(
|
|||||||
let log = local_log.clone();
|
let log = local_log.clone();
|
||||||
let request_result = match (req.method(), path.as_ref()) {
|
let request_result = match (req.method(), path.as_ref()) {
|
||||||
// Methods for Client
|
// Methods for Client
|
||||||
|
(&Method::GET, "/node/health") => node::get_health(req),
|
||||||
(&Method::GET, "/node/version") => node::get_version(req),
|
(&Method::GET, "/node/version") => node::get_version(req),
|
||||||
(&Method::GET, "/node/syncing") => {
|
(&Method::GET, "/node/syncing") => {
|
||||||
// inform the current slot, or set to 0
|
// inform the current slot, or set to 0
|
||||||
|
@ -1252,3 +1252,15 @@ mod validator_attestation {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn get_health() {
|
||||||
|
let mut env = build_env();
|
||||||
|
|
||||||
|
let node = build_node(&mut env, testing_client_config());
|
||||||
|
let remote_node = node.remote_node().expect("should produce remote node");
|
||||||
|
|
||||||
|
env.runtime()
|
||||||
|
.block_on(remote_node.http.node().get_health())
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
@ -55,3 +55,36 @@ Typical Responses | 200
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## `/node/health`
|
||||||
|
|
||||||
|
Requests information about the health of the beacon node.
|
||||||
|
|
||||||
|
### HTTP Specification
|
||||||
|
|
||||||
|
| Property | Specification |
|
||||||
|
| --- |--- |
|
||||||
|
Path | `/node/health`
|
||||||
|
Method | GET
|
||||||
|
JSON Encoding | Object
|
||||||
|
Query Parameters | None
|
||||||
|
Typical Responses | 200
|
||||||
|
|
||||||
|
### Example Response
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"pid": 96160,
|
||||||
|
"pid_num_threads": 30,
|
||||||
|
"pid_mem_resident_set_size": 55476224,
|
||||||
|
"pid_mem_virtual_memory_size": 2081382400,
|
||||||
|
"sys_virt_mem_total": 16721076224,
|
||||||
|
"sys_virt_mem_available": 7423197184,
|
||||||
|
"sys_virt_mem_used": 8450183168,
|
||||||
|
"sys_virt_mem_free": 3496345600,
|
||||||
|
"sys_virt_mem_percent": 55.605743,
|
||||||
|
"sys_loadavg_1": 1.56,
|
||||||
|
"sys_loadavg_5": 2.61,
|
||||||
|
"sys_loadavg_15": 2.43
|
||||||
|
}
|
||||||
|
```
|
||||||
|
@ -19,7 +19,7 @@ use url::Url;
|
|||||||
pub use operation_pool::PersistedOperationPool;
|
pub use operation_pool::PersistedOperationPool;
|
||||||
pub use proto_array_fork_choice::core::ProtoArray;
|
pub use proto_array_fork_choice::core::ProtoArray;
|
||||||
pub use rest_types::{
|
pub use rest_types::{
|
||||||
CanonicalHeadResponse, Committee, HeadBeaconBlock, IndividualVotesRequest,
|
CanonicalHeadResponse, Committee, HeadBeaconBlock, Health, IndividualVotesRequest,
|
||||||
IndividualVotesResponse, SyncingResponse, ValidatorDutiesRequest, ValidatorDutyBytes,
|
IndividualVotesResponse, SyncingResponse, ValidatorDutiesRequest, ValidatorDutyBytes,
|
||||||
ValidatorRequest, ValidatorResponse, ValidatorSubscription,
|
ValidatorRequest, ValidatorResponse, ValidatorSubscription,
|
||||||
};
|
};
|
||||||
@ -612,6 +612,12 @@ impl<E: EthSpec> Node<E> {
|
|||||||
client.json_get(url, vec![]).await
|
client.json_get(url, vec![]).await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn get_health(&self) -> Result<Health, Error> {
|
||||||
|
let client = self.0.clone();
|
||||||
|
let url = self.url("health")?;
|
||||||
|
client.json_get(url, vec![]).await
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn syncing_status(&self) -> Result<SyncingResponse, Error> {
|
pub async fn syncing_status(&self) -> Result<SyncingResponse, Error> {
|
||||||
let client = self.0.clone();
|
let client = self.0.clone();
|
||||||
let url = self.url("syncing")?;
|
let url = self.url("syncing")?;
|
||||||
|
@ -14,3 +14,5 @@ state_processing = { path = "../../consensus/state_processing" }
|
|||||||
bls = { path = "../../crypto/bls" }
|
bls = { path = "../../crypto/bls" }
|
||||||
serde = { version = "1.0.110", features = ["derive"] }
|
serde = { version = "1.0.110", features = ["derive"] }
|
||||||
rayon = "1.3.0"
|
rayon = "1.3.0"
|
||||||
|
psutil = "3.1.0"
|
||||||
|
procinfo = "0.4.2"
|
||||||
|
@ -18,4 +18,4 @@ pub use validator::{
|
|||||||
|
|
||||||
pub use consensus::{IndividualVote, IndividualVotesRequest, IndividualVotesResponse};
|
pub use consensus::{IndividualVote, IndividualVotesRequest, IndividualVotesResponse};
|
||||||
|
|
||||||
pub use node::{SyncingResponse, SyncingStatus};
|
pub use node::{Health, SyncingResponse, SyncingStatus};
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
//! Collection of types for the /node HTTP
|
//! Collection of types for the /node HTTP
|
||||||
|
use procinfo::pid;
|
||||||
|
use psutil::process::Process;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use ssz_derive::{Decode, Encode};
|
use ssz_derive::{Decode, Encode};
|
||||||
use types::Slot;
|
use types::Slot;
|
||||||
@ -30,3 +32,65 @@ pub struct SyncingResponse {
|
|||||||
/// The current sync status.
|
/// The current sync status.
|
||||||
pub sync_status: SyncingStatus,
|
pub sync_status: SyncingStatus,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||||
|
/// Reports on the health of the Lighthouse instance.
|
||||||
|
pub struct Health {
|
||||||
|
/// The pid of this process.
|
||||||
|
pub pid: u32,
|
||||||
|
/// The number of threads used by this pid.
|
||||||
|
pub pid_num_threads: i32,
|
||||||
|
/// The total resident memory used by this pid.
|
||||||
|
pub pid_mem_resident_set_size: u64,
|
||||||
|
/// The total virtual memory used by this pid.
|
||||||
|
pub pid_mem_virtual_memory_size: u64,
|
||||||
|
/// Total virtual memory on the system
|
||||||
|
pub sys_virt_mem_total: u64,
|
||||||
|
/// Total virtual memory available for new processes.
|
||||||
|
pub sys_virt_mem_available: u64,
|
||||||
|
/// Total virtual memory used on the system
|
||||||
|
pub sys_virt_mem_used: u64,
|
||||||
|
/// Total virtual memory not used on the system
|
||||||
|
pub sys_virt_mem_free: u64,
|
||||||
|
/// Percentage of virtual memory used on the system
|
||||||
|
pub sys_virt_mem_percent: f32,
|
||||||
|
/// System load average over 1 minute.
|
||||||
|
pub sys_loadavg_1: f64,
|
||||||
|
/// System load average over 5 minutes.
|
||||||
|
pub sys_loadavg_5: f64,
|
||||||
|
/// System load average over 15 minutes.
|
||||||
|
pub sys_loadavg_15: f64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Health {
|
||||||
|
pub fn observe() -> Result<Self, String> {
|
||||||
|
let process =
|
||||||
|
Process::current().map_err(|e| format!("Unable to get current process: {:?}", e))?;
|
||||||
|
|
||||||
|
let process_mem = process
|
||||||
|
.memory_info()
|
||||||
|
.map_err(|e| format!("Unable to get process memory info: {:?}", e))?;
|
||||||
|
|
||||||
|
let stat = pid::stat_self().map_err(|e| format!("Unable to get stat: {:?}", e))?;
|
||||||
|
|
||||||
|
let vm = psutil::memory::virtual_memory()
|
||||||
|
.map_err(|e| format!("Unable to get virtual memory: {:?}", e))?;
|
||||||
|
let loadavg =
|
||||||
|
psutil::host::loadavg().map_err(|e| format!("Unable to get loadavg: {:?}", e))?;
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
pid: process.pid().into(),
|
||||||
|
pid_num_threads: stat.num_threads,
|
||||||
|
pid_mem_resident_set_size: process_mem.rss().into(),
|
||||||
|
pid_mem_virtual_memory_size: process_mem.vms().into(),
|
||||||
|
sys_virt_mem_total: vm.total().into(),
|
||||||
|
sys_virt_mem_available: vm.available().into(),
|
||||||
|
sys_virt_mem_used: vm.used().into(),
|
||||||
|
sys_virt_mem_free: vm.free().into(),
|
||||||
|
sys_virt_mem_percent: vm.percent().into(),
|
||||||
|
sys_loadavg_1: loadavg.one.into(),
|
||||||
|
sys_loadavg_5: loadavg.five.into(),
|
||||||
|
sys_loadavg_15: loadavg.fifteen.into(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user