544e4a700b
This PR changes metrics collection to actually measure the time interval between collections, rather than assume 3 seconds. I did some ad hoc profiling, and on slower hardware (eg, my Raspberry Pi 4) I routinely saw intervals between 3.3 - 3.5 seconds, with some being as high as 4.5 seconds. This will generally cause the CPU gauge readings to be too high, and in some cases can cause impossibly large values for the CPU load metrics (eg. greater than 400 for a 4 core CPU). --------- Co-authored-by: Felix Lange <fjl@twurst.com>
214 lines
7.8 KiB
Go
214 lines
7.8 KiB
Go
// Go port of Coda Hale's Metrics library
|
|
//
|
|
// <https://github.com/rcrowley/go-metrics>
|
|
//
|
|
// Coda Hale's original work: <https://github.com/codahale/metrics>
|
|
package metrics
|
|
|
|
import (
|
|
"os"
|
|
"runtime/metrics"
|
|
"runtime/pprof"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/ethereum/go-ethereum/log"
|
|
)
|
|
|
|
// Enabled is checked by the constructor functions for all of the
|
|
// standard metrics. If it is true, the metric returned is a stub.
|
|
//
|
|
// This global kill-switch helps quantify the observer effect and makes
|
|
// for less cluttered pprof profiles.
|
|
var Enabled = false
|
|
|
|
// EnabledExpensive is a soft-flag meant for external packages to check if costly
|
|
// metrics gathering is allowed or not. The goal is to separate standard metrics
|
|
// for health monitoring and debug metrics that might impact runtime performance.
|
|
var EnabledExpensive = false
|
|
|
|
// enablerFlags is the CLI flag names to use to enable metrics collections.
|
|
var enablerFlags = []string{"metrics"}
|
|
|
|
// expensiveEnablerFlags is the CLI flag names to use to enable metrics collections.
|
|
var expensiveEnablerFlags = []string{"metrics.expensive"}
|
|
|
|
// Init enables or disables the metrics system. Since we need this to run before
|
|
// any other code gets to create meters and timers, we'll actually do an ugly hack
|
|
// and peek into the command line args for the metrics flag.
|
|
func init() {
|
|
for _, arg := range os.Args {
|
|
flag := strings.TrimLeft(arg, "-")
|
|
|
|
for _, enabler := range enablerFlags {
|
|
if !Enabled && flag == enabler {
|
|
log.Info("Enabling metrics collection")
|
|
Enabled = true
|
|
}
|
|
}
|
|
for _, enabler := range expensiveEnablerFlags {
|
|
if !EnabledExpensive && flag == enabler {
|
|
log.Info("Enabling expensive metrics collection")
|
|
EnabledExpensive = true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
var threadCreateProfile = pprof.Lookup("threadcreate")
|
|
|
|
type runtimeStats struct {
|
|
GCPauses *metrics.Float64Histogram
|
|
GCAllocBytes uint64
|
|
GCFreedBytes uint64
|
|
|
|
MemTotal uint64
|
|
HeapObjects uint64
|
|
HeapFree uint64
|
|
HeapReleased uint64
|
|
HeapUnused uint64
|
|
|
|
Goroutines uint64
|
|
SchedLatency *metrics.Float64Histogram
|
|
}
|
|
|
|
var runtimeSamples = []metrics.Sample{
|
|
{Name: "/gc/pauses:seconds"}, // histogram
|
|
{Name: "/gc/heap/allocs:bytes"},
|
|
{Name: "/gc/heap/frees:bytes"},
|
|
{Name: "/memory/classes/total:bytes"},
|
|
{Name: "/memory/classes/heap/objects:bytes"},
|
|
{Name: "/memory/classes/heap/free:bytes"},
|
|
{Name: "/memory/classes/heap/released:bytes"},
|
|
{Name: "/memory/classes/heap/unused:bytes"},
|
|
{Name: "/sched/goroutines:goroutines"},
|
|
{Name: "/sched/latencies:seconds"}, // histogram
|
|
}
|
|
|
|
func readRuntimeStats(v *runtimeStats) {
|
|
metrics.Read(runtimeSamples)
|
|
for _, s := range runtimeSamples {
|
|
// Skip invalid/unknown metrics. This is needed because some metrics
|
|
// are unavailable in older Go versions, and attempting to read a 'bad'
|
|
// metric panics.
|
|
if s.Value.Kind() == metrics.KindBad {
|
|
continue
|
|
}
|
|
|
|
switch s.Name {
|
|
case "/gc/pauses:seconds":
|
|
v.GCPauses = s.Value.Float64Histogram()
|
|
case "/gc/heap/allocs:bytes":
|
|
v.GCAllocBytes = s.Value.Uint64()
|
|
case "/gc/heap/frees:bytes":
|
|
v.GCFreedBytes = s.Value.Uint64()
|
|
case "/memory/classes/total:bytes":
|
|
v.MemTotal = s.Value.Uint64()
|
|
case "/memory/classes/heap/objects:bytes":
|
|
v.HeapObjects = s.Value.Uint64()
|
|
case "/memory/classes/heap/free:bytes":
|
|
v.HeapFree = s.Value.Uint64()
|
|
case "/memory/classes/heap/released:bytes":
|
|
v.HeapReleased = s.Value.Uint64()
|
|
case "/memory/classes/heap/unused:bytes":
|
|
v.HeapUnused = s.Value.Uint64()
|
|
case "/sched/goroutines:goroutines":
|
|
v.Goroutines = s.Value.Uint64()
|
|
case "/sched/latencies:seconds":
|
|
v.SchedLatency = s.Value.Float64Histogram()
|
|
}
|
|
}
|
|
}
|
|
|
|
// CollectProcessMetrics periodically collects various metrics about the running process.
|
|
func CollectProcessMetrics(refresh time.Duration) {
|
|
// Short circuit if the metrics system is disabled
|
|
if !Enabled {
|
|
return
|
|
}
|
|
|
|
// Create the various data collectors
|
|
var (
|
|
cpustats = make([]CPUStats, 2)
|
|
diskstats = make([]DiskStats, 2)
|
|
rstats = make([]runtimeStats, 2)
|
|
)
|
|
|
|
// This scale factor is used for the runtime's time metrics. It's useful to convert to
|
|
// ns here because the runtime gives times in float seconds, but runtimeHistogram can
|
|
// only provide integers for the minimum and maximum values.
|
|
const secondsToNs = float64(time.Second)
|
|
|
|
// Define the various metrics to collect
|
|
var (
|
|
cpuSysLoad = GetOrRegisterGauge("system/cpu/sysload", DefaultRegistry)
|
|
cpuSysWait = GetOrRegisterGauge("system/cpu/syswait", DefaultRegistry)
|
|
cpuProcLoad = GetOrRegisterGauge("system/cpu/procload", DefaultRegistry)
|
|
cpuThreads = GetOrRegisterGauge("system/cpu/threads", DefaultRegistry)
|
|
cpuGoroutines = GetOrRegisterGauge("system/cpu/goroutines", DefaultRegistry)
|
|
cpuSchedLatency = getOrRegisterRuntimeHistogram("system/cpu/schedlatency", secondsToNs, nil)
|
|
memPauses = getOrRegisterRuntimeHistogram("system/memory/pauses", secondsToNs, nil)
|
|
memAllocs = GetOrRegisterMeter("system/memory/allocs", DefaultRegistry)
|
|
memFrees = GetOrRegisterMeter("system/memory/frees", DefaultRegistry)
|
|
memTotal = GetOrRegisterGauge("system/memory/held", DefaultRegistry)
|
|
heapUsed = GetOrRegisterGauge("system/memory/used", DefaultRegistry)
|
|
heapObjects = GetOrRegisterGauge("system/memory/objects", DefaultRegistry)
|
|
diskReads = GetOrRegisterMeter("system/disk/readcount", DefaultRegistry)
|
|
diskReadBytes = GetOrRegisterMeter("system/disk/readdata", DefaultRegistry)
|
|
diskReadBytesCounter = GetOrRegisterCounter("system/disk/readbytes", DefaultRegistry)
|
|
diskWrites = GetOrRegisterMeter("system/disk/writecount", DefaultRegistry)
|
|
diskWriteBytes = GetOrRegisterMeter("system/disk/writedata", DefaultRegistry)
|
|
diskWriteBytesCounter = GetOrRegisterCounter("system/disk/writebytes", DefaultRegistry)
|
|
)
|
|
|
|
var lastCollectTime time.Time
|
|
|
|
// Iterate loading the different stats and updating the meters.
|
|
now, prev := 0, 1
|
|
for ; ; now, prev = prev, now {
|
|
// Gather CPU times.
|
|
ReadCPUStats(&cpustats[now])
|
|
collectTime := time.Now()
|
|
secondsSinceLastCollect := collectTime.Sub(lastCollectTime).Seconds()
|
|
lastCollectTime = collectTime
|
|
if secondsSinceLastCollect > 0 {
|
|
sysLoad := (cpustats[now].GlobalTime - cpustats[prev].GlobalTime) / secondsSinceLastCollect
|
|
sysWait := (cpustats[now].GlobalWait - cpustats[prev].GlobalWait) / secondsSinceLastCollect
|
|
procLoad := (cpustats[now].LocalTime - cpustats[prev].LocalTime) / secondsSinceLastCollect
|
|
// Convert to integer percentage.
|
|
cpuSysLoad.Update(int64(sysLoad * 100))
|
|
cpuSysWait.Update(int64(sysWait * 100))
|
|
cpuProcLoad.Update(int64(procLoad * 100))
|
|
}
|
|
|
|
// Threads
|
|
cpuThreads.Update(int64(threadCreateProfile.Count()))
|
|
|
|
// Go runtime metrics
|
|
readRuntimeStats(&rstats[now])
|
|
|
|
cpuGoroutines.Update(int64(rstats[now].Goroutines))
|
|
cpuSchedLatency.update(rstats[now].SchedLatency)
|
|
memPauses.update(rstats[now].GCPauses)
|
|
|
|
memAllocs.Mark(int64(rstats[now].GCAllocBytes - rstats[prev].GCAllocBytes))
|
|
memFrees.Mark(int64(rstats[now].GCFreedBytes - rstats[prev].GCFreedBytes))
|
|
|
|
memTotal.Update(int64(rstats[now].MemTotal))
|
|
heapUsed.Update(int64(rstats[now].MemTotal - rstats[now].HeapUnused - rstats[now].HeapFree - rstats[now].HeapReleased))
|
|
heapObjects.Update(int64(rstats[now].HeapObjects))
|
|
|
|
// Disk
|
|
if ReadDiskStats(&diskstats[now]) == nil {
|
|
diskReads.Mark(diskstats[now].ReadCount - diskstats[prev].ReadCount)
|
|
diskReadBytes.Mark(diskstats[now].ReadBytes - diskstats[prev].ReadBytes)
|
|
diskWrites.Mark(diskstats[now].WriteCount - diskstats[prev].WriteCount)
|
|
diskWriteBytes.Mark(diskstats[now].WriteBytes - diskstats[prev].WriteBytes)
|
|
diskReadBytesCounter.Inc(diskstats[now].ReadBytes - diskstats[prev].ReadBytes)
|
|
diskWriteBytesCounter.Inc(diskstats[now].WriteBytes - diskstats[prev].WriteBytes)
|
|
}
|
|
|
|
time.Sleep(refresh)
|
|
}
|
|
}
|