7dc100714d
This PR adds counter metrics for the CPU system and the Geth process. Currently the only metrics available for these items are gauges. Gauges are fine when the consumer scrapes metrics data at the same interval as Geth produces new values (every 3 seconds), but it is likely that most consumers will not scrape that often. Intervals of 10, 15, or maybe even 30 seconds are probably more common. So the problem is, how does the consumer estimate what the CPU was doing in between scrapes. With a counter, it's easy ... you just subtract two successive values and divide by the time to get a nice, accurate average. But with a gauge, you can't do that. A gauge reading is an instantaneous picture of what was happening at that moment, but it gives you no idea about what was going on between scrapes. Taking an average of values is meaningless.
221 lines
8.2 KiB
Go
221 lines
8.2 KiB
Go
// Go port of Coda Hale's Metrics library
|
|
//
|
|
// <https://github.com/rcrowley/go-metrics>
|
|
//
|
|
// Coda Hale's original work: <https://github.com/codahale/metrics>
|
|
package metrics
|
|
|
|
import (
|
|
"os"
|
|
"runtime/metrics"
|
|
"runtime/pprof"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/ethereum/go-ethereum/log"
|
|
)
|
|
|
|
// Enabled is checked by the constructor functions for all of the
|
|
// standard metrics. If it is true, the metric returned is a stub.
|
|
//
|
|
// This global kill-switch helps quantify the observer effect and makes
|
|
// for less cluttered pprof profiles.
|
|
var Enabled = false
|
|
|
|
// EnabledExpensive is a soft-flag meant for external packages to check if costly
|
|
// metrics gathering is allowed or not. The goal is to separate standard metrics
|
|
// for health monitoring and debug metrics that might impact runtime performance.
|
|
var EnabledExpensive = false
|
|
|
|
// enablerFlags is the CLI flag names to use to enable metrics collections.
|
|
var enablerFlags = []string{"metrics"}
|
|
|
|
// expensiveEnablerFlags is the CLI flag names to use to enable metrics collections.
|
|
var expensiveEnablerFlags = []string{"metrics.expensive"}
|
|
|
|
// Init enables or disables the metrics system. Since we need this to run before
|
|
// any other code gets to create meters and timers, we'll actually do an ugly hack
|
|
// and peek into the command line args for the metrics flag.
|
|
func init() {
|
|
for _, arg := range os.Args {
|
|
flag := strings.TrimLeft(arg, "-")
|
|
|
|
for _, enabler := range enablerFlags {
|
|
if !Enabled && flag == enabler {
|
|
log.Info("Enabling metrics collection")
|
|
Enabled = true
|
|
}
|
|
}
|
|
for _, enabler := range expensiveEnablerFlags {
|
|
if !EnabledExpensive && flag == enabler {
|
|
log.Info("Enabling expensive metrics collection")
|
|
EnabledExpensive = true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
var threadCreateProfile = pprof.Lookup("threadcreate")
|
|
|
|
type runtimeStats struct {
|
|
GCPauses *metrics.Float64Histogram
|
|
GCAllocBytes uint64
|
|
GCFreedBytes uint64
|
|
|
|
MemTotal uint64
|
|
HeapObjects uint64
|
|
HeapFree uint64
|
|
HeapReleased uint64
|
|
HeapUnused uint64
|
|
|
|
Goroutines uint64
|
|
SchedLatency *metrics.Float64Histogram
|
|
}
|
|
|
|
var runtimeSamples = []metrics.Sample{
|
|
{Name: "/gc/pauses:seconds"}, // histogram
|
|
{Name: "/gc/heap/allocs:bytes"},
|
|
{Name: "/gc/heap/frees:bytes"},
|
|
{Name: "/memory/classes/total:bytes"},
|
|
{Name: "/memory/classes/heap/objects:bytes"},
|
|
{Name: "/memory/classes/heap/free:bytes"},
|
|
{Name: "/memory/classes/heap/released:bytes"},
|
|
{Name: "/memory/classes/heap/unused:bytes"},
|
|
{Name: "/sched/goroutines:goroutines"},
|
|
{Name: "/sched/latencies:seconds"}, // histogram
|
|
}
|
|
|
|
func readRuntimeStats(v *runtimeStats) {
|
|
metrics.Read(runtimeSamples)
|
|
for _, s := range runtimeSamples {
|
|
// Skip invalid/unknown metrics. This is needed because some metrics
|
|
// are unavailable in older Go versions, and attempting to read a 'bad'
|
|
// metric panics.
|
|
if s.Value.Kind() == metrics.KindBad {
|
|
continue
|
|
}
|
|
|
|
switch s.Name {
|
|
case "/gc/pauses:seconds":
|
|
v.GCPauses = s.Value.Float64Histogram()
|
|
case "/gc/heap/allocs:bytes":
|
|
v.GCAllocBytes = s.Value.Uint64()
|
|
case "/gc/heap/frees:bytes":
|
|
v.GCFreedBytes = s.Value.Uint64()
|
|
case "/memory/classes/total:bytes":
|
|
v.MemTotal = s.Value.Uint64()
|
|
case "/memory/classes/heap/objects:bytes":
|
|
v.HeapObjects = s.Value.Uint64()
|
|
case "/memory/classes/heap/free:bytes":
|
|
v.HeapFree = s.Value.Uint64()
|
|
case "/memory/classes/heap/released:bytes":
|
|
v.HeapReleased = s.Value.Uint64()
|
|
case "/memory/classes/heap/unused:bytes":
|
|
v.HeapUnused = s.Value.Uint64()
|
|
case "/sched/goroutines:goroutines":
|
|
v.Goroutines = s.Value.Uint64()
|
|
case "/sched/latencies:seconds":
|
|
v.SchedLatency = s.Value.Float64Histogram()
|
|
}
|
|
}
|
|
}
|
|
|
|
// CollectProcessMetrics periodically collects various metrics about the running process.
|
|
func CollectProcessMetrics(refresh time.Duration) {
|
|
// Short circuit if the metrics system is disabled
|
|
if !Enabled {
|
|
return
|
|
}
|
|
|
|
// Create the various data collectors
|
|
var (
|
|
cpustats = make([]CPUStats, 2)
|
|
diskstats = make([]DiskStats, 2)
|
|
rstats = make([]runtimeStats, 2)
|
|
)
|
|
|
|
// This scale factor is used for the runtime's time metrics. It's useful to convert to
|
|
// ns here because the runtime gives times in float seconds, but runtimeHistogram can
|
|
// only provide integers for the minimum and maximum values.
|
|
const secondsToNs = float64(time.Second)
|
|
|
|
// Define the various metrics to collect
|
|
var (
|
|
cpuSysLoad = GetOrRegisterGauge("system/cpu/sysload", DefaultRegistry)
|
|
cpuSysWait = GetOrRegisterGauge("system/cpu/syswait", DefaultRegistry)
|
|
cpuProcLoad = GetOrRegisterGauge("system/cpu/procload", DefaultRegistry)
|
|
cpuSysLoadTotal = GetOrRegisterCounterFloat64("system/cpu/sysload/total", DefaultRegistry)
|
|
cpuSysWaitTotal = GetOrRegisterCounterFloat64("system/cpu/syswait/total", DefaultRegistry)
|
|
cpuProcLoadTotal = GetOrRegisterCounterFloat64("system/cpu/procload/total", DefaultRegistry)
|
|
cpuThreads = GetOrRegisterGauge("system/cpu/threads", DefaultRegistry)
|
|
cpuGoroutines = GetOrRegisterGauge("system/cpu/goroutines", DefaultRegistry)
|
|
cpuSchedLatency = getOrRegisterRuntimeHistogram("system/cpu/schedlatency", secondsToNs, nil)
|
|
memPauses = getOrRegisterRuntimeHistogram("system/memory/pauses", secondsToNs, nil)
|
|
memAllocs = GetOrRegisterMeter("system/memory/allocs", DefaultRegistry)
|
|
memFrees = GetOrRegisterMeter("system/memory/frees", DefaultRegistry)
|
|
memTotal = GetOrRegisterGauge("system/memory/held", DefaultRegistry)
|
|
heapUsed = GetOrRegisterGauge("system/memory/used", DefaultRegistry)
|
|
heapObjects = GetOrRegisterGauge("system/memory/objects", DefaultRegistry)
|
|
diskReads = GetOrRegisterMeter("system/disk/readcount", DefaultRegistry)
|
|
diskReadBytes = GetOrRegisterMeter("system/disk/readdata", DefaultRegistry)
|
|
diskReadBytesCounter = GetOrRegisterCounter("system/disk/readbytes", DefaultRegistry)
|
|
diskWrites = GetOrRegisterMeter("system/disk/writecount", DefaultRegistry)
|
|
diskWriteBytes = GetOrRegisterMeter("system/disk/writedata", DefaultRegistry)
|
|
diskWriteBytesCounter = GetOrRegisterCounter("system/disk/writebytes", DefaultRegistry)
|
|
)
|
|
|
|
var lastCollectTime time.Time
|
|
|
|
// Iterate loading the different stats and updating the meters.
|
|
now, prev := 0, 1
|
|
for ; ; now, prev = prev, now {
|
|
// Gather CPU times.
|
|
ReadCPUStats(&cpustats[now])
|
|
collectTime := time.Now()
|
|
secondsSinceLastCollect := collectTime.Sub(lastCollectTime).Seconds()
|
|
lastCollectTime = collectTime
|
|
if secondsSinceLastCollect > 0 {
|
|
sysLoad := cpustats[now].GlobalTime - cpustats[prev].GlobalTime
|
|
sysWait := cpustats[now].GlobalWait - cpustats[prev].GlobalWait
|
|
procLoad := cpustats[now].LocalTime - cpustats[prev].LocalTime
|
|
// Convert to integer percentage.
|
|
cpuSysLoad.Update(int64(sysLoad / secondsSinceLastCollect * 100))
|
|
cpuSysWait.Update(int64(sysWait / secondsSinceLastCollect * 100))
|
|
cpuProcLoad.Update(int64(procLoad / secondsSinceLastCollect * 100))
|
|
// increment counters (ms)
|
|
cpuSysLoadTotal.Inc(sysLoad)
|
|
cpuSysWaitTotal.Inc(sysWait)
|
|
cpuProcLoadTotal.Inc(procLoad)
|
|
}
|
|
|
|
// Threads
|
|
cpuThreads.Update(int64(threadCreateProfile.Count()))
|
|
|
|
// Go runtime metrics
|
|
readRuntimeStats(&rstats[now])
|
|
|
|
cpuGoroutines.Update(int64(rstats[now].Goroutines))
|
|
cpuSchedLatency.update(rstats[now].SchedLatency)
|
|
memPauses.update(rstats[now].GCPauses)
|
|
|
|
memAllocs.Mark(int64(rstats[now].GCAllocBytes - rstats[prev].GCAllocBytes))
|
|
memFrees.Mark(int64(rstats[now].GCFreedBytes - rstats[prev].GCFreedBytes))
|
|
|
|
memTotal.Update(int64(rstats[now].MemTotal))
|
|
heapUsed.Update(int64(rstats[now].MemTotal - rstats[now].HeapUnused - rstats[now].HeapFree - rstats[now].HeapReleased))
|
|
heapObjects.Update(int64(rstats[now].HeapObjects))
|
|
|
|
// Disk
|
|
if ReadDiskStats(&diskstats[now]) == nil {
|
|
diskReads.Mark(diskstats[now].ReadCount - diskstats[prev].ReadCount)
|
|
diskReadBytes.Mark(diskstats[now].ReadBytes - diskstats[prev].ReadBytes)
|
|
diskWrites.Mark(diskstats[now].WriteCount - diskstats[prev].WriteCount)
|
|
diskWriteBytes.Mark(diskstats[now].WriteBytes - diskstats[prev].WriteBytes)
|
|
diskReadBytesCounter.Inc(diskstats[now].ReadBytes - diskstats[prev].ReadBytes)
|
|
diskWriteBytesCounter.Inc(diskstats[now].WriteBytes - diskstats[prev].WriteBytes)
|
|
}
|
|
|
|
time.Sleep(refresh)
|
|
}
|
|
}
|