upgrade to raulk/go-watchdog@v1.0.1
This pulls in the improvements introduced in: - https://github.com/raulk/go-watchdog/releases/tag/v1.0.0 - https://github.com/raulk/go-watchdog/releases/tag/v1.0.1 Lotus tries to initialize the watchdog in the following order of precedence: 1. If a max heap limit has been provided, initialize a heap-driven watchdog. 2. Else, try to initialize a cgroup-driven watchdog. 3. Else, try to initialize a system-driven watchdog. 4. Else, log a warning that the system is flying solo, and return. This PR also enabled automatic heap profile capture when memory usage surpasses 90% of the limit. Profiles are written to <LOTUS_HOME>/heapprof. A single heap profile is captured per episode, with a max of 10 episodes captured during the lifetime of the process. Episode = instance of usage climbing above the 90% threshold.
This commit is contained in:
parent
7a042ab21f
commit
931cfe1ed1
4
go.mod
4
go.mod
@ -14,7 +14,7 @@ require (
|
||||
github.com/buger/goterm v0.0.0-20200322175922-2f3e71b85129
|
||||
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e
|
||||
github.com/cockroachdb/pebble v0.0.0-20201001221639-879f3bfeef07
|
||||
github.com/coreos/go-systemd/v22 v22.0.0
|
||||
github.com/coreos/go-systemd/v22 v22.1.0
|
||||
github.com/detailyang/go-fallocate v0.0.0-20180908115635-432fa640bd2e
|
||||
github.com/dgraph-io/badger/v2 v2.2007.2
|
||||
github.com/docker/go-units v0.4.0
|
||||
@ -124,7 +124,7 @@ require (
|
||||
github.com/polydawn/refmt v0.0.0-20190809202753-05966cbd336a
|
||||
github.com/prometheus/client_golang v1.6.0
|
||||
github.com/raulk/clock v1.1.0
|
||||
github.com/raulk/go-watchdog v0.0.1
|
||||
github.com/raulk/go-watchdog v1.0.1
|
||||
github.com/stretchr/testify v1.6.1
|
||||
github.com/supranational/blst v0.1.1
|
||||
github.com/syndtr/goleveldb v1.0.0
|
||||
|
19
go.sum
19
go.sum
@ -129,6 +129,7 @@ github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e h1:fY5BOSpyZCqRo5O
|
||||
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
|
||||
github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1 h1:q763qf9huN11kDQavWsoZXJNW3xEE4JJyHa5Q25/sd8=
|
||||
github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
|
||||
github.com/cilium/ebpf v0.2.0/go.mod h1:To2CFviqOWL/M0gIMsvSMlqe7em/l1ALkX1PyjrX2Qs=
|
||||
github.com/clbanning/x2j v0.0.0-20191024224557-825249438eec/go.mod h1:jMjuTZXRI4dUb/I5gc9Hdhagfvm9+RyrPryS/auMzxE=
|
||||
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
|
||||
github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
|
||||
@ -145,6 +146,8 @@ github.com/cockroachdb/redact v0.0.0-20200622112456-cd282804bbd3 h1:2+dpIJzYMSbL
|
||||
github.com/cockroachdb/redact v0.0.0-20200622112456-cd282804bbd3/go.mod h1:BVNblN9mBWFyMyqK1k3AAiSxhvhfK2oOZZ2lK+dpvRg=
|
||||
github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd h1:qMd81Ts1T2OTKmB4acZcyKaMtRnY5Y44NuXGX2GFJ1w=
|
||||
github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd/go.mod h1:sE/e/2PUdi/liOCUjSTXgM1o87ZssimdTWN964YiIeI=
|
||||
github.com/containerd/cgroups v0.0.0-20201119153540-4cbc285b3327 h1:7grrpcfCtbZLsjtB0DgMuzs1umsJmpzaHMZ6cO6iAWw=
|
||||
github.com/containerd/cgroups v0.0.0-20201119153540-4cbc285b3327/go.mod h1:ZJeTFisyysqgcCdecO57Dj79RfL0LNeGiFUqLYQRYLE=
|
||||
github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
|
||||
github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk=
|
||||
github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
|
||||
@ -155,8 +158,8 @@ github.com/coreos/go-systemd v0.0.0-20180511133405-39ca1b05acc7/go.mod h1:F5haX7
|
||||
github.com/coreos/go-systemd v0.0.0-20181012123002-c6f51f82210d/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
|
||||
github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf h1:iW4rZ826su+pqaw19uhpSCzhj44qo35pNgKFGqzDKkU=
|
||||
github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
|
||||
github.com/coreos/go-systemd/v22 v22.0.0 h1:XJIw/+VlJ+87J+doOxznsAWIdmWuViOVhkQamW5YV28=
|
||||
github.com/coreos/go-systemd/v22 v22.0.0/go.mod h1:xO0FLkIi5MaZafQlIrOotqXZ90ih+1atmu1JpKERPPk=
|
||||
github.com/coreos/go-systemd/v22 v22.1.0 h1:kq/SbG2BCKLkDKkjQf5OWwKWUKj1lgs3lFI4PxnR5lg=
|
||||
github.com/coreos/go-systemd/v22 v22.1.0/go.mod h1:xO0FLkIi5MaZafQlIrOotqXZ90ih+1atmu1JpKERPPk=
|
||||
github.com/coreos/pkg v0.0.0-20160727233714-3ac0863d7acf/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA=
|
||||
github.com/cpuguy83/go-md2man v1.0.10 h1:BSKMNlYxDvnunlTymqtgONjNnaRV1sTpcovwwjF22jk=
|
||||
github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE=
|
||||
@ -394,6 +397,8 @@ github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw
|
||||
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-cmp v0.5.0 h1:/QaMHBdZ26BB3SSst0Iwl10Epc+xhTquomWX0oZEB6w=
|
||||
github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-cmp v0.5.2 h1:X2ev0eStA3AbceY54o37/0PQ/UWqKEiiO2dKL5OPaFM=
|
||||
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-github v17.0.0+incompatible/go.mod h1:zLgOLi98H3fifZn+44m+umXrS52loVEgC2AApnigrVQ=
|
||||
github.com/google/go-querystring v1.0.0/go.mod h1:odCYkC5MyYFN7vkCjXpyrEuKhc/BUO6wN/zVPAxq5ck=
|
||||
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||
@ -1236,6 +1241,8 @@ github.com/onsi/gomega v1.9.0/go.mod h1:Ho0h+IUsWyvy1OpqCwxlQ/21gkhVunqlU8fDGcoT
|
||||
github.com/onsi/gomega v1.10.1 h1:o0+MgICZLuZ7xjH7Vx6zS/zcu93/BEp1VwkIW1mEXCE=
|
||||
github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo=
|
||||
github.com/op/go-logging v0.0.0-20160315200505-970db520ece7/go.mod h1:HzydrMdWErDVzsI23lYNej1Htcns9BCg93Dk0bBINWk=
|
||||
github.com/opencontainers/runtime-spec v1.0.2 h1:UfAcuLBJB9Coz72x1hgl8O5RVzTdNiaglX6v2DM6FI0=
|
||||
github.com/opencontainers/runtime-spec v1.0.2/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
|
||||
github.com/opentracing-contrib/go-grpc v0.0.0-20180928155321-4b5a12d3ff02/go.mod h1:JNdpVEzCpXBgIiv4ds+TzhN1hrtxq6ClLrTlT9OQRSc=
|
||||
github.com/opentracing-contrib/go-grpc v0.0.0-20191001143057-db30781987df h1:vdYtBU6zvL7v+Tr+0xFM/qhahw/EvY8DMMunZHKH6eE=
|
||||
github.com/opentracing-contrib/go-grpc v0.0.0-20191001143057-db30781987df/go.mod h1:DYR5Eij8rJl8h7gblRrOZ8g0kW1umSpKqYIBTgeDtLo=
|
||||
@ -1313,8 +1320,8 @@ github.com/prometheus/procfs v0.1.0 h1:jhMy6QXfi3y2HEzFoyuCj40z4OZIIHHPtFyCMftmv
|
||||
github.com/prometheus/procfs v0.1.0/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
|
||||
github.com/raulk/clock v1.1.0 h1:dpb29+UKMbLqiU/jqIJptgLR1nn23HLgMY0sTCDza5Y=
|
||||
github.com/raulk/clock v1.1.0/go.mod h1:3MpVxdZ/ODBQDxbN+kzshf5OSZwPjtMDx6BBXBmOeY0=
|
||||
github.com/raulk/go-watchdog v0.0.1 h1:q0ad0fanW8uaLRTvxQ0RfdADBiKa6CL6NMByhB0vpBs=
|
||||
github.com/raulk/go-watchdog v0.0.1/go.mod h1:dIvQcKy0laxuHGda1ms8/2T9wE3ZJRbz9bxEO7c0q1M=
|
||||
github.com/raulk/go-watchdog v1.0.1 h1:qgm3DIJAeb+2byneLrQJ7kvmDLGxN2vy3apXyGaDKN4=
|
||||
github.com/raulk/go-watchdog v1.0.1/go.mod h1:lzSbAl5sh4rtI8tYHU01BWIDzgzqaQLj6RcA1i4mlqI=
|
||||
github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0 h1:OdAsTTz6OkFY5QxjkYwrChwuRruF69c169dPK26NUlk=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||
@ -1366,6 +1373,8 @@ github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPx
|
||||
github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
|
||||
github.com/sirupsen/logrus v1.6.0 h1:UBcNElsrwanuuMsnGSlYmtmgbb23qDR5dG+6X6Oo89I=
|
||||
github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88=
|
||||
github.com/sirupsen/logrus v1.7.0 h1:ShrD1U9pZB12TX0cVy0DtePoCH97K8EtX+mg7ZARUtM=
|
||||
github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
|
||||
github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
|
||||
github.com/smartystreets/assertions v1.0.0/go.mod h1:kHHU4qYBaI3q23Pp3VPrmWhuIUrLW/7eUrw0BU5VaoM=
|
||||
github.com/smartystreets/assertions v1.0.1 h1:voD4ITNjPL5jjBfgR/r8fPIIBrliWrWHeiJApdr3r4w=
|
||||
@ -1426,6 +1435,8 @@ github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljT
|
||||
github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA=
|
||||
github.com/urfave/cli v1.22.1 h1:+mkCCcOFKPnCmVYVcURKps1Xe+3zP90gSYGNfRkjoIY=
|
||||
github.com/urfave/cli v1.22.1/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
|
||||
github.com/urfave/cli v1.22.2 h1:gsqYFH8bb9ekPA12kRo0hfjngWQjkJPlN9R0N78BoUo=
|
||||
github.com/urfave/cli v1.22.2/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
|
||||
github.com/urfave/cli/v2 v2.0.0/go.mod h1:SE9GqnLQmjVa0iPEY0f1w3ygNIYcIJ0OKPMoW2caLfQ=
|
||||
github.com/urfave/cli/v2 v2.2.0 h1:JTTnM6wKzdA0Jqodd966MVj4vWbbquZykeX1sKbe2C4=
|
||||
github.com/urfave/cli/v2 v2.2.0/go.mod h1:SE9GqnLQmjVa0iPEY0f1w3ygNIYcIJ0OKPMoW2caLfQ=
|
||||
|
@ -7,6 +7,7 @@ import (
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
"github.com/gbrlsnchs/jwt/v3"
|
||||
@ -14,6 +15,7 @@ import (
|
||||
"github.com/libp2p/go-libp2p-core/peer"
|
||||
"github.com/libp2p/go-libp2p-core/peerstore"
|
||||
record "github.com/libp2p/go-libp2p-record"
|
||||
"github.com/raulk/go-watchdog"
|
||||
"go.uber.org/fx"
|
||||
"golang.org/x/xerrors"
|
||||
|
||||
@ -28,7 +30,6 @@ import (
|
||||
"github.com/filecoin-project/lotus/node/modules/dtypes"
|
||||
"github.com/filecoin-project/lotus/node/repo"
|
||||
"github.com/filecoin-project/lotus/system"
|
||||
"github.com/raulk/go-watchdog"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -69,45 +70,71 @@ func MemoryConstraints() system.MemoryConstraints {
|
||||
|
||||
// MemoryWatchdog starts the memory watchdog, applying the computed resource
|
||||
// constraints.
|
||||
func MemoryWatchdog(lc fx.Lifecycle, constraints system.MemoryConstraints) {
|
||||
func MemoryWatchdog(lr repo.LockedRepo, lc fx.Lifecycle, constraints system.MemoryConstraints) {
|
||||
if os.Getenv(EnvWatchdogDisabled) == "1" {
|
||||
log.Infof("memory watchdog is disabled via %s", EnvWatchdogDisabled)
|
||||
return
|
||||
}
|
||||
|
||||
cfg := watchdog.MemConfig{
|
||||
Resolution: 5 * time.Second,
|
||||
Policy: &watchdog.WatermarkPolicy{
|
||||
Watermarks: []float64{0.50, 0.60, 0.70, 0.85, 0.90, 0.925, 0.95},
|
||||
EmergencyWatermark: 0.95,
|
||||
},
|
||||
Logger: logWatchdog,
|
||||
// configure heap profile capture so that one is captured per episode where
|
||||
// utilization climbs over 90% of the limit. A maximum of 10 heapdumps
|
||||
// will be captured during life of this process.
|
||||
watchdog.HeapProfileDir = filepath.Join(lr.Path(), "heapprof")
|
||||
watchdog.HeapProfileMaxCaptures = 10
|
||||
watchdog.HeapProfileThreshold = 0.1
|
||||
watchdog.Logger = logWatchdog
|
||||
|
||||
policy := watchdog.NewWatermarkPolicy(0.50, 0.60, 0.70, 0.85, 0.90, 0.925, 0.95)
|
||||
|
||||
// Try to initialize a watchdog in the following order of precedence:
|
||||
// 1. If a max heap limit has been provided, initialize a heap-driven watchdog.
|
||||
// 2. Else, try to initialize a cgroup-driven watchdog.
|
||||
// 3. Else, try to initialize a system-driven watchdog.
|
||||
// 4. Else, log a warning that the system is flying solo, and return.
|
||||
|
||||
addStopHook := func(stopFn func()) {
|
||||
lc.Append(fx.Hook{
|
||||
OnStop: func(ctx context.Context) error {
|
||||
stopFn()
|
||||
return nil
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// if user has set max heap limit, apply it. Otherwise, fall back to total
|
||||
// system memory constraint.
|
||||
// 1. If user has set max heap limit, apply it.
|
||||
if maxHeap := constraints.MaxHeapMem; maxHeap != 0 {
|
||||
log.Infof("memory watchdog will apply max heap constraint: %d bytes", maxHeap)
|
||||
cfg.Limit = maxHeap
|
||||
cfg.Scope = watchdog.ScopeHeap
|
||||
} else {
|
||||
log.Infof("max heap size not provided; memory watchdog will apply total system memory constraint: %d bytes", constraints.TotalSystemMem)
|
||||
cfg.Limit = constraints.TotalSystemMem
|
||||
cfg.Scope = watchdog.ScopeSystem
|
||||
const minGOGC = 10
|
||||
err, stopFn := watchdog.HeapDriven(maxHeap, minGOGC, policy)
|
||||
if err == nil {
|
||||
log.Infof("initialized heap-driven watchdog; max heap: %d bytes", maxHeap)
|
||||
addStopHook(stopFn)
|
||||
return
|
||||
}
|
||||
log.Warnf("failed to initialize heap-driven watchdog; err: %s", err)
|
||||
log.Warnf("trying a cgroup-driven watchdog")
|
||||
}
|
||||
|
||||
err, stop := watchdog.Memory(cfg)
|
||||
if err != nil {
|
||||
log.Warnf("failed to instantiate memory watchdog: %s", err)
|
||||
// 2. cgroup-driven watchdog.
|
||||
err, stopFn := watchdog.CgroupDriven(5*time.Second, policy)
|
||||
if err == nil {
|
||||
log.Infof("initialized cgroup-driven watchdog")
|
||||
addStopHook(stopFn)
|
||||
return
|
||||
}
|
||||
log.Warnf("failed to initialize cgroup-driven watchdog; err: %s", err)
|
||||
log.Warnf("trying a system-driven watchdog")
|
||||
|
||||
// 3. system-driven watchdog.
|
||||
err, stopFn = watchdog.SystemDriven(0, 5*time.Second, policy) // 0 calculates the limit automatically.
|
||||
if err == nil {
|
||||
log.Infof("initialized system-driven watchdog")
|
||||
addStopHook(stopFn)
|
||||
return
|
||||
}
|
||||
|
||||
lc.Append(fx.Hook{
|
||||
OnStop: func(ctx context.Context) error {
|
||||
stop()
|
||||
return nil
|
||||
},
|
||||
})
|
||||
// 4. log the failure
|
||||
log.Warnf("failed to initialize system-driven watchdog; err: %s", err)
|
||||
log.Warnf("system running without a memory watchdog")
|
||||
}
|
||||
|
||||
type JwtPayload struct {
|
||||
@ -166,7 +193,7 @@ func BuiltinBootstrap() (dtypes.BootstrapPeers, error) {
|
||||
|
||||
func DrandBootstrap(ds dtypes.DrandSchedule) (dtypes.DrandBootstrap, error) {
|
||||
// TODO: retry resolving, don't fail if at least one resolve succeeds
|
||||
res := []peer.AddrInfo{}
|
||||
var res []peer.AddrInfo
|
||||
for _, d := range ds {
|
||||
addrs, err := addrutil.ParseAddresses(context.TODO(), d.Config.Relays)
|
||||
if err != nil {
|
||||
|
Loading…
Reference in New Issue
Block a user