From 81b1dd12f8418a51abd145b30e1b4661d057209d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Magiera?= Date: Tue, 17 Aug 2021 14:39:36 +0200 Subject: [PATCH] Simple alert system; FD limit alerts --- journal/alerting/alerts.go | 129 +++++++++++++++++++++++++++++++++++ lib/ulimit/ulimit.go | 4 +- lib/ulimit/ulimit_freebsd.go | 2 +- lib/ulimit/ulimit_unix.go | 2 +- node/builder.go | 7 ++ node/builder_miner.go | 2 + node/modules/alerts.go | 43 ++++++++++++ 7 files changed, 185 insertions(+), 4 deletions(-) create mode 100644 journal/alerting/alerts.go create mode 100644 node/modules/alerts.go diff --git a/journal/alerting/alerts.go b/journal/alerting/alerts.go new file mode 100644 index 000000000..21e9a79b8 --- /dev/null +++ b/journal/alerting/alerts.go @@ -0,0 +1,129 @@ +package alerting + +import ( + "encoding/json" + "sync" + "time" + + "github.com/filecoin-project/lotus/journal" + logging "github.com/ipfs/go-log/v2" +) + +var log = logging.Logger("alerting") + +type Alerting struct { + j journal.Journal + + lk sync.Mutex + alerts map[AlertType]Alert +} + +type AlertType struct { + System, Subsystem string +} + +type AlertEvent struct { + Message json.RawMessage + Time time.Time +} + +type Alert struct { + Type AlertType + Active bool + + LastActive *AlertEvent // NOTE: pointer for nullability, don't mutate the referenced object! + LastResolved *AlertEvent + + journalType journal.EventType +} + +func NewAlertingSystem(j journal.Journal) *Alerting { + return &Alerting{ + j: j, + + alerts: map[AlertType]Alert{}, + } +} + +func (a *Alerting) AddAlertType(system, subsystem string) AlertType { + a.lk.Lock() + defer a.lk.Unlock() + + at := AlertType{ + System: system, + Subsystem: subsystem, + } + + if _, exists := a.alerts[at]; exists { + return at + } + + et := a.j.RegisterEventType(system, subsystem) + + a.alerts[at] = Alert{ + Type: at, + Active: false, + journalType: et, + } + + return at +} + +func (a *Alerting) update(at AlertType, message interface{}, upd func(Alert, json.RawMessage) Alert) { + a.lk.Lock() + defer a.lk.Unlock() + + alert, ok := a.alerts[at] + if !ok { + log.Errorw("unknown alert", "type", at, "message", message) + } + + rawMsg, err := json.Marshal(message) + if err != nil { + log.Errorw("marshaling alert message failed", "type", at, "error", err) + rawMsg, err = json.Marshal(&struct { + AlertError string + }{ + AlertError: err.Error(), + }) + log.Errorw("marshaling marshaling error failed", "type", at, "error", err) + } + + a.alerts[at] = upd(alert, rawMsg) +} + +func (a *Alerting) Raise(at AlertType, message interface{}) { + log.Errorw("alert raised", "type", at, "message", message) + + a.update(at, message, func(alert Alert, rawMsg json.RawMessage) Alert { + alert.Active = true + alert.LastActive = &AlertEvent{ + Message: rawMsg, + Time: time.Now(), + } + + a.j.RecordEvent(alert.journalType, func() interface{} { + return alert.LastActive + }) + + return alert + }) +} + +func (a *Alerting) Resolve(at AlertType, message interface{}) { + log.Errorw("alert resolved", "type", at, "message", message) + + a.update(at, message, func(alert Alert, rawMsg json.RawMessage) Alert { + alert.Active = false + alert.LastResolved = &AlertEvent{ + Message: rawMsg, + Time: time.Now(), + } + + a.j.RecordEvent(alert.journalType, func() interface{} { + return alert.LastResolved + }) + + return alert + }) +} diff --git a/lib/ulimit/ulimit.go b/lib/ulimit/ulimit.go index 16bd4c9c1..7e80fd223 100644 --- a/lib/ulimit/ulimit.go +++ b/lib/ulimit/ulimit.go @@ -17,7 +17,7 @@ var ( supportsFDManagement = false // getlimit returns the soft and hard limits of file descriptors counts - getLimit func() (uint64, uint64, error) + GetLimit func() (uint64, uint64, error) // set limit sets the soft and hard limits of file descriptors counts setLimit func(uint64, uint64) error ) @@ -61,7 +61,7 @@ func ManageFdLimit() (changed bool, newLimit uint64, err error) { targetLimit = userLimit } - soft, hard, err := getLimit() + soft, hard, err := GetLimit() if err != nil { return false, 0, err } diff --git a/lib/ulimit/ulimit_freebsd.go b/lib/ulimit/ulimit_freebsd.go index 7e50436f3..aeea77d9d 100644 --- a/lib/ulimit/ulimit_freebsd.go +++ b/lib/ulimit/ulimit_freebsd.go @@ -11,7 +11,7 @@ import ( func init() { supportsFDManagement = true - getLimit = freebsdGetLimit + GetLimit = freebsdGetLimit setLimit = freebsdSetLimit } diff --git a/lib/ulimit/ulimit_unix.go b/lib/ulimit/ulimit_unix.go index a351236dc..e015b2b32 100644 --- a/lib/ulimit/ulimit_unix.go +++ b/lib/ulimit/ulimit_unix.go @@ -8,7 +8,7 @@ import ( func init() { supportsFDManagement = true - getLimit = unixGetLimit + GetLimit = unixGetLimit setLimit = unixSetLimit } diff --git a/node/builder.go b/node/builder.go index f04678bc8..58095da8f 100644 --- a/node/builder.go +++ b/node/builder.go @@ -32,6 +32,7 @@ import ( "github.com/filecoin-project/lotus/chain/types" "github.com/filecoin-project/lotus/extern/sector-storage/stores" "github.com/filecoin-project/lotus/journal" + "github.com/filecoin-project/lotus/journal/alerting" "github.com/filecoin-project/lotus/lib/peermgr" _ "github.com/filecoin-project/lotus/lib/sigs/bls" _ "github.com/filecoin-project/lotus/lib/sigs/secp" @@ -82,6 +83,9 @@ const ( // System processes. InitMemoryWatchdog + // health checks + CheckFDLimit + // libp2p PstoreAddSelfKeysKey StartListeningKey @@ -146,6 +150,9 @@ func defaults() []Option { // global system journal. Override(new(journal.DisabledEvents), journal.EnvDisabledEvents), Override(new(journal.Journal), modules.OpenFilesystemJournal), + Override(new(*alerting.Alerting), alerting.NewAlertingSystem), + + Override(CheckFDLimit, modules.CheckFdLimit(16<<10)), Override(new(system.MemoryConstraints), modules.MemoryConstraints), Override(InitMemoryWatchdog, modules.MemoryWatchdog), diff --git a/node/builder_miner.go b/node/builder_miner.go index fd69de678..4c9d2492a 100644 --- a/node/builder_miner.go +++ b/node/builder_miner.go @@ -74,6 +74,8 @@ func ConfigStorageMiner(c interface{}) Option { return Options( ConfigCommon(&cfg.Common, enableLibp2pNode), + Override(CheckFDLimit, modules.CheckFdLimit(100_000)), // recommend at least 100k FD limit to miners + Override(new(api.MinerSubsystems), modules.ExtractEnabledMinerSubsystems(cfg.Subsystems)), Override(new(stores.LocalStorage), From(new(repo.LockedRepo))), Override(new(*stores.Local), modules.LocalStorage), diff --git a/node/modules/alerts.go b/node/modules/alerts.go new file mode 100644 index 000000000..89261e231 --- /dev/null +++ b/node/modules/alerts.go @@ -0,0 +1,43 @@ +package modules + +import ( + "github.com/filecoin-project/lotus/journal/alerting" + "github.com/filecoin-project/lotus/lib/ulimit" +) + +func CheckFdLimit(min uint64) func(al *alerting.Alerting) { + return func(al *alerting.Alerting) { + if ulimit.GetLimit == nil { + return + } + + alert := al.AddAlertType("process", "fd-limit") + + soft, _, err := ulimit.GetLimit() + if err != nil { + al.Raise(alert, map[string]string{ + "message": "failed to get FD limit", + "error": err.Error(), + }) + } + + if soft < min { + al.Raise(alert, map[string]interface{}{ + "message": "soft FD limit is low", + "soft_limit": soft, + "recommended_min": min, + }) + } + } +} + +// TODO: More things: +// * Space in repo dirs (taking into account mounts) +// * Miner +// * Faulted partitions +// * Low balances +// * Market provider +// * Reachability +// * on-chain config +// * Low memory (maybe) +// * Network / sync issues