Simple alert system; FD limit alerts

This commit is contained in:
Łukasz Magiera 2021-08-17 14:39:36 +02:00
parent 15667db6a9
commit 81b1dd12f8
7 changed files with 185 additions and 4 deletions

129
journal/alerting/alerts.go Normal file
View File

@ -0,0 +1,129 @@
package alerting
import (
"encoding/json"
"sync"
"time"
"github.com/filecoin-project/lotus/journal"
logging "github.com/ipfs/go-log/v2"
)
var log = logging.Logger("alerting")
type Alerting struct {
j journal.Journal
lk sync.Mutex
alerts map[AlertType]Alert
}
type AlertType struct {
System, Subsystem string
}
type AlertEvent struct {
Message json.RawMessage
Time time.Time
}
type Alert struct {
Type AlertType
Active bool
LastActive *AlertEvent // NOTE: pointer for nullability, don't mutate the referenced object!
LastResolved *AlertEvent
journalType journal.EventType
}
func NewAlertingSystem(j journal.Journal) *Alerting {
return &Alerting{
j: j,
alerts: map[AlertType]Alert{},
}
}
func (a *Alerting) AddAlertType(system, subsystem string) AlertType {
a.lk.Lock()
defer a.lk.Unlock()
at := AlertType{
System: system,
Subsystem: subsystem,
}
if _, exists := a.alerts[at]; exists {
return at
}
et := a.j.RegisterEventType(system, subsystem)
a.alerts[at] = Alert{
Type: at,
Active: false,
journalType: et,
}
return at
}
func (a *Alerting) update(at AlertType, message interface{}, upd func(Alert, json.RawMessage) Alert) {
a.lk.Lock()
defer a.lk.Unlock()
alert, ok := a.alerts[at]
if !ok {
log.Errorw("unknown alert", "type", at, "message", message)
}
rawMsg, err := json.Marshal(message)
if err != nil {
log.Errorw("marshaling alert message failed", "type", at, "error", err)
rawMsg, err = json.Marshal(&struct {
AlertError string
}{
AlertError: err.Error(),
})
log.Errorw("marshaling marshaling error failed", "type", at, "error", err)
}
a.alerts[at] = upd(alert, rawMsg)
}
func (a *Alerting) Raise(at AlertType, message interface{}) {
log.Errorw("alert raised", "type", at, "message", message)
a.update(at, message, func(alert Alert, rawMsg json.RawMessage) Alert {
alert.Active = true
alert.LastActive = &AlertEvent{
Message: rawMsg,
Time: time.Now(),
}
a.j.RecordEvent(alert.journalType, func() interface{} {
return alert.LastActive
})
return alert
})
}
func (a *Alerting) Resolve(at AlertType, message interface{}) {
log.Errorw("alert resolved", "type", at, "message", message)
a.update(at, message, func(alert Alert, rawMsg json.RawMessage) Alert {
alert.Active = false
alert.LastResolved = &AlertEvent{
Message: rawMsg,
Time: time.Now(),
}
a.j.RecordEvent(alert.journalType, func() interface{} {
return alert.LastResolved
})
return alert
})
}

View File

@ -17,7 +17,7 @@ var (
supportsFDManagement = false
// getlimit returns the soft and hard limits of file descriptors counts
getLimit func() (uint64, uint64, error)
GetLimit func() (uint64, uint64, error)
// set limit sets the soft and hard limits of file descriptors counts
setLimit func(uint64, uint64) error
)
@ -61,7 +61,7 @@ func ManageFdLimit() (changed bool, newLimit uint64, err error) {
targetLimit = userLimit
}
soft, hard, err := getLimit()
soft, hard, err := GetLimit()
if err != nil {
return false, 0, err
}

View File

@ -11,7 +11,7 @@ import (
func init() {
supportsFDManagement = true
getLimit = freebsdGetLimit
GetLimit = freebsdGetLimit
setLimit = freebsdSetLimit
}

View File

@ -8,7 +8,7 @@ import (
func init() {
supportsFDManagement = true
getLimit = unixGetLimit
GetLimit = unixGetLimit
setLimit = unixSetLimit
}

View File

@ -32,6 +32,7 @@ import (
"github.com/filecoin-project/lotus/chain/types"
"github.com/filecoin-project/lotus/extern/sector-storage/stores"
"github.com/filecoin-project/lotus/journal"
"github.com/filecoin-project/lotus/journal/alerting"
"github.com/filecoin-project/lotus/lib/peermgr"
_ "github.com/filecoin-project/lotus/lib/sigs/bls"
_ "github.com/filecoin-project/lotus/lib/sigs/secp"
@ -82,6 +83,9 @@ const (
// System processes.
InitMemoryWatchdog
// health checks
CheckFDLimit
// libp2p
PstoreAddSelfKeysKey
StartListeningKey
@ -146,6 +150,9 @@ func defaults() []Option {
// global system journal.
Override(new(journal.DisabledEvents), journal.EnvDisabledEvents),
Override(new(journal.Journal), modules.OpenFilesystemJournal),
Override(new(*alerting.Alerting), alerting.NewAlertingSystem),
Override(CheckFDLimit, modules.CheckFdLimit(16<<10)),
Override(new(system.MemoryConstraints), modules.MemoryConstraints),
Override(InitMemoryWatchdog, modules.MemoryWatchdog),

View File

@ -74,6 +74,8 @@ func ConfigStorageMiner(c interface{}) Option {
return Options(
ConfigCommon(&cfg.Common, enableLibp2pNode),
Override(CheckFDLimit, modules.CheckFdLimit(100_000)), // recommend at least 100k FD limit to miners
Override(new(api.MinerSubsystems), modules.ExtractEnabledMinerSubsystems(cfg.Subsystems)),
Override(new(stores.LocalStorage), From(new(repo.LockedRepo))),
Override(new(*stores.Local), modules.LocalStorage),

43
node/modules/alerts.go Normal file
View File

@ -0,0 +1,43 @@
package modules
import (
"github.com/filecoin-project/lotus/journal/alerting"
"github.com/filecoin-project/lotus/lib/ulimit"
)
func CheckFdLimit(min uint64) func(al *alerting.Alerting) {
return func(al *alerting.Alerting) {
if ulimit.GetLimit == nil {
return
}
alert := al.AddAlertType("process", "fd-limit")
soft, _, err := ulimit.GetLimit()
if err != nil {
al.Raise(alert, map[string]string{
"message": "failed to get FD limit",
"error": err.Error(),
})
}
if soft < min {
al.Raise(alert, map[string]interface{}{
"message": "soft FD limit is low",
"soft_limit": soft,
"recommended_min": min,
})
}
}
}
// TODO: More things:
// * Space in repo dirs (taking into account mounts)
// * Miner
// * Faulted partitions
// * Low balances
// * Market provider
// * Reachability
// * on-chain config
// * Low memory (maybe)
// * Network / sync issues