Simple alert system; FD limit alerts
This commit is contained in:
parent
15667db6a9
commit
81b1dd12f8
129
journal/alerting/alerts.go
Normal file
129
journal/alerting/alerts.go
Normal file
@ -0,0 +1,129 @@
|
||||
package alerting
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/filecoin-project/lotus/journal"
|
||||
logging "github.com/ipfs/go-log/v2"
|
||||
)
|
||||
|
||||
var log = logging.Logger("alerting")
|
||||
|
||||
type Alerting struct {
|
||||
j journal.Journal
|
||||
|
||||
lk sync.Mutex
|
||||
alerts map[AlertType]Alert
|
||||
}
|
||||
|
||||
type AlertType struct {
|
||||
System, Subsystem string
|
||||
}
|
||||
|
||||
type AlertEvent struct {
|
||||
Message json.RawMessage
|
||||
Time time.Time
|
||||
}
|
||||
|
||||
type Alert struct {
|
||||
Type AlertType
|
||||
Active bool
|
||||
|
||||
LastActive *AlertEvent // NOTE: pointer for nullability, don't mutate the referenced object!
|
||||
LastResolved *AlertEvent
|
||||
|
||||
journalType journal.EventType
|
||||
}
|
||||
|
||||
func NewAlertingSystem(j journal.Journal) *Alerting {
|
||||
return &Alerting{
|
||||
j: j,
|
||||
|
||||
alerts: map[AlertType]Alert{},
|
||||
}
|
||||
}
|
||||
|
||||
func (a *Alerting) AddAlertType(system, subsystem string) AlertType {
|
||||
a.lk.Lock()
|
||||
defer a.lk.Unlock()
|
||||
|
||||
at := AlertType{
|
||||
System: system,
|
||||
Subsystem: subsystem,
|
||||
}
|
||||
|
||||
if _, exists := a.alerts[at]; exists {
|
||||
return at
|
||||
}
|
||||
|
||||
et := a.j.RegisterEventType(system, subsystem)
|
||||
|
||||
a.alerts[at] = Alert{
|
||||
Type: at,
|
||||
Active: false,
|
||||
journalType: et,
|
||||
}
|
||||
|
||||
return at
|
||||
}
|
||||
|
||||
func (a *Alerting) update(at AlertType, message interface{}, upd func(Alert, json.RawMessage) Alert) {
|
||||
a.lk.Lock()
|
||||
defer a.lk.Unlock()
|
||||
|
||||
alert, ok := a.alerts[at]
|
||||
if !ok {
|
||||
log.Errorw("unknown alert", "type", at, "message", message)
|
||||
}
|
||||
|
||||
rawMsg, err := json.Marshal(message)
|
||||
if err != nil {
|
||||
log.Errorw("marshaling alert message failed", "type", at, "error", err)
|
||||
rawMsg, err = json.Marshal(&struct {
|
||||
AlertError string
|
||||
}{
|
||||
AlertError: err.Error(),
|
||||
})
|
||||
log.Errorw("marshaling marshaling error failed", "type", at, "error", err)
|
||||
}
|
||||
|
||||
a.alerts[at] = upd(alert, rawMsg)
|
||||
}
|
||||
|
||||
func (a *Alerting) Raise(at AlertType, message interface{}) {
|
||||
log.Errorw("alert raised", "type", at, "message", message)
|
||||
|
||||
a.update(at, message, func(alert Alert, rawMsg json.RawMessage) Alert {
|
||||
alert.Active = true
|
||||
alert.LastActive = &AlertEvent{
|
||||
Message: rawMsg,
|
||||
Time: time.Now(),
|
||||
}
|
||||
|
||||
a.j.RecordEvent(alert.journalType, func() interface{} {
|
||||
return alert.LastActive
|
||||
})
|
||||
|
||||
return alert
|
||||
})
|
||||
}
|
||||
|
||||
func (a *Alerting) Resolve(at AlertType, message interface{}) {
|
||||
log.Errorw("alert resolved", "type", at, "message", message)
|
||||
|
||||
a.update(at, message, func(alert Alert, rawMsg json.RawMessage) Alert {
|
||||
alert.Active = false
|
||||
alert.LastResolved = &AlertEvent{
|
||||
Message: rawMsg,
|
||||
Time: time.Now(),
|
||||
}
|
||||
|
||||
a.j.RecordEvent(alert.journalType, func() interface{} {
|
||||
return alert.LastResolved
|
||||
})
|
||||
|
||||
return alert
|
||||
})
|
||||
}
|
@ -17,7 +17,7 @@ var (
|
||||
supportsFDManagement = false
|
||||
|
||||
// getlimit returns the soft and hard limits of file descriptors counts
|
||||
getLimit func() (uint64, uint64, error)
|
||||
GetLimit func() (uint64, uint64, error)
|
||||
// set limit sets the soft and hard limits of file descriptors counts
|
||||
setLimit func(uint64, uint64) error
|
||||
)
|
||||
@ -61,7 +61,7 @@ func ManageFdLimit() (changed bool, newLimit uint64, err error) {
|
||||
targetLimit = userLimit
|
||||
}
|
||||
|
||||
soft, hard, err := getLimit()
|
||||
soft, hard, err := GetLimit()
|
||||
if err != nil {
|
||||
return false, 0, err
|
||||
}
|
||||
|
@ -11,7 +11,7 @@ import (
|
||||
|
||||
func init() {
|
||||
supportsFDManagement = true
|
||||
getLimit = freebsdGetLimit
|
||||
GetLimit = freebsdGetLimit
|
||||
setLimit = freebsdSetLimit
|
||||
}
|
||||
|
||||
|
@ -8,7 +8,7 @@ import (
|
||||
|
||||
func init() {
|
||||
supportsFDManagement = true
|
||||
getLimit = unixGetLimit
|
||||
GetLimit = unixGetLimit
|
||||
setLimit = unixSetLimit
|
||||
}
|
||||
|
||||
|
@ -32,6 +32,7 @@ import (
|
||||
"github.com/filecoin-project/lotus/chain/types"
|
||||
"github.com/filecoin-project/lotus/extern/sector-storage/stores"
|
||||
"github.com/filecoin-project/lotus/journal"
|
||||
"github.com/filecoin-project/lotus/journal/alerting"
|
||||
"github.com/filecoin-project/lotus/lib/peermgr"
|
||||
_ "github.com/filecoin-project/lotus/lib/sigs/bls"
|
||||
_ "github.com/filecoin-project/lotus/lib/sigs/secp"
|
||||
@ -82,6 +83,9 @@ const (
|
||||
// System processes.
|
||||
InitMemoryWatchdog
|
||||
|
||||
// health checks
|
||||
CheckFDLimit
|
||||
|
||||
// libp2p
|
||||
PstoreAddSelfKeysKey
|
||||
StartListeningKey
|
||||
@ -146,6 +150,9 @@ func defaults() []Option {
|
||||
// global system journal.
|
||||
Override(new(journal.DisabledEvents), journal.EnvDisabledEvents),
|
||||
Override(new(journal.Journal), modules.OpenFilesystemJournal),
|
||||
Override(new(*alerting.Alerting), alerting.NewAlertingSystem),
|
||||
|
||||
Override(CheckFDLimit, modules.CheckFdLimit(16<<10)),
|
||||
|
||||
Override(new(system.MemoryConstraints), modules.MemoryConstraints),
|
||||
Override(InitMemoryWatchdog, modules.MemoryWatchdog),
|
||||
|
@ -74,6 +74,8 @@ func ConfigStorageMiner(c interface{}) Option {
|
||||
return Options(
|
||||
ConfigCommon(&cfg.Common, enableLibp2pNode),
|
||||
|
||||
Override(CheckFDLimit, modules.CheckFdLimit(100_000)), // recommend at least 100k FD limit to miners
|
||||
|
||||
Override(new(api.MinerSubsystems), modules.ExtractEnabledMinerSubsystems(cfg.Subsystems)),
|
||||
Override(new(stores.LocalStorage), From(new(repo.LockedRepo))),
|
||||
Override(new(*stores.Local), modules.LocalStorage),
|
||||
|
43
node/modules/alerts.go
Normal file
43
node/modules/alerts.go
Normal file
@ -0,0 +1,43 @@
|
||||
package modules
|
||||
|
||||
import (
|
||||
"github.com/filecoin-project/lotus/journal/alerting"
|
||||
"github.com/filecoin-project/lotus/lib/ulimit"
|
||||
)
|
||||
|
||||
func CheckFdLimit(min uint64) func(al *alerting.Alerting) {
|
||||
return func(al *alerting.Alerting) {
|
||||
if ulimit.GetLimit == nil {
|
||||
return
|
||||
}
|
||||
|
||||
alert := al.AddAlertType("process", "fd-limit")
|
||||
|
||||
soft, _, err := ulimit.GetLimit()
|
||||
if err != nil {
|
||||
al.Raise(alert, map[string]string{
|
||||
"message": "failed to get FD limit",
|
||||
"error": err.Error(),
|
||||
})
|
||||
}
|
||||
|
||||
if soft < min {
|
||||
al.Raise(alert, map[string]interface{}{
|
||||
"message": "soft FD limit is low",
|
||||
"soft_limit": soft,
|
||||
"recommended_min": min,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: More things:
|
||||
// * Space in repo dirs (taking into account mounts)
|
||||
// * Miner
|
||||
// * Faulted partitions
|
||||
// * Low balances
|
||||
// * Market provider
|
||||
// * Reachability
|
||||
// * on-chain config
|
||||
// * Low memory (maybe)
|
||||
// * Network / sync issues
|
Loading…
Reference in New Issue
Block a user