Simple alert system; FD limit alerts
This commit is contained in:
parent
15667db6a9
commit
81b1dd12f8
129
journal/alerting/alerts.go
Normal file
129
journal/alerting/alerts.go
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
package alerting
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/filecoin-project/lotus/journal"
|
||||||
|
logging "github.com/ipfs/go-log/v2"
|
||||||
|
)
|
||||||
|
|
||||||
|
var log = logging.Logger("alerting")
|
||||||
|
|
||||||
|
type Alerting struct {
|
||||||
|
j journal.Journal
|
||||||
|
|
||||||
|
lk sync.Mutex
|
||||||
|
alerts map[AlertType]Alert
|
||||||
|
}
|
||||||
|
|
||||||
|
type AlertType struct {
|
||||||
|
System, Subsystem string
|
||||||
|
}
|
||||||
|
|
||||||
|
type AlertEvent struct {
|
||||||
|
Message json.RawMessage
|
||||||
|
Time time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
type Alert struct {
|
||||||
|
Type AlertType
|
||||||
|
Active bool
|
||||||
|
|
||||||
|
LastActive *AlertEvent // NOTE: pointer for nullability, don't mutate the referenced object!
|
||||||
|
LastResolved *AlertEvent
|
||||||
|
|
||||||
|
journalType journal.EventType
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewAlertingSystem(j journal.Journal) *Alerting {
|
||||||
|
return &Alerting{
|
||||||
|
j: j,
|
||||||
|
|
||||||
|
alerts: map[AlertType]Alert{},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *Alerting) AddAlertType(system, subsystem string) AlertType {
|
||||||
|
a.lk.Lock()
|
||||||
|
defer a.lk.Unlock()
|
||||||
|
|
||||||
|
at := AlertType{
|
||||||
|
System: system,
|
||||||
|
Subsystem: subsystem,
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, exists := a.alerts[at]; exists {
|
||||||
|
return at
|
||||||
|
}
|
||||||
|
|
||||||
|
et := a.j.RegisterEventType(system, subsystem)
|
||||||
|
|
||||||
|
a.alerts[at] = Alert{
|
||||||
|
Type: at,
|
||||||
|
Active: false,
|
||||||
|
journalType: et,
|
||||||
|
}
|
||||||
|
|
||||||
|
return at
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *Alerting) update(at AlertType, message interface{}, upd func(Alert, json.RawMessage) Alert) {
|
||||||
|
a.lk.Lock()
|
||||||
|
defer a.lk.Unlock()
|
||||||
|
|
||||||
|
alert, ok := a.alerts[at]
|
||||||
|
if !ok {
|
||||||
|
log.Errorw("unknown alert", "type", at, "message", message)
|
||||||
|
}
|
||||||
|
|
||||||
|
rawMsg, err := json.Marshal(message)
|
||||||
|
if err != nil {
|
||||||
|
log.Errorw("marshaling alert message failed", "type", at, "error", err)
|
||||||
|
rawMsg, err = json.Marshal(&struct {
|
||||||
|
AlertError string
|
||||||
|
}{
|
||||||
|
AlertError: err.Error(),
|
||||||
|
})
|
||||||
|
log.Errorw("marshaling marshaling error failed", "type", at, "error", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
a.alerts[at] = upd(alert, rawMsg)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *Alerting) Raise(at AlertType, message interface{}) {
|
||||||
|
log.Errorw("alert raised", "type", at, "message", message)
|
||||||
|
|
||||||
|
a.update(at, message, func(alert Alert, rawMsg json.RawMessage) Alert {
|
||||||
|
alert.Active = true
|
||||||
|
alert.LastActive = &AlertEvent{
|
||||||
|
Message: rawMsg,
|
||||||
|
Time: time.Now(),
|
||||||
|
}
|
||||||
|
|
||||||
|
a.j.RecordEvent(alert.journalType, func() interface{} {
|
||||||
|
return alert.LastActive
|
||||||
|
})
|
||||||
|
|
||||||
|
return alert
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *Alerting) Resolve(at AlertType, message interface{}) {
|
||||||
|
log.Errorw("alert resolved", "type", at, "message", message)
|
||||||
|
|
||||||
|
a.update(at, message, func(alert Alert, rawMsg json.RawMessage) Alert {
|
||||||
|
alert.Active = false
|
||||||
|
alert.LastResolved = &AlertEvent{
|
||||||
|
Message: rawMsg,
|
||||||
|
Time: time.Now(),
|
||||||
|
}
|
||||||
|
|
||||||
|
a.j.RecordEvent(alert.journalType, func() interface{} {
|
||||||
|
return alert.LastResolved
|
||||||
|
})
|
||||||
|
|
||||||
|
return alert
|
||||||
|
})
|
||||||
|
}
|
@ -17,7 +17,7 @@ var (
|
|||||||
supportsFDManagement = false
|
supportsFDManagement = false
|
||||||
|
|
||||||
// getlimit returns the soft and hard limits of file descriptors counts
|
// getlimit returns the soft and hard limits of file descriptors counts
|
||||||
getLimit func() (uint64, uint64, error)
|
GetLimit func() (uint64, uint64, error)
|
||||||
// set limit sets the soft and hard limits of file descriptors counts
|
// set limit sets the soft and hard limits of file descriptors counts
|
||||||
setLimit func(uint64, uint64) error
|
setLimit func(uint64, uint64) error
|
||||||
)
|
)
|
||||||
@ -61,7 +61,7 @@ func ManageFdLimit() (changed bool, newLimit uint64, err error) {
|
|||||||
targetLimit = userLimit
|
targetLimit = userLimit
|
||||||
}
|
}
|
||||||
|
|
||||||
soft, hard, err := getLimit()
|
soft, hard, err := GetLimit()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false, 0, err
|
return false, 0, err
|
||||||
}
|
}
|
||||||
|
@ -11,7 +11,7 @@ import (
|
|||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
supportsFDManagement = true
|
supportsFDManagement = true
|
||||||
getLimit = freebsdGetLimit
|
GetLimit = freebsdGetLimit
|
||||||
setLimit = freebsdSetLimit
|
setLimit = freebsdSetLimit
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -8,7 +8,7 @@ import (
|
|||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
supportsFDManagement = true
|
supportsFDManagement = true
|
||||||
getLimit = unixGetLimit
|
GetLimit = unixGetLimit
|
||||||
setLimit = unixSetLimit
|
setLimit = unixSetLimit
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -32,6 +32,7 @@ import (
|
|||||||
"github.com/filecoin-project/lotus/chain/types"
|
"github.com/filecoin-project/lotus/chain/types"
|
||||||
"github.com/filecoin-project/lotus/extern/sector-storage/stores"
|
"github.com/filecoin-project/lotus/extern/sector-storage/stores"
|
||||||
"github.com/filecoin-project/lotus/journal"
|
"github.com/filecoin-project/lotus/journal"
|
||||||
|
"github.com/filecoin-project/lotus/journal/alerting"
|
||||||
"github.com/filecoin-project/lotus/lib/peermgr"
|
"github.com/filecoin-project/lotus/lib/peermgr"
|
||||||
_ "github.com/filecoin-project/lotus/lib/sigs/bls"
|
_ "github.com/filecoin-project/lotus/lib/sigs/bls"
|
||||||
_ "github.com/filecoin-project/lotus/lib/sigs/secp"
|
_ "github.com/filecoin-project/lotus/lib/sigs/secp"
|
||||||
@ -82,6 +83,9 @@ const (
|
|||||||
// System processes.
|
// System processes.
|
||||||
InitMemoryWatchdog
|
InitMemoryWatchdog
|
||||||
|
|
||||||
|
// health checks
|
||||||
|
CheckFDLimit
|
||||||
|
|
||||||
// libp2p
|
// libp2p
|
||||||
PstoreAddSelfKeysKey
|
PstoreAddSelfKeysKey
|
||||||
StartListeningKey
|
StartListeningKey
|
||||||
@ -146,6 +150,9 @@ func defaults() []Option {
|
|||||||
// global system journal.
|
// global system journal.
|
||||||
Override(new(journal.DisabledEvents), journal.EnvDisabledEvents),
|
Override(new(journal.DisabledEvents), journal.EnvDisabledEvents),
|
||||||
Override(new(journal.Journal), modules.OpenFilesystemJournal),
|
Override(new(journal.Journal), modules.OpenFilesystemJournal),
|
||||||
|
Override(new(*alerting.Alerting), alerting.NewAlertingSystem),
|
||||||
|
|
||||||
|
Override(CheckFDLimit, modules.CheckFdLimit(16<<10)),
|
||||||
|
|
||||||
Override(new(system.MemoryConstraints), modules.MemoryConstraints),
|
Override(new(system.MemoryConstraints), modules.MemoryConstraints),
|
||||||
Override(InitMemoryWatchdog, modules.MemoryWatchdog),
|
Override(InitMemoryWatchdog, modules.MemoryWatchdog),
|
||||||
|
@ -74,6 +74,8 @@ func ConfigStorageMiner(c interface{}) Option {
|
|||||||
return Options(
|
return Options(
|
||||||
ConfigCommon(&cfg.Common, enableLibp2pNode),
|
ConfigCommon(&cfg.Common, enableLibp2pNode),
|
||||||
|
|
||||||
|
Override(CheckFDLimit, modules.CheckFdLimit(100_000)), // recommend at least 100k FD limit to miners
|
||||||
|
|
||||||
Override(new(api.MinerSubsystems), modules.ExtractEnabledMinerSubsystems(cfg.Subsystems)),
|
Override(new(api.MinerSubsystems), modules.ExtractEnabledMinerSubsystems(cfg.Subsystems)),
|
||||||
Override(new(stores.LocalStorage), From(new(repo.LockedRepo))),
|
Override(new(stores.LocalStorage), From(new(repo.LockedRepo))),
|
||||||
Override(new(*stores.Local), modules.LocalStorage),
|
Override(new(*stores.Local), modules.LocalStorage),
|
||||||
|
43
node/modules/alerts.go
Normal file
43
node/modules/alerts.go
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
package modules
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/filecoin-project/lotus/journal/alerting"
|
||||||
|
"github.com/filecoin-project/lotus/lib/ulimit"
|
||||||
|
)
|
||||||
|
|
||||||
|
func CheckFdLimit(min uint64) func(al *alerting.Alerting) {
|
||||||
|
return func(al *alerting.Alerting) {
|
||||||
|
if ulimit.GetLimit == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
alert := al.AddAlertType("process", "fd-limit")
|
||||||
|
|
||||||
|
soft, _, err := ulimit.GetLimit()
|
||||||
|
if err != nil {
|
||||||
|
al.Raise(alert, map[string]string{
|
||||||
|
"message": "failed to get FD limit",
|
||||||
|
"error": err.Error(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
if soft < min {
|
||||||
|
al.Raise(alert, map[string]interface{}{
|
||||||
|
"message": "soft FD limit is low",
|
||||||
|
"soft_limit": soft,
|
||||||
|
"recommended_min": min,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: More things:
|
||||||
|
// * Space in repo dirs (taking into account mounts)
|
||||||
|
// * Miner
|
||||||
|
// * Faulted partitions
|
||||||
|
// * Low balances
|
||||||
|
// * Market provider
|
||||||
|
// * Reachability
|
||||||
|
// * on-chain config
|
||||||
|
// * Low memory (maybe)
|
||||||
|
// * Network / sync issues
|
Loading…
Reference in New Issue
Block a user