lotus/journal/alerting/alerts.go

170 lines
3.6 KiB
Go
Raw Normal View History

2021-08-17 12:39:36 +00:00
package alerting
import (
"encoding/json"
2021-08-17 12:51:54 +00:00
"sort"
2021-08-17 12:39:36 +00:00
"sync"
"time"
logging "github.com/ipfs/go-log/v2"
2022-06-14 15:00:51 +00:00
"github.com/filecoin-project/lotus/journal"
2021-08-17 12:39:36 +00:00
)
var log = logging.Logger("alerting")
2021-08-18 12:46:06 +00:00
2021-08-18 12:41:13 +00:00
// Alerting provides simple stateful alert system. Consumers can register alerts,
// which can be raised and resolved.
//
// When an alert is raised or resolved, a related journal entry is recorded.
2021-08-17 12:39:36 +00:00
type Alerting struct {
j journal.Journal
lk sync.Mutex
alerts map[AlertType]Alert
}
2021-08-18 12:41:13 +00:00
// AlertType is a unique alert identifier
2021-08-17 12:39:36 +00:00
type AlertType struct {
System, Subsystem string
}
2021-08-18 12:41:13 +00:00
// AlertEvent contains information about alert state transition
2021-08-17 12:39:36 +00:00
type AlertEvent struct {
2021-08-18 12:46:06 +00:00
Type string // either 'raised' or 'resolved'
2021-08-17 12:39:36 +00:00
Message json.RawMessage
Time time.Time
}
type Alert struct {
Type AlertType
Active bool
LastActive *AlertEvent // NOTE: pointer for nullability, don't mutate the referenced object!
LastResolved *AlertEvent
journalType journal.EventType
}
func NewAlertingSystem(j journal.Journal) *Alerting {
return &Alerting{
j: j,
alerts: map[AlertType]Alert{},
}
}
func (a *Alerting) AddAlertType(system, subsystem string) AlertType {
a.lk.Lock()
defer a.lk.Unlock()
at := AlertType{
System: system,
Subsystem: subsystem,
}
if _, exists := a.alerts[at]; exists {
return at
}
et := a.j.RegisterEventType(system, subsystem)
a.alerts[at] = Alert{
Type: at,
Active: false,
journalType: et,
}
return at
}
func (a *Alerting) update(at AlertType, message interface{}, upd func(Alert, json.RawMessage) Alert) {
a.lk.Lock()
defer a.lk.Unlock()
alert, ok := a.alerts[at]
if !ok {
log.Errorw("unknown alert", "type", at, "message", message)
}
rawMsg, err := json.Marshal(message)
if err != nil {
log.Errorw("marshaling alert message failed", "type", at, "error", err)
rawMsg, err = json.Marshal(&struct {
AlertError string
}{
AlertError: err.Error(),
})
log.Errorw("marshaling error failed", "type", at, "error", err)
2021-08-17 12:39:36 +00:00
}
a.alerts[at] = upd(alert, rawMsg)
}
2021-08-18 12:41:13 +00:00
// Raise marks the alert condition as active and records related event in the journal
2021-08-17 12:39:36 +00:00
func (a *Alerting) Raise(at AlertType, message interface{}) {
log.Errorw("alert raised", "type", at, "message", message)
a.update(at, message, func(alert Alert, rawMsg json.RawMessage) Alert {
alert.Active = true
alert.LastActive = &AlertEvent{
2021-08-18 12:46:06 +00:00
Type: "raised",
2021-08-17 12:39:36 +00:00
Message: rawMsg,
Time: time.Now(),
}
a.j.RecordEvent(alert.journalType, func() interface{} {
return alert.LastActive
})
return alert
})
}
2021-08-18 12:41:13 +00:00
// Resolve marks the alert condition as resolved and records related event in the journal
2021-08-17 12:39:36 +00:00
func (a *Alerting) Resolve(at AlertType, message interface{}) {
log.Errorw("alert resolved", "type", at, "message", message)
a.update(at, message, func(alert Alert, rawMsg json.RawMessage) Alert {
alert.Active = false
alert.LastResolved = &AlertEvent{
2021-08-18 12:46:06 +00:00
Type: "resolved",
2021-08-17 12:39:36 +00:00
Message: rawMsg,
Time: time.Now(),
}
a.j.RecordEvent(alert.journalType, func() interface{} {
2021-08-18 12:46:06 +00:00
return alert.LastResolved
2021-08-17 12:39:36 +00:00
})
return alert
})
}
2021-08-17 12:51:54 +00:00
2021-08-18 12:41:13 +00:00
// GetAlerts returns all registered (active and inactive) alerts
2021-08-17 12:51:54 +00:00
func (a *Alerting) GetAlerts() []Alert {
a.lk.Lock()
defer a.lk.Unlock()
out := make([]Alert, 0, len(a.alerts))
for _, alert := range a.alerts {
out = append(out, alert)
}
sort.Slice(out, func(i, j int) bool {
if out[i].Type.System != out[j].Type.System {
return out[i].Type.System < out[j].Type.System
}
return out[i].Type.Subsystem < out[j].Type.Subsystem
})
return out
}
func (a *Alerting) IsRaised(at AlertType) bool {
a.lk.Lock()
defer a.lk.Unlock()
return a.alerts[at].Active
}