2021-08-17 12:39:36 +00:00
|
|
|
package alerting
|
|
|
|
|
|
|
|
import (
|
|
|
|
"encoding/json"
|
2021-08-17 12:51:54 +00:00
|
|
|
"sort"
|
2021-08-17 12:39:36 +00:00
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
logging "github.com/ipfs/go-log/v2"
|
2022-06-14 15:00:51 +00:00
|
|
|
|
|
|
|
"github.com/filecoin-project/lotus/journal"
|
2021-08-17 12:39:36 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
var log = logging.Logger("alerting")
|
2021-08-18 12:46:06 +00:00
|
|
|
|
2021-08-18 12:41:13 +00:00
|
|
|
// Alerting provides simple stateful alert system. Consumers can register alerts,
|
|
|
|
// which can be raised and resolved.
|
|
|
|
//
|
|
|
|
// When an alert is raised or resolved, a related journal entry is recorded.
|
2021-08-17 12:39:36 +00:00
|
|
|
type Alerting struct {
|
|
|
|
j journal.Journal
|
|
|
|
|
|
|
|
lk sync.Mutex
|
|
|
|
alerts map[AlertType]Alert
|
|
|
|
}
|
|
|
|
|
2021-08-18 12:41:13 +00:00
|
|
|
// AlertType is a unique alert identifier
|
2021-08-17 12:39:36 +00:00
|
|
|
type AlertType struct {
|
|
|
|
System, Subsystem string
|
|
|
|
}
|
|
|
|
|
2021-08-18 12:41:13 +00:00
|
|
|
// AlertEvent contains information about alert state transition
|
2021-08-17 12:39:36 +00:00
|
|
|
type AlertEvent struct {
|
2021-08-18 12:46:06 +00:00
|
|
|
Type string // either 'raised' or 'resolved'
|
2021-08-17 12:39:36 +00:00
|
|
|
Message json.RawMessage
|
|
|
|
Time time.Time
|
|
|
|
}
|
|
|
|
|
|
|
|
type Alert struct {
|
|
|
|
Type AlertType
|
|
|
|
Active bool
|
|
|
|
|
|
|
|
LastActive *AlertEvent // NOTE: pointer for nullability, don't mutate the referenced object!
|
|
|
|
LastResolved *AlertEvent
|
|
|
|
|
|
|
|
journalType journal.EventType
|
|
|
|
}
|
|
|
|
|
|
|
|
func NewAlertingSystem(j journal.Journal) *Alerting {
|
|
|
|
return &Alerting{
|
|
|
|
j: j,
|
|
|
|
|
|
|
|
alerts: map[AlertType]Alert{},
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (a *Alerting) AddAlertType(system, subsystem string) AlertType {
|
|
|
|
a.lk.Lock()
|
|
|
|
defer a.lk.Unlock()
|
|
|
|
|
|
|
|
at := AlertType{
|
|
|
|
System: system,
|
|
|
|
Subsystem: subsystem,
|
|
|
|
}
|
|
|
|
|
|
|
|
if _, exists := a.alerts[at]; exists {
|
|
|
|
return at
|
|
|
|
}
|
|
|
|
|
|
|
|
et := a.j.RegisterEventType(system, subsystem)
|
|
|
|
|
|
|
|
a.alerts[at] = Alert{
|
|
|
|
Type: at,
|
|
|
|
Active: false,
|
|
|
|
journalType: et,
|
|
|
|
}
|
|
|
|
|
|
|
|
return at
|
|
|
|
}
|
|
|
|
|
|
|
|
func (a *Alerting) update(at AlertType, message interface{}, upd func(Alert, json.RawMessage) Alert) {
|
|
|
|
a.lk.Lock()
|
|
|
|
defer a.lk.Unlock()
|
|
|
|
|
|
|
|
alert, ok := a.alerts[at]
|
|
|
|
if !ok {
|
|
|
|
log.Errorw("unknown alert", "type", at, "message", message)
|
|
|
|
}
|
|
|
|
|
|
|
|
rawMsg, err := json.Marshal(message)
|
|
|
|
if err != nil {
|
|
|
|
log.Errorw("marshaling alert message failed", "type", at, "error", err)
|
|
|
|
rawMsg, err = json.Marshal(&struct {
|
|
|
|
AlertError string
|
|
|
|
}{
|
|
|
|
AlertError: err.Error(),
|
|
|
|
})
|
2024-04-30 03:49:46 +00:00
|
|
|
log.Errorw("marshaling error failed", "type", at, "error", err)
|
2021-08-17 12:39:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
a.alerts[at] = upd(alert, rawMsg)
|
|
|
|
}
|
|
|
|
|
2021-08-18 12:41:13 +00:00
|
|
|
// Raise marks the alert condition as active and records related event in the journal
|
2021-08-17 12:39:36 +00:00
|
|
|
func (a *Alerting) Raise(at AlertType, message interface{}) {
|
|
|
|
log.Errorw("alert raised", "type", at, "message", message)
|
|
|
|
|
|
|
|
a.update(at, message, func(alert Alert, rawMsg json.RawMessage) Alert {
|
|
|
|
alert.Active = true
|
|
|
|
alert.LastActive = &AlertEvent{
|
2021-08-18 12:46:06 +00:00
|
|
|
Type: "raised",
|
2021-08-17 12:39:36 +00:00
|
|
|
Message: rawMsg,
|
|
|
|
Time: time.Now(),
|
|
|
|
}
|
|
|
|
|
|
|
|
a.j.RecordEvent(alert.journalType, func() interface{} {
|
|
|
|
return alert.LastActive
|
|
|
|
})
|
|
|
|
|
|
|
|
return alert
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2021-08-18 12:41:13 +00:00
|
|
|
// Resolve marks the alert condition as resolved and records related event in the journal
|
2021-08-17 12:39:36 +00:00
|
|
|
func (a *Alerting) Resolve(at AlertType, message interface{}) {
|
|
|
|
log.Errorw("alert resolved", "type", at, "message", message)
|
|
|
|
|
|
|
|
a.update(at, message, func(alert Alert, rawMsg json.RawMessage) Alert {
|
|
|
|
alert.Active = false
|
|
|
|
alert.LastResolved = &AlertEvent{
|
2021-08-18 12:46:06 +00:00
|
|
|
Type: "resolved",
|
2021-08-17 12:39:36 +00:00
|
|
|
Message: rawMsg,
|
|
|
|
Time: time.Now(),
|
|
|
|
}
|
|
|
|
|
|
|
|
a.j.RecordEvent(alert.journalType, func() interface{} {
|
2021-08-18 12:46:06 +00:00
|
|
|
return alert.LastResolved
|
2021-08-17 12:39:36 +00:00
|
|
|
})
|
|
|
|
|
|
|
|
return alert
|
|
|
|
})
|
|
|
|
}
|
2021-08-17 12:51:54 +00:00
|
|
|
|
2021-08-18 12:41:13 +00:00
|
|
|
// GetAlerts returns all registered (active and inactive) alerts
|
2021-08-17 12:51:54 +00:00
|
|
|
func (a *Alerting) GetAlerts() []Alert {
|
|
|
|
a.lk.Lock()
|
|
|
|
defer a.lk.Unlock()
|
|
|
|
|
|
|
|
out := make([]Alert, 0, len(a.alerts))
|
|
|
|
for _, alert := range a.alerts {
|
|
|
|
out = append(out, alert)
|
|
|
|
}
|
|
|
|
sort.Slice(out, func(i, j int) bool {
|
|
|
|
if out[i].Type.System != out[j].Type.System {
|
|
|
|
return out[i].Type.System < out[j].Type.System
|
|
|
|
}
|
|
|
|
|
|
|
|
return out[i].Type.Subsystem < out[j].Type.Subsystem
|
|
|
|
})
|
|
|
|
|
|
|
|
return out
|
|
|
|
}
|
2022-07-12 11:55:18 +00:00
|
|
|
|
|
|
|
func (a *Alerting) IsRaised(at AlertType) bool {
|
|
|
|
a.lk.Lock()
|
|
|
|
defer a.lk.Unlock()
|
|
|
|
|
|
|
|
return a.alerts[at].Active
|
|
|
|
}
|