lotus/node/modules/alerts.go

142 lines
3.5 KiB
Go

package modules
import (
"net"
"os"
"strconv"
"syscall"
"github.com/filecoin-project/lotus/journal/alerting"
"github.com/filecoin-project/lotus/lib/ulimit"
)
func CheckFdLimit(min uint64) func(al *alerting.Alerting) {
return func(al *alerting.Alerting) {
soft, _, err := ulimit.GetLimit()
if err == ulimit.ErrUnsupported {
log.Warn("FD limit monitoring not available")
return
}
alert := al.AddAlertType("process", "fd-limit")
if err != nil {
al.Raise(alert, map[string]string{
"message": "failed to get FD limit",
"error": err.Error(),
})
}
if soft < min {
al.Raise(alert, map[string]interface{}{
"message": "soft FD limit is low",
"soft_limit": soft,
"recommended_min": min,
})
}
}
}
func CheckUDPBufferSize(wanted int) func(al *alerting.Alerting) {
return func(al *alerting.Alerting) {
conn, err := net.Dial("udp", "localhost:0")
if err != nil {
alert := al.AddAlertType("process", "udp-buffer-size")
al.Raise(alert, map[string]string{
"message": "Failed to create UDP connection",
"error": err.Error(),
})
return
}
defer func() {
if err := conn.Close(); err != nil {
log.Warnf("Failed to close connection: %s", err)
}
}()
udpConn, ok := conn.(*net.UDPConn)
if !ok {
alert := al.AddAlertType("process", "udp-buffer-size")
al.Raise(alert, map[string]string{
"message": "Failed to cast connection to UDPConn",
})
return
}
file, err := udpConn.File()
if err != nil {
alert := al.AddAlertType("process", "udp-buffer-size")
al.Raise(alert, map[string]string{
"message": "Failed to get file descriptor from UDPConn",
"error": err.Error(),
})
return
}
defer func() {
if err := file.Close(); err != nil {
log.Warnf("Failed to close file: %s", err)
}
}()
size, err := syscall.GetsockoptInt(int(file.Fd()), syscall.SOL_SOCKET, syscall.SO_RCVBUF)
if err != nil {
alert := al.AddAlertType("process", "udp-buffer-size")
al.Raise(alert, map[string]string{
"message": "Failed to get UDP buffer size",
"error": err.Error(),
})
return
}
if size < wanted {
alert := al.AddAlertType("process", "udp-buffer-size")
al.Raise(alert, map[string]interface{}{
"message": "UDP buffer size is low",
"current_size": size,
"wanted_size": wanted,
"help": "See https://github.com/quic-go/quic-go/wiki/UDP-Buffer-Sizes for details.",
})
}
}
}
func CheckFvmConcurrency() func(al *alerting.Alerting) {
return func(al *alerting.Alerting) {
fvmConcurrency, ok := os.LookupEnv("LOTUS_FVM_CONCURRENCY")
if !ok {
return
}
fvmConcurrencyVal, err := strconv.Atoi(fvmConcurrency)
if err != nil {
alert := al.AddAlertType("process", "fvm-concurrency")
al.Raise(alert, map[string]string{
"message": "LOTUS_FVM_CONCURRENCY is not an integer",
"error": err.Error(),
})
return
}
// Raise alert if LOTUS_FVM_CONCURRENCY is set to a high value
if fvmConcurrencyVal > 24 {
alert := al.AddAlertType("process", "fvm-concurrency")
al.Raise(alert, map[string]interface{}{
"message": "LOTUS_FVM_CONCURRENCY is set to a high value that can cause chain sync panics on network migrations/upgrades",
"set_value": fvmConcurrencyVal,
"recommended": "24 or less during network upgrades",
})
}
}
}
// TODO: More things:
// * Space in repo dirs (taking into account mounts)
// * Miner
// * Faulted partitions
// * Low balances
// * Market provider
// * Reachability
// * on-chain config
// * Low memory (maybe)
// * Network / sync issues