2020-01-07 14:03:58 +00:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"os"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/filecoin-project/lotus/api"
|
|
|
|
"github.com/filecoin-project/lotus/build"
|
2020-01-11 02:13:11 +00:00
|
|
|
lcli "github.com/filecoin-project/lotus/cli"
|
2020-01-07 14:03:58 +00:00
|
|
|
cid "github.com/ipfs/go-cid"
|
|
|
|
logging "github.com/ipfs/go-log"
|
|
|
|
"gopkg.in/urfave/cli.v2"
|
|
|
|
)
|
|
|
|
|
|
|
|
type CidWindow [][]cid.Cid
|
|
|
|
|
2020-01-11 02:13:11 +00:00
|
|
|
var log = logging.Logger("lotus-health")
|
2020-01-07 14:03:58 +00:00
|
|
|
|
|
|
|
func main() {
|
|
|
|
logging.SetLogLevel("*", "INFO")
|
|
|
|
|
|
|
|
log.Info("Starting health agent")
|
|
|
|
|
|
|
|
local := []*cli.Command{
|
|
|
|
watchHeadCmd,
|
|
|
|
}
|
|
|
|
|
|
|
|
app := &cli.App{
|
|
|
|
Name: "lotus-health",
|
|
|
|
Usage: "Tools for monitoring lotus daemon health",
|
|
|
|
Version: build.UserVersion,
|
|
|
|
Commands: local,
|
2020-01-11 02:13:11 +00:00
|
|
|
Flags: []cli.Flag{
|
|
|
|
&cli.StringFlag{
|
|
|
|
Name: "repo",
|
|
|
|
EnvVars: []string{"LOTUS_PATH"},
|
|
|
|
Value: "~/.lotus", // TODO: Consider XDG_DATA_HOME
|
|
|
|
},
|
|
|
|
},
|
2020-01-07 14:03:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if err := app.Run(os.Args); err != nil {
|
|
|
|
log.Warn(err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
var watchHeadCmd = &cli.Command{
|
|
|
|
Name: "watch-head",
|
|
|
|
Flags: []cli.Flag{
|
|
|
|
&cli.IntFlag{
|
|
|
|
Name: "threshold",
|
|
|
|
Value: 3,
|
|
|
|
Usage: "number of times head remains unchanged before failing health check",
|
|
|
|
},
|
|
|
|
&cli.IntFlag{
|
|
|
|
Name: "interval",
|
2020-01-11 02:13:11 +00:00
|
|
|
Value: build.BlockDelay,
|
2020-01-07 14:03:58 +00:00
|
|
|
Usage: "interval in seconds between chain head checks",
|
|
|
|
},
|
|
|
|
&cli.StringFlag{
|
|
|
|
Name: "systemd-unit",
|
|
|
|
Value: "lotus-daemon.service",
|
|
|
|
Usage: "systemd unit name to restart on health check failure",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
Action: func(c *cli.Context) error {
|
|
|
|
threshold := c.Int("threshold")
|
2020-01-11 02:13:11 +00:00
|
|
|
interval := time.Duration(c.Int("interval")) * time.Second
|
2020-01-07 14:03:58 +00:00
|
|
|
name := c.String("systemd-unit")
|
|
|
|
|
|
|
|
var headCheckWindow CidWindow
|
|
|
|
|
2020-01-11 02:13:11 +00:00
|
|
|
api, closer, err := lcli.GetFullNodeAPI(c)
|
2020-01-07 14:03:58 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer closer()
|
2020-01-11 02:13:11 +00:00
|
|
|
ctx := lcli.ReqContext(c)
|
2020-01-07 14:03:58 +00:00
|
|
|
|
|
|
|
if err := WaitForSyncComplete(ctx, api); err != nil {
|
|
|
|
log.Fatal(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
ch := make(chan CidWindow, 1)
|
|
|
|
aCh := make(chan interface{}, 1)
|
|
|
|
|
|
|
|
go func() {
|
|
|
|
for {
|
|
|
|
headCheckWindow, err = updateWindow(ctx, api, headCheckWindow, threshold, ch)
|
|
|
|
if err != nil {
|
|
|
|
log.Fatal(err)
|
|
|
|
}
|
2020-01-11 02:13:11 +00:00
|
|
|
time.Sleep(interval)
|
2020-01-07 14:03:58 +00:00
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
|
|
|
go func() {
|
|
|
|
result, err := alertHandler(name, aCh)
|
|
|
|
if err != nil {
|
|
|
|
log.Fatal(err)
|
|
|
|
}
|
|
|
|
if result != "done" {
|
|
|
|
log.Fatal("systemd unit failed to restart:", result)
|
|
|
|
}
|
|
|
|
log.Info("restarting health agent")
|
|
|
|
// Exit health agent and let supervisor restart health agent
|
|
|
|
// Restarting lotus systemd unit kills api connection
|
|
|
|
os.Exit(130)
|
|
|
|
}()
|
|
|
|
|
|
|
|
for {
|
2020-01-21 01:24:33 +00:00
|
|
|
ok := checkWindow(ch, threshold)
|
2020-01-07 14:03:58 +00:00
|
|
|
if !ok {
|
|
|
|
log.Warn("chain head has not updated. Restarting systemd service")
|
|
|
|
aCh <- nil
|
|
|
|
break
|
|
|
|
}
|
|
|
|
log.Info("chain head is healthy")
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* reads channel of slices of Cids
|
|
|
|
* compares slices of Cids when len is greater or equal to `t` - threshold
|
|
|
|
* if all slices are equal, head has not updated and returns false
|
|
|
|
*/
|
|
|
|
func checkWindow(ch chan CidWindow, t int) bool {
|
|
|
|
select {
|
|
|
|
case window := <-ch:
|
|
|
|
var dup int
|
|
|
|
windowLen := len(window)
|
|
|
|
if windowLen >= t {
|
|
|
|
cidWindow:
|
|
|
|
for i, cids := range window {
|
|
|
|
next := windowLen - 1 - i
|
|
|
|
// if array length is different, head is changing
|
|
|
|
if next >= 1 && len(window[next]) != len(window[next-1]) {
|
|
|
|
break cidWindow
|
|
|
|
}
|
|
|
|
// if cids are different, head is changing
|
|
|
|
for j := range cids {
|
|
|
|
if next >= 1 && window[next][j] != window[next-1][j] {
|
|
|
|
break cidWindow
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if i < (t - 1) {
|
|
|
|
dup++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if dup == (t - 1) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2020-01-21 01:24:33 +00:00
|
|
|
* get chain head from API
|
|
|
|
* returns a slice of slices of Cids
|
|
|
|
* len of slice <= `t` - threshold
|
2020-01-07 14:03:58 +00:00
|
|
|
*/
|
|
|
|
func updateWindow(ctx context.Context, a api.FullNode, w CidWindow, t int, ch chan CidWindow) (CidWindow, error) {
|
|
|
|
head, err := a.ChainHead(ctx)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
window := appendCIDsToWindow(w, head.Cids(), t)
|
|
|
|
ch <- window
|
|
|
|
return window, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* appends slice of Cids to window slice
|
|
|
|
* keeps a fixed window slice size, dropping older slices
|
|
|
|
* returns new window
|
|
|
|
*/
|
|
|
|
func appendCIDsToWindow(w CidWindow, c []cid.Cid, t int) CidWindow {
|
|
|
|
offset := len(w) - t + 1
|
|
|
|
if offset >= 0 {
|
|
|
|
return append(w[offset:], c)
|
|
|
|
}
|
|
|
|
return append(w, c)
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* wait for node to sync
|
|
|
|
*/
|
|
|
|
func WaitForSyncComplete(ctx context.Context, napi api.FullNode) error {
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
return ctx.Err()
|
|
|
|
case <-time.After(3 * time.Second):
|
|
|
|
head, err := napi.ChainHead(ctx)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
if time.Now().Unix()-int64(head.MinTimestamp()) < build.BlockDelay {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|