From 444d0b1b8a8faa5035046574c6e2e6a92c3cfca9 Mon Sep 17 00:00:00 2001 From: Cory Schwartz Date: Fri, 20 May 2022 18:38:17 -0700 Subject: [PATCH 1/6] add healthz and livez endpoints --- cmd/lotus-gateway/main.go | 2 +- gateway/handler.go | 11 ++++-- node/health.go | 81 +++++++++++++++++++++++++++++++++++++++ node/rpc.go | 2 + 4 files changed, 91 insertions(+), 5 deletions(-) create mode 100644 node/health.go diff --git a/cmd/lotus-gateway/main.go b/cmd/lotus-gateway/main.go index cfda02d86..c49e8d532 100644 --- a/cmd/lotus-gateway/main.go +++ b/cmd/lotus-gateway/main.go @@ -174,7 +174,7 @@ var runCmd = &cli.Command{ } gwapi := gateway.NewNode(api, lookbackCap, waitLookback) - h, err := gateway.Handler(gwapi, serverOptions...) + h, err := gateway.Handler(gwapi, api, serverOptions...) if err != nil { return xerrors.Errorf("failed to set up gateway HTTP handler") } diff --git a/gateway/handler.go b/gateway/handler.go index f8da5a5e1..3e88ba214 100644 --- a/gateway/handler.go +++ b/gateway/handler.go @@ -5,16 +5,17 @@ import ( "contrib.go.opencensus.io/exporter/prometheus" "github.com/filecoin-project/go-jsonrpc" - "github.com/filecoin-project/lotus/api" + lapi "github.com/filecoin-project/lotus/api" "github.com/filecoin-project/lotus/api/v0api" "github.com/filecoin-project/lotus/api/v1api" "github.com/filecoin-project/lotus/metrics/proxy" + "github.com/filecoin-project/lotus/node" "github.com/gorilla/mux" promclient "github.com/prometheus/client_golang/prometheus" ) // Handler returns a gateway http.Handler, to be mounted as-is on the server. -func Handler(a api.Gateway, opts ...jsonrpc.ServerOption) (http.Handler, error) { +func Handler(gwapi lapi.Gateway, api lapi.FullNode, opts ...jsonrpc.ServerOption) (http.Handler, error) { m := mux.NewRouter() serveRpc := func(path string, hnd interface{}) { @@ -23,10 +24,10 @@ func Handler(a api.Gateway, opts ...jsonrpc.ServerOption) (http.Handler, error) m.Handle(path, rpcServer) } - ma := proxy.MetricedGatewayAPI(a) + ma := proxy.MetricedGatewayAPI(gwapi) serveRpc("/rpc/v1", ma) - serveRpc("/rpc/v0", api.Wrap(new(v1api.FullNodeStruct), new(v0api.WrapperV1Full), ma)) + serveRpc("/rpc/v0", lapi.Wrap(new(v1api.FullNodeStruct), new(v0api.WrapperV1Full), ma)) registry := promclient.DefaultRegisterer.(*promclient.Registry) exporter, err := prometheus.NewExporter(prometheus.Options{ @@ -37,6 +38,8 @@ func Handler(a api.Gateway, opts ...jsonrpc.ServerOption) (http.Handler, error) return nil, err } m.Handle("/debug/metrics", exporter) + m.Handle("/health/livez", node.NewLiveHandler(api)) + m.Handle("/health/readyz", node.NewReadyHandler(api)) m.PathPrefix("/").Handler(http.DefaultServeMux) /*ah := &auth.Handler{ diff --git a/node/health.go b/node/health.go new file mode 100644 index 000000000..a92997a61 --- /dev/null +++ b/node/health.go @@ -0,0 +1,81 @@ +package node + +import ( + "context" + "net/http" + "time" + + lapi "github.com/filecoin-project/lotus/api" + "github.com/libp2p/go-libp2p-core/network" +) + +type HealthHandler struct { + healthy bool +} + +func (h *HealthHandler) SetHealthy(healthy bool) { + h.healthy = healthy +} + +func (h *HealthHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + if !h.healthy { + w.WriteHeader(http.StatusServiceUnavailable) + return + } + w.WriteHeader(http.StatusOK) +} + +// The backend is considered alive so long as there have been recent +// head changes. Being alive doesn't mean we are up to date, just moving. +func NewLiveHandler(api lapi.FullNode) *HealthHandler { + ctx := context.Background() + h := HealthHandler{} + go func() { + const reset = 5 + var countdown = 0 + minutely := time.NewTicker(time.Minute) + headCh, err := api.ChainNotify(ctx) + if err != nil { + //TODO + } + for { + select { + case <-minutely.C: + countdown = countdown - 1 + if countdown == 0 { + h.SetHealthy(false) + } + case <-headCh: + countdown = reset + h.SetHealthy(true) + } + } + }() + return &h +} + +// Check if we are ready to handle traffic. +// 1. sync workers are caught up. +// 2 +func NewReadyHandler(api lapi.FullNode) *HealthHandler { + ctx := context.Background() + h := HealthHandler{} + go func() { + const heightTolerance = uint64(5) + var nethealth, synchealth bool + minutely := time.NewTicker(time.Minute) + for { + select { + case <-minutely.C: + netstat, err := api.NetAutoNatStatus(ctx) + nethealth = err == nil && netstat.Reachability != network.ReachabilityUnknown + + nodestat, err := api.NodeStatus(ctx, false) + synchealth = err == nil && nodestat.SyncStatus.Behind < heightTolerance + + h.SetHealthy(nethealth && synchealth) + } + } + }() + return &h +} diff --git a/node/rpc.go b/node/rpc.go index 6a3e55115..161bea3d3 100644 --- a/node/rpc.go +++ b/node/rpc.go @@ -114,6 +114,8 @@ func FullNodeHandler(a v1api.FullNode, permissioned bool, opts ...jsonrpc.Server m.Handle("/debug/pprof-set/mutex", handleFractionOpt("MutexProfileFraction", func(x int) { runtime.SetMutexProfileFraction(x) })) + m.Handle("/health/livez", NewLiveHandler(a)) + m.Handle("/health/readyz", NewReadyHandler(a)) m.PathPrefix("/").Handler(http.DefaultServeMux) // pprof return m, nil From 45d15cb8ee83886471e2e74c9cb4fdbc30618a7b Mon Sep 17 00:00:00 2001 From: Cory Schwartz Date: Mon, 23 May 2022 09:29:11 -0700 Subject: [PATCH 2/6] handle error during liveness check --- node/health.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/node/health.go b/node/health.go index a92997a61..99c3e03ae 100644 --- a/node/health.go +++ b/node/health.go @@ -6,9 +6,12 @@ import ( "time" lapi "github.com/filecoin-project/lotus/api" + logging "github.com/ipfs/go-log/v2" "github.com/libp2p/go-libp2p-core/network" ) +var healthlog = logging.Logger("healthcheck") + type HealthHandler struct { healthy bool } @@ -36,7 +39,9 @@ func NewLiveHandler(api lapi.FullNode) *HealthHandler { minutely := time.NewTicker(time.Minute) headCh, err := api.ChainNotify(ctx) if err != nil { - //TODO + healthlog.Warnf("failed to instantiate chain notify channel; liveliness cannot be determined. %s", err) + h.SetHealthy(false) + return } for { select { From b4852038a1dc453229395baa96d4889700d816be Mon Sep 17 00:00:00 2001 From: Cory Schwartz Date: Mon, 23 May 2022 11:11:45 -0700 Subject: [PATCH 3/6] Update node/health.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Ɓukasz Magiera --- node/health.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/node/health.go b/node/health.go index a92997a61..9e9f3bb80 100644 --- a/node/health.go +++ b/node/health.go @@ -10,15 +10,19 @@ import ( ) type HealthHandler struct { - healthy bool + healthy int32 } func (h *HealthHandler) SetHealthy(healthy bool) { - h.healthy = healthy + h := int32(0) + if healthy { + h = 1 + } + atomic.StoreInt32(&h.healthy, h) } func (h *HealthHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { - if !h.healthy { + if atomic.LoadInt32(&h.healthy) != 1 { w.WriteHeader(http.StatusServiceUnavailable) return } From 7d55ab07342f450aa9652ae738bd7966177a0030 Mon Sep 17 00:00:00 2001 From: Cory Schwartz Date: Mon, 23 May 2022 12:04:13 -0700 Subject: [PATCH 4/6] make atomic --- node/health.go | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/node/health.go b/node/health.go index 3a7f759e3..a61d76164 100644 --- a/node/health.go +++ b/node/health.go @@ -3,6 +3,7 @@ package node import ( "context" "net/http" + "sync/atomic" "time" lapi "github.com/filecoin-project/lotus/api" @@ -17,11 +18,11 @@ type HealthHandler struct { } func (h *HealthHandler) SetHealthy(healthy bool) { - h := int32(0) + var hi32 int32 if healthy { - h = 1 + hi32 = 1 } - atomic.StoreInt32(&h.healthy, h) + atomic.StoreInt32(&h.healthy, hi32) } func (h *HealthHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { @@ -32,30 +33,30 @@ func (h *HealthHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) } -// The backend is considered alive so long as there have been recent -// head changes. Being alive doesn't mean we are up to date, just moving. +// Check that the node is still working. That is, that it's still processing the chain. +// If there have been no recent changes, consider the node to be dead. func NewLiveHandler(api lapi.FullNode) *HealthHandler { ctx := context.Background() h := HealthHandler{} go func() { - const reset = 5 - var countdown = 0 + const reset int32 = 5 + var countdown int32 = 0 minutely := time.NewTicker(time.Minute) headCh, err := api.ChainNotify(ctx) if err != nil { - healthlog.Warnf("failed to instantiate chain notify channel; liveliness cannot be determined. %s", err) + healthlog.Warnf("failed to instantiate chain notify channel; liveness cannot be determined. %s", err) h.SetHealthy(false) return } for { select { case <-minutely.C: - countdown = countdown - 1 - if countdown == 0 { + atomic.AddInt32(&countdown, -1) + if countdown <= 0 { h.SetHealthy(false) } case <-headCh: - countdown = reset + atomic.StoreInt32(&countdown, reset) h.SetHealthy(true) } } @@ -64,8 +65,8 @@ func NewLiveHandler(api lapi.FullNode) *HealthHandler { } // Check if we are ready to handle traffic. -// 1. sync workers are caught up. -// 2 +// 1. sync workers are reasonably up to date. +// 2. libp2p is servicable func NewReadyHandler(api lapi.FullNode) *HealthHandler { ctx := context.Background() h := HealthHandler{} From 2b0d0ce224f8db40ab23008f82b8a81d11517a9e Mon Sep 17 00:00:00 2001 From: Cory Schwartz Date: Mon, 23 May 2022 14:59:57 -0700 Subject: [PATCH 5/6] fix itests --- itests/gateway_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/itests/gateway_test.go b/itests/gateway_test.go index 249112b1f..593ec17e9 100644 --- a/itests/gateway_test.go +++ b/itests/gateway_test.go @@ -291,7 +291,7 @@ func startNodes( // Create a gateway server in front of the full node gwapi := gateway.NewNode(full, lookbackCap, stateWaitLookbackLimit) - handler, err := gateway.Handler(gwapi) + handler, err := gateway.Handler(gwapi, full) require.NoError(t, err) l, err := net.Listen("tcp", "127.0.0.1:0") From d2299dfbf8cf9a7e59de5b84aad80d4854692a88 Mon Sep 17 00:00:00 2001 From: Cory Schwartz Date: Tue, 24 May 2022 10:24:37 -0700 Subject: [PATCH 6/6] backoff/reconnect loop --- node/health.go | 44 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/node/health.go b/node/health.go index a61d76164..7291e7bc8 100644 --- a/node/health.go +++ b/node/health.go @@ -39,23 +39,49 @@ func NewLiveHandler(api lapi.FullNode) *HealthHandler { ctx := context.Background() h := HealthHandler{} go func() { - const reset int32 = 5 - var countdown int32 = 0 + const ( + reset int32 = 5 + maxbackoff time.Duration = time.Minute + minbackoff time.Duration = time.Second + ) + var ( + countdown int32 + headCh <-chan []*lapi.HeadChange + backoff time.Duration = minbackoff + err error + ) minutely := time.NewTicker(time.Minute) - headCh, err := api.ChainNotify(ctx) - if err != nil { - healthlog.Warnf("failed to instantiate chain notify channel; liveness cannot be determined. %s", err) - h.SetHealthy(false) - return - } for { + if headCh == nil { + healthlog.Infof("waiting %v before starting ChainNotify channel", backoff) + <-time.After(backoff) + headCh, err = api.ChainNotify(ctx) + if err != nil { + healthlog.Warnf("failed to instantiate ChainNotify channel; cannot determine liveness. %s", err) + h.SetHealthy(false) + nextbackoff := 2 * backoff + if nextbackoff > maxbackoff { + nextbackoff = maxbackoff + } + backoff = nextbackoff + continue + } else { + healthlog.Infof("started ChainNotify channel") + backoff = minbackoff + } + } select { case <-minutely.C: atomic.AddInt32(&countdown, -1) if countdown <= 0 { h.SetHealthy(false) } - case <-headCh: + case _, ok := <-headCh: + if !ok { // channel is closed, enter reconnect loop. + h.SetHealthy(false) + headCh = nil + continue + } atomic.StoreInt32(&countdown, reset) h.SetHealthy(true) }