Merge pull request #8692 from filecoin-project/feat/health-endpoints
feat: networking: add healthz and livez endpoints
This commit is contained in:
commit
56bde5adf9
@ -174,7 +174,7 @@ var runCmd = &cli.Command{
|
|||||||
}
|
}
|
||||||
|
|
||||||
gwapi := gateway.NewNode(api, lookbackCap, waitLookback)
|
gwapi := gateway.NewNode(api, lookbackCap, waitLookback)
|
||||||
h, err := gateway.Handler(gwapi, serverOptions...)
|
h, err := gateway.Handler(gwapi, api, serverOptions...)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return xerrors.Errorf("failed to set up gateway HTTP handler")
|
return xerrors.Errorf("failed to set up gateway HTTP handler")
|
||||||
}
|
}
|
||||||
|
@ -5,16 +5,17 @@ import (
|
|||||||
|
|
||||||
"contrib.go.opencensus.io/exporter/prometheus"
|
"contrib.go.opencensus.io/exporter/prometheus"
|
||||||
"github.com/filecoin-project/go-jsonrpc"
|
"github.com/filecoin-project/go-jsonrpc"
|
||||||
"github.com/filecoin-project/lotus/api"
|
lapi "github.com/filecoin-project/lotus/api"
|
||||||
"github.com/filecoin-project/lotus/api/v0api"
|
"github.com/filecoin-project/lotus/api/v0api"
|
||||||
"github.com/filecoin-project/lotus/api/v1api"
|
"github.com/filecoin-project/lotus/api/v1api"
|
||||||
"github.com/filecoin-project/lotus/metrics/proxy"
|
"github.com/filecoin-project/lotus/metrics/proxy"
|
||||||
|
"github.com/filecoin-project/lotus/node"
|
||||||
"github.com/gorilla/mux"
|
"github.com/gorilla/mux"
|
||||||
promclient "github.com/prometheus/client_golang/prometheus"
|
promclient "github.com/prometheus/client_golang/prometheus"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Handler returns a gateway http.Handler, to be mounted as-is on the server.
|
// Handler returns a gateway http.Handler, to be mounted as-is on the server.
|
||||||
func Handler(a api.Gateway, opts ...jsonrpc.ServerOption) (http.Handler, error) {
|
func Handler(gwapi lapi.Gateway, api lapi.FullNode, opts ...jsonrpc.ServerOption) (http.Handler, error) {
|
||||||
m := mux.NewRouter()
|
m := mux.NewRouter()
|
||||||
|
|
||||||
serveRpc := func(path string, hnd interface{}) {
|
serveRpc := func(path string, hnd interface{}) {
|
||||||
@ -25,10 +26,10 @@ func Handler(a api.Gateway, opts ...jsonrpc.ServerOption) (http.Handler, error)
|
|||||||
m.Handle(path, rpcServer)
|
m.Handle(path, rpcServer)
|
||||||
}
|
}
|
||||||
|
|
||||||
ma := proxy.MetricedGatewayAPI(a)
|
ma := proxy.MetricedGatewayAPI(gwapi)
|
||||||
|
|
||||||
serveRpc("/rpc/v1", ma)
|
serveRpc("/rpc/v1", ma)
|
||||||
serveRpc("/rpc/v0", api.Wrap(new(v1api.FullNodeStruct), new(v0api.WrapperV1Full), ma))
|
serveRpc("/rpc/v0", lapi.Wrap(new(v1api.FullNodeStruct), new(v0api.WrapperV1Full), ma))
|
||||||
|
|
||||||
registry := promclient.DefaultRegisterer.(*promclient.Registry)
|
registry := promclient.DefaultRegisterer.(*promclient.Registry)
|
||||||
exporter, err := prometheus.NewExporter(prometheus.Options{
|
exporter, err := prometheus.NewExporter(prometheus.Options{
|
||||||
@ -39,6 +40,8 @@ func Handler(a api.Gateway, opts ...jsonrpc.ServerOption) (http.Handler, error)
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
m.Handle("/debug/metrics", exporter)
|
m.Handle("/debug/metrics", exporter)
|
||||||
|
m.Handle("/health/livez", node.NewLiveHandler(api))
|
||||||
|
m.Handle("/health/readyz", node.NewReadyHandler(api))
|
||||||
m.PathPrefix("/").Handler(http.DefaultServeMux)
|
m.PathPrefix("/").Handler(http.DefaultServeMux)
|
||||||
|
|
||||||
/*ah := &auth.Handler{
|
/*ah := &auth.Handler{
|
||||||
|
@ -291,7 +291,7 @@ func startNodes(
|
|||||||
|
|
||||||
// Create a gateway server in front of the full node
|
// Create a gateway server in front of the full node
|
||||||
gwapi := gateway.NewNode(full, lookbackCap, stateWaitLookbackLimit)
|
gwapi := gateway.NewNode(full, lookbackCap, stateWaitLookbackLimit)
|
||||||
handler, err := gateway.Handler(gwapi)
|
handler, err := gateway.Handler(gwapi, full)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
l, err := net.Listen("tcp", "127.0.0.1:0")
|
l, err := net.Listen("tcp", "127.0.0.1:0")
|
||||||
|
117
node/health.go
Normal file
117
node/health.go
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
package node
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"net/http"
|
||||||
|
"sync/atomic"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
lapi "github.com/filecoin-project/lotus/api"
|
||||||
|
logging "github.com/ipfs/go-log/v2"
|
||||||
|
"github.com/libp2p/go-libp2p-core/network"
|
||||||
|
)
|
||||||
|
|
||||||
|
var healthlog = logging.Logger("healthcheck")
|
||||||
|
|
||||||
|
type HealthHandler struct {
|
||||||
|
healthy int32
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *HealthHandler) SetHealthy(healthy bool) {
|
||||||
|
var hi32 int32
|
||||||
|
if healthy {
|
||||||
|
hi32 = 1
|
||||||
|
}
|
||||||
|
atomic.StoreInt32(&h.healthy, hi32)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *HealthHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if atomic.LoadInt32(&h.healthy) != 1 {
|
||||||
|
w.WriteHeader(http.StatusServiceUnavailable)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check that the node is still working. That is, that it's still processing the chain.
|
||||||
|
// If there have been no recent changes, consider the node to be dead.
|
||||||
|
func NewLiveHandler(api lapi.FullNode) *HealthHandler {
|
||||||
|
ctx := context.Background()
|
||||||
|
h := HealthHandler{}
|
||||||
|
go func() {
|
||||||
|
const (
|
||||||
|
reset int32 = 5
|
||||||
|
maxbackoff time.Duration = time.Minute
|
||||||
|
minbackoff time.Duration = time.Second
|
||||||
|
)
|
||||||
|
var (
|
||||||
|
countdown int32
|
||||||
|
headCh <-chan []*lapi.HeadChange
|
||||||
|
backoff time.Duration = minbackoff
|
||||||
|
err error
|
||||||
|
)
|
||||||
|
minutely := time.NewTicker(time.Minute)
|
||||||
|
for {
|
||||||
|
if headCh == nil {
|
||||||
|
healthlog.Infof("waiting %v before starting ChainNotify channel", backoff)
|
||||||
|
<-time.After(backoff)
|
||||||
|
headCh, err = api.ChainNotify(ctx)
|
||||||
|
if err != nil {
|
||||||
|
healthlog.Warnf("failed to instantiate ChainNotify channel; cannot determine liveness. %s", err)
|
||||||
|
h.SetHealthy(false)
|
||||||
|
nextbackoff := 2 * backoff
|
||||||
|
if nextbackoff > maxbackoff {
|
||||||
|
nextbackoff = maxbackoff
|
||||||
|
}
|
||||||
|
backoff = nextbackoff
|
||||||
|
continue
|
||||||
|
} else {
|
||||||
|
healthlog.Infof("started ChainNotify channel")
|
||||||
|
backoff = minbackoff
|
||||||
|
}
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-minutely.C:
|
||||||
|
atomic.AddInt32(&countdown, -1)
|
||||||
|
if countdown <= 0 {
|
||||||
|
h.SetHealthy(false)
|
||||||
|
}
|
||||||
|
case _, ok := <-headCh:
|
||||||
|
if !ok { // channel is closed, enter reconnect loop.
|
||||||
|
h.SetHealthy(false)
|
||||||
|
headCh = nil
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
atomic.StoreInt32(&countdown, reset)
|
||||||
|
h.SetHealthy(true)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
return &h
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if we are ready to handle traffic.
|
||||||
|
// 1. sync workers are reasonably up to date.
|
||||||
|
// 2. libp2p is servicable
|
||||||
|
func NewReadyHandler(api lapi.FullNode) *HealthHandler {
|
||||||
|
ctx := context.Background()
|
||||||
|
h := HealthHandler{}
|
||||||
|
go func() {
|
||||||
|
const heightTolerance = uint64(5)
|
||||||
|
var nethealth, synchealth bool
|
||||||
|
minutely := time.NewTicker(time.Minute)
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-minutely.C:
|
||||||
|
netstat, err := api.NetAutoNatStatus(ctx)
|
||||||
|
nethealth = err == nil && netstat.Reachability != network.ReachabilityUnknown
|
||||||
|
|
||||||
|
nodestat, err := api.NodeStatus(ctx, false)
|
||||||
|
synchealth = err == nil && nodestat.SyncStatus.Behind < heightTolerance
|
||||||
|
|
||||||
|
h.SetHealthy(nethealth && synchealth)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
return &h
|
||||||
|
}
|
@ -115,6 +115,8 @@ func FullNodeHandler(a v1api.FullNode, permissioned bool, opts ...jsonrpc.Server
|
|||||||
m.Handle("/debug/pprof-set/mutex", handleFractionOpt("MutexProfileFraction", func(x int) {
|
m.Handle("/debug/pprof-set/mutex", handleFractionOpt("MutexProfileFraction", func(x int) {
|
||||||
runtime.SetMutexProfileFraction(x)
|
runtime.SetMutexProfileFraction(x)
|
||||||
}))
|
}))
|
||||||
|
m.Handle("/health/livez", NewLiveHandler(a))
|
||||||
|
m.Handle("/health/readyz", NewReadyHandler(a))
|
||||||
m.PathPrefix("/").Handler(http.DefaultServeMux) // pprof
|
m.PathPrefix("/").Handler(http.DefaultServeMux) // pprof
|
||||||
|
|
||||||
return m, nil
|
return m, nil
|
||||||
|
Loading…
Reference in New Issue
Block a user