Merge pull request #1294 from natewalck/more-opencensus-metrics

More opencensus metrics
This commit is contained in:
Whyrusleeping 2020-03-03 12:02:13 -08:00 committed by GitHub
commit 41bf668189
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 166 additions and 19 deletions

View File

@ -13,6 +13,8 @@ import (
"github.com/filecoin-project/lotus/build"
"github.com/filecoin-project/lotus/chain/state"
"github.com/filecoin-project/lotus/chain/vm"
"github.com/filecoin-project/lotus/metrics"
"go.opencensus.io/stats"
"go.opencensus.io/trace"
"go.uber.org/multierr"
@ -100,7 +102,15 @@ func NewChainStore(bs bstore.Blockstore, ds dstore.Batching, vmcalls *types.VMSy
return nil
}
cs.headChangeNotifs = append(cs.headChangeNotifs, hcnf)
hcmetric := func(rev, app []*types.TipSet) error {
ctx := context.Background()
for _, r := range app {
stats.Record(ctx, metrics.ChainNodeHeight.M(int64(r.Height())))
}
return nil
}
cs.headChangeNotifs = append(cs.headChangeNotifs, hcnf, hcmetric)
return cs
}

View File

@ -2,6 +2,7 @@ package sub
import (
"context"
"fmt"
"time"
lru "github.com/hashicorp/golang-lru"
@ -10,11 +11,14 @@ import (
connmgr "github.com/libp2p/go-libp2p-core/connmgr"
peer "github.com/libp2p/go-libp2p-peer"
pubsub "github.com/libp2p/go-libp2p-pubsub"
"go.opencensus.io/stats"
"go.opencensus.io/tag"
"github.com/filecoin-project/lotus/build"
"github.com/filecoin-project/lotus/chain"
"github.com/filecoin-project/lotus/chain/messagepool"
"github.com/filecoin-project/lotus/chain/types"
"github.com/filecoin-project/lotus/metrics"
)
var log = logging.Logger("sub")
@ -107,15 +111,25 @@ func (bv *BlockValidator) flagPeer(p peer.ID) {
}
func (bv *BlockValidator) Validate(ctx context.Context, pid peer.ID, msg *pubsub.Message) bool {
stats.Record(ctx, metrics.BlockReceived.M(1))
ctx, _ = tag.New(
ctx,
tag.Insert(metrics.PeerID, pid.String()),
tag.Insert(metrics.ReceivedFrom, msg.ReceivedFrom.String()),
)
blk, err := types.DecodeBlockMsg(msg.GetData())
if err != nil {
log.Error("got invalid block over pubsub: ", err)
ctx, _ = tag.New(ctx, tag.Insert(metrics.FailureType, "invalid"))
stats.Record(ctx, metrics.BlockValidationFailure.M(1))
bv.flagPeer(pid)
return false
}
if len(blk.BlsMessages)+len(blk.SecpkMessages) > build.BlockMessageLimit {
log.Warnf("received block with too many messages over pubsub")
ctx, _ = tag.New(ctx, tag.Insert(metrics.FailureType, "too_many_messages"))
stats.Record(ctx, metrics.BlockValidationFailure.M(1))
bv.flagPeer(pid)
return false
}
@ -127,6 +141,7 @@ func (bv *BlockValidator) Validate(ctx context.Context, pid peer.ID, msg *pubsub
}
msg.ValidatorData = blk
stats.Record(ctx, metrics.BlockValidationSuccess.M(1))
return true
}
@ -162,17 +177,29 @@ func NewMessageValidator(mp *messagepool.MessagePool) *MessageValidator {
}
func (mv *MessageValidator) Validate(ctx context.Context, pid peer.ID, msg *pubsub.Message) bool {
stats.Record(ctx, metrics.MessageReceived.M(1))
ctx, _ = tag.New(ctx, tag.Insert(metrics.PeerID, pid.String()))
m, err := types.DecodeSignedMessage(msg.Message.GetData())
if err != nil {
log.Warnf("failed to decode incoming message: %s", err)
ctx, _ = tag.New(ctx, tag.Insert(metrics.FailureType, "decode"))
stats.Record(ctx, metrics.MessageValidationFailure.M(1))
return false
}
if err := mv.mpool.Add(m); err != nil {
log.Warnf("failed to add message from network to message pool (From: %s, To: %s, Nonce: %d, Value: %s): %s", m.Message.From, m.Message.To, m.Message.Nonce, types.FIL(m.Message.Value), err)
ctx, _ = tag.New(
ctx,
tag.Insert(metrics.MessageFrom, m.Message.From.String()),
tag.Insert(metrics.MessageTo, m.Message.To.String()),
tag.Insert(metrics.MessageNonce, fmt.Sprint(m.Message.Nonce)),
tag.Insert(metrics.FailureType, "add"),
)
stats.Record(ctx, metrics.MessageValidationFailure.M(1))
return false
}
stats.Record(ctx, metrics.MessageValidationSuccess.M(1))
return true
}

View File

@ -23,6 +23,7 @@ import (
"github.com/libp2p/go-libp2p-core/peer"
cbg "github.com/whyrusleeping/cbor-gen"
"github.com/whyrusleeping/pubsub"
"go.opencensus.io/stats"
"go.opencensus.io/trace"
"golang.org/x/xerrors"
@ -38,6 +39,7 @@ import (
"github.com/filecoin-project/lotus/chain/store"
"github.com/filecoin-project/lotus/chain/types"
"github.com/filecoin-project/lotus/lib/sigs"
"github.com/filecoin-project/lotus/metrics"
)
var log = logging.Logger("chain")
@ -1038,6 +1040,7 @@ func (syncer *Syncer) syncMessagesAndCheckState(ctx context.Context, headers []*
return xerrors.Errorf("message processing failed: %w", err)
}
stats.Record(ctx, metrics.ChainNodeWorkerHeight.M(int64(fts.TipSet().Height())))
ss.SetHeight(fts.TipSet().Height())
return nil

View File

@ -24,6 +24,7 @@ import (
"github.com/filecoin-project/lotus/chain/stmgr"
"github.com/filecoin-project/lotus/chain/store"
"github.com/filecoin-project/lotus/chain/vm"
"github.com/filecoin-project/lotus/metrics"
"github.com/filecoin-project/lotus/node"
"github.com/filecoin-project/lotus/node/modules"
"github.com/filecoin-project/lotus/node/modules/testing"
@ -36,12 +37,6 @@ const (
preSealedSectorsFlag = "genesis-presealed-sectors"
)
var (
lotusInfo = stats.Int64("info", "Arbitrary counter to tag lotus info to", stats.UnitDimensionless)
version, _ = tag.NewKey("version")
commit, _ = tag.NewKey("commit")
)
// DaemonCmd is the `go-lotus daemon` command
var DaemonCmd = &cli.Command{
Name: "daemon",
@ -99,7 +94,7 @@ var DaemonCmd = &cli.Command{
defer pprof.StopCPUProfile()
}
ctx, _ := tag.New(context.Background(), tag.Insert(version, build.BuildVersion), tag.Insert(commit, build.CurrentCommit))
ctx, _ := tag.New(context.Background(), tag.Insert(metrics.Version, build.BuildVersion), tag.Insert(metrics.Commit, build.CurrentCommit))
{
dir, err := homedir.Expand(cctx.String("repo"))
if err != nil {
@ -180,21 +175,15 @@ var DaemonCmd = &cli.Command{
return xerrors.Errorf("initializing node: %w", err)
}
// We are using this metric to tag info about lotus even though
// it doesn't contain any actual metrics
// Register all metric views
if err = view.Register(
&view.View{
Name: "info",
Description: "Lotus node information",
Measure: lotusInfo,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{version, commit},
},
metrics.DefaultViews...,
); err != nil {
log.Fatalf("Cannot register the view: %v", err)
}
// Set the metric to one so it is published to the exporter
stats.Record(ctx, lotusInfo.M(1))
stats.Record(ctx, metrics.LotusInfo.M(1))
endpoint, err := r.APIEndpoint()
if err != nil {

View File

@ -9,6 +9,9 @@ import (
"io"
"reflect"
"github.com/filecoin-project/lotus/metrics"
"go.opencensus.io/stats"
"go.opencensus.io/tag"
"go.opencensus.io/trace"
"go.opencensus.io/trace/propagation"
"golang.org/x/xerrors"
@ -151,18 +154,22 @@ func (handlers) getSpan(ctx context.Context, req request) (context.Context, *tra
}
func (h handlers) handle(ctx context.Context, req request, w func(func(io.Writer)), rpcError rpcErrFunc, done func(keepCtx bool), chOut chanOut) {
// Not sure if we need to sanitize the incoming req.Method or not.
ctx, span := h.getSpan(ctx, req)
ctx, _ = tag.New(ctx, tag.Insert(metrics.RPCMethod, req.Method))
defer span.End()
handler, ok := h[req.Method]
if !ok {
rpcError(w, &req, rpcMethodNotFound, fmt.Errorf("method '%s' not found", req.Method))
stats.Record(ctx, metrics.RPCInvalidMethod.M(1))
done(false)
return
}
if len(req.Params) != handler.nParams {
rpcError(w, &req, rpcInvalidParams, fmt.Errorf("wrong param count"))
stats.Record(ctx, metrics.RPCRequestError.M(1))
done(false)
return
}
@ -172,6 +179,7 @@ func (h handlers) handle(ctx context.Context, req request, w func(func(io.Writer
if chOut == nil && outCh {
rpcError(w, &req, rpcMethodNotFound, fmt.Errorf("method '%s' not supported in this mode (no out channel support)", req.Method))
stats.Record(ctx, metrics.RPCRequestError.M(1))
return
}
@ -185,6 +193,7 @@ func (h handlers) handle(ctx context.Context, req request, w func(func(io.Writer
rp := reflect.New(handler.paramReceivers[i])
if err := json.NewDecoder(bytes.NewReader(req.Params[i].data)).Decode(rp.Interface()); err != nil {
rpcError(w, &req, rpcParseError, xerrors.Errorf("unmarshaling params for '%s': %w", handler.handlerFunc, err))
stats.Record(ctx, metrics.RPCRequestError.M(1))
return
}
@ -196,6 +205,7 @@ func (h handlers) handle(ctx context.Context, req request, w func(func(io.Writer
callResult, err := doCall(req.Method, handler.handlerFunc, callParams)
if err != nil {
rpcError(w, &req, 0, xerrors.Errorf("fatal error calling '%s': %w", req.Method, err))
stats.Record(ctx, metrics.RPCRequestError.M(1))
return
}
if req.ID == nil {
@ -213,6 +223,7 @@ func (h handlers) handle(ctx context.Context, req request, w func(func(io.Writer
err := callResult[handler.errOut].Interface()
if err != nil {
log.Warnf("error in RPC call to '%s': %+v", req.Method, err)
stats.Record(ctx, metrics.RPCResponseError.M(1))
resp.Error = &respError{
Code: 1,
Message: err.(error).Error(),
@ -234,6 +245,7 @@ func (h handlers) handle(ctx context.Context, req request, w func(func(io.Writer
}
log.Warnf("failed to setup channel in RPC call to '%s': %+v", req.Method, err)
stats.Record(ctx, metrics.RPCResponseError.M(1))
resp.Error = &respError{
Code: 1,
Message: err.(error).Error(),
@ -243,6 +255,7 @@ func (h handlers) handle(ctx context.Context, req request, w func(func(io.Writer
w(func(w io.Writer) {
if err := json.NewEncoder(w).Encode(resp); err != nil {
log.Error(err)
stats.Record(ctx, metrics.RPCResponseError.M(1))
return
}
})

102
metrics/metrics.go Normal file
View File

@ -0,0 +1,102 @@
package metrics
import (
"go.opencensus.io/stats"
"go.opencensus.io/stats/view"
"go.opencensus.io/tag"
)
// Global Tags
var (
Version, _ = tag.NewKey("version")
Commit, _ = tag.NewKey("commit")
RPCMethod, _ = tag.NewKey("method")
PeerID, _ = tag.NewKey("peer_id")
FailureType, _ = tag.NewKey("failure_type")
MessageFrom, _ = tag.NewKey("message_from")
MessageTo, _ = tag.NewKey("message_to")
MessageNonce, _ = tag.NewKey("message_nonce")
ReceivedFrom, _ = tag.NewKey("received_from")
)
// Measures
var (
LotusInfo = stats.Int64("info", "Arbitrary counter to tag lotus info to", stats.UnitDimensionless)
ChainNodeHeight = stats.Int64("chain/node_height", "Current Height of the node", stats.UnitDimensionless)
ChainNodeWorkerHeight = stats.Int64("chain/node_worker_height", "Current Height of workers on the node", stats.UnitDimensionless)
MessageReceived = stats.Int64("message/received", "Counter for total received messages", stats.UnitDimensionless)
MessageValidationFailure = stats.Int64("message/failure", "Counter for message validation failures", stats.UnitDimensionless)
MessageValidationSuccess = stats.Int64("message/success", "Counter for message validation successes", stats.UnitDimensionless)
BlockReceived = stats.Int64("block/received", "Counter for total received blocks", stats.UnitDimensionless)
BlockValidationFailure = stats.Int64("block/failure", "Counter for block validation failures", stats.UnitDimensionless)
BlockValidationSuccess = stats.Int64("block/success", "Counter for block validation successes", stats.UnitDimensionless)
PeerCount = stats.Int64("peer/count", "Current number of FIL peers", stats.UnitDimensionless)
RPCInvalidMethod = stats.Int64("rpc/invalid_method", "Total number of invalid RPC methods called", stats.UnitDimensionless)
RPCRequestError = stats.Int64("rpc/request_error", "Total number of request errors handled", stats.UnitDimensionless)
RPCResponseError = stats.Int64("rpc/response_error", "Total number of responses errors handled", stats.UnitDimensionless)
)
// DefaultViews is an array of Consensus views for metric gathering purposes
var DefaultViews = []*view.View{
&view.View{
Name: "info",
Description: "Lotus node information",
Measure: LotusInfo,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{Version, Commit},
},
&view.View{
Measure: ChainNodeHeight,
Aggregation: view.LastValue(),
},
&view.View{
Measure: ChainNodeWorkerHeight,
Aggregation: view.LastValue(),
},
&view.View{
Measure: BlockReceived,
Aggregation: view.Count(),
},
&view.View{
Measure: BlockValidationFailure,
Aggregation: view.Count(),
TagKeys: []tag.Key{FailureType, PeerID, ReceivedFrom},
},
&view.View{
Measure: BlockValidationSuccess,
Aggregation: view.Count(),
},
&view.View{
Measure: MessageReceived,
Aggregation: view.Count(),
},
&view.View{
Measure: MessageValidationFailure,
Aggregation: view.Count(),
TagKeys: []tag.Key{FailureType, MessageFrom, MessageTo, MessageNonce},
},
&view.View{
Measure: MessageValidationSuccess,
Aggregation: view.Count(),
},
&view.View{
Measure: PeerCount,
Aggregation: view.LastValue(),
},
// All RPC related metrics should at the very least tag the RPCMethod
&view.View{
Measure: RPCInvalidMethod,
Aggregation: view.Count(),
TagKeys: []tag.Key{RPCMethod},
},
&view.View{
Measure: RPCRequestError,
Aggregation: view.Count(),
TagKeys: []tag.Key{RPCMethod},
},
&view.View{
Measure: RPCResponseError,
Aggregation: view.Count(),
TagKeys: []tag.Key{RPCMethod},
},
}

View File

@ -5,7 +5,9 @@ import (
"sync"
"time"
"github.com/filecoin-project/lotus/metrics"
"github.com/filecoin-project/lotus/node/modules/dtypes"
"go.opencensus.io/stats"
"go.uber.org/fx"
host "github.com/libp2p/go-libp2p-core/host"
@ -115,6 +117,7 @@ func (pmgr *PeerMgr) Run(ctx context.Context) {
} else if pcount > pmgr.maxFilPeers {
log.Debug("peer count about threshold: %d > %d", pcount, pmgr.maxFilPeers)
}
stats.Record(ctx, metrics.PeerCount.M(int64(pmgr.getPeerCount())))
}
}
}