2020-02-26 02:42:34 +00:00
package metrics
import (
2020-10-21 08:10:27 +00:00
"context"
2020-07-24 05:47:41 +00:00
"time"
2020-02-26 02:42:34 +00:00
"go.opencensus.io/stats"
"go.opencensus.io/stats/view"
"go.opencensus.io/tag"
2020-05-20 17:43:22 +00:00
rpcmetrics "github.com/filecoin-project/go-jsonrpc/metrics"
2021-02-28 22:48:36 +00:00
"github.com/filecoin-project/lotus/blockstore"
2020-02-26 02:42:34 +00:00
)
2020-07-24 05:47:41 +00:00
// Distribution
2021-02-28 22:48:36 +00:00
var defaultMillisecondsDistribution = view . Distribution ( 0.01 , 0.05 , 0.1 , 0.3 , 0.6 , 0.8 , 1 , 2 , 3 , 4 , 5 , 6 , 8 , 10 , 13 , 16 , 20 , 25 , 30 , 40 , 50 , 65 , 80 , 100 , 130 , 160 , 200 , 250 , 300 , 400 , 500 , 650 , 800 , 1000 , 2000 , 3000 , 4000 , 5000 , 7500 , 10000 , 20000 , 50000 , 100000 )
2021-02-21 10:03:00 +00:00
var workMillisecondsDistribution = view . Distribution (
250 , 500 , 1000 , 2000 , 5000 , 10_000 , 30_000 , 60_000 , 2 * 60_000 , 5 * 60_000 , 10 * 60_000 , 15 * 60_000 , 30 * 60_000 , // short sealing tasks
40 * 60_000 , 45 * 60_000 , 50 * 60_000 , 55 * 60_000 , 60 * 60_000 , 65 * 60_000 , 70 * 60_000 , 75 * 60_000 , 80 * 60_000 , 85 * 60_000 , 100 * 60_000 , 120 * 60_000 , // PC2 / C2 range
130 * 60_000 , 140 * 60_000 , 150 * 60_000 , 160 * 60_000 , 180 * 60_000 , 200 * 60_000 , 220 * 60_000 , 260 * 60_000 , 300 * 60_000 , // PC1 range
350 * 60_000 , 400 * 60_000 , 600 * 60_000 , 800 * 60_000 , 1000 * 60_000 , 1300 * 60_000 , 1800 * 60_000 , 4000 * 60_000 , 10000 * 60_000 , // intel PC1 range
)
2020-07-24 05:47:41 +00:00
2020-02-26 02:42:34 +00:00
// Global Tags
var (
2021-02-21 10:03:00 +00:00
// common
Version , _ = tag . NewKey ( "version" )
Commit , _ = tag . NewKey ( "commit" )
NodeType , _ = tag . NewKey ( "node_type" )
PeerID , _ = tag . NewKey ( "peer_id" )
MinerID , _ = tag . NewKey ( "miner_id" )
FailureType , _ = tag . NewKey ( "failure_type" )
// chain
2020-08-28 06:11:24 +00:00
Local , _ = tag . NewKey ( "local" )
2020-03-02 00:26:09 +00:00
MessageFrom , _ = tag . NewKey ( "message_from" )
MessageTo , _ = tag . NewKey ( "message_to" )
MessageNonce , _ = tag . NewKey ( "message_nonce" )
2020-03-02 00:57:16 +00:00
ReceivedFrom , _ = tag . NewKey ( "received_from" )
2020-10-21 08:10:27 +00:00
Endpoint , _ = tag . NewKey ( "endpoint" )
APIInterface , _ = tag . NewKey ( "api" ) // to distinguish between gateway api and full node api endpoint calls
2021-02-21 10:03:00 +00:00
// miner
TaskType , _ = tag . NewKey ( "task_type" )
WorkerHostname , _ = tag . NewKey ( "worker_hostname" )
2020-02-26 02:42:34 +00:00
)
// Measures
var (
2021-02-21 10:03:00 +00:00
// common
LotusInfo = stats . Int64 ( "info" , "Arbitrary counter to tag lotus info to" , stats . UnitDimensionless )
PeerCount = stats . Int64 ( "peer/count" , "Current number of FIL peers" , stats . UnitDimensionless )
APIRequestDuration = stats . Float64 ( "api/request_duration_ms" , "Duration of API requests" , stats . UnitMilliseconds )
// chain
2020-06-04 22:18:14 +00:00
ChainNodeHeight = stats . Int64 ( "chain/node_height" , "Current Height of the node" , stats . UnitDimensionless )
2020-09-06 04:32:05 +00:00
ChainNodeHeightExpected = stats . Int64 ( "chain/node_height_expected" , "Expected Height of the node" , stats . UnitDimensionless )
2020-06-04 22:18:14 +00:00
ChainNodeWorkerHeight = stats . Int64 ( "chain/node_worker_height" , "Current Height of workers on the node" , stats . UnitDimensionless )
2020-08-28 06:11:24 +00:00
MessagePublished = stats . Int64 ( "message/published" , "Counter for total locally published messages" , stats . UnitDimensionless )
2020-06-04 22:18:14 +00:00
MessageReceived = stats . Int64 ( "message/received" , "Counter for total received messages" , stats . UnitDimensionless )
MessageValidationFailure = stats . Int64 ( "message/failure" , "Counter for message validation failures" , stats . UnitDimensionless )
MessageValidationSuccess = stats . Int64 ( "message/success" , "Counter for message validation successes" , stats . UnitDimensionless )
2020-08-17 06:04:22 +00:00
BlockPublished = stats . Int64 ( "block/published" , "Counter for total locally published blocks" , stats . UnitDimensionless )
2020-06-04 22:18:14 +00:00
BlockReceived = stats . Int64 ( "block/received" , "Counter for total received blocks" , stats . UnitDimensionless )
BlockValidationFailure = stats . Int64 ( "block/failure" , "Counter for block validation failures" , stats . UnitDimensionless )
BlockValidationSuccess = stats . Int64 ( "block/success" , "Counter for block validation successes" , stats . UnitDimensionless )
BlockValidationDurationMilliseconds = stats . Float64 ( "block/validation_ms" , "Duration for Block Validation in ms" , stats . UnitMilliseconds )
2020-12-10 14:48:37 +00:00
BlockDelay = stats . Int64 ( "block/delay" , "Delay of accepted blocks, where delay is >5s" , stats . UnitMilliseconds )
2020-08-17 07:46:20 +00:00
PubsubPublishMessage = stats . Int64 ( "pubsub/published" , "Counter for total published messages" , stats . UnitDimensionless )
PubsubDeliverMessage = stats . Int64 ( "pubsub/delivered" , "Counter for total delivered messages" , stats . UnitDimensionless )
PubsubRejectMessage = stats . Int64 ( "pubsub/rejected" , "Counter for total rejected messages" , stats . UnitDimensionless )
PubsubDuplicateMessage = stats . Int64 ( "pubsub/duplicate" , "Counter for total duplicate messages" , stats . UnitDimensionless )
2020-08-20 20:14:32 +00:00
PubsubRecvRPC = stats . Int64 ( "pubsub/recv_rpc" , "Counter for total received RPCs" , stats . UnitDimensionless )
PubsubSendRPC = stats . Int64 ( "pubsub/send_rpc" , "Counter for total sent RPCs" , stats . UnitDimensionless )
PubsubDropRPC = stats . Int64 ( "pubsub/drop_rpc" , "Counter for total dropped RPCs" , stats . UnitDimensionless )
2020-11-11 16:05:08 +00:00
VMFlushCopyDuration = stats . Float64 ( "vm/flush_copy_ms" , "Time spent in VM Flush Copy" , stats . UnitMilliseconds )
VMFlushCopyCount = stats . Int64 ( "vm/flush_copy_count" , "Number of copied objects" , stats . UnitDimensionless )
2021-02-21 10:03:00 +00:00
// miner
WorkerCallsStarted = stats . Int64 ( "sealing/worker_calls_started" , "Counter of started worker tasks" , stats . UnitDimensionless )
WorkerCallsReturnedCount = stats . Int64 ( "sealing/worker_calls_returned_count" , "Counter of returned worker tasks" , stats . UnitDimensionless )
WorkerCallsReturnedDuration = stats . Float64 ( "sealing/worker_calls_returned_ms" , "Counter of returned worker tasks" , stats . UnitMilliseconds )
WorkerUntrackedCallsReturned = stats . Int64 ( "sealing/worker_untracked_calls_returned" , "Counter of returned untracked worker tasks" , stats . UnitDimensionless )
2020-02-26 02:42:34 +00:00
)
2020-03-05 09:47:20 +00:00
var (
InfoView = & view . View {
2020-02-26 02:42:34 +00:00
Name : "info" ,
Description : "Lotus node information" ,
Measure : LotusInfo ,
Aggregation : view . LastValue ( ) ,
TagKeys : [ ] tag . Key { Version , Commit } ,
2020-03-05 09:47:20 +00:00
}
ChainNodeHeightView = & view . View {
2020-03-02 00:26:09 +00:00
Measure : ChainNodeHeight ,
2020-02-26 02:42:34 +00:00
Aggregation : view . LastValue ( ) ,
2020-03-05 09:47:20 +00:00
}
2020-09-06 04:32:05 +00:00
ChainNodeHeightExpectedView = & view . View {
Measure : ChainNodeHeightExpected ,
Aggregation : view . LastValue ( ) ,
}
2020-03-05 09:47:20 +00:00
ChainNodeWorkerHeightView = & view . View {
2020-03-02 00:26:09 +00:00
Measure : ChainNodeWorkerHeight ,
Aggregation : view . LastValue ( ) ,
2020-03-05 09:47:20 +00:00
}
BlockReceivedView = & view . View {
2020-03-02 00:57:16 +00:00
Measure : BlockReceived ,
2020-03-02 00:26:09 +00:00
Aggregation : view . Count ( ) ,
2020-03-05 09:47:20 +00:00
}
BlockValidationFailureView = & view . View {
2020-03-02 00:57:16 +00:00
Measure : BlockValidationFailure ,
Aggregation : view . Count ( ) ,
2020-03-05 09:47:20 +00:00
TagKeys : [ ] tag . Key { FailureType } ,
}
BlockValidationSuccessView = & view . View {
2020-03-02 00:57:16 +00:00
Measure : BlockValidationSuccess ,
Aggregation : view . Count ( ) ,
2020-03-05 09:47:20 +00:00
}
2020-06-04 22:18:14 +00:00
BlockValidationDurationView = & view . View {
Measure : BlockValidationDurationMilliseconds ,
2020-07-24 05:47:41 +00:00
Aggregation : defaultMillisecondsDistribution ,
2020-06-04 22:18:14 +00:00
}
2020-12-10 14:48:37 +00:00
BlockDelayView = & view . View {
Measure : BlockDelay ,
TagKeys : [ ] tag . Key { MinerID } ,
Aggregation : func ( ) * view . Aggregation {
var bounds [ ] float64
for i := 5 ; i < 29 ; i ++ { // 5-29s, step 1s
bounds = append ( bounds , float64 ( i * 1000 ) )
}
for i := 30 ; i < 60 ; i += 2 { // 30-58s, step 2s
bounds = append ( bounds , float64 ( i * 1000 ) )
}
for i := 60 ; i <= 300 ; i += 10 { // 60-300s, step 10s
bounds = append ( bounds , float64 ( i * 1000 ) )
}
bounds = append ( bounds , 600 * 1000 ) // final cutoff at 10m
return view . Distribution ( bounds ... )
} ( ) ,
}
2020-08-28 06:25:50 +00:00
MessagePublishedView = & view . View {
2020-08-28 09:51:51 +00:00
Measure : MessagePublished ,
2020-08-28 06:25:50 +00:00
Aggregation : view . Count ( ) ,
}
2020-03-05 09:47:20 +00:00
MessageReceivedView = & view . View {
2020-03-02 00:57:16 +00:00
Measure : MessageReceived ,
Aggregation : view . Count ( ) ,
2020-03-05 09:47:20 +00:00
}
MessageValidationFailureView = & view . View {
2020-03-02 00:57:16 +00:00
Measure : MessageValidationFailure ,
Aggregation : view . Count ( ) ,
2020-08-28 06:11:24 +00:00
TagKeys : [ ] tag . Key { FailureType , Local } ,
2020-03-05 09:47:20 +00:00
}
MessageValidationSuccessView = & view . View {
2020-03-02 00:57:16 +00:00
Measure : MessageValidationSuccess ,
2020-03-02 00:26:09 +00:00
Aggregation : view . Count ( ) ,
2020-03-05 09:47:20 +00:00
}
PeerCountView = & view . View {
2020-03-02 00:26:09 +00:00
Measure : PeerCount ,
2020-02-26 02:42:34 +00:00
Aggregation : view . LastValue ( ) ,
2020-03-05 09:47:20 +00:00
}
2020-08-28 06:25:50 +00:00
PubsubPublishMessageView = & view . View {
2020-08-28 09:51:51 +00:00
Measure : PubsubPublishMessage ,
2020-08-28 06:25:50 +00:00
Aggregation : view . Count ( ) ,
}
PubsubDeliverMessageView = & view . View {
2020-08-28 09:51:51 +00:00
Measure : PubsubDeliverMessage ,
2020-08-28 06:25:50 +00:00
Aggregation : view . Count ( ) ,
}
PubsubRejectMessageView = & view . View {
2020-08-28 09:51:51 +00:00
Measure : PubsubRejectMessage ,
2020-08-28 06:25:50 +00:00
Aggregation : view . Count ( ) ,
}
PubsubDuplicateMessageView = & view . View {
2020-08-28 09:51:51 +00:00
Measure : PubsubDuplicateMessage ,
2020-08-28 06:25:50 +00:00
Aggregation : view . Count ( ) ,
}
PubsubRecvRPCView = & view . View {
2020-08-28 09:51:51 +00:00
Measure : PubsubRecvRPC ,
2020-08-28 06:25:50 +00:00
Aggregation : view . Count ( ) ,
}
PubsubSendRPCView = & view . View {
2020-08-28 09:51:51 +00:00
Measure : PubsubSendRPC ,
2020-08-28 06:25:50 +00:00
Aggregation : view . Count ( ) ,
}
PubsubDropRPCView = & view . View {
2020-08-28 09:51:51 +00:00
Measure : PubsubDropRPC ,
2020-08-28 06:25:50 +00:00
Aggregation : view . Count ( ) ,
}
2020-10-21 08:10:27 +00:00
APIRequestDurationView = & view . View {
Measure : APIRequestDuration ,
Aggregation : defaultMillisecondsDistribution ,
TagKeys : [ ] tag . Key { APIInterface , Endpoint } ,
}
2020-11-11 16:05:08 +00:00
VMFlushCopyDurationView = & view . View {
Measure : VMFlushCopyDuration ,
Aggregation : view . Sum ( ) ,
}
VMFlushCopyCountView = & view . View {
Measure : VMFlushCopyCount ,
Aggregation : view . Sum ( ) ,
}
2021-02-21 10:03:00 +00:00
// miner
WorkerCallsStartedView = & view . View {
Measure : WorkerCallsStarted ,
Aggregation : view . Count ( ) ,
TagKeys : [ ] tag . Key { TaskType , WorkerHostname } ,
}
WorkerCallsReturnedCountView = & view . View {
Measure : WorkerCallsReturnedCount ,
Aggregation : view . Count ( ) ,
TagKeys : [ ] tag . Key { TaskType , WorkerHostname } ,
}
WorkerUntrackedCallsReturnedView = & view . View {
Measure : WorkerUntrackedCallsReturned ,
Aggregation : view . Count ( ) ,
}
WorkerCallsReturnedDurationView = & view . View {
Measure : WorkerCallsReturnedDuration ,
Aggregation : workMillisecondsDistribution ,
TagKeys : [ ] tag . Key { TaskType , WorkerHostname } ,
}
2020-03-05 09:47:20 +00:00
)
// DefaultViews is an array of OpenCensus views for metric gathering purposes
2021-02-28 22:48:36 +00:00
var DefaultViews = func ( ) [ ] * view . View {
views := [ ] * view . View {
InfoView ,
PeerCountView ,
APIRequestDurationView ,
}
views = append ( views , blockstore . DefaultViews ... )
views = append ( views , rpcmetrics . DefaultViews ... )
return views
} ( )
2020-07-24 05:47:41 +00:00
2021-02-21 10:03:00 +00:00
var ChainNodeViews = append ( [ ] * view . View {
ChainNodeHeightView ,
ChainNodeHeightExpectedView ,
ChainNodeWorkerHeightView ,
BlockReceivedView ,
BlockValidationFailureView ,
BlockValidationSuccessView ,
BlockValidationDurationView ,
BlockDelayView ,
MessagePublishedView ,
MessageReceivedView ,
MessageValidationFailureView ,
MessageValidationSuccessView ,
PubsubPublishMessageView ,
PubsubDeliverMessageView ,
PubsubRejectMessageView ,
PubsubDuplicateMessageView ,
PubsubRecvRPCView ,
PubsubSendRPCView ,
PubsubDropRPCView ,
VMFlushCopyCountView ,
VMFlushCopyDurationView ,
} , DefaultViews ... )
var MinerNodeViews = append ( [ ] * view . View {
WorkerCallsStartedView ,
WorkerCallsReturnedCountView ,
WorkerUntrackedCallsReturnedView ,
WorkerCallsReturnedDurationView ,
} , DefaultViews ... )
2020-07-24 05:47:41 +00:00
// SinceInMilliseconds returns the duration of time since the provide time as a float64.
func SinceInMilliseconds ( startTime time . Time ) float64 {
return float64 ( time . Since ( startTime ) . Nanoseconds ( ) ) / 1e6
2020-07-28 14:37:29 +00:00
}
2020-10-21 08:10:27 +00:00
// Timer is a function stopwatch, calling it starts the timer,
// calling the returned function will record the duration.
func Timer ( ctx context . Context , m * stats . Float64Measure ) func ( ) {
start := time . Now ( )
return func ( ) {
stats . Record ( ctx , m . M ( SinceInMilliseconds ( start ) ) )
}
2020-10-21 08:39:57 +00:00
}