feat: curio: wdPost and wnPost alerts (#12029)
* post alerts * check missed posts * fix tasks * fix typo
This commit is contained in:
parent
4088706697
commit
803acdfd16
@ -1,9 +1,12 @@
|
|||||||
package alertmanager
|
package alertmanager
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"database/sql"
|
"database/sql"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"math"
|
||||||
"strings"
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/BurntSushi/toml"
|
"github.com/BurntSushi/toml"
|
||||||
"github.com/dustin/go-humanize"
|
"github.com/dustin/go-humanize"
|
||||||
@ -13,6 +16,8 @@ import (
|
|||||||
"github.com/filecoin-project/go-state-types/abi"
|
"github.com/filecoin-project/go-state-types/abi"
|
||||||
"github.com/filecoin-project/go-state-types/big"
|
"github.com/filecoin-project/go-state-types/big"
|
||||||
|
|
||||||
|
"github.com/filecoin-project/lotus/build"
|
||||||
|
"github.com/filecoin-project/lotus/chain/actors/builtin/miner"
|
||||||
"github.com/filecoin-project/lotus/node/config"
|
"github.com/filecoin-project/lotus/node/config"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -77,7 +82,7 @@ func taskFailureCheck(al *alerts) {
|
|||||||
type taskFailure struct {
|
type taskFailure struct {
|
||||||
Machine string `db:"completed_by_host_and_port"`
|
Machine string `db:"completed_by_host_and_port"`
|
||||||
Name string `db:"name"`
|
Name string `db:"name"`
|
||||||
Failures int `db:"failed_tasks_count"`
|
Failures int `db:"failed_count"`
|
||||||
}
|
}
|
||||||
|
|
||||||
var taskFailures []taskFailure
|
var taskFailures []taskFailure
|
||||||
@ -152,6 +157,7 @@ func taskFailureCheck(al *alerts) {
|
|||||||
// If any sectors are unaccounted for, it calculates the total missing space and adds an alert to the alert map.
|
// If any sectors are unaccounted for, it calculates the total missing space and adds an alert to the alert map.
|
||||||
func permanentStorageCheck(al *alerts) {
|
func permanentStorageCheck(al *alerts) {
|
||||||
Name := "PermanentStorageSpace"
|
Name := "PermanentStorageSpace"
|
||||||
|
al.alertMap[Name] = &alertOut{}
|
||||||
// Get all storage path for permanent storages
|
// Get all storage path for permanent storages
|
||||||
type storage struct {
|
type storage struct {
|
||||||
ID string `db:"storage_id"`
|
ID string `db:"storage_id"`
|
||||||
@ -330,3 +336,238 @@ func (al *alerts) getAddresses() ([]string, []string, error) {
|
|||||||
}
|
}
|
||||||
return uniqueAddrs, miners, nil
|
return uniqueAddrs, miners, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func wdPostCheck(al *alerts) {
|
||||||
|
Name := "WindowPost"
|
||||||
|
al.alertMap[Name] = &alertOut{}
|
||||||
|
head, err := al.api.ChainHead(al.ctx)
|
||||||
|
if err != nil {
|
||||||
|
al.alertMap[Name].err = err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
from := head.Height() - abi.ChainEpoch(math.Ceil(AlertMangerInterval.Seconds()/float64(build.BlockDelaySecs))) - 1
|
||||||
|
if from < 0 {
|
||||||
|
from = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Infof("ALERTMANAGER: FROM: %d", from)
|
||||||
|
|
||||||
|
_, miners, err := al.getAddresses()
|
||||||
|
if err != nil {
|
||||||
|
al.alertMap[Name].err = err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
h := head
|
||||||
|
|
||||||
|
type partSent struct {
|
||||||
|
sent bool
|
||||||
|
parts int
|
||||||
|
}
|
||||||
|
|
||||||
|
msgCheck := make(map[address.Address]map[uint64]*partSent)
|
||||||
|
|
||||||
|
for h.Height() >= from {
|
||||||
|
for _, minerStr := range miners {
|
||||||
|
maddr, err := address.NewFromString(minerStr)
|
||||||
|
if err != nil {
|
||||||
|
al.alertMap[Name].err = err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
deadlineInfo, err := al.api.StateMinerProvingDeadline(al.ctx, maddr, h.Key())
|
||||||
|
if err != nil {
|
||||||
|
al.alertMap[Name].err = xerrors.Errorf("getting miner deadline: %w", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
partitions, err := al.api.StateMinerPartitions(al.ctx, maddr, deadlineInfo.Index, h.Key())
|
||||||
|
if err != nil {
|
||||||
|
al.alertMap[Name].err = xerrors.Errorf("getting miner partitions: %w", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if _, ok := msgCheck[maddr]; !ok {
|
||||||
|
msgCheck[maddr] = make(map[uint64]*partSent)
|
||||||
|
}
|
||||||
|
if _, ok := msgCheck[maddr][deadlineInfo.Index]; !ok {
|
||||||
|
msgCheck[maddr][deadlineInfo.Index] = &partSent{
|
||||||
|
sent: false,
|
||||||
|
parts: len(partitions),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
h, err = al.api.ChainGetTipSet(al.ctx, h.Parents())
|
||||||
|
if err != nil {
|
||||||
|
al.alertMap[Name].err = err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for maddr, deadlines := range msgCheck {
|
||||||
|
for deadlineIndex, ps := range deadlines {
|
||||||
|
log.Infof("ALERTMANAGER: Address: %s, DEADLINE: %d, Partitions: %d", maddr.String(), deadlineIndex, ps.parts)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var wdDetails []struct {
|
||||||
|
Miner int64 `db:"sp_id"`
|
||||||
|
Deadline int64 `db:"deadline"`
|
||||||
|
Partition int64 `db:"partition"`
|
||||||
|
Epoch abi.ChainEpoch `db:"submit_at_epoch"`
|
||||||
|
Proof []byte `db:"proof_params"`
|
||||||
|
}
|
||||||
|
|
||||||
|
err = al.db.Select(al.ctx, &wdDetails, `
|
||||||
|
SELECT sp_id, submit_at_epoch, proof_params, partition, deadline
|
||||||
|
FROM wdpost_proofs
|
||||||
|
WHERE submit_at_epoch > $1;`, from)
|
||||||
|
if err != nil {
|
||||||
|
al.alertMap[Name].err = xerrors.Errorf("getting windowPost details from database: %w", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(wdDetails) < 1 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, detail := range wdDetails {
|
||||||
|
addr, err := address.NewIDAddress(uint64(detail.Miner))
|
||||||
|
if err != nil {
|
||||||
|
al.alertMap[Name].err = xerrors.Errorf("getting miner address: %w", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if _, ok := msgCheck[addr][uint64(detail.Deadline)]; !ok {
|
||||||
|
al.alertMap[Name].alertString += fmt.Sprintf("unknown WindowPost jobs for miner %s deadline %d partition %d found. ", addr.String(), detail.Deadline, detail.Partition)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
msgCheck[addr][uint64(detail.Deadline)].sent = true
|
||||||
|
|
||||||
|
var postOut miner.SubmitWindowedPoStParams
|
||||||
|
err = postOut.UnmarshalCBOR(bytes.NewReader(detail.Proof))
|
||||||
|
if err != nil {
|
||||||
|
al.alertMap[Name].err = xerrors.Errorf("unmarshaling windowPost proof params: %w", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range postOut.Partitions {
|
||||||
|
c, err := postOut.Partitions[i].Skipped.Count()
|
||||||
|
if err != nil {
|
||||||
|
al.alertMap[Name].err = xerrors.Errorf("getting skipped sector count: %w", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if c > 0 {
|
||||||
|
al.alertMap[Name].alertString += fmt.Sprintf("Skipped %d sectors in deadline %d partition %d. ", c, postOut.Deadline, postOut.Partitions[i].Index)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for maddr, deadlines := range msgCheck {
|
||||||
|
for deadlineIndex, ps := range deadlines {
|
||||||
|
if !ps.sent {
|
||||||
|
al.alertMap[Name].alertString += fmt.Sprintf("No WindowPost jobs found for miner %s deadline %d. ", maddr.String(), deadlineIndex)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func wnPostCheck(al *alerts) {
|
||||||
|
Name := "WinningPost"
|
||||||
|
al.alertMap[Name] = &alertOut{}
|
||||||
|
head, err := al.api.ChainHead(al.ctx)
|
||||||
|
if err != nil {
|
||||||
|
al.alertMap[Name].err = err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
from := head.Height() - abi.ChainEpoch(math.Ceil(AlertMangerInterval.Seconds()/float64(build.BlockDelaySecs))) - 1
|
||||||
|
if from < 0 {
|
||||||
|
from = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
var wnDetails []struct {
|
||||||
|
Miner int64 `db:"sp_id"`
|
||||||
|
Block string `db:"mined_cid"`
|
||||||
|
Epoch abi.ChainEpoch `db:"epoch"`
|
||||||
|
}
|
||||||
|
|
||||||
|
err = al.db.Select(al.ctx, &wnDetails, `
|
||||||
|
SELECT sp_id, mined_cid, epoch
|
||||||
|
FROM mining_tasks
|
||||||
|
WHERE epoch > $1 AND won = TRUE
|
||||||
|
ORDER BY epoch;`, from)
|
||||||
|
if err != nil {
|
||||||
|
al.alertMap[Name].err = xerrors.Errorf("getting winningPost details from database: %w", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var count []int64
|
||||||
|
err = al.db.Select(al.ctx, &count, `
|
||||||
|
SELECT COUNT(*)
|
||||||
|
FROM mining_tasks
|
||||||
|
WHERE epoch > $1;`, from)
|
||||||
|
if err != nil {
|
||||||
|
al.alertMap[Name].err = xerrors.Errorf("getting winningPost count details from database: %w", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if count[0] == 0 {
|
||||||
|
al.alertMap[Name].alertString += "No winningPost tasks found in the last " + humanize.Time(time.Now().Add(-AlertMangerInterval))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
epochs := int64(math.Ceil(AlertMangerInterval.Seconds() / float64(build.BlockDelaySecs)))
|
||||||
|
if (head.Height() - abi.ChainEpoch(epochs)) < 0 {
|
||||||
|
epochs = int64(head.Height())
|
||||||
|
}
|
||||||
|
|
||||||
|
if epochs != count[0]+1 && epochs != count[0]-1 && epochs != count[0] {
|
||||||
|
al.alertMap[Name].alertString += fmt.Sprintf("Expected %d WinningPost task and found %d in DB ", epochs, count[0])
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(wnDetails) < 1 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
to := wnDetails[len(wnDetails)-1].Epoch
|
||||||
|
|
||||||
|
epochMap := make(map[abi.ChainEpoch]string)
|
||||||
|
|
||||||
|
for head.Height() >= to {
|
||||||
|
epochMap[head.Height()] = head.String()
|
||||||
|
head, err = al.api.ChainGetTipSet(al.ctx, head.Parents())
|
||||||
|
if err != nil {
|
||||||
|
al.alertMap[Name].err = xerrors.Errorf("getting tipset: %w", err)
|
||||||
|
}
|
||||||
|
if head == nil {
|
||||||
|
al.alertMap[Name].err = xerrors.Errorf("tipset is nil")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if head.Height() == 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
winMap := make(map[abi.ChainEpoch]struct {
|
||||||
|
won bool
|
||||||
|
cid string
|
||||||
|
})
|
||||||
|
|
||||||
|
for _, wn := range wnDetails {
|
||||||
|
if strings.Contains(epochMap[wn.Epoch], wn.Block) {
|
||||||
|
winMap[wn.Epoch] = struct {
|
||||||
|
won bool
|
||||||
|
cid string
|
||||||
|
}{won: true, cid: wn.Block}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
winMap[wn.Epoch] = struct {
|
||||||
|
won bool
|
||||||
|
cid string
|
||||||
|
}{won: false, cid: wn.Block}
|
||||||
|
}
|
||||||
|
|
||||||
|
for epoch, st := range winMap {
|
||||||
|
if !st.won {
|
||||||
|
al.alertMap[Name].alertString += fmt.Sprintf("Epoch %d: does not contain our block %s", epoch, st.cid)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -16,6 +16,7 @@ import (
|
|||||||
"golang.org/x/xerrors"
|
"golang.org/x/xerrors"
|
||||||
|
|
||||||
"github.com/filecoin-project/go-address"
|
"github.com/filecoin-project/go-address"
|
||||||
|
"github.com/filecoin-project/go-state-types/dline"
|
||||||
|
|
||||||
"github.com/filecoin-project/lotus/api"
|
"github.com/filecoin-project/lotus/api"
|
||||||
"github.com/filecoin-project/lotus/chain/types"
|
"github.com/filecoin-project/lotus/chain/types"
|
||||||
@ -32,7 +33,11 @@ var log = logging.Logger("curio/alertmanager")
|
|||||||
|
|
||||||
type AlertAPI interface {
|
type AlertAPI interface {
|
||||||
ctladdr.NodeApi
|
ctladdr.NodeApi
|
||||||
|
ChainHead(context.Context) (*types.TipSet, error)
|
||||||
|
ChainGetTipSet(context.Context, types.TipSetKey) (*types.TipSet, error)
|
||||||
StateMinerInfo(ctx context.Context, actor address.Address, tsk types.TipSetKey) (api.MinerInfo, error)
|
StateMinerInfo(ctx context.Context, actor address.Address, tsk types.TipSetKey) (api.MinerInfo, error)
|
||||||
|
StateMinerProvingDeadline(context.Context, address.Address, types.TipSetKey) (*dline.Info, error)
|
||||||
|
StateMinerPartitions(context.Context, address.Address, uint64, types.TipSetKey) ([]api.Partition, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
type AlertTask struct {
|
type AlertTask struct {
|
||||||
@ -70,6 +75,8 @@ var alertFuncs = []alertFunc{
|
|||||||
balanceCheck,
|
balanceCheck,
|
||||||
taskFailureCheck,
|
taskFailureCheck,
|
||||||
permanentStorageCheck,
|
permanentStorageCheck,
|
||||||
|
wdPostCheck,
|
||||||
|
wnPostCheck,
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewAlertTask(api AlertAPI, db *harmonydb.DB, alertingCfg config.CurioAlerting) *AlertTask {
|
func NewAlertTask(api AlertAPI, db *harmonydb.DB, alertingCfg config.CurioAlerting) *AlertTask {
|
||||||
|
Loading…
Reference in New Issue
Block a user