package alertmanager import ( "database/sql" "fmt" "strings" "github.com/BurntSushi/toml" "github.com/dustin/go-humanize" "golang.org/x/xerrors" "github.com/filecoin-project/go-address" "github.com/filecoin-project/go-state-types/abi" "github.com/filecoin-project/go-state-types/big" "github.com/filecoin-project/lotus/node/config" ) // balanceCheck retrieves the machine details from the database and performs balance checks on unique addresses. // It populates the alert map with any errors encountered during the process and with any alerts related to low wallet balance and missing wallets. // The alert map key is "Balance Check". // It queries the database for the configuration of each layer and decodes it using the toml.Decode function. // It then iterates over the addresses in the configuration and curates a list of unique addresses. // If an address is not found in the chain node, it adds an alert to the alert map. // If the balance of an address is below MinimumWalletBalance, it adds an alert to the alert map. // If there are any errors encountered during the process, the err field of the alert map is populated. func balanceCheck(al *alerts) { Name := "Balance Check" al.alertMap[Name] = &alertOut{} var ret string uniqueAddrs, _, err := al.getAddresses() if err != nil { al.alertMap[Name].err = err return } for _, addrStr := range uniqueAddrs { addr, err := address.NewFromString(addrStr) if err != nil { al.alertMap[Name].err = xerrors.Errorf("failed to parse address: %w", err) return } has, err := al.api.WalletHas(al.ctx, addr) if err != nil { al.alertMap[Name].err = err return } if !has { ret += fmt.Sprintf("Wallet %s was not found in chain node. ", addrStr) } balance, err := al.api.WalletBalance(al.ctx, addr) if err != nil { al.alertMap[Name].err = err } if abi.TokenAmount(al.cfg.MinimumWalletBalance).GreaterThanEqual(balance) { ret += fmt.Sprintf("Balance for wallet %s is below 5 Fil. ", addrStr) } } if ret != "" { al.alertMap[Name].alertString = ret } return } // taskFailureCheck retrieves the task failure counts from the database for a specific time period. // It then checks for specific sealing tasks and tasks with more than 5 failures to generate alerts. func taskFailureCheck(al *alerts) { Name := "TaskFailures" al.alertMap[Name] = &alertOut{} type taskFailure struct { Machine string `db:"completed_by_host_and_port"` Name string `db:"name"` Failures int `db:"failed_tasks_count"` } var taskFailures []taskFailure err := al.db.Select(al.ctx, &taskFailures, ` SELECT completed_by_host_and_port, name, COUNT(*) AS failed_count FROM harmony_task_history WHERE result = FALSE AND work_end >= NOW() - $1::interval GROUP BY completed_by_host_and_port, name ORDER BY completed_by_host_and_port, name;`, fmt.Sprintf("%f Minutes", AlertMangerInterval.Minutes())) if err != nil { al.alertMap[Name].err = xerrors.Errorf("getting failed task count: %w", err) return } mmap := make(map[string]int) tmap := make(map[string]int) if len(taskFailures) > 0 { for _, tf := range taskFailures { _, ok := tmap[tf.Name] if !ok { tmap[tf.Name] = tf.Failures } else { tmap[tf.Name] += tf.Failures } _, ok = mmap[tf.Machine] if !ok { mmap[tf.Machine] = tf.Failures } else { mmap[tf.Machine] += tf.Failures } } } sealingTasks := []string{"SDR", "TreeD", "TreeRC", "PreCommitSubmit", "PoRep", "Finalize", "MoveStorage", "CommitSubmit", "WdPost", "ParkPiece"} contains := func(s []string, e string) bool { for _, a := range s { if a == e { return true } } return false } // Alerts for any sealing pipeline failures. Other tasks should have at least 5 failures for an alert for name, count := range tmap { if contains(sealingTasks, name) { al.alertMap[Name].alertString += fmt.Sprintf("Task: %s, Failures: %d. ", name, count) } if count > 5 { al.alertMap[Name].alertString += fmt.Sprintf("Task: %s, Failures: %d. ", name, count) } } // Alert if a machine failed more than 5 tasks for name, count := range tmap { if count > 5 { al.alertMap[Name].alertString += fmt.Sprintf("Machine: %s, Failures: %d. ", name, count) } } return } // permanentStorageCheck retrieves the storage details from the database and checks if there is sufficient space for sealing sectors. // It queries the database for the available storage for all storage paths that can store data. // It queries the database for sectors being sealed that have not been finalized yet. // For each sector, it calculates the required space for sealing based on the sector size. // It checks if there is enough available storage for each sector and updates the sectorMap accordingly. // If any sectors are unaccounted for, it calculates the total missing space and adds an alert to the alert map. func permanentStorageCheck(al *alerts) { Name := "PermanentStorageSpace" // Get all storage path for permanent storages type storage struct { ID string `db:"storage_id"` Available int64 `db:"available"` } var storages []storage err := al.db.Select(al.ctx, &storages, ` SELECT storage_id, available FROM storage_path WHERE can_store = TRUE;`) if err != nil { al.alertMap[Name].err = xerrors.Errorf("getting storage details: %w", err) return } type sector struct { Miner abi.ActorID `db:"sp_id"` Number abi.SectorNumber `db:"sector_number"` Proof abi.RegisteredSealProof `db:"reg_seal_proof"` } var sectors []sector err = al.db.Select(al.ctx, §ors, ` SELECT sp_id, sector_number, reg_seal_proof FROM sectors_sdr_pipeline WHERE after_move_storage = FALSE;`) if err != nil { al.alertMap[Name].err = xerrors.Errorf("getting sectors being sealed: %w", err) return } type sm struct { s sector size int64 } sectorMap := make(map[sm]bool) for _, sec := range sectors { space := int64(0) sec := sec sectorSize, err := sec.Proof.SectorSize() if err != nil { space = int64(64<<30)*2 + int64(200<<20) // Assume 64 GiB sector } else { space = int64(sectorSize)*2 + int64(200<<20) // sealed + unsealed + cache } key := sm{s: sec, size: space} sectorMap[key] = false for _, strg := range storages { if space > strg.Available { strg.Available -= space sectorMap[key] = true } } } missingSpace := big.NewInt(0) for sec, accounted := range sectorMap { if !accounted { big.Add(missingSpace, big.NewInt(sec.size)) } } if missingSpace.GreaterThan(big.NewInt(0)) { al.alertMap[Name].alertString = fmt.Sprintf("Insufficient storage space for sealing sectors. Additional %s required.", humanize.Bytes(missingSpace.Uint64())) } } // getAddresses retrieves machine details from the database, stores them in an array and compares layers for uniqueness. // It employs addrMap to handle unique addresses, and generated slices for configuration fields and MinerAddresses. // The function iterates over layers, storing decoded configuration and verifying address existence in addrMap. // It ends by returning unique addresses and miner slices. func (al *alerts) getAddresses() ([]string, []string, error) { // MachineDetails represents the structure of data received from the SQL query. type machineDetail struct { ID int HostAndPort string Layers string } var machineDetails []machineDetail // Get all layers in use err := al.db.Select(al.ctx, &machineDetails, ` SELECT m.id, m.host_and_port, d.layers FROM harmony_machines m LEFT JOIN harmony_machine_details d ON m.id = d.machine_id;`) if err != nil { return nil, nil, xerrors.Errorf("getting config layers for all machines: %w", err) } // UniqueLayers takes an array of MachineDetails and returns a slice of unique layers. layerMap := make(map[string]bool) var uniqueLayers []string // Get unique layers in use for _, machine := range machineDetails { machine := machine // Split the Layers field into individual layers layers := strings.Split(machine.Layers, ",") for _, layer := range layers { layer = strings.TrimSpace(layer) if _, exists := layerMap[layer]; !exists && layer != "" { layerMap[layer] = true uniqueLayers = append(uniqueLayers, layer) } } } addrMap := make(map[string]bool) var uniqueAddrs []string var miners []string // Get all unique addresses for _, layer := range uniqueLayers { text := "" cfg := config.DefaultCurioConfig() err := al.db.QueryRow(al.ctx, `SELECT config FROM harmony_config WHERE title=$1`, layer).Scan(&text) if err != nil { if strings.Contains(err.Error(), sql.ErrNoRows.Error()) { return nil, nil, xerrors.Errorf("missing layer '%s' ", layer) } return nil, nil, fmt.Errorf("could not read layer '%s': %w", layer, err) } _, err = toml.Decode(text, cfg) if err != nil { return nil, nil, fmt.Errorf("could not read layer, bad toml %s: %w", layer, err) } for i := range cfg.Addresses { prec := cfg.Addresses[i].PreCommitControl com := cfg.Addresses[i].CommitControl term := cfg.Addresses[i].TerminateControl miner := cfg.Addresses[i].MinerAddresses if prec != nil { for j := range prec { if _, ok := addrMap[prec[j]]; !ok && prec[j] != "" { addrMap[prec[j]] = true uniqueAddrs = append(uniqueAddrs, prec[j]) } } } if com != nil { for j := range com { if _, ok := addrMap[com[j]]; !ok && com[j] != "" { addrMap[com[j]] = true uniqueAddrs = append(uniqueAddrs, com[j]) } } } if term != nil { for j := range term { if _, ok := addrMap[term[j]]; !ok && term[j] != "" { addrMap[term[j]] = true uniqueAddrs = append(uniqueAddrs, term[j]) } } } if miner != nil { for j := range miner { if _, ok := addrMap[miner[j]]; !ok && miner[j] != "" { addrMap[miner[j]] = true miners = append(miners, miner[j]) } } } } } return uniqueAddrs, miners, nil }