Fix widowed PoSt scheduler

This commit is contained in:
Łukasz Magiera 2020-07-14 19:10:31 +02:00
parent f53bd6bdba
commit af87b9aa98
7 changed files with 208 additions and 253 deletions

View File

@ -253,7 +253,9 @@ type FullNode interface {
// StateMinerInfo returns info about the indicated miner
StateMinerInfo(context.Context, address.Address, types.TipSetKey) (MinerInfo, error)
// StateMinerDeadlines returns all the proving deadlines for the given miner
StateMinerDeadlines(context.Context, address.Address, types.TipSetKey) (*miner.Deadlines, error)
StateMinerDeadlines(context.Context, address.Address, types.TipSetKey) ([]*miner.Deadline, error)
// StateMinerPartitions loads miner partitions for the specified miner/deadline
StateMinerPartitions(context.Context, address.Address, uint64, types.TipSetKey) ([]*miner.Partition, error)
// StateMinerFaults returns a bitfield indicating the faulty sectors of the given miner
StateMinerFaults(context.Context, address.Address, types.TipSetKey) (*abi.BitField, error)
// StateAllMinerFaults returns all non-expired Faults that occur within lookback epochs of the given tipset

View File

@ -131,7 +131,8 @@ type FullNodeStruct struct {
StateMinerProvingDeadline func(context.Context, address.Address, types.TipSetKey) (*miner.DeadlineInfo, error) `perm:"read"`
StateMinerPower func(context.Context, address.Address, types.TipSetKey) (*api.MinerPower, error) `perm:"read"`
StateMinerInfo func(context.Context, address.Address, types.TipSetKey) (api.MinerInfo, error) `perm:"read"`
StateMinerDeadlines func(context.Context, address.Address, types.TipSetKey) (*miner.Deadlines, error) `perm:"read"`
StateMinerDeadlines func(context.Context, address.Address, types.TipSetKey) ([]*miner.Deadline, error) `perm:"read"`
StateMinerPartitions func(context.Context, address.Address, uint64, types.TipSetKey) ([]*miner.Partition, error)
StateMinerFaults func(context.Context, address.Address, types.TipSetKey) (*abi.BitField, error) `perm:"read"`
StateAllMinerFaults func(context.Context, abi.ChainEpoch, types.TipSetKey) ([]*api.Fault, error) `perm:"read"`
StateMinerRecoveries func(context.Context, address.Address, types.TipSetKey) (*abi.BitField, error) `perm:"read"`
@ -593,10 +594,14 @@ func (c *FullNodeStruct) StateMinerInfo(ctx context.Context, actor address.Addre
return c.Internal.StateMinerInfo(ctx, actor, tsk)
}
func (c *FullNodeStruct) StateMinerDeadlines(ctx context.Context, m address.Address, tsk types.TipSetKey) (*miner.Deadlines, error) {
func (c *FullNodeStruct) StateMinerDeadlines(ctx context.Context, m address.Address, tsk types.TipSetKey) ([]*miner.Deadline, error) {
return c.Internal.StateMinerDeadlines(ctx, m, tsk)
}
func (c *FullNodeStruct) StateMinerPartitions(ctx context.Context, m address.Address, dlIdx uint64, tsk types.TipSetKey) ([]*miner.Partition, error) {
return c.Internal.StateMinerPartitions(ctx, m, dlIdx, tsk)
}
func (c *FullNodeStruct) StateMinerFaults(ctx context.Context, actor address.Address, tsk types.TipSetKey) (*abi.BitField, error) {
return c.Internal.StateMinerFaults(ctx, actor, tsk)
}

View File

@ -323,36 +323,6 @@ func GetMinerSlashed(ctx context.Context, sm *StateManager, ts *types.TipSet, ma
return false, nil
}
func GetMinerDeadlines(ctx context.Context, sm *StateManager, ts *types.TipSet, maddr address.Address) (*miner.Deadlines, error) {
var mas miner.State
_, err := sm.LoadActorState(ctx, maddr, &mas, ts)
if err != nil {
return nil, xerrors.Errorf("(get ssize) failed to load miner actor state: %w", err)
}
return mas.LoadDeadlines(sm.cs.Store(ctx))
}
/*func GetMinerFaults(ctx context.Context, sm *StateManager, ts *types.TipSet, maddr address.Address) (*abi.BitField, error) {
var mas miner.State
_, err := sm.LoadActorState(ctx, maddr, &mas, ts)
if err != nil {
return nil, xerrors.Errorf("(get faults) failed to load miner actor state: %w", err)
}
return mas.Faults, nil
}
func GetMinerRecoveries(ctx context.Context, sm *StateManager, ts *types.TipSet, maddr address.Address) (*abi.BitField, error) {
var mas miner.State
_, err := sm.LoadActorState(ctx, maddr, &mas, ts)
if err != nil {
return nil, xerrors.Errorf("(get recoveries) failed to load miner actor state: %w", err)
}
return mas.Recoveries, nil
}*/
func GetStorageDeal(ctx context.Context, sm *StateManager, dealID abi.DealID, ts *types.TipSet) (*api.MarketDeal, error) {
var state market.State
if _, err := sm.LoadActorState(ctx, builtin.StorageMarketActorAddr, &state, ts); err != nil {

View File

@ -88,12 +88,30 @@ func (a *StateAPI) StateMinerInfo(ctx context.Context, actor address.Address, ts
return api.NewApiMinerInfo(mi), nil
}
func (a *StateAPI) StateMinerDeadlines(ctx context.Context, m address.Address, tsk types.TipSetKey) (*miner.Deadlines, error) {
ts, err := a.Chain.GetTipSetFromKey(tsk)
if err != nil {
return nil, xerrors.Errorf("loading tipset %s: %w", tsk, err)
func (a *StateAPI) StateMinerDeadlines(ctx context.Context, m address.Address, tsk types.TipSetKey) ([]*miner.Deadline, error) {
var out []*miner.Deadline
return out, a.StateManager.WithParentStateTsk(tsk,
a.StateManager.WithActor(m,
a.StateManager.WithActorState(ctx,
a.StateManager.WithDeadlines(
a.StateManager.WithEachDeadline(
func(store adt.Store, idx uint64, deadline *miner.Deadline) error {
out = append(out, deadline)
return nil
})))))
}
return stmgr.GetMinerDeadlines(ctx, a.StateManager, ts, m)
func (a *StateAPI) StateMinerPartitions(ctx context.Context, m address.Address, dlIdx uint64, tsk types.TipSetKey) ([]*miner.Partition, error) {
var out []*miner.Partition
return out, a.StateManager.WithParentStateTsk(tsk,
a.StateManager.WithActor(m,
a.StateManager.WithActorState(ctx,
a.StateManager.WithDeadlines(
a.StateManager.WithDeadline(dlIdx,
a.StateManager.WithEachPartition(func(store adt.Store, partIdx uint64, partition *miner.Partition) error {
out = append(out, partition)
return nil
}))))))
}
func (a *StateAPI) StateMinerProvingDeadline(ctx context.Context, addr address.Address, tsk types.TipSetKey) (*miner.DeadlineInfo, error) {

View File

@ -72,7 +72,7 @@ func (s SealingAPIAdapter) StateMinerWorkerAddress(ctx context.Context, maddr ad
return mi.Worker, nil
}
func (s SealingAPIAdapter) StateMinerDeadlines(ctx context.Context, maddr address.Address, tok sealing.TipSetToken) (*miner.Deadlines, error) {
func (s SealingAPIAdapter) StateMinerDeadlines(ctx context.Context, maddr address.Address, tok sealing.TipSetToken) ([]*miner.Deadline, error) {
tsk, err := types.TipSetKeyFromBytes(tok)
if err != nil {
return nil, xerrors.Errorf("failed to unmarshal TipSetToken to TipSetKey: %w", err)
@ -184,7 +184,6 @@ func (s SealingAPIAdapter) StateSectorGetInfo(ctx context.Context, maddr address
return s.delegate.StateSectorGetInfo(ctx, maddr, sectorNumber, tsk)
}
func (s SealingAPIAdapter) StateSectorPartition(ctx context.Context, maddr address.Address, sectorNumber abi.SectorNumber, tok sealing.TipSetToken) (*sealing.SectorLocation, error) {
tsk, err := types.TipSetKeyFromBytes(tok)
if err != nil {

View File

@ -48,7 +48,8 @@ type Miner struct {
type storageMinerApi interface {
// Call a read only method on actors (no interaction with the chain required)
StateCall(context.Context, *types.Message, types.TipSetKey) (*api.InvocResult, error)
StateMinerDeadlines(ctx context.Context, maddr address.Address, tok types.TipSetKey) (*miner.Deadlines, error)
StateMinerDeadlines(ctx context.Context, maddr address.Address, tok types.TipSetKey) ([]*miner.Deadline, error)
StateMinerPartitions(context.Context, address.Address, uint64, types.TipSetKey) ([]*miner.Partition, error)
StateMinerSectors(context.Context, address.Address, *abi.BitField, bool, types.TipSetKey) ([]*api.ChainSectorInfo, error)
StateSectorPreCommitInfo(context.Context, address.Address, abi.SectorNumber, types.TipSetKey) (miner.SectorPreCommitOnChainInfo, error)
StateSectorGetInfo(context.Context, address.Address, abi.SectorNumber, types.TipSetKey) (*miner.SectorOnChainInfo, error)

View File

@ -4,6 +4,7 @@ import (
"bytes"
"context"
"errors"
"sync"
"time"
"github.com/filecoin-project/go-bitfield"
@ -107,63 +108,62 @@ func (s *WindowPoStScheduler) checkSectors(ctx context.Context, check *abi.BitFi
return &sbf, nil
}
func (s *WindowPoStScheduler) checkNextRecoveries(ctx context.Context, deadline uint64, deadlineSectors *abi.BitField, ts *types.TipSet) error {
faults, err := s.api.StateMinerFaults(ctx, s.actor, ts.Key())
if err != nil {
return xerrors.Errorf("getting on-chain faults: %w", err)
func (s *WindowPoStScheduler) checkNextRecoveries(ctx context.Context, dlIdx uint64, partitions []*miner.Partition) error {
ctx, span := trace.StartSpan(ctx, "storage.checkNextRecoveries")
defer span.End()
params := &miner.DeclareFaultsRecoveredParams{
Recoveries: []miner.RecoveryDeclaration{},
}
fc, err := faults.Count()
if err != nil {
return xerrors.Errorf("counting faulty sectors: %w", err)
}
faulty := uint64(0)
if fc == 0 {
return nil
}
recov, err := s.api.StateMinerRecoveries(ctx, s.actor, ts.Key())
if err != nil {
return xerrors.Errorf("getting on-chain recoveries: %w", err)
}
unrecovered, err := bitfield.SubtractBitField(faults, recov)
for partIdx, partition := range partitions {
unrecovered, err := bitfield.SubtractBitField(partition.Faults, partition.Recoveries)
if err != nil {
return xerrors.Errorf("subtracting recovered set from fault set: %w", err)
}
unrecovered, err = bitfield.IntersectBitField(unrecovered, deadlineSectors)
if err != nil {
return xerrors.Errorf("intersect unrecovered set with deadlineSectors: %w", err)
}
uc, err := unrecovered.Count()
if err != nil {
return xerrors.Errorf("counting unrecovered sectors: %w", err)
}
if uc == 0 {
return nil
continue
}
sbf, err := s.checkSectors(ctx, unrecovered)
faulty += uc
recovered, err := s.checkSectors(ctx, unrecovered)
if err != nil {
return xerrors.Errorf("checking unrecovered sectors: %w", err)
}
// if all sectors failed to recover, don't declare recoveries
sbfCount, err := sbf.Count()
recoveredCount, err := recovered.Count()
if err != nil {
return xerrors.Errorf("counting recovered sectors: %w", err)
}
if sbfCount == 0 {
log.Warnw("No recoveries to declare", "deadline", deadline, "faulty", uc)
return nil
if recoveredCount == 0 {
continue
}
params := &miner.DeclareFaultsRecoveredParams{
Recoveries: []miner.RecoveryDeclaration{{Deadline: deadline, Sectors: sbf}},
params.Recoveries = append(params.Recoveries, miner.RecoveryDeclaration{
Deadline: dlIdx,
Partition: uint64(partIdx),
Sectors: recovered,
})
}
if len(params.Recoveries) == 0 {
if faulty != 0 {
log.Warnw("No recoveries to declare", "deadline", dlIdx, "faulty", faulty)
}
return nil
}
enc, aerr := actors.SerializeParams(params)
@ -200,22 +200,23 @@ func (s *WindowPoStScheduler) checkNextRecoveries(ctx context.Context, deadline
return nil
}
func (s *WindowPoStScheduler) checkNextFaults(ctx context.Context, deadline uint64, deadlineSectors *abi.BitField, ts *types.TipSet) error {
dc, err := deadlineSectors.Count()
if err != nil {
return xerrors.Errorf("counting deadline sectors: %w", err)
}
if dc == 0 {
// nothing can become faulty
return nil
func (s *WindowPoStScheduler) checkNextFaults(ctx context.Context, dlIdx uint64, partitions []*miner.Partition) error {
ctx, span := trace.StartSpan(ctx, "storage.checkNextFaults")
defer span.End()
params := &miner.DeclareFaultsParams{
Faults: []miner.FaultDeclaration{},
}
toCheck, err := s.getSectorsToProve(ctx, deadlineSectors, true, ts)
bad := uint64(0)
for partIdx, partition := range partitions {
toCheck, err := partition.ActiveSectors()
if err != nil {
return xerrors.Errorf("getting next sectors to prove: %w", err)
return xerrors.Errorf("getting active sectors: %w", err)
}
good, err := s.checkSectors(ctx, deadlineSectors)
good, err := s.checkSectors(ctx, toCheck)
if err != nil {
return xerrors.Errorf("checking sectors: %w", err)
}
@ -231,19 +232,23 @@ func (s *WindowPoStScheduler) checkNextFaults(ctx context.Context, deadline uint
}
if c == 0 {
continue
}
bad += c
params.Faults = append(params.Faults, miner.FaultDeclaration{
Deadline: dlIdx,
Partition: uint64(partIdx),
Sectors: faulty,
})
}
if len(params.Faults) == 0 {
return nil
}
log.Errorw("DETECTED FAULTY SECTORS, declaring faults", "count", c)
params := &miner.DeclareFaultsParams{
Faults: []miner.FaultDeclaration{
{
Deadline: deadline,
Sectors: faulty,
},
},
}
log.Errorw("DETECTED FAULTY SECTORS, declaring faults", "count", bad)
enc, aerr := actors.SerializeParams(params)
if aerr != nil {
@ -279,76 +284,37 @@ func (s *WindowPoStScheduler) checkNextFaults(ctx context.Context, deadline uint
return nil
}
// the input sectors must match with the miner actor
func (s *WindowPoStScheduler) getSectorsToProve(ctx context.Context, deadlineSectors *abi.BitField, ignoreRecoveries bool, ts *types.TipSet) (*abi.BitField, error) {
stateFaults, err := s.api.StateMinerFaults(ctx, s.actor, ts.Key())
if err != nil {
return nil, xerrors.Errorf("getting on-chain faults: %w", err)
}
faults, err := bitfield.IntersectBitField(deadlineSectors, stateFaults)
if err != nil {
return nil, xerrors.Errorf("failed to intersect proof sectors with faults: %w", err)
}
recoveries, err := s.api.StateMinerRecoveries(ctx, s.actor, ts.Key())
if err != nil {
return nil, xerrors.Errorf("getting on-chain recoveries: %w", err)
}
if !ignoreRecoveries {
expectedRecoveries, err := bitfield.IntersectBitField(faults, recoveries)
if err != nil {
return nil, xerrors.Errorf("failed to intersect recoveries with faults: %w", err)
}
faults, err = bitfield.SubtractBitField(faults, expectedRecoveries)
if err != nil {
return nil, xerrors.Errorf("failed to subtract recoveries from faults: %w", err)
}
}
nonFaults, err := bitfield.SubtractBitField(deadlineSectors, faults)
if err != nil {
return nil, xerrors.Errorf("failed to diff bitfields: %w", err)
}
empty, err := nonFaults.IsEmpty()
if err != nil {
return nil, xerrors.Errorf("failed to check if bitfield was empty: %w", err)
}
if empty {
return nil, xerrors.Errorf("no non-faulty sectors in partitions: %w", err)
}
return nonFaults, nil
}
func (s *WindowPoStScheduler) runPost(ctx context.Context, di miner.DeadlineInfo, ts *types.TipSet) (*miner.SubmitWindowedPoStParams, error) {
ctx, span := trace.StartSpan(ctx, "storage.runPost")
defer span.End()
deadlines, err := s.api.StateMinerDeadlines(ctx, s.actor, ts.Key())
if err != nil {
return nil, xerrors.Errorf("getting miner deadlines: %w", err)
}
var declWait sync.WaitGroup
defer declWait.Wait()
declWait.Add(1)
go func(){
defer declWait.Done()
{
// check faults / recoveries for the *next* deadline. It's already too
// late to declare them for this deadline
declDeadline := (di.Index + 1) % miner.WPoStPeriodDeadlines
if err := s.checkNextRecoveries(ctx, declDeadline, deadlines.Due[declDeadline], ts); err != nil {
partitions, err := s.api.StateMinerPartitions(ctx, s.actor, declDeadline, ts.Key())
if err != nil {
log.Errorf("getting partitions: %v", err)
return
}
if err := s.checkNextRecoveries(ctx, declDeadline, partitions); err != nil {
// TODO: This is potentially quite bad, but not even trying to post when this fails is objectively worse
log.Errorf("checking sector recoveries: %v", err)
}
if err := s.checkNextFaults(ctx, declDeadline, deadlines.Due[declDeadline], ts); err != nil {
if err := s.checkNextFaults(ctx, declDeadline, partitions); err != nil {
// TODO: This is also potentially really bad, but we try to post anyways
log.Errorf("checking sector faults: %v", err)
}
}
}()
buf := new(bytes.Buffer)
if err := s.actor.MarshalCBOR(buf); err != nil {
@ -359,88 +325,87 @@ func (s *WindowPoStScheduler) runPost(ctx context.Context, di miner.DeadlineInfo
return nil, xerrors.Errorf("failed to get chain randomness for windowPost (ts=%d; deadline=%d): %w", ts.Height(), di, err)
}
firstPartition, _, err := miner.PartitionsForDeadline(deadlines, s.partitionSectors, di.Index)
partitions, err := s.api.StateMinerPartitions(ctx, s.actor, di.Index, ts.Key())
if err != nil {
return nil, xerrors.Errorf("getting partitions for deadline: %w", err)
return nil, xerrors.Errorf("getting partitions: %w", err)
}
partitionCount, _, err := miner.DeadlineCount(deadlines, s.partitionSectors, di.Index)
params := &miner.SubmitWindowedPoStParams{
Deadline: di.Index,
Partitions: make([]miner.PoStPartition, len(partitions)),
Proofs: nil,
}
var sinfos []abi.SectorInfo
sidToPart := map[abi.SectorNumber]uint64{}
skipCount := uint64(0)
for partIdx, partition := range partitions {
// TODO: Can do this in parallel
toProve, err := partition.ActiveSectors()
if err != nil {
return nil, xerrors.Errorf("getting deadline partition count: %w", err)
return nil, xerrors.Errorf("getting active sectors: %w", err)
}
dc, err := deadlines.Due[di.Index].Count()
toProve, err = bitfield.MergeBitFields(toProve, partition.Recoveries)
if err != nil {
return nil, xerrors.Errorf("get deadline count: %w", err)
return nil, xerrors.Errorf("adding recoveries to set of sectors to prove: %w", err)
}
log.Infof("di: %+v", di)
log.Infof("dc: %+v", dc)
log.Infof("fp: %+v", firstPartition)
log.Infof("pc: %+v", partitionCount)
log.Infof("ts: %+v (%d)", ts.Key(), ts.Height())
if partitionCount == 0 {
return nil, errNoPartitions
}
partitions := make([]uint64, partitionCount)
for i := range partitions {
partitions[i] = firstPartition + uint64(i)
}
nps, err := s.getSectorsToProve(ctx, deadlines.Due[di.Index], false, ts)
if err != nil {
return nil, xerrors.Errorf("get need prove sectors: %w", err)
}
good, err := s.checkSectors(ctx, nps)
good, err := s.checkSectors(ctx, toProve)
if err != nil {
return nil, xerrors.Errorf("checking sectors to skip: %w", err)
}
skipped, err := bitfield.SubtractBitField(nps, good)
skipped, err := bitfield.SubtractBitField(toProve, good)
if err != nil {
return nil, xerrors.Errorf("nps - good: %w", err)
return nil, xerrors.Errorf("toProve - good: %w", err)
}
skipCount, err := skipped.Count()
sc, err := skipped.Count()
if err != nil {
return nil, xerrors.Errorf("getting skipped sector count: %w", err)
}
ssi, err := s.sortedSectorInfo(ctx, good, ts)
skipCount += sc
ssi, err := s.sectorInfo(ctx, good, ts)
if err != nil {
return nil, xerrors.Errorf("getting sorted sector info: %w", err)
}
sinfos = append(sinfos, ssi...)
for _, si := range ssi {
sidToPart[si.SectorNumber] = uint64(partIdx)
}
if len(ssi) == 0 {
log.Warn("attempted to run windowPost without any sectors...")
return nil, xerrors.Errorf("no sectors to run windowPost on")
}
params.Partitions[partIdx] = miner.PoStPartition{
Index: uint64(partIdx),
Skipped: skipped,
}
}
log.Infow("running windowPost",
"chain-random", rand,
"deadline", di,
"height", ts.Height(),
"skipped", skipCount)
var snums []abi.SectorNumber
for _, si := range ssi {
snums = append(snums, si.SectorNumber)
}
tsStart := time.Now()
log.Infow("generating windowPost",
"sectors", len(ssi))
log.Infow("generating windowPost", "sectors", len(sinfos))
mid, err := address.IDFromAddress(s.actor)
if err != nil {
return nil, err
}
postOut, postSkipped, err := s.prover.GenerateWindowPoSt(ctx, abi.ActorID(mid), ssi, abi.PoStRandomness(rand))
postOut, postSkipped, err := s.prover.GenerateWindowPoSt(ctx, abi.ActorID(mid), sinfos, abi.PoStRandomness(rand))
if err != nil {
return nil, xerrors.Errorf("running post failed: %w", err)
}
@ -450,21 +415,16 @@ func (s *WindowPoStScheduler) runPost(ctx context.Context, di miner.DeadlineInfo
}
for _, sector := range postSkipped {
skipped.Set(uint64(sector.Number))
params.Partitions[sidToPart[sector.Number]].Skipped.Set(uint64(sector.Number))
}
elapsed := time.Since(tsStart)
log.Infow("submitting window PoSt", "elapsed", elapsed)
return &miner.SubmitWindowedPoStParams{
Deadline: di.Index,
Partitions: partitions,
Proofs: postOut,
Skipped: *skipped,
}, nil
return params, nil
}
func (s *WindowPoStScheduler) sortedSectorInfo(ctx context.Context, deadlineSectors *abi.BitField, ts *types.TipSet) ([]abi.SectorInfo, error) {
func (s *WindowPoStScheduler) sectorInfo(ctx context.Context, deadlineSectors *abi.BitField, ts *types.TipSet) ([]abi.SectorInfo, error) {
sset, err := s.api.StateMinerSectors(ctx, s.actor, deadlineSectors, false, ts.Key())
if err != nil {
return nil, err