diff --git a/itests/wdpost_check_config_test.go b/itests/wdpost_config_test.go similarity index 61% rename from itests/wdpost_check_config_test.go rename to itests/wdpost_config_test.go index cc10af04f..61b64387d 100644 --- a/itests/wdpost_check_config_test.go +++ b/itests/wdpost_config_test.go @@ -186,3 +186,117 @@ func TestWindowPostNoPreChecks(t *testing.T) { sectors = p.MinerPower.RawBytePower.Uint64() / uint64(ssz) require.Equal(t, nSectors+kit.DefaultPresealsPerBootstrapMiner-2+1, int(sectors)) // -2 not recovered sectors + 1 just pledged } + +func TestWindowPostMaxSectorsRecoveryConfig(t *testing.T) { + oldVal := wdpost.RecoveringSectorLimit + defer func() { + wdpost.RecoveringSectorLimit = oldVal + }() + wdpost.RecoveringSectorLimit = 1 + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + client, miner, ens := kit.EnsembleMinimal(t, + kit.LatestActorsAt(-1), + kit.MockProofs()) + ens.InterconnectAll().BeginMining(2 * time.Millisecond) + + nSectors := 10 + + miner.PledgeSectors(ctx, nSectors, 0, nil) + + maddr, err := miner.ActorAddress(ctx) + require.NoError(t, err) + di, err := client.StateMinerProvingDeadline(ctx, maddr, types.EmptyTSK) + require.NoError(t, err) + + mid, err := address.IDFromAddress(maddr) + require.NoError(t, err) + + t.Log("Running one proving period") + waitUntil := di.Open + di.WPoStProvingPeriod + t.Logf("End for head.Height > %d", waitUntil) + + ts := client.WaitTillChain(ctx, kit.HeightAtLeast(waitUntil)) + t.Logf("Now head.Height = %d", ts.Height()) + + p, err := client.StateMinerPower(ctx, maddr, types.EmptyTSK) + require.NoError(t, err) + + ssz, err := miner.ActorSectorSize(ctx, maddr) + require.NoError(t, err) + + require.Equal(t, p.MinerPower, p.TotalPower) + require.Equal(t, p.MinerPower.RawBytePower, types.NewInt(uint64(ssz)*uint64(nSectors+kit.DefaultPresealsPerBootstrapMiner))) + + t.Log("Drop some sectors") + + // Drop 2 sectors from deadline 2 partition 0 (full partition / deadline) + parts, err := client.StateMinerPartitions(ctx, maddr, 2, types.EmptyTSK) + require.NoError(t, err) + require.Greater(t, len(parts), 0) + + secs := parts[0].AllSectors + n, err := secs.Count() + require.NoError(t, err) + require.Equal(t, uint64(2), n) + + // Drop the partition + err = secs.ForEach(func(sid uint64) error { + return miner.StorageMiner.(*impl.StorageMinerAPI).IStorageMgr.(*mock.SectorMgr).MarkFailed(storiface.SectorRef{ + ID: abi.SectorID{ + Miner: abi.ActorID(mid), + Number: abi.SectorNumber(sid), + }, + }, true) + }) + require.NoError(t, err) + + di, err = client.StateMinerProvingDeadline(ctx, maddr, types.EmptyTSK) + require.NoError(t, err) + + t.Log("Go through another PP, wait for sectors to become faulty") + waitUntil = di.Open + di.WPoStProvingPeriod + t.Logf("End for head.Height > %d", waitUntil) + + ts = client.WaitTillChain(ctx, kit.HeightAtLeast(waitUntil)) + t.Logf("Now head.Height = %d", ts.Height()) + + p, err = client.StateMinerPower(ctx, maddr, types.EmptyTSK) + require.NoError(t, err) + + require.Equal(t, p.MinerPower, p.TotalPower) + + sectors := p.MinerPower.RawBytePower.Uint64() / uint64(ssz) + require.Equal(t, nSectors+kit.DefaultPresealsPerBootstrapMiner-2, int(sectors)) // -2 just removed sectors + + t.Log("Make the sectors recoverable") + + err = secs.ForEach(func(sid uint64) error { + return miner.StorageMiner.(*impl.StorageMinerAPI).IStorageMgr.(*mock.SectorMgr).MarkFailed(storiface.SectorRef{ + ID: abi.SectorID{ + Miner: abi.ActorID(mid), + Number: abi.SectorNumber(sid), + }, + }, false) + }) + require.NoError(t, err) + + di, err = client.StateMinerProvingDeadline(ctx, maddr, types.EmptyTSK) + require.NoError(t, err) + + waitUntil = di.Open + di.WPoStProvingPeriod + 200 + t.Logf("End for head.Height > %d", waitUntil) + + ts = client.WaitTillChain(ctx, kit.HeightAtLeast(waitUntil)) + t.Logf("Now head.Height = %d", ts.Height()) + + p, err = client.StateMinerPower(ctx, maddr, types.EmptyTSK) + require.NoError(t, err) + + require.Equal(t, p.MinerPower, p.TotalPower) + + sectors = p.MinerPower.RawBytePower.Uint64() / uint64(ssz) + require.Equal(t, nSectors+kit.DefaultPresealsPerBootstrapMiner-1, int(sectors)) // -1 not recovered sector +} diff --git a/storage/wdpost/wdpost_run_faults.go b/storage/wdpost/wdpost_run_faults.go index 32f0d96a6..22186b551 100644 --- a/storage/wdpost/wdpost_run_faults.go +++ b/storage/wdpost/wdpost_run_faults.go @@ -2,6 +2,7 @@ package wdpost import ( "context" + "math" "os" "strconv" @@ -21,12 +22,12 @@ import ( "github.com/filecoin-project/lotus/chain/types" ) -var recoveringSectorLimit int64 = 0 +var RecoveringSectorLimit uint64 = 0 func init() { if rcl := os.Getenv("LOTUS_RECOVERING_SECTOR_LIMIT"); rcl != "" { var err error - recoveringSectorLimit, err = strconv.ParseInt(rcl, 10, 64) + RecoveringSectorLimit, err = strconv.ParseUint(rcl, 10, 64) if err != nil { log.Errorw("parsing LOTUS_RECOVERING_SECTOR_LIMIT", "error", err) } @@ -92,6 +93,31 @@ func (s *WindowPoStScheduler) declareRecoveries(ctx context.Context, dlIdx uint6 continue } + // rules to follow if we have indicated that we don't want to recover more than X sectors in a deadline + if RecoveringSectorLimit > 0 { + // something weird happened, break because we can't recover any more + if RecoveringSectorLimit < totalSectorsToRecover { + log.Warnf("accepted more recoveries (%d) than RecoveringSectorLimit (%d)", totalSectorsToRecover, RecoveringSectorLimit) + break + } + + maxNewRecoverable := RecoveringSectorLimit - totalSectorsToRecover + + // we need to trim the recover bitfield + if recoveredCount > maxNewRecoverable { + recoverySlice, err := recovered.All(math.MaxUint64) + if err != nil { + log.Errorw("failed to slice recovery bitfield, breaking out of recovery loop", err) + break + } + + log.Warnf("only adding %d sectors to respect RecoveringSectorLimit %d", maxNewRecoverable, RecoveringSectorLimit) + + recovered = bitfield.NewFromSet(recoverySlice[:maxNewRecoverable]) + recoveredCount = maxNewRecoverable + } + } + // respect user config if set if s.maxPartitionsPerRecoveryMessage > 0 && len(batchedRecoveryDecls[len(batchedRecoveryDecls)-1]) >= s.maxPartitionsPerRecoveryMessage { @@ -106,9 +132,9 @@ func (s *WindowPoStScheduler) declareRecoveries(ctx context.Context, dlIdx uint6 totalSectorsToRecover += recoveredCount - if recoveringSectorLimit > 0 && int64(totalSectorsToRecover) >= recoveringSectorLimit { + if RecoveringSectorLimit > 0 && totalSectorsToRecover >= RecoveringSectorLimit { log.Errorf("reached recovering sector limit %d, only marking %d sectors for recovery now", - recoveringSectorLimit, + RecoveringSectorLimit, totalSectorsToRecover) break }