Merge pull request #9111 from filecoin-project/asr/fix-rec-setcor-limit

fix: post: restrict recoveries per deadline
This commit is contained in:
Aayush Rajasekaran 2022-08-02 15:44:33 -04:00 committed by GitHub
commit c0d2f249bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 144 additions and 4 deletions

View File

@ -186,3 +186,117 @@ func TestWindowPostNoPreChecks(t *testing.T) {
sectors = p.MinerPower.RawBytePower.Uint64() / uint64(ssz)
require.Equal(t, nSectors+kit.DefaultPresealsPerBootstrapMiner-2+1, int(sectors)) // -2 not recovered sectors + 1 just pledged
}
func TestWindowPostMaxSectorsRecoveryConfig(t *testing.T) {
oldVal := wdpost.RecoveringSectorLimit
defer func() {
wdpost.RecoveringSectorLimit = oldVal
}()
wdpost.RecoveringSectorLimit = 1
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
client, miner, ens := kit.EnsembleMinimal(t,
kit.LatestActorsAt(-1),
kit.MockProofs())
ens.InterconnectAll().BeginMining(2 * time.Millisecond)
nSectors := 10
miner.PledgeSectors(ctx, nSectors, 0, nil)
maddr, err := miner.ActorAddress(ctx)
require.NoError(t, err)
di, err := client.StateMinerProvingDeadline(ctx, maddr, types.EmptyTSK)
require.NoError(t, err)
mid, err := address.IDFromAddress(maddr)
require.NoError(t, err)
t.Log("Running one proving period")
waitUntil := di.Open + di.WPoStProvingPeriod
t.Logf("End for head.Height > %d", waitUntil)
ts := client.WaitTillChain(ctx, kit.HeightAtLeast(waitUntil))
t.Logf("Now head.Height = %d", ts.Height())
p, err := client.StateMinerPower(ctx, maddr, types.EmptyTSK)
require.NoError(t, err)
ssz, err := miner.ActorSectorSize(ctx, maddr)
require.NoError(t, err)
require.Equal(t, p.MinerPower, p.TotalPower)
require.Equal(t, p.MinerPower.RawBytePower, types.NewInt(uint64(ssz)*uint64(nSectors+kit.DefaultPresealsPerBootstrapMiner)))
t.Log("Drop some sectors")
// Drop 2 sectors from deadline 2 partition 0 (full partition / deadline)
parts, err := client.StateMinerPartitions(ctx, maddr, 2, types.EmptyTSK)
require.NoError(t, err)
require.Greater(t, len(parts), 0)
secs := parts[0].AllSectors
n, err := secs.Count()
require.NoError(t, err)
require.Equal(t, uint64(2), n)
// Drop the partition
err = secs.ForEach(func(sid uint64) error {
return miner.StorageMiner.(*impl.StorageMinerAPI).IStorageMgr.(*mock.SectorMgr).MarkFailed(storiface.SectorRef{
ID: abi.SectorID{
Miner: abi.ActorID(mid),
Number: abi.SectorNumber(sid),
},
}, true)
})
require.NoError(t, err)
di, err = client.StateMinerProvingDeadline(ctx, maddr, types.EmptyTSK)
require.NoError(t, err)
t.Log("Go through another PP, wait for sectors to become faulty")
waitUntil = di.Open + di.WPoStProvingPeriod
t.Logf("End for head.Height > %d", waitUntil)
ts = client.WaitTillChain(ctx, kit.HeightAtLeast(waitUntil))
t.Logf("Now head.Height = %d", ts.Height())
p, err = client.StateMinerPower(ctx, maddr, types.EmptyTSK)
require.NoError(t, err)
require.Equal(t, p.MinerPower, p.TotalPower)
sectors := p.MinerPower.RawBytePower.Uint64() / uint64(ssz)
require.Equal(t, nSectors+kit.DefaultPresealsPerBootstrapMiner-2, int(sectors)) // -2 just removed sectors
t.Log("Make the sectors recoverable")
err = secs.ForEach(func(sid uint64) error {
return miner.StorageMiner.(*impl.StorageMinerAPI).IStorageMgr.(*mock.SectorMgr).MarkFailed(storiface.SectorRef{
ID: abi.SectorID{
Miner: abi.ActorID(mid),
Number: abi.SectorNumber(sid),
},
}, false)
})
require.NoError(t, err)
di, err = client.StateMinerProvingDeadline(ctx, maddr, types.EmptyTSK)
require.NoError(t, err)
waitUntil = di.Open + di.WPoStProvingPeriod + 200
t.Logf("End for head.Height > %d", waitUntil)
ts = client.WaitTillChain(ctx, kit.HeightAtLeast(waitUntil))
t.Logf("Now head.Height = %d", ts.Height())
p, err = client.StateMinerPower(ctx, maddr, types.EmptyTSK)
require.NoError(t, err)
require.Equal(t, p.MinerPower, p.TotalPower)
sectors = p.MinerPower.RawBytePower.Uint64() / uint64(ssz)
require.Equal(t, nSectors+kit.DefaultPresealsPerBootstrapMiner-1, int(sectors)) // -1 not recovered sector
}

View File

@ -2,6 +2,7 @@ package wdpost
import (
"context"
"math"
"os"
"strconv"
@ -21,12 +22,12 @@ import (
"github.com/filecoin-project/lotus/chain/types"
)
var recoveringSectorLimit int64 = 0
var RecoveringSectorLimit uint64 = 0
func init() {
if rcl := os.Getenv("LOTUS_RECOVERING_SECTOR_LIMIT"); rcl != "" {
var err error
recoveringSectorLimit, err = strconv.ParseInt(rcl, 10, 64)
RecoveringSectorLimit, err = strconv.ParseUint(rcl, 10, 64)
if err != nil {
log.Errorw("parsing LOTUS_RECOVERING_SECTOR_LIMIT", "error", err)
}
@ -92,6 +93,31 @@ func (s *WindowPoStScheduler) declareRecoveries(ctx context.Context, dlIdx uint6
continue
}
// rules to follow if we have indicated that we don't want to recover more than X sectors in a deadline
if RecoveringSectorLimit > 0 {
// something weird happened, break because we can't recover any more
if RecoveringSectorLimit < totalSectorsToRecover {
log.Warnf("accepted more recoveries (%d) than RecoveringSectorLimit (%d)", totalSectorsToRecover, RecoveringSectorLimit)
break
}
maxNewRecoverable := RecoveringSectorLimit - totalSectorsToRecover
// we need to trim the recover bitfield
if recoveredCount > maxNewRecoverable {
recoverySlice, err := recovered.All(math.MaxUint64)
if err != nil {
log.Errorw("failed to slice recovery bitfield, breaking out of recovery loop", err)
break
}
log.Warnf("only adding %d sectors to respect RecoveringSectorLimit %d", maxNewRecoverable, RecoveringSectorLimit)
recovered = bitfield.NewFromSet(recoverySlice[:maxNewRecoverable])
recoveredCount = maxNewRecoverable
}
}
// respect user config if set
if s.maxPartitionsPerRecoveryMessage > 0 &&
len(batchedRecoveryDecls[len(batchedRecoveryDecls)-1]) >= s.maxPartitionsPerRecoveryMessage {
@ -106,9 +132,9 @@ func (s *WindowPoStScheduler) declareRecoveries(ctx context.Context, dlIdx uint6
totalSectorsToRecover += recoveredCount
if recoveringSectorLimit > 0 && int64(totalSectorsToRecover) >= recoveringSectorLimit {
if RecoveringSectorLimit > 0 && totalSectorsToRecover >= RecoveringSectorLimit {
log.Errorf("reached recovering sector limit %d, only marking %d sectors for recovery now",
recoveringSectorLimit,
RecoveringSectorLimit,
totalSectorsToRecover)
break
}