feat: wdpost: Configurable pre-check timeouts

This commit is contained in:
Łukasz Magiera 2022-11-17 18:25:30 +01:00 committed by Shrenuj Bansal
parent c117611267
commit 87c25e211c
6 changed files with 87 additions and 7 deletions

View File

@ -325,6 +325,29 @@
# env var: LOTUS_PROVING_PARALLELCHECKLIMIT # env var: LOTUS_PROVING_PARALLELCHECKLIMIT
#ParallelCheckLimit = 128 #ParallelCheckLimit = 128
# Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped
#
# WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
# test challenge took longer than this timeout
# WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are
# blocked (e.g. in case of disconnected NFS mount)
#
# type: Duration
# env var: LOTUS_PROVING_SINGLECHECKTIMEOUT
#SingleCheckTimeout = "10m0s"
# Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in
# the partition which didn't get checked on time will be skipped
#
# WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
# test challenge took longer than this timeout
# WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are
# blocked or slow
#
# type: Duration
# env var: LOTUS_PROVING_PARTITIONCHECKTIMEOUT
#PartitionCheckTimeout = "20m0s"
# Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present. # Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present.
# #
# WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need # WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need

View File

@ -142,6 +142,8 @@ func DefaultStorageMiner() *StorageMiner {
Proving: ProvingConfig{ Proving: ProvingConfig{
ParallelCheckLimit: 128, ParallelCheckLimit: 128,
PartitionCheckTimeout: Duration(20 * time.Minute),
SingleCheckTimeout: Duration(10 * time.Minute),
}, },
Storage: SealerConfig{ Storage: SealerConfig{

View File

@ -644,6 +644,29 @@ to late submission.
After changing this option, confirm that the new value works in your setup by invoking After changing this option, confirm that the new value works in your setup by invoking
'lotus-miner proving compute window-post 0'`, 'lotus-miner proving compute window-post 0'`,
},
{
Name: "SingleCheckTimeout",
Type: "Duration",
Comment: `Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped
WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
test challenge took longer than this timeout
WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are
blocked (e.g. in case of disconnected NFS mount)`,
},
{
Name: "PartitionCheckTimeout",
Type: "Duration",
Comment: `Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in
the partition which didn't get checked on time will be skipped
WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
test challenge took longer than this timeout
WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are
blocked or slow`,
}, },
{ {
Name: "DisableBuiltinWindowPoSt", Name: "DisableBuiltinWindowPoSt",

View File

@ -230,6 +230,23 @@ type ProvingConfig struct {
// 'lotus-miner proving compute window-post 0' // 'lotus-miner proving compute window-post 0'
ParallelCheckLimit int ParallelCheckLimit int
// Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped
//
// WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
// test challenge took longer than this timeout
// WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are
// blocked (e.g. in case of disconnected NFS mount)
SingleCheckTimeout Duration
// Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in
// the partition which didn't get checked on time will be skipped
//
// WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
// test challenge took longer than this timeout
// WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are
// blocked or slow
PartitionCheckTimeout Duration
// Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present. // Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present.
// //
// WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need // WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need

View File

@ -5,7 +5,6 @@ import (
"crypto/rand" "crypto/rand"
"fmt" "fmt"
"sync" "sync"
"time"
"golang.org/x/xerrors" "golang.org/x/xerrors"
@ -15,8 +14,6 @@ import (
"github.com/filecoin-project/lotus/storage/sealer/storiface" "github.com/filecoin-project/lotus/storage/sealer/storiface"
) )
var PostCheckTimeout = 160 * time.Second
// FaultTracker TODO: Track things more actively // FaultTracker TODO: Track things more actively
type FaultTracker interface { type FaultTracker interface {
CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof, sectors []storiface.SectorRef, rg storiface.RGetter) (map[abi.SectorID]string, error) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof, sectors []storiface.SectorRef, rg storiface.RGetter) (map[abi.SectorID]string, error)
@ -50,6 +47,12 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof,
badLk.Unlock() badLk.Unlock()
} }
if m.partitionCheckTimeout > 0 {
var cancel2 context.CancelFunc
ctx, cancel2 = context.WithTimeout(ctx, m.partitionCheckTimeout)
defer cancel2()
}
var wg sync.WaitGroup var wg sync.WaitGroup
wg.Add(len(sectors)) wg.Add(len(sectors))
@ -57,7 +60,9 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof,
select { select {
case throttle <- struct{}{}: case throttle <- struct{}{}:
case <-ctx.Done(): case <-ctx.Done():
return nil, ctx.Err() addBad(sector.ID, fmt.Sprintf("waiting for check worker: %s", ctx.Err()))
wg.Done()
continue
} }
go func(sector storiface.SectorRef) { go func(sector storiface.SectorRef) {
@ -107,8 +112,13 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof,
return return
} }
vctx, cancel2 := context.WithTimeout(ctx, PostCheckTimeout) vctx := ctx
if m.singleCheckTimeout > 0 {
var cancel2 context.CancelFunc
vctx, cancel2 = context.WithTimeout(ctx, m.singleCheckTimeout)
defer cancel2() defer cancel2()
}
_, err = m.storage.GenerateSingleVanillaProof(vctx, sector.ID.Miner, storiface.PostSectorChallenge{ _, err = m.storage.GenerateSingleVanillaProof(vctx, sector.ID.Miner, storiface.PostSectorChallenge{
SealProof: sector.ProofType, SealProof: sector.ProofType,

View File

@ -7,6 +7,7 @@ import (
"net/http" "net/http"
"sort" "sort"
"sync" "sync"
"time"
"github.com/google/uuid" "github.com/google/uuid"
"github.com/hashicorp/go-multierror" "github.com/hashicorp/go-multierror"
@ -72,6 +73,8 @@ type Manager struct {
work *statestore.StateStore work *statestore.StateStore
parallelCheckLimit int parallelCheckLimit int
singleCheckTimeout time.Duration
partitionCheckTimeout time.Duration
disableBuiltinWindowPoSt bool disableBuiltinWindowPoSt bool
disableBuiltinWinningPoSt bool disableBuiltinWinningPoSt bool
disallowRemoteFinalize bool disallowRemoteFinalize bool
@ -121,6 +124,8 @@ func New(ctx context.Context, lstor *paths.Local, stor paths.Store, ls paths.Loc
localProver: prover, localProver: prover,
parallelCheckLimit: pc.ParallelCheckLimit, parallelCheckLimit: pc.ParallelCheckLimit,
singleCheckTimeout: time.Duration(pc.SingleCheckTimeout),
partitionCheckTimeout: time.Duration(pc.PartitionCheckTimeout),
disableBuiltinWindowPoSt: pc.DisableBuiltinWindowPoSt, disableBuiltinWindowPoSt: pc.DisableBuiltinWindowPoSt,
disableBuiltinWinningPoSt: pc.DisableBuiltinWinningPoSt, disableBuiltinWinningPoSt: pc.DisableBuiltinWinningPoSt,
disallowRemoteFinalize: sc.DisallowRemoteFinalize, disallowRemoteFinalize: sc.DisallowRemoteFinalize,