feat: wdpost: Configurable pre-check timeouts
This commit is contained in:
parent
c117611267
commit
87c25e211c
@ -325,6 +325,29 @@
|
||||
# env var: LOTUS_PROVING_PARALLELCHECKLIMIT
|
||||
#ParallelCheckLimit = 128
|
||||
|
||||
# Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped
|
||||
#
|
||||
# WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
|
||||
# test challenge took longer than this timeout
|
||||
# WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are
|
||||
# blocked (e.g. in case of disconnected NFS mount)
|
||||
#
|
||||
# type: Duration
|
||||
# env var: LOTUS_PROVING_SINGLECHECKTIMEOUT
|
||||
#SingleCheckTimeout = "10m0s"
|
||||
|
||||
# Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in
|
||||
# the partition which didn't get checked on time will be skipped
|
||||
#
|
||||
# WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
|
||||
# test challenge took longer than this timeout
|
||||
# WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are
|
||||
# blocked or slow
|
||||
#
|
||||
# type: Duration
|
||||
# env var: LOTUS_PROVING_PARTITIONCHECKTIMEOUT
|
||||
#PartitionCheckTimeout = "20m0s"
|
||||
|
||||
# Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present.
|
||||
#
|
||||
# WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need
|
||||
|
@ -142,6 +142,8 @@ func DefaultStorageMiner() *StorageMiner {
|
||||
|
||||
Proving: ProvingConfig{
|
||||
ParallelCheckLimit: 128,
|
||||
PartitionCheckTimeout: Duration(20 * time.Minute),
|
||||
SingleCheckTimeout: Duration(10 * time.Minute),
|
||||
},
|
||||
|
||||
Storage: SealerConfig{
|
||||
|
@ -644,6 +644,29 @@ to late submission.
|
||||
|
||||
After changing this option, confirm that the new value works in your setup by invoking
|
||||
'lotus-miner proving compute window-post 0'`,
|
||||
},
|
||||
{
|
||||
Name: "SingleCheckTimeout",
|
||||
Type: "Duration",
|
||||
|
||||
Comment: `Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped
|
||||
|
||||
WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
|
||||
test challenge took longer than this timeout
|
||||
WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are
|
||||
blocked (e.g. in case of disconnected NFS mount)`,
|
||||
},
|
||||
{
|
||||
Name: "PartitionCheckTimeout",
|
||||
Type: "Duration",
|
||||
|
||||
Comment: `Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in
|
||||
the partition which didn't get checked on time will be skipped
|
||||
|
||||
WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
|
||||
test challenge took longer than this timeout
|
||||
WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are
|
||||
blocked or slow`,
|
||||
},
|
||||
{
|
||||
Name: "DisableBuiltinWindowPoSt",
|
||||
|
@ -230,6 +230,23 @@ type ProvingConfig struct {
|
||||
// 'lotus-miner proving compute window-post 0'
|
||||
ParallelCheckLimit int
|
||||
|
||||
// Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped
|
||||
//
|
||||
// WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
|
||||
// test challenge took longer than this timeout
|
||||
// WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are
|
||||
// blocked (e.g. in case of disconnected NFS mount)
|
||||
SingleCheckTimeout Duration
|
||||
|
||||
// Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in
|
||||
// the partition which didn't get checked on time will be skipped
|
||||
//
|
||||
// WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
|
||||
// test challenge took longer than this timeout
|
||||
// WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are
|
||||
// blocked or slow
|
||||
PartitionCheckTimeout Duration
|
||||
|
||||
// Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present.
|
||||
//
|
||||
// WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need
|
||||
|
@ -5,7 +5,6 @@ import (
|
||||
"crypto/rand"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"golang.org/x/xerrors"
|
||||
|
||||
@ -15,8 +14,6 @@ import (
|
||||
"github.com/filecoin-project/lotus/storage/sealer/storiface"
|
||||
)
|
||||
|
||||
var PostCheckTimeout = 160 * time.Second
|
||||
|
||||
// FaultTracker TODO: Track things more actively
|
||||
type FaultTracker interface {
|
||||
CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof, sectors []storiface.SectorRef, rg storiface.RGetter) (map[abi.SectorID]string, error)
|
||||
@ -50,6 +47,12 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof,
|
||||
badLk.Unlock()
|
||||
}
|
||||
|
||||
if m.partitionCheckTimeout > 0 {
|
||||
var cancel2 context.CancelFunc
|
||||
ctx, cancel2 = context.WithTimeout(ctx, m.partitionCheckTimeout)
|
||||
defer cancel2()
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(len(sectors))
|
||||
|
||||
@ -57,7 +60,9 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof,
|
||||
select {
|
||||
case throttle <- struct{}{}:
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
addBad(sector.ID, fmt.Sprintf("waiting for check worker: %s", ctx.Err()))
|
||||
wg.Done()
|
||||
continue
|
||||
}
|
||||
|
||||
go func(sector storiface.SectorRef) {
|
||||
@ -107,8 +112,13 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof,
|
||||
return
|
||||
}
|
||||
|
||||
vctx, cancel2 := context.WithTimeout(ctx, PostCheckTimeout)
|
||||
vctx := ctx
|
||||
|
||||
if m.singleCheckTimeout > 0 {
|
||||
var cancel2 context.CancelFunc
|
||||
vctx, cancel2 = context.WithTimeout(ctx, m.singleCheckTimeout)
|
||||
defer cancel2()
|
||||
}
|
||||
|
||||
_, err = m.storage.GenerateSingleVanillaProof(vctx, sector.ID.Miner, storiface.PostSectorChallenge{
|
||||
SealProof: sector.ProofType,
|
||||
|
@ -7,6 +7,7 @@ import (
|
||||
"net/http"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/hashicorp/go-multierror"
|
||||
@ -72,6 +73,8 @@ type Manager struct {
|
||||
work *statestore.StateStore
|
||||
|
||||
parallelCheckLimit int
|
||||
singleCheckTimeout time.Duration
|
||||
partitionCheckTimeout time.Duration
|
||||
disableBuiltinWindowPoSt bool
|
||||
disableBuiltinWinningPoSt bool
|
||||
disallowRemoteFinalize bool
|
||||
@ -121,6 +124,8 @@ func New(ctx context.Context, lstor *paths.Local, stor paths.Store, ls paths.Loc
|
||||
localProver: prover,
|
||||
|
||||
parallelCheckLimit: pc.ParallelCheckLimit,
|
||||
singleCheckTimeout: time.Duration(pc.SingleCheckTimeout),
|
||||
partitionCheckTimeout: time.Duration(pc.PartitionCheckTimeout),
|
||||
disableBuiltinWindowPoSt: pc.DisableBuiltinWindowPoSt,
|
||||
disableBuiltinWinningPoSt: pc.DisableBuiltinWinningPoSt,
|
||||
disallowRemoteFinalize: sc.DisallowRemoteFinalize,
|
||||
|
Loading…
Reference in New Issue
Block a user