feat: wdpost: Configurable pre-check timeouts

This commit is contained in:
Łukasz Magiera 2022-11-17 18:25:30 +01:00 committed by Shrenuj Bansal
parent c117611267
commit 87c25e211c
6 changed files with 87 additions and 7 deletions

View File

@ -325,6 +325,29 @@
# env var: LOTUS_PROVING_PARALLELCHECKLIMIT
#ParallelCheckLimit = 128
# Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped
#
# WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
# test challenge took longer than this timeout
# WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are
# blocked (e.g. in case of disconnected NFS mount)
#
# type: Duration
# env var: LOTUS_PROVING_SINGLECHECKTIMEOUT
#SingleCheckTimeout = "10m0s"
# Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in
# the partition which didn't get checked on time will be skipped
#
# WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
# test challenge took longer than this timeout
# WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are
# blocked or slow
#
# type: Duration
# env var: LOTUS_PROVING_PARTITIONCHECKTIMEOUT
#PartitionCheckTimeout = "20m0s"
# Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present.
#
# WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need

View File

@ -142,6 +142,8 @@ func DefaultStorageMiner() *StorageMiner {
Proving: ProvingConfig{
ParallelCheckLimit: 128,
PartitionCheckTimeout: Duration(20 * time.Minute),
SingleCheckTimeout: Duration(10 * time.Minute),
},
Storage: SealerConfig{

View File

@ -644,6 +644,29 @@ to late submission.
After changing this option, confirm that the new value works in your setup by invoking
'lotus-miner proving compute window-post 0'`,
},
{
Name: "SingleCheckTimeout",
Type: "Duration",
Comment: `Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped
WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
test challenge took longer than this timeout
WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are
blocked (e.g. in case of disconnected NFS mount)`,
},
{
Name: "PartitionCheckTimeout",
Type: "Duration",
Comment: `Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in
the partition which didn't get checked on time will be skipped
WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
test challenge took longer than this timeout
WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are
blocked or slow`,
},
{
Name: "DisableBuiltinWindowPoSt",

View File

@ -230,6 +230,23 @@ type ProvingConfig struct {
// 'lotus-miner proving compute window-post 0'
ParallelCheckLimit int
// Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped
//
// WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
// test challenge took longer than this timeout
// WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are
// blocked (e.g. in case of disconnected NFS mount)
SingleCheckTimeout Duration
// Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in
// the partition which didn't get checked on time will be skipped
//
// WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
// test challenge took longer than this timeout
// WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are
// blocked or slow
PartitionCheckTimeout Duration
// Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present.
//
// WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need

View File

@ -5,7 +5,6 @@ import (
"crypto/rand"
"fmt"
"sync"
"time"
"golang.org/x/xerrors"
@ -15,8 +14,6 @@ import (
"github.com/filecoin-project/lotus/storage/sealer/storiface"
)
var PostCheckTimeout = 160 * time.Second
// FaultTracker TODO: Track things more actively
type FaultTracker interface {
CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof, sectors []storiface.SectorRef, rg storiface.RGetter) (map[abi.SectorID]string, error)
@ -50,6 +47,12 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof,
badLk.Unlock()
}
if m.partitionCheckTimeout > 0 {
var cancel2 context.CancelFunc
ctx, cancel2 = context.WithTimeout(ctx, m.partitionCheckTimeout)
defer cancel2()
}
var wg sync.WaitGroup
wg.Add(len(sectors))
@ -57,7 +60,9 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof,
select {
case throttle <- struct{}{}:
case <-ctx.Done():
return nil, ctx.Err()
addBad(sector.ID, fmt.Sprintf("waiting for check worker: %s", ctx.Err()))
wg.Done()
continue
}
go func(sector storiface.SectorRef) {
@ -107,8 +112,13 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof,
return
}
vctx, cancel2 := context.WithTimeout(ctx, PostCheckTimeout)
vctx := ctx
if m.singleCheckTimeout > 0 {
var cancel2 context.CancelFunc
vctx, cancel2 = context.WithTimeout(ctx, m.singleCheckTimeout)
defer cancel2()
}
_, err = m.storage.GenerateSingleVanillaProof(vctx, sector.ID.Miner, storiface.PostSectorChallenge{
SealProof: sector.ProofType,

View File

@ -7,6 +7,7 @@ import (
"net/http"
"sort"
"sync"
"time"
"github.com/google/uuid"
"github.com/hashicorp/go-multierror"
@ -72,6 +73,8 @@ type Manager struct {
work *statestore.StateStore
parallelCheckLimit int
singleCheckTimeout time.Duration
partitionCheckTimeout time.Duration
disableBuiltinWindowPoSt bool
disableBuiltinWinningPoSt bool
disallowRemoteFinalize bool
@ -121,6 +124,8 @@ func New(ctx context.Context, lstor *paths.Local, stor paths.Store, ls paths.Loc
localProver: prover,
parallelCheckLimit: pc.ParallelCheckLimit,
singleCheckTimeout: time.Duration(pc.SingleCheckTimeout),
partitionCheckTimeout: time.Duration(pc.PartitionCheckTimeout),
disableBuiltinWindowPoSt: pc.DisableBuiltinWindowPoSt,
disableBuiltinWinningPoSt: pc.DisableBuiltinWinningPoSt,
disallowRemoteFinalize: sc.DisallowRemoteFinalize,