feat: wdpost: Configurable pre-check timeouts
This commit is contained in:
parent
c117611267
commit
87c25e211c
@ -325,6 +325,29 @@
|
|||||||
# env var: LOTUS_PROVING_PARALLELCHECKLIMIT
|
# env var: LOTUS_PROVING_PARALLELCHECKLIMIT
|
||||||
#ParallelCheckLimit = 128
|
#ParallelCheckLimit = 128
|
||||||
|
|
||||||
|
# Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped
|
||||||
|
#
|
||||||
|
# WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
|
||||||
|
# test challenge took longer than this timeout
|
||||||
|
# WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are
|
||||||
|
# blocked (e.g. in case of disconnected NFS mount)
|
||||||
|
#
|
||||||
|
# type: Duration
|
||||||
|
# env var: LOTUS_PROVING_SINGLECHECKTIMEOUT
|
||||||
|
#SingleCheckTimeout = "10m0s"
|
||||||
|
|
||||||
|
# Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in
|
||||||
|
# the partition which didn't get checked on time will be skipped
|
||||||
|
#
|
||||||
|
# WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
|
||||||
|
# test challenge took longer than this timeout
|
||||||
|
# WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are
|
||||||
|
# blocked or slow
|
||||||
|
#
|
||||||
|
# type: Duration
|
||||||
|
# env var: LOTUS_PROVING_PARTITIONCHECKTIMEOUT
|
||||||
|
#PartitionCheckTimeout = "20m0s"
|
||||||
|
|
||||||
# Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present.
|
# Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present.
|
||||||
#
|
#
|
||||||
# WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need
|
# WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need
|
||||||
|
@ -141,7 +141,9 @@ func DefaultStorageMiner() *StorageMiner {
|
|||||||
},
|
},
|
||||||
|
|
||||||
Proving: ProvingConfig{
|
Proving: ProvingConfig{
|
||||||
ParallelCheckLimit: 128,
|
ParallelCheckLimit: 128,
|
||||||
|
PartitionCheckTimeout: Duration(20 * time.Minute),
|
||||||
|
SingleCheckTimeout: Duration(10 * time.Minute),
|
||||||
},
|
},
|
||||||
|
|
||||||
Storage: SealerConfig{
|
Storage: SealerConfig{
|
||||||
|
@ -644,6 +644,29 @@ to late submission.
|
|||||||
|
|
||||||
After changing this option, confirm that the new value works in your setup by invoking
|
After changing this option, confirm that the new value works in your setup by invoking
|
||||||
'lotus-miner proving compute window-post 0'`,
|
'lotus-miner proving compute window-post 0'`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "SingleCheckTimeout",
|
||||||
|
Type: "Duration",
|
||||||
|
|
||||||
|
Comment: `Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped
|
||||||
|
|
||||||
|
WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
|
||||||
|
test challenge took longer than this timeout
|
||||||
|
WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are
|
||||||
|
blocked (e.g. in case of disconnected NFS mount)`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "PartitionCheckTimeout",
|
||||||
|
Type: "Duration",
|
||||||
|
|
||||||
|
Comment: `Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in
|
||||||
|
the partition which didn't get checked on time will be skipped
|
||||||
|
|
||||||
|
WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
|
||||||
|
test challenge took longer than this timeout
|
||||||
|
WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are
|
||||||
|
blocked or slow`,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
Name: "DisableBuiltinWindowPoSt",
|
Name: "DisableBuiltinWindowPoSt",
|
||||||
|
@ -230,6 +230,23 @@ type ProvingConfig struct {
|
|||||||
// 'lotus-miner proving compute window-post 0'
|
// 'lotus-miner proving compute window-post 0'
|
||||||
ParallelCheckLimit int
|
ParallelCheckLimit int
|
||||||
|
|
||||||
|
// Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped
|
||||||
|
//
|
||||||
|
// WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
|
||||||
|
// test challenge took longer than this timeout
|
||||||
|
// WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are
|
||||||
|
// blocked (e.g. in case of disconnected NFS mount)
|
||||||
|
SingleCheckTimeout Duration
|
||||||
|
|
||||||
|
// Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in
|
||||||
|
// the partition which didn't get checked on time will be skipped
|
||||||
|
//
|
||||||
|
// WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
|
||||||
|
// test challenge took longer than this timeout
|
||||||
|
// WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are
|
||||||
|
// blocked or slow
|
||||||
|
PartitionCheckTimeout Duration
|
||||||
|
|
||||||
// Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present.
|
// Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present.
|
||||||
//
|
//
|
||||||
// WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need
|
// WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need
|
||||||
|
@ -5,7 +5,6 @@ import (
|
|||||||
"crypto/rand"
|
"crypto/rand"
|
||||||
"fmt"
|
"fmt"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
|
||||||
|
|
||||||
"golang.org/x/xerrors"
|
"golang.org/x/xerrors"
|
||||||
|
|
||||||
@ -15,8 +14,6 @@ import (
|
|||||||
"github.com/filecoin-project/lotus/storage/sealer/storiface"
|
"github.com/filecoin-project/lotus/storage/sealer/storiface"
|
||||||
)
|
)
|
||||||
|
|
||||||
var PostCheckTimeout = 160 * time.Second
|
|
||||||
|
|
||||||
// FaultTracker TODO: Track things more actively
|
// FaultTracker TODO: Track things more actively
|
||||||
type FaultTracker interface {
|
type FaultTracker interface {
|
||||||
CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof, sectors []storiface.SectorRef, rg storiface.RGetter) (map[abi.SectorID]string, error)
|
CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof, sectors []storiface.SectorRef, rg storiface.RGetter) (map[abi.SectorID]string, error)
|
||||||
@ -50,6 +47,12 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof,
|
|||||||
badLk.Unlock()
|
badLk.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if m.partitionCheckTimeout > 0 {
|
||||||
|
var cancel2 context.CancelFunc
|
||||||
|
ctx, cancel2 = context.WithTimeout(ctx, m.partitionCheckTimeout)
|
||||||
|
defer cancel2()
|
||||||
|
}
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
wg.Add(len(sectors))
|
wg.Add(len(sectors))
|
||||||
|
|
||||||
@ -57,7 +60,9 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof,
|
|||||||
select {
|
select {
|
||||||
case throttle <- struct{}{}:
|
case throttle <- struct{}{}:
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return nil, ctx.Err()
|
addBad(sector.ID, fmt.Sprintf("waiting for check worker: %s", ctx.Err()))
|
||||||
|
wg.Done()
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
go func(sector storiface.SectorRef) {
|
go func(sector storiface.SectorRef) {
|
||||||
@ -107,8 +112,13 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof,
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
vctx, cancel2 := context.WithTimeout(ctx, PostCheckTimeout)
|
vctx := ctx
|
||||||
defer cancel2()
|
|
||||||
|
if m.singleCheckTimeout > 0 {
|
||||||
|
var cancel2 context.CancelFunc
|
||||||
|
vctx, cancel2 = context.WithTimeout(ctx, m.singleCheckTimeout)
|
||||||
|
defer cancel2()
|
||||||
|
}
|
||||||
|
|
||||||
_, err = m.storage.GenerateSingleVanillaProof(vctx, sector.ID.Miner, storiface.PostSectorChallenge{
|
_, err = m.storage.GenerateSingleVanillaProof(vctx, sector.ID.Miner, storiface.PostSectorChallenge{
|
||||||
SealProof: sector.ProofType,
|
SealProof: sector.ProofType,
|
||||||
|
@ -7,6 +7,7 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"sort"
|
"sort"
|
||||||
"sync"
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/google/uuid"
|
"github.com/google/uuid"
|
||||||
"github.com/hashicorp/go-multierror"
|
"github.com/hashicorp/go-multierror"
|
||||||
@ -72,6 +73,8 @@ type Manager struct {
|
|||||||
work *statestore.StateStore
|
work *statestore.StateStore
|
||||||
|
|
||||||
parallelCheckLimit int
|
parallelCheckLimit int
|
||||||
|
singleCheckTimeout time.Duration
|
||||||
|
partitionCheckTimeout time.Duration
|
||||||
disableBuiltinWindowPoSt bool
|
disableBuiltinWindowPoSt bool
|
||||||
disableBuiltinWinningPoSt bool
|
disableBuiltinWinningPoSt bool
|
||||||
disallowRemoteFinalize bool
|
disallowRemoteFinalize bool
|
||||||
@ -121,6 +124,8 @@ func New(ctx context.Context, lstor *paths.Local, stor paths.Store, ls paths.Loc
|
|||||||
localProver: prover,
|
localProver: prover,
|
||||||
|
|
||||||
parallelCheckLimit: pc.ParallelCheckLimit,
|
parallelCheckLimit: pc.ParallelCheckLimit,
|
||||||
|
singleCheckTimeout: time.Duration(pc.SingleCheckTimeout),
|
||||||
|
partitionCheckTimeout: time.Duration(pc.PartitionCheckTimeout),
|
||||||
disableBuiltinWindowPoSt: pc.DisableBuiltinWindowPoSt,
|
disableBuiltinWindowPoSt: pc.DisableBuiltinWindowPoSt,
|
||||||
disableBuiltinWinningPoSt: pc.DisableBuiltinWinningPoSt,
|
disableBuiltinWinningPoSt: pc.DisableBuiltinWinningPoSt,
|
||||||
disallowRemoteFinalize: sc.DisallowRemoteFinalize,
|
disallowRemoteFinalize: sc.DisallowRemoteFinalize,
|
||||||
|
Loading…
Reference in New Issue
Block a user