Merge pull request #9702 from filecoin-project/1.19.0_backport_20221122

backport: configurable pre check timeouts and zondax hid update
This commit is contained in:
Aayush Rajasekaran 2022-11-22 11:19:16 -05:00 committed by GitHub
commit 0bf439ff94
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 90 additions and 10 deletions

View File

@ -325,6 +325,29 @@
# env var: LOTUS_PROVING_PARALLELCHECKLIMIT
#ParallelCheckLimit = 128
# Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped
#
# WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
# test challenge took longer than this timeout
# WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are
# blocked (e.g. in case of disconnected NFS mount)
#
# type: Duration
# env var: LOTUS_PROVING_SINGLECHECKTIMEOUT
#SingleCheckTimeout = "10m0s"
# Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in
# the partition which didn't get checked on time will be skipped
#
# WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
# test challenge took longer than this timeout
# WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are
# blocked or slow
#
# type: Duration
# env var: LOTUS_PROVING_PARTITIONCHECKTIMEOUT
#PartitionCheckTimeout = "20m0s"
# Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present.
#
# WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need

2
go.mod
View File

@ -323,7 +323,7 @@ require (
github.com/whyrusleeping/go-keyspace v0.0.0-20160322163242-5b898ac5add1 // indirect
github.com/whyrusleeping/timecache v0.0.0-20160911033111-cfcb2f1abfee // indirect
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
github.com/zondax/hid v0.9.1-0.20220302062450-5552068d2266 // indirect
github.com/zondax/hid v0.9.1 // indirect
github.com/zondax/ledger-go v0.12.1 // indirect
go.opentelemetry.io/otel/metric v0.25.0 // indirect
go.opentelemetry.io/otel/sdk/export/metric v0.25.0 // indirect

4
go.sum
View File

@ -1831,8 +1831,8 @@ github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
github.com/zondax/hid v0.9.0/go.mod h1:l5wttcP0jwtdLjqjMMWFVEE7d1zO0jvSPA9OPZxWpEM=
github.com/zondax/hid v0.9.1-0.20220302062450-5552068d2266 h1:O9XLFXGkVswDFmH9LaYpqu+r/AAFWqr0DL6V00KEVFg=
github.com/zondax/hid v0.9.1-0.20220302062450-5552068d2266/go.mod h1:l5wttcP0jwtdLjqjMMWFVEE7d1zO0jvSPA9OPZxWpEM=
github.com/zondax/hid v0.9.1 h1:gQe66rtmyZ8VeGFcOpbuH3r7erYtNEAezCAYu8LdkJo=
github.com/zondax/hid v0.9.1/go.mod h1:l5wttcP0jwtdLjqjMMWFVEE7d1zO0jvSPA9OPZxWpEM=
github.com/zondax/ledger-go v0.12.1 h1:hYRcyznPRJp+5mzF2sazTLP2nGvGjYDD2VzhHhFomLU=
github.com/zondax/ledger-go v0.12.1/go.mod h1:KatxXrVDzgWwbssUWsF5+cOJHXPvzQ09YSlzGNuhOEo=
go.dedis.ch/fixbuf v1.0.3 h1:hGcV9Cd/znUxlusJ64eAlExS+5cJDIyTyEG+otu5wQs=

View File

@ -141,7 +141,9 @@ func DefaultStorageMiner() *StorageMiner {
},
Proving: ProvingConfig{
ParallelCheckLimit: 128,
ParallelCheckLimit: 128,
PartitionCheckTimeout: Duration(20 * time.Minute),
SingleCheckTimeout: Duration(10 * time.Minute),
},
Storage: SealerConfig{

View File

@ -644,6 +644,29 @@ to late submission.
After changing this option, confirm that the new value works in your setup by invoking
'lotus-miner proving compute window-post 0'`,
},
{
Name: "SingleCheckTimeout",
Type: "Duration",
Comment: `Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped
WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
test challenge took longer than this timeout
WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are
blocked (e.g. in case of disconnected NFS mount)`,
},
{
Name: "PartitionCheckTimeout",
Type: "Duration",
Comment: `Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in
the partition which didn't get checked on time will be skipped
WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
test challenge took longer than this timeout
WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are
blocked or slow`,
},
{
Name: "DisableBuiltinWindowPoSt",

View File

@ -230,6 +230,23 @@ type ProvingConfig struct {
// 'lotus-miner proving compute window-post 0'
ParallelCheckLimit int
// Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped
//
// WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
// test challenge took longer than this timeout
// WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are
// blocked (e.g. in case of disconnected NFS mount)
SingleCheckTimeout Duration
// Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in
// the partition which didn't get checked on time will be skipped
//
// WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
// test challenge took longer than this timeout
// WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are
// blocked or slow
PartitionCheckTimeout Duration
// Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present.
//
// WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need

View File

@ -5,7 +5,6 @@ import (
"crypto/rand"
"fmt"
"sync"
"time"
"golang.org/x/xerrors"
@ -15,8 +14,6 @@ import (
"github.com/filecoin-project/lotus/storage/sealer/storiface"
)
var PostCheckTimeout = 160 * time.Second
// FaultTracker TODO: Track things more actively
type FaultTracker interface {
CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof, sectors []storiface.SectorRef, rg storiface.RGetter) (map[abi.SectorID]string, error)
@ -50,6 +47,12 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof,
badLk.Unlock()
}
if m.partitionCheckTimeout > 0 {
var cancel2 context.CancelFunc
ctx, cancel2 = context.WithTimeout(ctx, m.partitionCheckTimeout)
defer cancel2()
}
var wg sync.WaitGroup
wg.Add(len(sectors))
@ -57,7 +60,9 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof,
select {
case throttle <- struct{}{}:
case <-ctx.Done():
return nil, ctx.Err()
addBad(sector.ID, fmt.Sprintf("waiting for check worker: %s", ctx.Err()))
wg.Done()
continue
}
go func(sector storiface.SectorRef) {
@ -107,8 +112,13 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof,
return
}
vctx, cancel2 := context.WithTimeout(ctx, PostCheckTimeout)
defer cancel2()
vctx := ctx
if m.singleCheckTimeout > 0 {
var cancel2 context.CancelFunc
vctx, cancel2 = context.WithTimeout(ctx, m.singleCheckTimeout)
defer cancel2()
}
_, err = m.storage.GenerateSingleVanillaProof(vctx, sector.ID.Miner, storiface.PostSectorChallenge{
SealProof: sector.ProofType,

View File

@ -7,6 +7,7 @@ import (
"net/http"
"sort"
"sync"
"time"
"github.com/google/uuid"
"github.com/hashicorp/go-multierror"
@ -72,6 +73,8 @@ type Manager struct {
work *statestore.StateStore
parallelCheckLimit int
singleCheckTimeout time.Duration
partitionCheckTimeout time.Duration
disableBuiltinWindowPoSt bool
disableBuiltinWinningPoSt bool
disallowRemoteFinalize bool
@ -121,6 +124,8 @@ func New(ctx context.Context, lstor *paths.Local, stor paths.Store, ls paths.Loc
localProver: prover,
parallelCheckLimit: pc.ParallelCheckLimit,
singleCheckTimeout: time.Duration(pc.SingleCheckTimeout),
partitionCheckTimeout: time.Duration(pc.PartitionCheckTimeout),
disableBuiltinWindowPoSt: pc.DisableBuiltinWindowPoSt,
disableBuiltinWinningPoSt: pc.DisableBuiltinWinningPoSt,
disallowRemoteFinalize: sc.DisallowRemoteFinalize,