Merge pull request #9702 from filecoin-project/1.19.0_backport_20221122

backport: configurable pre check timeouts and zondax hid update
This commit is contained in:
Aayush Rajasekaran 2022-11-22 11:19:16 -05:00 committed by GitHub
commit 0bf439ff94
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 90 additions and 10 deletions

View File

@ -325,6 +325,29 @@
# env var: LOTUS_PROVING_PARALLELCHECKLIMIT # env var: LOTUS_PROVING_PARALLELCHECKLIMIT
#ParallelCheckLimit = 128 #ParallelCheckLimit = 128
# Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped
#
# WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
# test challenge took longer than this timeout
# WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are
# blocked (e.g. in case of disconnected NFS mount)
#
# type: Duration
# env var: LOTUS_PROVING_SINGLECHECKTIMEOUT
#SingleCheckTimeout = "10m0s"
# Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in
# the partition which didn't get checked on time will be skipped
#
# WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
# test challenge took longer than this timeout
# WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are
# blocked or slow
#
# type: Duration
# env var: LOTUS_PROVING_PARTITIONCHECKTIMEOUT
#PartitionCheckTimeout = "20m0s"
# Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present. # Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present.
# #
# WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need # WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need

2
go.mod
View File

@ -323,7 +323,7 @@ require (
github.com/whyrusleeping/go-keyspace v0.0.0-20160322163242-5b898ac5add1 // indirect github.com/whyrusleeping/go-keyspace v0.0.0-20160322163242-5b898ac5add1 // indirect
github.com/whyrusleeping/timecache v0.0.0-20160911033111-cfcb2f1abfee // indirect github.com/whyrusleeping/timecache v0.0.0-20160911033111-cfcb2f1abfee // indirect
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
github.com/zondax/hid v0.9.1-0.20220302062450-5552068d2266 // indirect github.com/zondax/hid v0.9.1 // indirect
github.com/zondax/ledger-go v0.12.1 // indirect github.com/zondax/ledger-go v0.12.1 // indirect
go.opentelemetry.io/otel/metric v0.25.0 // indirect go.opentelemetry.io/otel/metric v0.25.0 // indirect
go.opentelemetry.io/otel/sdk/export/metric v0.25.0 // indirect go.opentelemetry.io/otel/sdk/export/metric v0.25.0 // indirect

4
go.sum
View File

@ -1831,8 +1831,8 @@ github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
github.com/zondax/hid v0.9.0/go.mod h1:l5wttcP0jwtdLjqjMMWFVEE7d1zO0jvSPA9OPZxWpEM= github.com/zondax/hid v0.9.0/go.mod h1:l5wttcP0jwtdLjqjMMWFVEE7d1zO0jvSPA9OPZxWpEM=
github.com/zondax/hid v0.9.1-0.20220302062450-5552068d2266 h1:O9XLFXGkVswDFmH9LaYpqu+r/AAFWqr0DL6V00KEVFg= github.com/zondax/hid v0.9.1 h1:gQe66rtmyZ8VeGFcOpbuH3r7erYtNEAezCAYu8LdkJo=
github.com/zondax/hid v0.9.1-0.20220302062450-5552068d2266/go.mod h1:l5wttcP0jwtdLjqjMMWFVEE7d1zO0jvSPA9OPZxWpEM= github.com/zondax/hid v0.9.1/go.mod h1:l5wttcP0jwtdLjqjMMWFVEE7d1zO0jvSPA9OPZxWpEM=
github.com/zondax/ledger-go v0.12.1 h1:hYRcyznPRJp+5mzF2sazTLP2nGvGjYDD2VzhHhFomLU= github.com/zondax/ledger-go v0.12.1 h1:hYRcyznPRJp+5mzF2sazTLP2nGvGjYDD2VzhHhFomLU=
github.com/zondax/ledger-go v0.12.1/go.mod h1:KatxXrVDzgWwbssUWsF5+cOJHXPvzQ09YSlzGNuhOEo= github.com/zondax/ledger-go v0.12.1/go.mod h1:KatxXrVDzgWwbssUWsF5+cOJHXPvzQ09YSlzGNuhOEo=
go.dedis.ch/fixbuf v1.0.3 h1:hGcV9Cd/znUxlusJ64eAlExS+5cJDIyTyEG+otu5wQs= go.dedis.ch/fixbuf v1.0.3 h1:hGcV9Cd/znUxlusJ64eAlExS+5cJDIyTyEG+otu5wQs=

View File

@ -142,6 +142,8 @@ func DefaultStorageMiner() *StorageMiner {
Proving: ProvingConfig{ Proving: ProvingConfig{
ParallelCheckLimit: 128, ParallelCheckLimit: 128,
PartitionCheckTimeout: Duration(20 * time.Minute),
SingleCheckTimeout: Duration(10 * time.Minute),
}, },
Storage: SealerConfig{ Storage: SealerConfig{

View File

@ -644,6 +644,29 @@ to late submission.
After changing this option, confirm that the new value works in your setup by invoking After changing this option, confirm that the new value works in your setup by invoking
'lotus-miner proving compute window-post 0'`, 'lotus-miner proving compute window-post 0'`,
},
{
Name: "SingleCheckTimeout",
Type: "Duration",
Comment: `Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped
WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
test challenge took longer than this timeout
WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are
blocked (e.g. in case of disconnected NFS mount)`,
},
{
Name: "PartitionCheckTimeout",
Type: "Duration",
Comment: `Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in
the partition which didn't get checked on time will be skipped
WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
test challenge took longer than this timeout
WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are
blocked or slow`,
}, },
{ {
Name: "DisableBuiltinWindowPoSt", Name: "DisableBuiltinWindowPoSt",

View File

@ -230,6 +230,23 @@ type ProvingConfig struct {
// 'lotus-miner proving compute window-post 0' // 'lotus-miner proving compute window-post 0'
ParallelCheckLimit int ParallelCheckLimit int
// Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped
//
// WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
// test challenge took longer than this timeout
// WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are
// blocked (e.g. in case of disconnected NFS mount)
SingleCheckTimeout Duration
// Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in
// the partition which didn't get checked on time will be skipped
//
// WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
// test challenge took longer than this timeout
// WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are
// blocked or slow
PartitionCheckTimeout Duration
// Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present. // Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present.
// //
// WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need // WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need

View File

@ -5,7 +5,6 @@ import (
"crypto/rand" "crypto/rand"
"fmt" "fmt"
"sync" "sync"
"time"
"golang.org/x/xerrors" "golang.org/x/xerrors"
@ -15,8 +14,6 @@ import (
"github.com/filecoin-project/lotus/storage/sealer/storiface" "github.com/filecoin-project/lotus/storage/sealer/storiface"
) )
var PostCheckTimeout = 160 * time.Second
// FaultTracker TODO: Track things more actively // FaultTracker TODO: Track things more actively
type FaultTracker interface { type FaultTracker interface {
CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof, sectors []storiface.SectorRef, rg storiface.RGetter) (map[abi.SectorID]string, error) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof, sectors []storiface.SectorRef, rg storiface.RGetter) (map[abi.SectorID]string, error)
@ -50,6 +47,12 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof,
badLk.Unlock() badLk.Unlock()
} }
if m.partitionCheckTimeout > 0 {
var cancel2 context.CancelFunc
ctx, cancel2 = context.WithTimeout(ctx, m.partitionCheckTimeout)
defer cancel2()
}
var wg sync.WaitGroup var wg sync.WaitGroup
wg.Add(len(sectors)) wg.Add(len(sectors))
@ -57,7 +60,9 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof,
select { select {
case throttle <- struct{}{}: case throttle <- struct{}{}:
case <-ctx.Done(): case <-ctx.Done():
return nil, ctx.Err() addBad(sector.ID, fmt.Sprintf("waiting for check worker: %s", ctx.Err()))
wg.Done()
continue
} }
go func(sector storiface.SectorRef) { go func(sector storiface.SectorRef) {
@ -107,8 +112,13 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof,
return return
} }
vctx, cancel2 := context.WithTimeout(ctx, PostCheckTimeout) vctx := ctx
if m.singleCheckTimeout > 0 {
var cancel2 context.CancelFunc
vctx, cancel2 = context.WithTimeout(ctx, m.singleCheckTimeout)
defer cancel2() defer cancel2()
}
_, err = m.storage.GenerateSingleVanillaProof(vctx, sector.ID.Miner, storiface.PostSectorChallenge{ _, err = m.storage.GenerateSingleVanillaProof(vctx, sector.ID.Miner, storiface.PostSectorChallenge{
SealProof: sector.ProofType, SealProof: sector.ProofType,

View File

@ -7,6 +7,7 @@ import (
"net/http" "net/http"
"sort" "sort"
"sync" "sync"
"time"
"github.com/google/uuid" "github.com/google/uuid"
"github.com/hashicorp/go-multierror" "github.com/hashicorp/go-multierror"
@ -72,6 +73,8 @@ type Manager struct {
work *statestore.StateStore work *statestore.StateStore
parallelCheckLimit int parallelCheckLimit int
singleCheckTimeout time.Duration
partitionCheckTimeout time.Duration
disableBuiltinWindowPoSt bool disableBuiltinWindowPoSt bool
disableBuiltinWinningPoSt bool disableBuiltinWinningPoSt bool
disallowRemoteFinalize bool disallowRemoteFinalize bool
@ -121,6 +124,8 @@ func New(ctx context.Context, lstor *paths.Local, stor paths.Store, ls paths.Loc
localProver: prover, localProver: prover,
parallelCheckLimit: pc.ParallelCheckLimit, parallelCheckLimit: pc.ParallelCheckLimit,
singleCheckTimeout: time.Duration(pc.SingleCheckTimeout),
partitionCheckTimeout: time.Duration(pc.PartitionCheckTimeout),
disableBuiltinWindowPoSt: pc.DisableBuiltinWindowPoSt, disableBuiltinWindowPoSt: pc.DisableBuiltinWindowPoSt,
disableBuiltinWinningPoSt: pc.DisableBuiltinWinningPoSt, disableBuiltinWinningPoSt: pc.DisableBuiltinWinningPoSt,
disallowRemoteFinalize: sc.DisallowRemoteFinalize, disallowRemoteFinalize: sc.DisallowRemoteFinalize,