From af29d2d5be3d1f2ca3aad33f601eff6c0b206d6e Mon Sep 17 00:00:00 2001 From: Ian Davis Date: Wed, 16 Nov 2022 18:06:03 +0000 Subject: [PATCH 1/3] Update Zondax/hid to 0.9.1 I updated us to an intermediate version based on a git sha. That version removed a deprecation warning, but caused issues on older versions of MacOs (Big Sur and lower). This release of Zondax/hid fixes things in a way that both removes the deprecation warning and works on older MacOS versions. --- go.mod | 2 +- go.sum | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 27416c31d..8fb3ed31f 100644 --- a/go.mod +++ b/go.mod @@ -323,7 +323,7 @@ require ( github.com/whyrusleeping/go-keyspace v0.0.0-20160322163242-5b898ac5add1 // indirect github.com/whyrusleeping/timecache v0.0.0-20160911033111-cfcb2f1abfee // indirect github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect - github.com/zondax/hid v0.9.1-0.20220302062450-5552068d2266 // indirect + github.com/zondax/hid v0.9.1 // indirect github.com/zondax/ledger-go v0.12.1 // indirect go.opentelemetry.io/otel/metric v0.25.0 // indirect go.opentelemetry.io/otel/sdk/export/metric v0.25.0 // indirect diff --git a/go.sum b/go.sum index 5095fe820..57756b043 100644 --- a/go.sum +++ b/go.sum @@ -1833,6 +1833,8 @@ github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1 github.com/zondax/hid v0.9.0/go.mod h1:l5wttcP0jwtdLjqjMMWFVEE7d1zO0jvSPA9OPZxWpEM= github.com/zondax/hid v0.9.1-0.20220302062450-5552068d2266 h1:O9XLFXGkVswDFmH9LaYpqu+r/AAFWqr0DL6V00KEVFg= github.com/zondax/hid v0.9.1-0.20220302062450-5552068d2266/go.mod h1:l5wttcP0jwtdLjqjMMWFVEE7d1zO0jvSPA9OPZxWpEM= +github.com/zondax/hid v0.9.1 h1:gQe66rtmyZ8VeGFcOpbuH3r7erYtNEAezCAYu8LdkJo= +github.com/zondax/hid v0.9.1/go.mod h1:l5wttcP0jwtdLjqjMMWFVEE7d1zO0jvSPA9OPZxWpEM= github.com/zondax/ledger-go v0.12.1 h1:hYRcyznPRJp+5mzF2sazTLP2nGvGjYDD2VzhHhFomLU= github.com/zondax/ledger-go v0.12.1/go.mod h1:KatxXrVDzgWwbssUWsF5+cOJHXPvzQ09YSlzGNuhOEo= go.dedis.ch/fixbuf v1.0.3 h1:hGcV9Cd/znUxlusJ64eAlExS+5cJDIyTyEG+otu5wQs= From c117611267a7f573353ad03e90f483107fda5d88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Magiera?= Date: Thu, 17 Nov 2022 18:37:48 +0100 Subject: [PATCH 2/3] mod tidy --- go.sum | 2 -- 1 file changed, 2 deletions(-) diff --git a/go.sum b/go.sum index 57756b043..a68d81741 100644 --- a/go.sum +++ b/go.sum @@ -1831,8 +1831,6 @@ github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= github.com/zondax/hid v0.9.0/go.mod h1:l5wttcP0jwtdLjqjMMWFVEE7d1zO0jvSPA9OPZxWpEM= -github.com/zondax/hid v0.9.1-0.20220302062450-5552068d2266 h1:O9XLFXGkVswDFmH9LaYpqu+r/AAFWqr0DL6V00KEVFg= -github.com/zondax/hid v0.9.1-0.20220302062450-5552068d2266/go.mod h1:l5wttcP0jwtdLjqjMMWFVEE7d1zO0jvSPA9OPZxWpEM= github.com/zondax/hid v0.9.1 h1:gQe66rtmyZ8VeGFcOpbuH3r7erYtNEAezCAYu8LdkJo= github.com/zondax/hid v0.9.1/go.mod h1:l5wttcP0jwtdLjqjMMWFVEE7d1zO0jvSPA9OPZxWpEM= github.com/zondax/ledger-go v0.12.1 h1:hYRcyznPRJp+5mzF2sazTLP2nGvGjYDD2VzhHhFomLU= From 87c25e211c4fb6ade20e0efbbce01ed7cf8f40b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Magiera?= Date: Thu, 17 Nov 2022 18:25:30 +0100 Subject: [PATCH 3/3] feat: wdpost: Configurable pre-check timeouts --- .../en/default-lotus-miner-config.toml | 23 +++++++++++++++++++ node/config/def.go | 4 +++- node/config/doc_gen.go | 23 +++++++++++++++++++ node/config/types.go | 17 ++++++++++++++ storage/sealer/faults.go | 22 +++++++++++++----- storage/sealer/manager.go | 5 ++++ 6 files changed, 87 insertions(+), 7 deletions(-) diff --git a/documentation/en/default-lotus-miner-config.toml b/documentation/en/default-lotus-miner-config.toml index c18ed86fe..939bac0cc 100644 --- a/documentation/en/default-lotus-miner-config.toml +++ b/documentation/en/default-lotus-miner-config.toml @@ -325,6 +325,29 @@ # env var: LOTUS_PROVING_PARALLELCHECKLIMIT #ParallelCheckLimit = 128 + # Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped + # + # WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the + # test challenge took longer than this timeout + # WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are + # blocked (e.g. in case of disconnected NFS mount) + # + # type: Duration + # env var: LOTUS_PROVING_SINGLECHECKTIMEOUT + #SingleCheckTimeout = "10m0s" + + # Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in + # the partition which didn't get checked on time will be skipped + # + # WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the + # test challenge took longer than this timeout + # WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are + # blocked or slow + # + # type: Duration + # env var: LOTUS_PROVING_PARTITIONCHECKTIMEOUT + #PartitionCheckTimeout = "20m0s" + # Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present. # # WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need diff --git a/node/config/def.go b/node/config/def.go index 75f6bf28d..dc26f1661 100644 --- a/node/config/def.go +++ b/node/config/def.go @@ -141,7 +141,9 @@ func DefaultStorageMiner() *StorageMiner { }, Proving: ProvingConfig{ - ParallelCheckLimit: 128, + ParallelCheckLimit: 128, + PartitionCheckTimeout: Duration(20 * time.Minute), + SingleCheckTimeout: Duration(10 * time.Minute), }, Storage: SealerConfig{ diff --git a/node/config/doc_gen.go b/node/config/doc_gen.go index 0cd80ea43..ecf533137 100644 --- a/node/config/doc_gen.go +++ b/node/config/doc_gen.go @@ -644,6 +644,29 @@ to late submission. After changing this option, confirm that the new value works in your setup by invoking 'lotus-miner proving compute window-post 0'`, + }, + { + Name: "SingleCheckTimeout", + Type: "Duration", + + Comment: `Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped + +WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the +test challenge took longer than this timeout +WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are +blocked (e.g. in case of disconnected NFS mount)`, + }, + { + Name: "PartitionCheckTimeout", + Type: "Duration", + + Comment: `Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in +the partition which didn't get checked on time will be skipped + +WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the +test challenge took longer than this timeout +WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are +blocked or slow`, }, { Name: "DisableBuiltinWindowPoSt", diff --git a/node/config/types.go b/node/config/types.go index 41e2854ea..90d878b7e 100644 --- a/node/config/types.go +++ b/node/config/types.go @@ -230,6 +230,23 @@ type ProvingConfig struct { // 'lotus-miner proving compute window-post 0' ParallelCheckLimit int + // Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped + // + // WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the + // test challenge took longer than this timeout + // WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are + // blocked (e.g. in case of disconnected NFS mount) + SingleCheckTimeout Duration + + // Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in + // the partition which didn't get checked on time will be skipped + // + // WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the + // test challenge took longer than this timeout + // WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are + // blocked or slow + PartitionCheckTimeout Duration + // Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present. // // WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need diff --git a/storage/sealer/faults.go b/storage/sealer/faults.go index e05bbb7b8..db7b75bec 100644 --- a/storage/sealer/faults.go +++ b/storage/sealer/faults.go @@ -5,7 +5,6 @@ import ( "crypto/rand" "fmt" "sync" - "time" "golang.org/x/xerrors" @@ -15,8 +14,6 @@ import ( "github.com/filecoin-project/lotus/storage/sealer/storiface" ) -var PostCheckTimeout = 160 * time.Second - // FaultTracker TODO: Track things more actively type FaultTracker interface { CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof, sectors []storiface.SectorRef, rg storiface.RGetter) (map[abi.SectorID]string, error) @@ -50,6 +47,12 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof, badLk.Unlock() } + if m.partitionCheckTimeout > 0 { + var cancel2 context.CancelFunc + ctx, cancel2 = context.WithTimeout(ctx, m.partitionCheckTimeout) + defer cancel2() + } + var wg sync.WaitGroup wg.Add(len(sectors)) @@ -57,7 +60,9 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof, select { case throttle <- struct{}{}: case <-ctx.Done(): - return nil, ctx.Err() + addBad(sector.ID, fmt.Sprintf("waiting for check worker: %s", ctx.Err())) + wg.Done() + continue } go func(sector storiface.SectorRef) { @@ -107,8 +112,13 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof, return } - vctx, cancel2 := context.WithTimeout(ctx, PostCheckTimeout) - defer cancel2() + vctx := ctx + + if m.singleCheckTimeout > 0 { + var cancel2 context.CancelFunc + vctx, cancel2 = context.WithTimeout(ctx, m.singleCheckTimeout) + defer cancel2() + } _, err = m.storage.GenerateSingleVanillaProof(vctx, sector.ID.Miner, storiface.PostSectorChallenge{ SealProof: sector.ProofType, diff --git a/storage/sealer/manager.go b/storage/sealer/manager.go index cb1db5866..b0f506539 100644 --- a/storage/sealer/manager.go +++ b/storage/sealer/manager.go @@ -7,6 +7,7 @@ import ( "net/http" "sort" "sync" + "time" "github.com/google/uuid" "github.com/hashicorp/go-multierror" @@ -72,6 +73,8 @@ type Manager struct { work *statestore.StateStore parallelCheckLimit int + singleCheckTimeout time.Duration + partitionCheckTimeout time.Duration disableBuiltinWindowPoSt bool disableBuiltinWinningPoSt bool disallowRemoteFinalize bool @@ -121,6 +124,8 @@ func New(ctx context.Context, lstor *paths.Local, stor paths.Store, ls paths.Loc localProver: prover, parallelCheckLimit: pc.ParallelCheckLimit, + singleCheckTimeout: time.Duration(pc.SingleCheckTimeout), + partitionCheckTimeout: time.Duration(pc.PartitionCheckTimeout), disableBuiltinWindowPoSt: pc.DisableBuiltinWindowPoSt, disableBuiltinWinningPoSt: pc.DisableBuiltinWinningPoSt, disallowRemoteFinalize: sc.DisallowRemoteFinalize,