diff --git a/documentation/en/default-lotus-miner-config.toml b/documentation/en/default-lotus-miner-config.toml index c18ed86fe..939bac0cc 100644 --- a/documentation/en/default-lotus-miner-config.toml +++ b/documentation/en/default-lotus-miner-config.toml @@ -325,6 +325,29 @@ # env var: LOTUS_PROVING_PARALLELCHECKLIMIT #ParallelCheckLimit = 128 + # Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped + # + # WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the + # test challenge took longer than this timeout + # WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are + # blocked (e.g. in case of disconnected NFS mount) + # + # type: Duration + # env var: LOTUS_PROVING_SINGLECHECKTIMEOUT + #SingleCheckTimeout = "10m0s" + + # Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in + # the partition which didn't get checked on time will be skipped + # + # WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the + # test challenge took longer than this timeout + # WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are + # blocked or slow + # + # type: Duration + # env var: LOTUS_PROVING_PARTITIONCHECKTIMEOUT + #PartitionCheckTimeout = "20m0s" + # Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present. # # WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need diff --git a/node/config/def.go b/node/config/def.go index 75f6bf28d..dc26f1661 100644 --- a/node/config/def.go +++ b/node/config/def.go @@ -141,7 +141,9 @@ func DefaultStorageMiner() *StorageMiner { }, Proving: ProvingConfig{ - ParallelCheckLimit: 128, + ParallelCheckLimit: 128, + PartitionCheckTimeout: Duration(20 * time.Minute), + SingleCheckTimeout: Duration(10 * time.Minute), }, Storage: SealerConfig{ diff --git a/node/config/doc_gen.go b/node/config/doc_gen.go index 0cd80ea43..ecf533137 100644 --- a/node/config/doc_gen.go +++ b/node/config/doc_gen.go @@ -644,6 +644,29 @@ to late submission. After changing this option, confirm that the new value works in your setup by invoking 'lotus-miner proving compute window-post 0'`, + }, + { + Name: "SingleCheckTimeout", + Type: "Duration", + + Comment: `Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped + +WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the +test challenge took longer than this timeout +WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are +blocked (e.g. in case of disconnected NFS mount)`, + }, + { + Name: "PartitionCheckTimeout", + Type: "Duration", + + Comment: `Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in +the partition which didn't get checked on time will be skipped + +WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the +test challenge took longer than this timeout +WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are +blocked or slow`, }, { Name: "DisableBuiltinWindowPoSt", diff --git a/node/config/types.go b/node/config/types.go index 41e2854ea..90d878b7e 100644 --- a/node/config/types.go +++ b/node/config/types.go @@ -230,6 +230,23 @@ type ProvingConfig struct { // 'lotus-miner proving compute window-post 0' ParallelCheckLimit int + // Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped + // + // WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the + // test challenge took longer than this timeout + // WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are + // blocked (e.g. in case of disconnected NFS mount) + SingleCheckTimeout Duration + + // Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in + // the partition which didn't get checked on time will be skipped + // + // WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the + // test challenge took longer than this timeout + // WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are + // blocked or slow + PartitionCheckTimeout Duration + // Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present. // // WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need diff --git a/storage/sealer/faults.go b/storage/sealer/faults.go index e05bbb7b8..db7b75bec 100644 --- a/storage/sealer/faults.go +++ b/storage/sealer/faults.go @@ -5,7 +5,6 @@ import ( "crypto/rand" "fmt" "sync" - "time" "golang.org/x/xerrors" @@ -15,8 +14,6 @@ import ( "github.com/filecoin-project/lotus/storage/sealer/storiface" ) -var PostCheckTimeout = 160 * time.Second - // FaultTracker TODO: Track things more actively type FaultTracker interface { CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof, sectors []storiface.SectorRef, rg storiface.RGetter) (map[abi.SectorID]string, error) @@ -50,6 +47,12 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof, badLk.Unlock() } + if m.partitionCheckTimeout > 0 { + var cancel2 context.CancelFunc + ctx, cancel2 = context.WithTimeout(ctx, m.partitionCheckTimeout) + defer cancel2() + } + var wg sync.WaitGroup wg.Add(len(sectors)) @@ -57,7 +60,9 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof, select { case throttle <- struct{}{}: case <-ctx.Done(): - return nil, ctx.Err() + addBad(sector.ID, fmt.Sprintf("waiting for check worker: %s", ctx.Err())) + wg.Done() + continue } go func(sector storiface.SectorRef) { @@ -107,8 +112,13 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof, return } - vctx, cancel2 := context.WithTimeout(ctx, PostCheckTimeout) - defer cancel2() + vctx := ctx + + if m.singleCheckTimeout > 0 { + var cancel2 context.CancelFunc + vctx, cancel2 = context.WithTimeout(ctx, m.singleCheckTimeout) + defer cancel2() + } _, err = m.storage.GenerateSingleVanillaProof(vctx, sector.ID.Miner, storiface.PostSectorChallenge{ SealProof: sector.ProofType, diff --git a/storage/sealer/manager.go b/storage/sealer/manager.go index cb1db5866..b0f506539 100644 --- a/storage/sealer/manager.go +++ b/storage/sealer/manager.go @@ -7,6 +7,7 @@ import ( "net/http" "sort" "sync" + "time" "github.com/google/uuid" "github.com/hashicorp/go-multierror" @@ -72,6 +73,8 @@ type Manager struct { work *statestore.StateStore parallelCheckLimit int + singleCheckTimeout time.Duration + partitionCheckTimeout time.Duration disableBuiltinWindowPoSt bool disableBuiltinWinningPoSt bool disallowRemoteFinalize bool @@ -121,6 +124,8 @@ func New(ctx context.Context, lstor *paths.Local, stor paths.Store, ls paths.Loc localProver: prover, parallelCheckLimit: pc.ParallelCheckLimit, + singleCheckTimeout: time.Duration(pc.SingleCheckTimeout), + partitionCheckTimeout: time.Duration(pc.PartitionCheckTimeout), disableBuiltinWindowPoSt: pc.DisableBuiltinWindowPoSt, disableBuiltinWinningPoSt: pc.DisableBuiltinWinningPoSt, disallowRemoteFinalize: sc.DisallowRemoteFinalize,