Merge pull request #8710 from filecoin-project/feat/stor-fin-move-selector

feat: sched: Finalize* move selectors
This commit is contained in:
Łukasz Magiera 2022-05-26 21:20:48 +02:00 committed by GitHub
commit 7836e20801
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 241 additions and 37 deletions

View File

@ -542,6 +542,22 @@
# env var: LOTUS_STORAGE_ASSIGNER
#Assigner = "utilization"
# DisallowRemoteFinalize when set to true will force all Finalize tasks to
# run on workers with local access to both long-term storage and the sealing
# path containing the sector.
# --
# WARNING: Only set this if all workers have access to long-term storage
# paths. If this flag is enabled, and there are workers without long-term
# storage access, sectors will not be moved from them, and Finalize tasks
# will appear to be stuck.
# --
# If you see stuck Finalize tasks after enabling this setting, check
# 'lotus-miner sealing sched-diag' and 'lotus-miner storage find [sector num]'
#
# type: bool
# env var: LOTUS_STORAGE_DISALLOWREMOTEFINALIZE
#DisallowRemoteFinalize = false
# ResourceFiltering instructs the system which resource filtering strategy
# to use when evaluating tasks against this worker. An empty value defaults
# to "hardware".

View File

@ -72,6 +72,7 @@ type Manager struct {
work *statestore.StateStore
parallelCheckLimit int
disallowRemoteFinalize bool
callToWork map[storiface.CallID]WorkID
// used when we get an early return and there's no callToWork mapping
@ -123,6 +124,8 @@ type Config struct {
// PoSt config
ParallelCheckLimit int
DisallowRemoteFinalize bool
Assigner string
}
@ -156,6 +159,7 @@ func New(ctx context.Context, lstor *stores.Local, stor stores.Store, ls stores.
localProver: prover,
parallelCheckLimit: sc.ParallelCheckLimit,
disallowRemoteFinalize: sc.DisallowRemoteFinalize,
work: mss,
callToWork: map[storiface.CallID]WorkID{},
@ -592,6 +596,7 @@ func (m *Manager) FinalizeSector(ctx context.Context, sector storage.SectorRef,
return xerrors.Errorf("acquiring sector lock: %w", err)
}
// first check if the unsealed file exists anywhere; If it doesn't ignore it
unsealed := storiface.FTUnsealed
{
unsealedStores, err := m.index.StorageFindSector(ctx, sector.ID, storiface.FTUnsealed, 0, false)
@ -604,6 +609,8 @@ func (m *Manager) FinalizeSector(ctx context.Context, sector storage.SectorRef,
}
}
// Make sure that the sealed file is still in sealing storage; In case it already
// isn't, we want to do finalize in long-term storage
pathType := storiface.PathStorage
{
sealedStores, err := m.index.StorageFindSector(ctx, sector.ID, storiface.FTSealed, 0, false)
@ -619,6 +626,8 @@ func (m *Manager) FinalizeSector(ctx context.Context, sector storage.SectorRef,
}
}
// do the cache trimming wherever the likely still very large cache lives.
// we really don't want to move it.
selector := newExistingSelector(m.index, sector.ID, storiface.FTCache, false)
err := m.sched.Schedule(ctx, sector, sealtasks.TTFinalize, selector,
@ -631,7 +640,10 @@ func (m *Manager) FinalizeSector(ctx context.Context, sector storage.SectorRef,
return err
}
fetchSel := newAllocSelector(m.index, storiface.FTCache|storiface.FTSealed, storiface.PathStorage)
// get a selector for moving stuff into long-term storage
fetchSel := newMoveSelector(m.index, sector.ID, storiface.FTCache|storiface.FTSealed, storiface.PathStorage, !m.disallowRemoteFinalize)
// only move the unsealed file if it still exists and needs moving
moveUnsealed := unsealed
{
if len(keepUnsealed) == 0 {
@ -639,6 +651,7 @@ func (m *Manager) FinalizeSector(ctx context.Context, sector storage.SectorRef,
}
}
// move stuff to long-term storage
err = m.sched.Schedule(ctx, sector, sealtasks.TTFetch, fetchSel,
m.schedFetch(sector, storiface.FTCache|storiface.FTSealed|moveUnsealed, storiface.PathStorage, storiface.AcquireMove),
func(ctx context.Context, w Worker) error {
@ -660,6 +673,7 @@ func (m *Manager) FinalizeReplicaUpdate(ctx context.Context, sector storage.Sect
return xerrors.Errorf("acquiring sector lock: %w", err)
}
// first check if the unsealed file exists anywhere; If it doesn't ignore it
moveUnsealed := storiface.FTUnsealed
{
unsealedStores, err := m.index.StorageFindSector(ctx, sector.ID, storiface.FTUnsealed, 0, false)
@ -672,6 +686,8 @@ func (m *Manager) FinalizeReplicaUpdate(ctx context.Context, sector storage.Sect
}
}
// Make sure that the update file is still in sealing storage; In case it already
// isn't, we want to do finalize in long-term storage
pathType := storiface.PathStorage
{
sealedStores, err := m.index.StorageFindSector(ctx, sector.ID, storiface.FTUpdate, 0, false)
@ -687,7 +703,9 @@ func (m *Manager) FinalizeReplicaUpdate(ctx context.Context, sector storage.Sect
}
}
selector := newExistingSelector(m.index, sector.ID, storiface.FTCache|storiface.FTUpdateCache, false)
// do the cache trimming wherever the likely still large cache lives.
// we really don't want to move it.
selector := newExistingSelector(m.index, sector.ID, storiface.FTUpdateCache, false)
err := m.sched.Schedule(ctx, sector, sealtasks.TTFinalizeReplicaUpdate, selector,
m.schedFetch(sector, storiface.FTCache|storiface.FTUpdateCache|moveUnsealed, pathType, storiface.AcquireMove),
@ -700,7 +718,8 @@ func (m *Manager) FinalizeReplicaUpdate(ctx context.Context, sector storage.Sect
}
move := func(types storiface.SectorFileType) error {
fetchSel := newAllocSelector(m.index, types, storiface.PathStorage)
// get a selector for moving stuff into long-term storage
fetchSel := newMoveSelector(m.index, sector.ID, types, storiface.PathStorage, !m.disallowRemoteFinalize)
{
if len(keepUnsealed) == 0 {
moveUnsealed = storiface.FTNone

View File

@ -44,7 +44,9 @@ const mib = 1 << 20
type WorkerAction func(ctx context.Context, w Worker) error
type WorkerSelector interface {
Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, a *WorkerHandle) (bool, error) // true if worker is acceptable for performing a task
// Ok is true if worker is acceptable for performing a task.
// If any worker is preferred for a task, other workers won't be considered for that task.
Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, a *WorkerHandle) (ok, preferred bool, err error)
Cmp(ctx context.Context, task sealtasks.TaskType, a, b *WorkerHandle) (bool, error) // true if a is preferred over b
}

View File

@ -61,8 +61,10 @@ func (a *AssignerCommon) TrySched(sh *Scheduler) {
}()
task := (*sh.SchedQueue)[sqi]
task.IndexHeap = sqi
var havePreferred bool
for wnd, windowRequest := range sh.OpenWindows {
worker, ok := sh.Workers[windowRequest.Worker]
if !ok {
@ -84,7 +86,7 @@ func (a *AssignerCommon) TrySched(sh *Scheduler) {
}
rpcCtx, cancel := context.WithTimeout(task.Ctx, SelectorTimeout)
ok, err := task.Sel.Ok(rpcCtx, task.TaskType, task.Sector.ProofType, worker)
ok, preferred, err := task.Sel.Ok(rpcCtx, task.TaskType, task.Sector.ProofType, worker)
cancel()
if err != nil {
log.Errorf("trySched(1) req.Sel.Ok error: %+v", err)
@ -95,6 +97,17 @@ func (a *AssignerCommon) TrySched(sh *Scheduler) {
continue
}
if havePreferred && !preferred {
// we have a way better worker for this task
continue
}
if preferred && !havePreferred {
// all workers we considered previously are much worse choice
acceptableWindows[sqi] = acceptableWindows[sqi][:0]
havePreferred = true
}
acceptableWindows[sqi] = append(acceptableWindows[sqi], wnd)
}

View File

@ -584,9 +584,9 @@ func TestSched(t *testing.T) {
type slowishSelector bool
func (s slowishSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, a *WorkerHandle) (bool, error) {
func (s slowishSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, a *WorkerHandle) (bool, bool, error) {
time.Sleep(200 * time.Microsecond)
return bool(s), nil
return bool(s), false, nil
}
func (s slowishSelector) Cmp(ctx context.Context, task sealtasks.TaskType, a, b *WorkerHandle) (bool, error) {

View File

@ -26,18 +26,18 @@ func newAllocSelector(index stores.SectorIndex, alloc storiface.SectorFileType,
}
}
func (s *allocSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, whnd *WorkerHandle) (bool, error) {
func (s *allocSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, whnd *WorkerHandle) (bool, bool, error) {
tasks, err := whnd.TaskTypes(ctx)
if err != nil {
return false, xerrors.Errorf("getting supported worker task types: %w", err)
return false, false, xerrors.Errorf("getting supported worker task types: %w", err)
}
if _, supported := tasks[task]; !supported {
return false, nil
return false, false, nil
}
paths, err := whnd.workerRpc.Paths(ctx)
if err != nil {
return false, xerrors.Errorf("getting worker paths: %w", err)
return false, false, xerrors.Errorf("getting worker paths: %w", err)
}
have := map[storiface.ID]struct{}{}
@ -47,21 +47,21 @@ func (s *allocSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt abi
ssize, err := spt.SectorSize()
if err != nil {
return false, xerrors.Errorf("getting sector size: %w", err)
return false, false, xerrors.Errorf("getting sector size: %w", err)
}
best, err := s.index.StorageBestAlloc(ctx, s.alloc, ssize, s.ptype)
if err != nil {
return false, xerrors.Errorf("finding best alloc storage: %w", err)
return false, false, xerrors.Errorf("finding best alloc storage: %w", err)
}
for _, info := range best {
if _, ok := have[info.ID]; ok {
return true, nil
return true, false, nil
}
}
return false, nil
return false, false, nil
}
func (s *allocSelector) Cmp(ctx context.Context, task sealtasks.TaskType, a, b *WorkerHandle) (bool, error) {

View File

@ -28,18 +28,18 @@ func newExistingSelector(index stores.SectorIndex, sector abi.SectorID, alloc st
}
}
func (s *existingSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, whnd *WorkerHandle) (bool, error) {
func (s *existingSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, whnd *WorkerHandle) (bool, bool, error) {
tasks, err := whnd.TaskTypes(ctx)
if err != nil {
return false, xerrors.Errorf("getting supported worker task types: %w", err)
return false, false, xerrors.Errorf("getting supported worker task types: %w", err)
}
if _, supported := tasks[task]; !supported {
return false, nil
return false, false, nil
}
paths, err := whnd.workerRpc.Paths(ctx)
if err != nil {
return false, xerrors.Errorf("getting worker paths: %w", err)
return false, false, xerrors.Errorf("getting worker paths: %w", err)
}
have := map[storiface.ID]struct{}{}
@ -49,21 +49,21 @@ func (s *existingSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt
ssize, err := spt.SectorSize()
if err != nil {
return false, xerrors.Errorf("getting sector size: %w", err)
return false, false, xerrors.Errorf("getting sector size: %w", err)
}
best, err := s.index.StorageFindSector(ctx, s.sector, s.alloc, ssize, s.allowFetch)
if err != nil {
return false, xerrors.Errorf("finding best storage: %w", err)
return false, false, xerrors.Errorf("finding best storage: %w", err)
}
for _, info := range best {
if _, ok := have[info.ID]; ok {
return true, nil
return true, false, nil
}
}
return false, nil
return false, false, nil
}
func (s *existingSelector) Cmp(ctx context.Context, task sealtasks.TaskType, a, b *WorkerHandle) (bool, error) {

98
extern/sector-storage/selector_move.go vendored Normal file
View File

@ -0,0 +1,98 @@
package sectorstorage
import (
"context"
"golang.org/x/xerrors"
"github.com/filecoin-project/go-state-types/abi"
"github.com/filecoin-project/lotus/extern/sector-storage/sealtasks"
"github.com/filecoin-project/lotus/extern/sector-storage/stores"
"github.com/filecoin-project/lotus/extern/sector-storage/storiface"
)
type moveSelector struct {
index stores.SectorIndex
sector abi.SectorID
alloc storiface.SectorFileType
destPtype storiface.PathType
allowRemote bool
}
func newMoveSelector(index stores.SectorIndex, sector abi.SectorID, alloc storiface.SectorFileType, destPtype storiface.PathType, allowRemote bool) *moveSelector {
return &moveSelector{
index: index,
sector: sector,
alloc: alloc,
destPtype: destPtype,
allowRemote: allowRemote,
}
}
func (s *moveSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, whnd *WorkerHandle) (bool, bool, error) {
tasks, err := whnd.TaskTypes(ctx)
if err != nil {
return false, false, xerrors.Errorf("getting supported worker task types: %w", err)
}
if _, supported := tasks[task]; !supported {
return false, false, nil
}
paths, err := whnd.workerRpc.Paths(ctx)
if err != nil {
return false, false, xerrors.Errorf("getting worker paths: %w", err)
}
workerPaths := map[storiface.ID]int{}
for _, path := range paths {
workerPaths[path.ID] = 0
}
ssize, err := spt.SectorSize()
if err != nil {
return false, false, xerrors.Errorf("getting sector size: %w", err)
}
// note: allowFetch is always false here, because we want to find workers with
// the sector available locally
preferred, err := s.index.StorageFindSector(ctx, s.sector, s.alloc, ssize, false)
if err != nil {
return false, false, xerrors.Errorf("finding preferred storage: %w", err)
}
for _, info := range preferred {
if _, ok := workerPaths[info.ID]; ok {
workerPaths[info.ID]++
}
}
best, err := s.index.StorageBestAlloc(ctx, s.alloc, ssize, s.destPtype)
if err != nil {
return false, false, xerrors.Errorf("finding best dest storage: %w", err)
}
var ok bool
for _, info := range best {
if n, has := workerPaths[info.ID]; has {
ok = true
// if the worker has a local path with the sector already in it
// prefer that worker; This usually meant that the move operation is
// either a no-op because the sector is already in the correct path,
// or the move a local move.
if n > 0 {
return true, true, nil
}
}
}
return ok && s.allowRemote, false, nil
}
func (s *moveSelector) Cmp(ctx context.Context, task sealtasks.TaskType, a, b *WorkerHandle) (bool, error) {
return a.Utilization() < b.Utilization(), nil
}
var _ WorkerSelector = &moveSelector{}

View File

@ -19,14 +19,14 @@ func newTaskSelector() *taskSelector {
return &taskSelector{}
}
func (s *taskSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, whnd *WorkerHandle) (bool, error) {
func (s *taskSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, whnd *WorkerHandle) (bool, bool, error) {
tasks, err := whnd.TaskTypes(ctx)
if err != nil {
return false, xerrors.Errorf("getting supported worker task types: %w", err)
return false, false, xerrors.Errorf("getting supported worker task types: %w", err)
}
_, supported := tasks[task]
return supported, nil
return supported, false, nil
}
func (s *taskSelector) Cmp(ctx context.Context, _ sealtasks.TaskType, a, b *WorkerHandle) (bool, error) {

View File

@ -571,6 +571,7 @@ func (n *Ensemble) Start() *Ensemble {
noLocal := m.options.minerNoLocalSealing
assigner := m.options.minerAssigner
disallowRemoteFinalize := m.options.disallowRemoteFinalize
var mineBlock = make(chan lotusminer.MineReq)
opts := []node.Option{
@ -597,6 +598,7 @@ func (n *Ensemble) Start() *Ensemble {
}
scfg.Storage.Assigner = assigner
scfg.Storage.DisallowRemoteFinalize = disallowRemoteFinalize
scfg.Storage.ResourceFiltering = sectorstorage.ResourceFilteringDisabled
return scfg.StorageManager()
}),

View File

@ -42,6 +42,7 @@ type nodeOpts struct {
maxStagingDealsBytes int64
minerNoLocalSealing bool // use worker
minerAssigner string
disallowRemoteFinalize bool
workerTasks []sealtasks.TaskType
workerStorageOpt func(stores.Store) stores.Store
@ -105,6 +106,13 @@ func WithAssigner(a string) NodeOpt {
}
}
func WithDisallowRemoteFinalize(d bool) NodeOpt {
return func(opts *nodeOpts) error {
opts.disallowRemoteFinalize = d
return nil
}
}
func DisableLibp2p() NodeOpt {
return func(opts *nodeOpts) error {
opts.disableLibp2p = true

View File

@ -57,6 +57,22 @@ func TestWorkerPledgeSpread(t *testing.T) {
miner.PledgeSectors(ctx, 1, 0, nil)
}
func TestWorkerPledgeLocalFin(t *testing.T) {
ctx := context.Background()
_, miner, worker, ens := kit.EnsembleWorker(t, kit.WithAllSubsystems(), kit.ThroughRPC(),
kit.WithTaskTypes([]sealtasks.TaskType{sealtasks.TTFetch, sealtasks.TTCommit1, sealtasks.TTFinalize, sealtasks.TTAddPiece, sealtasks.TTPreCommit1, sealtasks.TTPreCommit2, sealtasks.TTCommit2, sealtasks.TTUnseal}),
kit.WithDisallowRemoteFinalize(true),
) // no mock proofs
ens.InterconnectAll().BeginMining(50 * time.Millisecond)
e, err := worker.Enabled(ctx)
require.NoError(t, err)
require.True(t, e)
miner.PledgeSectors(ctx, 1, 0, nil)
}
func TestWorkerDataCid(t *testing.T) {
ctx := context.Background()
_, miner, worker, _ := kit.EnsembleWorker(t, kit.WithAllSubsystems(), kit.ThroughRPC(), kit.WithNoLocalSealing(true),

View File

@ -763,6 +763,22 @@ This parameter is ONLY applicable if the retrieval pricing policy strategy has b
Comment: `Assigner specifies the worker assigner to use when scheduling tasks.
"utilization" (default) - assign tasks to workers with lowest utilization.
"spread" - assign tasks to as many distinct workers as possible.`,
},
{
Name: "DisallowRemoteFinalize",
Type: "bool",
Comment: `DisallowRemoteFinalize when set to true will force all Finalize tasks to
run on workers with local access to both long-term storage and the sealing
path containing the sector.
--
WARNING: Only set this if all workers have access to long-term storage
paths. If this flag is enabled, and there are workers without long-term
storage access, sectors will not be moved from them, and Finalize tasks
will appear to be stuck.
--
If you see stuck Finalize tasks after enabling this setting, check
'lotus-miner sealing sched-diag' and 'lotus-miner storage find [sector num]'`,
},
{
Name: "ResourceFiltering",

View File

@ -63,6 +63,7 @@ func (c *StorageMiner) StorageManager() sectorstorage.Config {
AllowProveReplicaUpdate2: c.Storage.AllowProveReplicaUpdate2,
AllowRegenSectorKey: c.Storage.AllowRegenSectorKey,
ResourceFiltering: c.Storage.ResourceFiltering,
DisallowRemoteFinalize: c.Storage.DisallowRemoteFinalize,
Assigner: c.Storage.Assigner,

View File

@ -335,6 +335,19 @@ type SealerConfig struct {
// "spread" - assign tasks to as many distinct workers as possible.
Assigner string
// DisallowRemoteFinalize when set to true will force all Finalize tasks to
// run on workers with local access to both long-term storage and the sealing
// path containing the sector.
// --
// WARNING: Only set this if all workers have access to long-term storage
// paths. If this flag is enabled, and there are workers without long-term
// storage access, sectors will not be moved from them, and Finalize tasks
// will appear to be stuck.
// --
// If you see stuck Finalize tasks after enabling this setting, check
// 'lotus-miner sealing sched-diag' and 'lotus-miner storage find [sector num]'
DisallowRemoteFinalize bool
// ResourceFiltering instructs the system which resource filtering strategy
// to use when evaluating tasks against this worker. An empty value defaults
// to "hardware".