move scheduling logic to a separate struct
This commit is contained in:
parent
5bea676ce3
commit
8c4dc60e75
221
manager.go
221
manager.go
@ -1,12 +1,10 @@
|
||||
package sectorstorage
|
||||
|
||||
import (
|
||||
"container/list"
|
||||
"context"
|
||||
"errors"
|
||||
"io"
|
||||
"net/http"
|
||||
"sync"
|
||||
|
||||
"github.com/ipfs/go-cid"
|
||||
logging "github.com/ipfs/go-log/v2"
|
||||
@ -62,18 +60,9 @@ type Manager struct {
|
||||
remoteHnd *stores.FetchHandler
|
||||
index stores.SectorIndex
|
||||
|
||||
sched *scheduler
|
||||
|
||||
storage.Prover
|
||||
|
||||
workersLk sync.Mutex
|
||||
nextWorker WorkerID
|
||||
workers map[WorkerID]*workerHandle
|
||||
|
||||
newWorkers chan *workerHandle
|
||||
schedule chan *workerRequest
|
||||
workerFree chan WorkerID
|
||||
closing chan struct{}
|
||||
|
||||
schedQueue *list.List // List[*workerRequest]
|
||||
}
|
||||
|
||||
type SealerConfig struct {
|
||||
@ -107,20 +96,12 @@ func New(ctx context.Context, ls stores.LocalStorage, si stores.SectorIndex, cfg
|
||||
remoteHnd: &stores.FetchHandler{Local: lstor},
|
||||
index: si,
|
||||
|
||||
nextWorker: 0,
|
||||
workers: map[WorkerID]*workerHandle{},
|
||||
|
||||
newWorkers: make(chan *workerHandle),
|
||||
schedule: make(chan *workerRequest),
|
||||
workerFree: make(chan WorkerID),
|
||||
closing: make(chan struct{}),
|
||||
|
||||
schedQueue: list.New(),
|
||||
sched: newScheduler(cfg.SealProofType),
|
||||
|
||||
Prover: prover,
|
||||
}
|
||||
|
||||
go m.runSched()
|
||||
go m.sched.runSched()
|
||||
|
||||
localTasks := []sealtasks.TaskType{
|
||||
sealtasks.TTAddPiece, sealtasks.TTCommit1, sealtasks.TTFinalize, sealtasks.TTFetch,
|
||||
@ -170,7 +151,7 @@ func (m *Manager) AddWorker(ctx context.Context, w Worker) error {
|
||||
return xerrors.Errorf("getting worker info: %w", err)
|
||||
}
|
||||
|
||||
m.newWorkers <- &workerHandle{
|
||||
m.sched.newWorkers <- &workerHandle{
|
||||
w: w,
|
||||
info: info,
|
||||
}
|
||||
@ -190,7 +171,7 @@ func (m *Manager) ReadPieceFromSealedSector(context.Context, abi.SectorID, ffiwr
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (m *Manager) getWorkersByPaths(task sealtasks.TaskType, inPaths []stores.StorageInfo) ([]WorkerID, map[WorkerID]stores.StorageInfo) {
|
||||
/*func (m *Manager) getWorkersByPaths(task sealtasks.TaskType, inPaths []stores.StorageInfo) ([]WorkerID, map[WorkerID]stores.StorageInfo) {
|
||||
m.workersLk.Lock()
|
||||
defer m.workersLk.Unlock()
|
||||
|
||||
@ -249,8 +230,8 @@ func (m *Manager) getWorker(ctx context.Context, taskType sealtasks.TaskType, ac
|
||||
taskType: taskType,
|
||||
accept: accept,
|
||||
|
||||
cancel: ctx.Done(),
|
||||
ret: ret,
|
||||
ctx: ctx.Done(),
|
||||
ret: ret,
|
||||
}:
|
||||
case <-m.closing:
|
||||
return nil, nil, xerrors.New("closing")
|
||||
@ -266,6 +247,16 @@ func (m *Manager) getWorker(ctx context.Context, taskType sealtasks.TaskType, ac
|
||||
case <-ctx.Done():
|
||||
return nil, nil, ctx.Err()
|
||||
}
|
||||
}*/
|
||||
|
||||
func schedNop(context.Context, Worker) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func schedFetch(sector abi.SectorID, ft stores.SectorFileType, sealing bool) func(context.Context, Worker) error {
|
||||
return func(ctx context.Context, worker Worker) error {
|
||||
return worker.Fetch(ctx, sector, ft, sealing)
|
||||
}
|
||||
}
|
||||
|
||||
func (m *Manager) NewSector(ctx context.Context, sector abi.SectorID) error {
|
||||
@ -274,151 +265,114 @@ func (m *Manager) NewSector(ctx context.Context, sector abi.SectorID) error {
|
||||
}
|
||||
|
||||
func (m *Manager) AddPiece(ctx context.Context, sector abi.SectorID, existingPieces []abi.UnpaddedPieceSize, sz abi.UnpaddedPieceSize, r io.Reader) (abi.PieceInfo, error) {
|
||||
// TODO: consider multiple paths vs workers when initially allocating
|
||||
|
||||
var best []stores.StorageInfo
|
||||
var selector WorkerSelector
|
||||
var err error
|
||||
if len(existingPieces) == 0 { // new
|
||||
best, err = m.index.StorageBestAlloc(ctx, stores.FTUnsealed, true)
|
||||
selector, err = newAllocSelector(ctx, m.index, stores.FTUnsealed)
|
||||
} else { // append to existing
|
||||
best, err = m.index.StorageFindSector(ctx, sector, stores.FTUnsealed, false)
|
||||
selector, err = newExistingSelector(ctx, m.index, sector, stores.FTUnsealed, false)
|
||||
}
|
||||
if err != nil {
|
||||
return abi.PieceInfo{}, xerrors.Errorf("finding sector path: %w", err)
|
||||
return abi.PieceInfo{}, xerrors.Errorf("creating path selector: %w", err)
|
||||
}
|
||||
|
||||
log.Debugf("find workers for %v", best)
|
||||
candidateWorkers, _ := m.getWorkersByPaths(sealtasks.TTAddPiece, best)
|
||||
var out abi.PieceInfo
|
||||
err = m.sched.Schedule(ctx, sealtasks.TTAddPiece, selector, schedNop, func(ctx context.Context, w Worker) error {
|
||||
p, err := w.AddPiece(ctx, sector, existingPieces, sz, r)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
out = p
|
||||
return nil
|
||||
})
|
||||
|
||||
if len(candidateWorkers) == 0 {
|
||||
return abi.PieceInfo{}, ErrNoWorkers
|
||||
}
|
||||
|
||||
worker, done, err := m.getWorker(ctx, sealtasks.TTAddPiece, candidateWorkers)
|
||||
if err != nil {
|
||||
return abi.PieceInfo{}, xerrors.Errorf("scheduling worker: %w", err)
|
||||
}
|
||||
defer done()
|
||||
|
||||
// TODO: select(candidateWorkers, ...)
|
||||
// TODO: remove the sectorbuilder abstraction, pass path directly
|
||||
return worker.AddPiece(ctx, sector, existingPieces, sz, r)
|
||||
return out, err
|
||||
}
|
||||
|
||||
func (m *Manager) SealPreCommit1(ctx context.Context, sector abi.SectorID, ticket abi.SealRandomness, pieces []abi.PieceInfo) (out storage.PreCommit1Out, err error) {
|
||||
// TODO: also consider where the unsealed data sits
|
||||
|
||||
best, err := m.index.StorageBestAlloc(ctx, stores.FTCache|stores.FTSealed, true)
|
||||
selector, err := newAllocSelector(ctx, m.index, stores.FTCache|stores.FTSealed)
|
||||
if err != nil {
|
||||
return nil, xerrors.Errorf("finding path for sector sealing: %w", err)
|
||||
return nil, xerrors.Errorf("creating path selector: %w", err)
|
||||
}
|
||||
|
||||
candidateWorkers, _ := m.getWorkersByPaths(sealtasks.TTPreCommit1, best)
|
||||
if len(candidateWorkers) == 0 {
|
||||
return nil, ErrNoWorkers
|
||||
}
|
||||
err = m.sched.Schedule(ctx, sealtasks.TTPreCommit1, selector, schedFetch(sector, stores.FTUnsealed, true), func(ctx context.Context, w Worker) error {
|
||||
p, err := w.SealPreCommit1(ctx, sector, ticket, pieces)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
out = p
|
||||
return nil
|
||||
})
|
||||
|
||||
worker, done, err := m.getWorker(ctx, sealtasks.TTPreCommit1, candidateWorkers)
|
||||
if err != nil {
|
||||
return nil, xerrors.Errorf("scheduling worker: %w", err)
|
||||
}
|
||||
defer done()
|
||||
|
||||
// TODO: select(candidateWorkers, ...)
|
||||
// TODO: remove the sectorbuilder abstraction, pass path directly
|
||||
return worker.SealPreCommit1(ctx, sector, ticket, pieces)
|
||||
return out, err
|
||||
}
|
||||
|
||||
func (m *Manager) SealPreCommit2(ctx context.Context, sector abi.SectorID, phase1Out storage.PreCommit1Out) (cids storage.SectorCids, err error) {
|
||||
// TODO: allow workers to fetch the sectors
|
||||
|
||||
best, err := m.index.StorageFindSector(ctx, sector, stores.FTCache|stores.FTSealed, true)
|
||||
func (m *Manager) SealPreCommit2(ctx context.Context, sector abi.SectorID, phase1Out storage.PreCommit1Out) (out storage.SectorCids, err error) {
|
||||
selector, err := newExistingSelector(ctx, m.index, sector, stores.FTCache|stores.FTSealed, true)
|
||||
if err != nil {
|
||||
return storage.SectorCids{}, xerrors.Errorf("finding path for sector sealing: %w", err)
|
||||
return storage.SectorCids{}, xerrors.Errorf("creating path selector: %w", err)
|
||||
}
|
||||
|
||||
candidateWorkers, _ := m.getWorkersByPaths(sealtasks.TTPreCommit2, best)
|
||||
if len(candidateWorkers) == 0 {
|
||||
return storage.SectorCids{}, ErrNoWorkers
|
||||
}
|
||||
|
||||
worker, done, err := m.getWorker(ctx, sealtasks.TTPreCommit2, candidateWorkers)
|
||||
if err != nil {
|
||||
return storage.SectorCids{}, xerrors.Errorf("scheduling worker: %w", err)
|
||||
}
|
||||
defer done()
|
||||
|
||||
// TODO: select(candidateWorkers, ...)
|
||||
// TODO: remove the sectorbuilder abstraction, pass path directly
|
||||
return worker.SealPreCommit2(ctx, sector, phase1Out)
|
||||
err = m.sched.Schedule(ctx, sealtasks.TTPreCommit2, selector, schedFetch(sector, stores.FTCache|stores.FTSealed, true), func(ctx context.Context, w Worker) error {
|
||||
p, err := w.SealPreCommit2(ctx, sector, phase1Out)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
out = p
|
||||
return nil
|
||||
})
|
||||
return out, err
|
||||
}
|
||||
|
||||
func (m *Manager) SealCommit1(ctx context.Context, sector abi.SectorID, ticket abi.SealRandomness, seed abi.InteractiveSealRandomness, pieces []abi.PieceInfo, cids storage.SectorCids) (output storage.Commit1Out, err error) {
|
||||
best, err := m.index.StorageFindSector(ctx, sector, stores.FTCache|stores.FTSealed, true)
|
||||
func (m *Manager) SealCommit1(ctx context.Context, sector abi.SectorID, ticket abi.SealRandomness, seed abi.InteractiveSealRandomness, pieces []abi.PieceInfo, cids storage.SectorCids) (out storage.Commit1Out, err error) {
|
||||
selector, err := newExistingSelector(ctx, m.index, sector, stores.FTCache|stores.FTSealed, false)
|
||||
if err != nil {
|
||||
return nil, xerrors.Errorf("finding path for sector sealing: %w", err)
|
||||
}
|
||||
|
||||
candidateWorkers, _ := m.getWorkersByPaths(sealtasks.TTCommit1, best)
|
||||
if len(candidateWorkers) == 0 {
|
||||
return nil, ErrNoWorkers
|
||||
return storage.Commit1Out{}, xerrors.Errorf("creating path selector: %w", err)
|
||||
}
|
||||
|
||||
// TODO: Try very hard to execute on worker with access to the sectors
|
||||
worker, done, err := m.getWorker(ctx, sealtasks.TTCommit1, candidateWorkers)
|
||||
if err != nil {
|
||||
return nil, xerrors.Errorf("scheduling worker: %w", err)
|
||||
}
|
||||
defer done()
|
||||
|
||||
// TODO: select(candidateWorkers, ...)
|
||||
// TODO: remove the sectorbuilder abstraction, pass path directly
|
||||
return worker.SealCommit1(ctx, sector, ticket, seed, pieces, cids)
|
||||
// (except, don't.. for now at least - we are using this step to bring data
|
||||
// into 'provable' storage. Optimally we'd do that in commit2, in parallel
|
||||
// with snark compute)
|
||||
err = m.sched.Schedule(ctx, sealtasks.TTCommit1, selector, schedFetch(sector, stores.FTCache|stores.FTSealed, true), func(ctx context.Context, w Worker) error {
|
||||
p, err := w.SealCommit1(ctx, sector, ticket, seed, pieces, cids)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
out = p
|
||||
return nil
|
||||
})
|
||||
return out, err
|
||||
}
|
||||
|
||||
func (m *Manager) SealCommit2(ctx context.Context, sector abi.SectorID, phase1Out storage.Commit1Out) (proof storage.Proof, err error) {
|
||||
var candidateWorkers []WorkerID
|
||||
func (m *Manager) SealCommit2(ctx context.Context, sector abi.SectorID, phase1Out storage.Commit1Out) (out storage.Proof, err error) {
|
||||
selector := newTaskSelector()
|
||||
|
||||
m.workersLk.Lock()
|
||||
for id, worker := range m.workers {
|
||||
tt, err := worker.w.TaskTypes(ctx)
|
||||
err = m.sched.Schedule(ctx, sealtasks.TTCommit2, selector, schedNop, func(ctx context.Context, w Worker) error {
|
||||
p, err := w.SealCommit2(ctx, sector, phase1Out)
|
||||
if err != nil {
|
||||
log.Errorf("error getting supported worker task types: %+v", err)
|
||||
continue
|
||||
return err
|
||||
}
|
||||
if _, ok := tt[sealtasks.TTCommit2]; !ok {
|
||||
continue
|
||||
}
|
||||
candidateWorkers = append(candidateWorkers, id)
|
||||
}
|
||||
m.workersLk.Unlock()
|
||||
if len(candidateWorkers) == 0 {
|
||||
return nil, ErrNoWorkers
|
||||
}
|
||||
out = p
|
||||
return nil
|
||||
})
|
||||
|
||||
worker, done, err := m.getWorker(ctx, sealtasks.TTCommit2, candidateWorkers)
|
||||
if err != nil {
|
||||
return nil, xerrors.Errorf("scheduling worker: %w", err)
|
||||
}
|
||||
defer done()
|
||||
|
||||
return worker.SealCommit2(ctx, sector, phase1Out)
|
||||
return out, err
|
||||
}
|
||||
|
||||
func (m *Manager) FinalizeSector(ctx context.Context, sector abi.SectorID) error {
|
||||
best, err := m.index.StorageFindSector(ctx, sector, stores.FTCache|stores.FTSealed|stores.FTUnsealed, true)
|
||||
selector, err := newExistingSelector(ctx, m.index, sector, stores.FTCache|stores.FTSealed|stores.FTUnsealed, false)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("finding sealed sector: %w", err)
|
||||
return xerrors.Errorf("creating path selector: %w", err)
|
||||
}
|
||||
|
||||
candidateWorkers, _ := m.getWorkersByPaths(sealtasks.TTFinalize, best)
|
||||
if len(candidateWorkers) == 0 {
|
||||
return ErrNoWorkers
|
||||
}
|
||||
|
||||
// TODO: Remove sector from sealing stores
|
||||
// TODO: Move the sector to long-term storage
|
||||
return m.workers[candidateWorkers[0]].w.FinalizeSector(ctx, sector)
|
||||
return m.sched.Schedule(ctx, sealtasks.TTFinalize, selector,
|
||||
schedFetch(sector, stores.FTCache|stores.FTSealed|stores.FTUnsealed, false),
|
||||
func(ctx context.Context, w Worker) error {
|
||||
return w.FinalizeSector(ctx, sector)
|
||||
})
|
||||
}
|
||||
|
||||
func (m *Manager) StorageLocal(ctx context.Context) (map[stores.ID]string, error) {
|
||||
@ -440,8 +394,7 @@ func (m *Manager) FsStat(ctx context.Context, id stores.ID) (stores.FsStat, erro
|
||||
}
|
||||
|
||||
func (m *Manager) Close() error {
|
||||
close(m.closing)
|
||||
return nil
|
||||
return m.sched.Close()
|
||||
}
|
||||
|
||||
var _ SectorManager = &Manager{}
|
||||
|
94
resources.go
94
resources.go
@ -23,12 +23,16 @@ type Resources struct {
|
||||
MinMemory uint64 // What Must be in RAM for decent perf
|
||||
MaxMemory uint64 // Memory required (swap + ram)
|
||||
|
||||
MultiThread bool
|
||||
CanGPU bool
|
||||
Threads int // -1 = multithread
|
||||
CanGPU bool
|
||||
|
||||
BaseMinMemory uint64 // What Must be in RAM for decent perf (shared between threads)
|
||||
}
|
||||
|
||||
func (r Resources) MultiThread() bool {
|
||||
return r.Threads == -1
|
||||
}
|
||||
|
||||
const MaxCachingOverhead = 32 << 30
|
||||
|
||||
var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
@ -37,7 +41,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
MaxMemory: 32 << 30,
|
||||
MinMemory: 32 << 30,
|
||||
|
||||
MultiThread: false,
|
||||
Threads: 1,
|
||||
|
||||
BaseMinMemory: 1 << 30,
|
||||
},
|
||||
@ -45,7 +49,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
MaxMemory: 1 << 30,
|
||||
MinMemory: 1 << 30,
|
||||
|
||||
MultiThread: false,
|
||||
Threads: 1,
|
||||
|
||||
BaseMinMemory: 1 << 30,
|
||||
},
|
||||
@ -53,7 +57,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
MaxMemory: 2 << 10,
|
||||
MinMemory: 2 << 10,
|
||||
|
||||
MultiThread: false,
|
||||
Threads: 1,
|
||||
|
||||
BaseMinMemory: 2 << 10,
|
||||
},
|
||||
@ -61,7 +65,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
MaxMemory: 8 << 20,
|
||||
MinMemory: 8 << 20,
|
||||
|
||||
MultiThread: false,
|
||||
Threads: 1,
|
||||
|
||||
BaseMinMemory: 8 << 20,
|
||||
},
|
||||
@ -71,7 +75,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
MaxMemory: 64 << 30,
|
||||
MinMemory: 32 << 30,
|
||||
|
||||
MultiThread: false,
|
||||
Threads: 1,
|
||||
|
||||
BaseMinMemory: 30 << 30,
|
||||
},
|
||||
@ -79,7 +83,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
MaxMemory: 3 << 29, // 1.5G
|
||||
MinMemory: 1 << 30,
|
||||
|
||||
MultiThread: false,
|
||||
Threads: 1,
|
||||
|
||||
BaseMinMemory: 1 << 30,
|
||||
},
|
||||
@ -87,7 +91,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
MaxMemory: 2 << 10,
|
||||
MinMemory: 2 << 10,
|
||||
|
||||
MultiThread: false,
|
||||
Threads: 1,
|
||||
|
||||
BaseMinMemory: 2 << 10,
|
||||
},
|
||||
@ -95,7 +99,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
MaxMemory: 8 << 20,
|
||||
MinMemory: 8 << 20,
|
||||
|
||||
MultiThread: false,
|
||||
Threads: 1,
|
||||
|
||||
BaseMinMemory: 8 << 20,
|
||||
},
|
||||
@ -105,7 +109,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
MaxMemory: 96 << 30,
|
||||
MinMemory: 64 << 30,
|
||||
|
||||
MultiThread: true,
|
||||
Threads: -1,
|
||||
|
||||
BaseMinMemory: 30 << 30,
|
||||
},
|
||||
@ -113,7 +117,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
MaxMemory: 3 << 29, // 1.5G
|
||||
MinMemory: 1 << 30,
|
||||
|
||||
MultiThread: true,
|
||||
Threads: -1,
|
||||
|
||||
BaseMinMemory: 1 << 30,
|
||||
},
|
||||
@ -121,7 +125,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
MaxMemory: 2 << 10,
|
||||
MinMemory: 2 << 10,
|
||||
|
||||
MultiThread: true,
|
||||
Threads: -1,
|
||||
|
||||
BaseMinMemory: 2 << 10,
|
||||
},
|
||||
@ -129,7 +133,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
MaxMemory: 8 << 20,
|
||||
MinMemory: 8 << 20,
|
||||
|
||||
MultiThread: true,
|
||||
Threads: -1,
|
||||
|
||||
BaseMinMemory: 8 << 20,
|
||||
},
|
||||
@ -139,7 +143,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
MaxMemory: 1 << 30,
|
||||
MinMemory: 1 << 30,
|
||||
|
||||
MultiThread: false,
|
||||
Threads: 0,
|
||||
|
||||
BaseMinMemory: 1 << 30,
|
||||
},
|
||||
@ -147,7 +151,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
MaxMemory: 1 << 30,
|
||||
MinMemory: 1 << 30,
|
||||
|
||||
MultiThread: false,
|
||||
Threads: 0,
|
||||
|
||||
BaseMinMemory: 1 << 30,
|
||||
},
|
||||
@ -155,7 +159,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
MaxMemory: 2 << 10,
|
||||
MinMemory: 2 << 10,
|
||||
|
||||
MultiThread: false,
|
||||
Threads: 0,
|
||||
|
||||
BaseMinMemory: 2 << 10,
|
||||
},
|
||||
@ -163,7 +167,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
MaxMemory: 8 << 20,
|
||||
MinMemory: 8 << 20,
|
||||
|
||||
MultiThread: false,
|
||||
Threads: 0,
|
||||
|
||||
BaseMinMemory: 8 << 20,
|
||||
},
|
||||
@ -173,8 +177,8 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
MaxMemory: 110 << 30,
|
||||
MinMemory: 60 << 30,
|
||||
|
||||
MultiThread: true,
|
||||
CanGPU: true,
|
||||
Threads: -1,
|
||||
CanGPU: true,
|
||||
|
||||
BaseMinMemory: 64 << 30, // params
|
||||
},
|
||||
@ -182,8 +186,8 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
MaxMemory: 3 << 29, // 1.5G
|
||||
MinMemory: 1 << 30,
|
||||
|
||||
MultiThread: false, // This is fine
|
||||
CanGPU: true,
|
||||
Threads: 1, // This is fine
|
||||
CanGPU: true,
|
||||
|
||||
BaseMinMemory: 10 << 30,
|
||||
},
|
||||
@ -191,8 +195,8 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
MaxMemory: 2 << 10,
|
||||
MinMemory: 2 << 10,
|
||||
|
||||
MultiThread: false,
|
||||
CanGPU: true,
|
||||
Threads: 1,
|
||||
CanGPU: true,
|
||||
|
||||
BaseMinMemory: 2 << 10,
|
||||
},
|
||||
@ -200,10 +204,48 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredProof]Resources{
|
||||
MaxMemory: 8 << 20,
|
||||
MinMemory: 8 << 20,
|
||||
|
||||
MultiThread: false,
|
||||
CanGPU: true,
|
||||
Threads: 1,
|
||||
CanGPU: true,
|
||||
|
||||
BaseMinMemory: 8 << 20,
|
||||
},
|
||||
},
|
||||
sealtasks.TTFetch: {
|
||||
abi.RegisteredProof_StackedDRG32GiBSeal: Resources{
|
||||
MaxMemory: 1 << 20,
|
||||
MinMemory: 1 << 20,
|
||||
|
||||
Threads: 0,
|
||||
CanGPU: false,
|
||||
|
||||
BaseMinMemory: 0,
|
||||
},
|
||||
abi.RegisteredProof_StackedDRG512MiBSeal: Resources{
|
||||
MaxMemory: 1 << 20,
|
||||
MinMemory: 1 << 20,
|
||||
|
||||
Threads: 0,
|
||||
CanGPU: false,
|
||||
|
||||
BaseMinMemory: 0,
|
||||
},
|
||||
abi.RegisteredProof_StackedDRG2KiBSeal: Resources{
|
||||
MaxMemory: 1 << 20,
|
||||
MinMemory: 1 << 20,
|
||||
|
||||
Threads: 0,
|
||||
CanGPU: false,
|
||||
|
||||
BaseMinMemory: 0,
|
||||
},
|
||||
abi.RegisteredProof_StackedDRG8MiBSeal: Resources{
|
||||
MaxMemory: 1 << 20,
|
||||
MinMemory: 1 << 20,
|
||||
|
||||
Threads: 0,
|
||||
CanGPU: false,
|
||||
|
||||
BaseMinMemory: 0,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
280
sched.go
280
sched.go
@ -1,6 +1,12 @@
|
||||
package sectorstorage
|
||||
|
||||
import (
|
||||
"container/list"
|
||||
"context"
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
"github.com/hashicorp/go-multierror"
|
||||
"golang.org/x/xerrors"
|
||||
|
||||
"github.com/filecoin-project/specs-actors/actors/abi"
|
||||
@ -11,29 +17,95 @@ import (
|
||||
|
||||
const mib = 1 << 20
|
||||
|
||||
type WorkerAction func(ctx context.Context, w Worker) error
|
||||
|
||||
type WorkerSelector interface {
|
||||
Ok(ctx context.Context, task sealtasks.TaskType, a *workerHandle) (bool, error) // true if worker is acceptable for performing a task
|
||||
|
||||
Cmp(ctx context.Context, task sealtasks.TaskType, a, b *workerHandle) (bool, error) // true if a is preferred over b
|
||||
}
|
||||
|
||||
type scheduler struct {
|
||||
spt abi.RegisteredProof
|
||||
|
||||
workersLk sync.Mutex
|
||||
nextWorker WorkerID
|
||||
workers map[WorkerID]*workerHandle
|
||||
|
||||
newWorkers chan *workerHandle
|
||||
schedule chan *workerRequest
|
||||
workerFree chan WorkerID
|
||||
closing chan struct{}
|
||||
|
||||
schedQueue *list.List // List[*workerRequest]
|
||||
}
|
||||
|
||||
func newScheduler(spt abi.RegisteredProof) *scheduler {
|
||||
return &scheduler{
|
||||
spt: spt,
|
||||
|
||||
nextWorker: 0,
|
||||
workers: map[WorkerID]*workerHandle{},
|
||||
|
||||
newWorkers: make(chan *workerHandle),
|
||||
schedule: make(chan *workerRequest),
|
||||
workerFree: make(chan WorkerID),
|
||||
closing: make(chan struct{}),
|
||||
|
||||
schedQueue: list.New(),
|
||||
}
|
||||
}
|
||||
|
||||
func (sh *scheduler) Schedule(ctx context.Context, taskType sealtasks.TaskType, sel WorkerSelector, prepare WorkerAction, work WorkerAction) error {
|
||||
ret := make(chan workerResponse)
|
||||
|
||||
select {
|
||||
case sh.schedule <- &workerRequest{
|
||||
taskType: taskType,
|
||||
sel: sel,
|
||||
|
||||
prepare: prepare,
|
||||
work: work,
|
||||
|
||||
ret: ret,
|
||||
ctx: ctx,
|
||||
}:
|
||||
case <-sh.closing:
|
||||
return xerrors.New("closing")
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
|
||||
select {
|
||||
case resp := <-ret:
|
||||
return resp.err
|
||||
case <-sh.closing:
|
||||
return xerrors.New("closing")
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
|
||||
type workerRequest struct {
|
||||
taskType sealtasks.TaskType
|
||||
accept []WorkerID // ordered by preference
|
||||
sel WorkerSelector
|
||||
|
||||
ret chan<- workerResponse
|
||||
cancel <-chan struct{}
|
||||
prepare WorkerAction
|
||||
work WorkerAction
|
||||
|
||||
ret chan<- workerResponse
|
||||
ctx context.Context
|
||||
}
|
||||
|
||||
type workerResponse struct {
|
||||
err error
|
||||
|
||||
worker Worker
|
||||
done func()
|
||||
}
|
||||
|
||||
func (r *workerRequest) respond(resp workerResponse) {
|
||||
func (r *workerRequest) respond(err error) {
|
||||
select {
|
||||
case r.ret <- resp:
|
||||
case <-r.cancel:
|
||||
case r.ret <- workerResponse{err: err}:
|
||||
case <-r.ctx.Done():
|
||||
log.Warnf("request got cancelled before we could respond")
|
||||
if resp.done != nil {
|
||||
resp.done()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -48,60 +120,56 @@ type workerHandle struct {
|
||||
cpuUse uint64 // 0 - free; 1+ - singlecore things
|
||||
}
|
||||
|
||||
func (m *Manager) runSched() {
|
||||
func (sh *scheduler) runSched() {
|
||||
for {
|
||||
select {
|
||||
case w := <-m.newWorkers:
|
||||
m.schedNewWorker(w)
|
||||
case req := <-m.schedule:
|
||||
resp, err := m.maybeSchedRequest(req)
|
||||
case w := <-sh.newWorkers:
|
||||
sh.schedNewWorker(w)
|
||||
case req := <-sh.schedule:
|
||||
scheduled, err := sh.maybeSchedRequest(req)
|
||||
if err != nil {
|
||||
req.respond(workerResponse{err: err})
|
||||
req.respond(err)
|
||||
continue
|
||||
}
|
||||
if scheduled {
|
||||
continue
|
||||
}
|
||||
|
||||
if resp != nil {
|
||||
req.respond(*resp)
|
||||
continue
|
||||
}
|
||||
|
||||
m.schedQueue.PushBack(req)
|
||||
case wid := <-m.workerFree:
|
||||
m.onWorkerFreed(wid)
|
||||
case <-m.closing:
|
||||
m.schedClose()
|
||||
sh.schedQueue.PushBack(req)
|
||||
case wid := <-sh.workerFree:
|
||||
sh.onWorkerFreed(wid)
|
||||
case <-sh.closing:
|
||||
sh.schedClose()
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *Manager) onWorkerFreed(wid WorkerID) {
|
||||
for e := m.schedQueue.Front(); e != nil; e = e.Next() {
|
||||
func (sh *scheduler) onWorkerFreed(wid WorkerID) {
|
||||
for e := sh.schedQueue.Front(); e != nil; e = e.Next() {
|
||||
req := e.Value.(*workerRequest)
|
||||
var ok bool
|
||||
for _, id := range req.accept {
|
||||
if id == wid {
|
||||
ok = true
|
||||
break
|
||||
}
|
||||
|
||||
ok, err := req.sel.Ok(req.ctx, req.taskType, sh.workers[wid])
|
||||
if err != nil {
|
||||
log.Errorf("onWorkerFreed req.sel.Ok error: %+v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
resp, err := m.maybeSchedRequest(req)
|
||||
scheduled, err := sh.maybeSchedRequest(req)
|
||||
if err != nil {
|
||||
req.respond(workerResponse{err: err})
|
||||
req.respond(err)
|
||||
continue
|
||||
}
|
||||
|
||||
if resp != nil {
|
||||
req.respond(*resp)
|
||||
|
||||
if scheduled {
|
||||
pe := e.Prev()
|
||||
m.schedQueue.Remove(e)
|
||||
sh.schedQueue.Remove(e)
|
||||
if pe == nil {
|
||||
pe = m.schedQueue.Front()
|
||||
pe = sh.schedQueue.Front()
|
||||
}
|
||||
if pe == nil {
|
||||
break
|
||||
@ -112,44 +180,68 @@ func (m *Manager) onWorkerFreed(wid WorkerID) {
|
||||
}
|
||||
}
|
||||
|
||||
func (m *Manager) maybeSchedRequest(req *workerRequest) (*workerResponse, error) {
|
||||
m.workersLk.Lock()
|
||||
defer m.workersLk.Unlock()
|
||||
func (sh *scheduler) maybeSchedRequest(req *workerRequest) (bool, error) {
|
||||
sh.workersLk.Lock()
|
||||
defer sh.workersLk.Unlock()
|
||||
|
||||
tried := 0
|
||||
var acceptable []WorkerID
|
||||
|
||||
for wid, worker := range sh.workers {
|
||||
ok, err := req.sel.Ok(req.ctx, req.taskType, worker)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
for i := len(req.accept) - 1; i >= 0; i-- {
|
||||
id := req.accept[i]
|
||||
w, ok := m.workers[id]
|
||||
if !ok {
|
||||
log.Warnf("requested worker %d is not in scheduler", id)
|
||||
continue
|
||||
}
|
||||
tried++
|
||||
|
||||
canDo, err := m.canHandleRequest(id, w, req)
|
||||
canDo, err := sh.canHandleRequest(wid, worker, req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return false, err
|
||||
}
|
||||
|
||||
if !canDo {
|
||||
continue
|
||||
}
|
||||
|
||||
return m.makeResponse(id, w, req), nil
|
||||
acceptable = append(acceptable, wid)
|
||||
}
|
||||
|
||||
if len(acceptable) > 0 {
|
||||
{
|
||||
var serr error
|
||||
|
||||
sort.SliceStable(acceptable, func(i, j int) bool {
|
||||
r, err := req.sel.Cmp(req.ctx, req.taskType, sh.workers[acceptable[i]], sh.workers[acceptable[j]])
|
||||
if err != nil {
|
||||
serr = multierror.Append(serr, err)
|
||||
}
|
||||
return r
|
||||
})
|
||||
|
||||
if serr != nil {
|
||||
return false, xerrors.Errorf("error(s) selecting best worker: %w", serr)
|
||||
}
|
||||
}
|
||||
|
||||
return true, sh.assignWorker(acceptable[0], sh.workers[acceptable[0]], req)
|
||||
}
|
||||
|
||||
if tried == 0 {
|
||||
return nil, xerrors.New("maybeSchedRequest didn't find any good workers")
|
||||
return false, xerrors.New("maybeSchedRequest didn't find any good workers")
|
||||
}
|
||||
|
||||
return nil, nil // put in waiting queue
|
||||
return false, nil // put in waiting queue
|
||||
}
|
||||
|
||||
func (m *Manager) makeResponse(wid WorkerID, w *workerHandle, req *workerRequest) *workerResponse {
|
||||
needRes := ResourceTable[req.taskType][m.scfg.SealProofType]
|
||||
func (sh *scheduler) assignWorker(wid WorkerID, w *workerHandle, req *workerRequest) error {
|
||||
needRes := ResourceTable[req.taskType][sh.spt]
|
||||
|
||||
w.gpuUsed = needRes.CanGPU
|
||||
if needRes.MultiThread {
|
||||
if needRes.MultiThread() {
|
||||
w.cpuUse += w.info.Resources.CPUs
|
||||
} else {
|
||||
w.cpuUse++
|
||||
@ -158,17 +250,17 @@ func (m *Manager) makeResponse(wid WorkerID, w *workerHandle, req *workerRequest
|
||||
w.memUsedMin += needRes.MinMemory
|
||||
w.memUsedMax += needRes.MaxMemory
|
||||
|
||||
return &workerResponse{
|
||||
err: nil,
|
||||
worker: w.w,
|
||||
done: func() {
|
||||
m.workersLk.Lock()
|
||||
go func() {
|
||||
var err error
|
||||
|
||||
defer func() {
|
||||
sh.workersLk.Lock()
|
||||
|
||||
if needRes.CanGPU {
|
||||
w.gpuUsed = false
|
||||
}
|
||||
|
||||
if needRes.MultiThread {
|
||||
if needRes.MultiThread() {
|
||||
w.cpuUse -= w.info.Resources.CPUs
|
||||
} else {
|
||||
w.cpuUse--
|
||||
@ -177,20 +269,35 @@ func (m *Manager) makeResponse(wid WorkerID, w *workerHandle, req *workerRequest
|
||||
w.memUsedMin -= needRes.MinMemory
|
||||
w.memUsedMax -= needRes.MaxMemory
|
||||
|
||||
m.workersLk.Unlock()
|
||||
sh.workersLk.Unlock()
|
||||
|
||||
select {
|
||||
case m.workerFree <- wid:
|
||||
case <-m.closing:
|
||||
case sh.workerFree <- wid:
|
||||
case <-sh.closing:
|
||||
}
|
||||
},
|
||||
}
|
||||
}()
|
||||
|
||||
err = req.prepare(req.ctx, w.w)
|
||||
if err == nil {
|
||||
err = req.work(req.ctx, w.w)
|
||||
}
|
||||
|
||||
select {
|
||||
case req.ret <- workerResponse{err: err}:
|
||||
case <-req.ctx.Done():
|
||||
log.Warnf("request got cancelled before we could respond")
|
||||
case <-sh.closing:
|
||||
log.Warnf("scheduler closed while sending response")
|
||||
}
|
||||
}()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *Manager) canHandleRequest(wid WorkerID, w *workerHandle, req *workerRequest) (bool, error) {
|
||||
needRes, ok := ResourceTable[req.taskType][m.scfg.SealProofType]
|
||||
func (sh *scheduler) canHandleRequest(wid WorkerID, w *workerHandle, req *workerRequest) (bool, error) {
|
||||
needRes, ok := ResourceTable[req.taskType][sh.spt]
|
||||
if !ok {
|
||||
return false, xerrors.Errorf("canHandleRequest: missing ResourceTable entry for %s/%d", req.taskType, m.scfg.SealProofType)
|
||||
return false, xerrors.Errorf("canHandleRequest: missing ResourceTable entry for %s/%d", req.taskType, sh.spt)
|
||||
}
|
||||
|
||||
res := w.info.Resources
|
||||
@ -203,7 +310,7 @@ func (m *Manager) canHandleRequest(wid WorkerID, w *workerHandle, req *workerReq
|
||||
}
|
||||
|
||||
maxNeedMem := res.MemReserved + w.memUsedMax + needRes.MaxMemory + needRes.BaseMinMemory
|
||||
if m.scfg.SealProofType == abi.RegisteredProof_StackedDRG32GiBSeal {
|
||||
if sh.spt == abi.RegisteredProof_StackedDRG32GiBSeal {
|
||||
maxNeedMem += MaxCachingOverhead
|
||||
}
|
||||
if maxNeedMem > res.MemSwap+res.MemPhysical {
|
||||
@ -211,7 +318,7 @@ func (m *Manager) canHandleRequest(wid WorkerID, w *workerHandle, req *workerReq
|
||||
return false, nil
|
||||
}
|
||||
|
||||
if needRes.MultiThread {
|
||||
if needRes.MultiThread() {
|
||||
if w.cpuUse > 0 {
|
||||
log.Debugf("sched: not scheduling on worker %d; multicore process needs %d threads, %d in use, target %d", wid, res.CPUs, w.cpuUse, res.CPUs)
|
||||
return false, nil
|
||||
@ -228,22 +335,27 @@ func (m *Manager) canHandleRequest(wid WorkerID, w *workerHandle, req *workerReq
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func (m *Manager) schedNewWorker(w *workerHandle) {
|
||||
m.workersLk.Lock()
|
||||
defer m.workersLk.Unlock()
|
||||
func (sh *scheduler) schedNewWorker(w *workerHandle) {
|
||||
sh.workersLk.Lock()
|
||||
defer sh.workersLk.Unlock()
|
||||
|
||||
id := m.nextWorker
|
||||
m.workers[id] = w
|
||||
m.nextWorker++
|
||||
id := sh.nextWorker
|
||||
sh.workers[id] = w
|
||||
sh.nextWorker++
|
||||
}
|
||||
|
||||
func (m *Manager) schedClose() {
|
||||
m.workersLk.Lock()
|
||||
defer m.workersLk.Unlock()
|
||||
func (sh *scheduler) schedClose() {
|
||||
sh.workersLk.Lock()
|
||||
defer sh.workersLk.Unlock()
|
||||
|
||||
for i, w := range m.workers {
|
||||
for i, w := range sh.workers {
|
||||
if err := w.w.Close(); err != nil {
|
||||
log.Errorf("closing worker %d: %+v", i, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (sh *scheduler) Close() error {
|
||||
close(sh.closing)
|
||||
return nil
|
||||
}
|
||||
|
6
stats.go
6
stats.go
@ -3,12 +3,12 @@ package sectorstorage
|
||||
import "github.com/filecoin-project/sector-storage/storiface"
|
||||
|
||||
func (m *Manager) WorkerStats() map[uint64]storiface.WorkerStats {
|
||||
m.workersLk.Lock()
|
||||
defer m.workersLk.Unlock()
|
||||
m.sched.workersLk.Lock()
|
||||
defer m.sched.workersLk.Unlock()
|
||||
|
||||
out := map[uint64]storiface.WorkerStats{}
|
||||
|
||||
for id, handle := range m.workers {
|
||||
for id, handle := range m.sched.workers {
|
||||
out[uint64(id)] = storiface.WorkerStats{
|
||||
Info: handle.info,
|
||||
MemUsedMin: handle.memUsedMin,
|
||||
|
Loading…
Reference in New Issue
Block a user