feat: sealing: Put scheduler assign logic behind an interface
This commit is contained in:
parent
df98a2a089
commit
9ac19cb14b
2
extern/sector-storage/manager.go
vendored
2
extern/sector-storage/manager.go
vendored
@ -62,7 +62,7 @@ type Manager struct {
|
||||
remoteHnd *stores.FetchHandler
|
||||
index stores.SectorIndex
|
||||
|
||||
sched *scheduler
|
||||
sched *Scheduler
|
||||
windowPoStSched *poStScheduler
|
||||
winningPoStSched *poStScheduler
|
||||
|
||||
|
24
extern/sector-storage/request_queue.go
vendored
24
extern/sector-storage/request_queue.go
vendored
@ -2,34 +2,34 @@ package sectorstorage
|
||||
|
||||
import "sort"
|
||||
|
||||
type requestQueue []*workerRequest
|
||||
type RequestQueue []*WorkerRequest
|
||||
|
||||
func (q requestQueue) Len() int { return len(q) }
|
||||
func (q RequestQueue) Len() int { return len(q) }
|
||||
|
||||
func (q requestQueue) Less(i, j int) bool {
|
||||
oneMuchLess, muchLess := q[i].taskType.MuchLess(q[j].taskType)
|
||||
func (q RequestQueue) Less(i, j int) bool {
|
||||
oneMuchLess, muchLess := q[i].TaskType.MuchLess(q[j].TaskType)
|
||||
if oneMuchLess {
|
||||
return muchLess
|
||||
}
|
||||
|
||||
if q[i].priority != q[j].priority {
|
||||
return q[i].priority > q[j].priority
|
||||
if q[i].Priority != q[j].Priority {
|
||||
return q[i].Priority > q[j].Priority
|
||||
}
|
||||
|
||||
if q[i].taskType != q[j].taskType {
|
||||
return q[i].taskType.Less(q[j].taskType)
|
||||
if q[i].TaskType != q[j].TaskType {
|
||||
return q[i].TaskType.Less(q[j].TaskType)
|
||||
}
|
||||
|
||||
return q[i].sector.ID.Number < q[j].sector.ID.Number // optimize minerActor.NewSectors bitfield
|
||||
return q[i].Sector.ID.Number < q[j].Sector.ID.Number // optimize minerActor.NewSectors bitfield
|
||||
}
|
||||
|
||||
func (q requestQueue) Swap(i, j int) {
|
||||
func (q RequestQueue) Swap(i, j int) {
|
||||
q[i], q[j] = q[j], q[i]
|
||||
q[i].index = i
|
||||
q[j].index = j
|
||||
}
|
||||
|
||||
func (q *requestQueue) Push(x *workerRequest) {
|
||||
func (q *RequestQueue) Push(x *WorkerRequest) {
|
||||
n := len(*q)
|
||||
item := x
|
||||
item.index = n
|
||||
@ -37,7 +37,7 @@ func (q *requestQueue) Push(x *workerRequest) {
|
||||
sort.Sort(q)
|
||||
}
|
||||
|
||||
func (q *requestQueue) Remove(i int) *workerRequest {
|
||||
func (q *RequestQueue) Remove(i int) *WorkerRequest {
|
||||
old := *q
|
||||
n := len(old)
|
||||
item := old[i]
|
||||
|
30
extern/sector-storage/request_queue_test.go
vendored
30
extern/sector-storage/request_queue_test.go
vendored
@ -8,13 +8,13 @@ import (
|
||||
)
|
||||
|
||||
func TestRequestQueue(t *testing.T) {
|
||||
rq := &requestQueue{}
|
||||
rq := &RequestQueue{}
|
||||
|
||||
rq.Push(&workerRequest{taskType: sealtasks.TTAddPiece})
|
||||
rq.Push(&workerRequest{taskType: sealtasks.TTPreCommit1})
|
||||
rq.Push(&workerRequest{taskType: sealtasks.TTPreCommit2})
|
||||
rq.Push(&workerRequest{taskType: sealtasks.TTPreCommit1})
|
||||
rq.Push(&workerRequest{taskType: sealtasks.TTAddPiece})
|
||||
rq.Push(&WorkerRequest{TaskType: sealtasks.TTAddPiece})
|
||||
rq.Push(&WorkerRequest{TaskType: sealtasks.TTPreCommit1})
|
||||
rq.Push(&WorkerRequest{TaskType: sealtasks.TTPreCommit2})
|
||||
rq.Push(&WorkerRequest{TaskType: sealtasks.TTPreCommit1})
|
||||
rq.Push(&WorkerRequest{TaskType: sealtasks.TTAddPiece})
|
||||
|
||||
dump := func(s string) {
|
||||
fmt.Println("---")
|
||||
@ -22,7 +22,7 @@ func TestRequestQueue(t *testing.T) {
|
||||
|
||||
for sqi := 0; sqi < rq.Len(); sqi++ {
|
||||
task := (*rq)[sqi]
|
||||
fmt.Println(sqi, task.taskType)
|
||||
fmt.Println(sqi, task.TaskType)
|
||||
}
|
||||
}
|
||||
|
||||
@ -32,31 +32,31 @@ func TestRequestQueue(t *testing.T) {
|
||||
|
||||
dump("pop 1")
|
||||
|
||||
if pt.taskType != sealtasks.TTPreCommit2 {
|
||||
t.Error("expected precommit2, got", pt.taskType)
|
||||
if pt.TaskType != sealtasks.TTPreCommit2 {
|
||||
t.Error("expected precommit2, got", pt.TaskType)
|
||||
}
|
||||
|
||||
pt = rq.Remove(0)
|
||||
|
||||
dump("pop 2")
|
||||
|
||||
if pt.taskType != sealtasks.TTPreCommit1 {
|
||||
t.Error("expected precommit1, got", pt.taskType)
|
||||
if pt.TaskType != sealtasks.TTPreCommit1 {
|
||||
t.Error("expected precommit1, got", pt.TaskType)
|
||||
}
|
||||
|
||||
pt = rq.Remove(1)
|
||||
|
||||
dump("pop 3")
|
||||
|
||||
if pt.taskType != sealtasks.TTAddPiece {
|
||||
t.Error("expected addpiece, got", pt.taskType)
|
||||
if pt.TaskType != sealtasks.TTAddPiece {
|
||||
t.Error("expected addpiece, got", pt.TaskType)
|
||||
}
|
||||
|
||||
pt = rq.Remove(0)
|
||||
|
||||
dump("pop 4")
|
||||
|
||||
if pt.taskType != sealtasks.TTPreCommit1 {
|
||||
t.Error("expected precommit1, got", pt.taskType)
|
||||
if pt.TaskType != sealtasks.TTPreCommit1 {
|
||||
t.Error("expected precommit1, got", pt.TaskType)
|
||||
}
|
||||
}
|
||||
|
384
extern/sector-storage/sched.go
vendored
384
extern/sector-storage/sched.go
vendored
@ -2,9 +2,6 @@ package sectorstorage
|
||||
|
||||
import (
|
||||
"context"
|
||||
"math"
|
||||
"math/rand"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@ -47,23 +44,26 @@ const mib = 1 << 20
|
||||
type WorkerAction func(ctx context.Context, w Worker) error
|
||||
|
||||
type WorkerSelector interface {
|
||||
Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, a *workerHandle) (bool, error) // true if worker is acceptable for performing a task
|
||||
Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, a *WorkerHandle) (bool, error) // true if worker is acceptable for performing a task
|
||||
|
||||
Cmp(ctx context.Context, task sealtasks.TaskType, a, b *workerHandle) (bool, error) // true if a is preferred over b
|
||||
Cmp(ctx context.Context, task sealtasks.TaskType, a, b *WorkerHandle) (bool, error) // true if a is preferred over b
|
||||
}
|
||||
|
||||
type scheduler struct {
|
||||
workersLk sync.RWMutex
|
||||
workers map[storiface.WorkerID]*workerHandle
|
||||
type Scheduler struct {
|
||||
assigner Assigner
|
||||
|
||||
schedule chan *workerRequest
|
||||
windowRequests chan *schedWindowRequest
|
||||
workersLk sync.RWMutex
|
||||
|
||||
Workers map[storiface.WorkerID]*WorkerHandle
|
||||
|
||||
schedule chan *WorkerRequest
|
||||
windowRequests chan *SchedWindowRequest
|
||||
workerChange chan struct{} // worker added / changed/freed resources
|
||||
workerDisable chan workerDisableReq
|
||||
|
||||
// owned by the sh.runSched goroutine
|
||||
schedQueue *requestQueue
|
||||
openWindows []*schedWindowRequest
|
||||
SchedQueue *RequestQueue
|
||||
OpenWindows []*SchedWindowRequest
|
||||
|
||||
workTracker *workTracker
|
||||
|
||||
@ -74,24 +74,24 @@ type scheduler struct {
|
||||
testSync chan struct{} // used for testing
|
||||
}
|
||||
|
||||
type workerHandle struct {
|
||||
type WorkerHandle struct {
|
||||
workerRpc Worker
|
||||
|
||||
tasksCache map[sealtasks.TaskType]struct{}
|
||||
tasksUpdate time.Time
|
||||
tasksLk sync.Mutex
|
||||
|
||||
info storiface.WorkerInfo
|
||||
Info storiface.WorkerInfo
|
||||
|
||||
preparing *activeResources // use with workerHandle.lk
|
||||
active *activeResources // use with workerHandle.lk
|
||||
preparing *activeResources // use with WorkerHandle.lk
|
||||
active *activeResources // use with WorkerHandle.lk
|
||||
|
||||
lk sync.Mutex // can be taken inside sched.workersLk.RLock
|
||||
|
||||
wndLk sync.Mutex // can be taken inside sched.workersLk.RLock
|
||||
activeWindows []*schedWindow
|
||||
activeWindows []*SchedWindow
|
||||
|
||||
enabled bool
|
||||
Enabled bool
|
||||
|
||||
// for sync manager goroutine closing
|
||||
cleanupStarted bool
|
||||
@ -99,19 +99,19 @@ type workerHandle struct {
|
||||
closingMgr chan struct{}
|
||||
}
|
||||
|
||||
type schedWindowRequest struct {
|
||||
worker storiface.WorkerID
|
||||
type SchedWindowRequest struct {
|
||||
Worker storiface.WorkerID
|
||||
|
||||
done chan *schedWindow
|
||||
Done chan *SchedWindow
|
||||
}
|
||||
|
||||
type schedWindow struct {
|
||||
allocated activeResources
|
||||
todo []*workerRequest
|
||||
type SchedWindow struct {
|
||||
Allocated activeResources
|
||||
Todo []*WorkerRequest
|
||||
}
|
||||
|
||||
type workerDisableReq struct {
|
||||
activeWindows []*schedWindow
|
||||
activeWindows []*SchedWindow
|
||||
wid storiface.WorkerID
|
||||
done func()
|
||||
}
|
||||
@ -126,11 +126,11 @@ type activeResources struct {
|
||||
waiting int
|
||||
}
|
||||
|
||||
type workerRequest struct {
|
||||
sector storage.SectorRef
|
||||
taskType sealtasks.TaskType
|
||||
priority int // larger values more important
|
||||
sel WorkerSelector
|
||||
type WorkerRequest struct {
|
||||
Sector storage.SectorRef
|
||||
TaskType sealtasks.TaskType
|
||||
Priority int // larger values more important
|
||||
Sel WorkerSelector
|
||||
|
||||
prepare WorkerAction
|
||||
work WorkerAction
|
||||
@ -139,25 +139,27 @@ type workerRequest struct {
|
||||
|
||||
index int // The index of the item in the heap.
|
||||
|
||||
indexHeap int
|
||||
IndexHeap int
|
||||
ret chan<- workerResponse
|
||||
ctx context.Context
|
||||
Ctx context.Context
|
||||
}
|
||||
|
||||
type workerResponse struct {
|
||||
err error
|
||||
}
|
||||
|
||||
func newScheduler() *scheduler {
|
||||
return &scheduler{
|
||||
workers: map[storiface.WorkerID]*workerHandle{},
|
||||
func newScheduler() *Scheduler {
|
||||
return &Scheduler{
|
||||
assigner: &AssignerUtil{},
|
||||
|
||||
schedule: make(chan *workerRequest),
|
||||
windowRequests: make(chan *schedWindowRequest, 20),
|
||||
Workers: map[storiface.WorkerID]*WorkerHandle{},
|
||||
|
||||
schedule: make(chan *WorkerRequest),
|
||||
windowRequests: make(chan *SchedWindowRequest, 20),
|
||||
workerChange: make(chan struct{}, 20),
|
||||
workerDisable: make(chan workerDisableReq),
|
||||
|
||||
schedQueue: &requestQueue{},
|
||||
SchedQueue: &RequestQueue{},
|
||||
|
||||
workTracker: &workTracker{
|
||||
done: map[storiface.CallID]struct{}{},
|
||||
@ -172,15 +174,15 @@ func newScheduler() *scheduler {
|
||||
}
|
||||
}
|
||||
|
||||
func (sh *scheduler) Schedule(ctx context.Context, sector storage.SectorRef, taskType sealtasks.TaskType, sel WorkerSelector, prepare WorkerAction, work WorkerAction) error {
|
||||
func (sh *Scheduler) Schedule(ctx context.Context, sector storage.SectorRef, taskType sealtasks.TaskType, sel WorkerSelector, prepare WorkerAction, work WorkerAction) error {
|
||||
ret := make(chan workerResponse)
|
||||
|
||||
select {
|
||||
case sh.schedule <- &workerRequest{
|
||||
sector: sector,
|
||||
taskType: taskType,
|
||||
priority: getPriority(ctx),
|
||||
sel: sel,
|
||||
case sh.schedule <- &WorkerRequest{
|
||||
Sector: sector,
|
||||
TaskType: taskType,
|
||||
Priority: getPriority(ctx),
|
||||
Sel: sel,
|
||||
|
||||
prepare: prepare,
|
||||
work: work,
|
||||
@ -188,7 +190,7 @@ func (sh *scheduler) Schedule(ctx context.Context, sector storage.SectorRef, tas
|
||||
start: time.Now(),
|
||||
|
||||
ret: ret,
|
||||
ctx: ctx,
|
||||
Ctx: ctx,
|
||||
}:
|
||||
case <-sh.closing:
|
||||
return xerrors.New("closing")
|
||||
@ -206,10 +208,10 @@ func (sh *scheduler) Schedule(ctx context.Context, sector storage.SectorRef, tas
|
||||
}
|
||||
}
|
||||
|
||||
func (r *workerRequest) respond(err error) {
|
||||
func (r *WorkerRequest) respond(err error) {
|
||||
select {
|
||||
case r.ret <- workerResponse{err: err}:
|
||||
case <-r.ctx.Done():
|
||||
case <-r.Ctx.Done():
|
||||
log.Warnf("request got cancelled before we could respond")
|
||||
}
|
||||
}
|
||||
@ -225,7 +227,7 @@ type SchedDiagInfo struct {
|
||||
OpenWindows []string
|
||||
}
|
||||
|
||||
func (sh *scheduler) runSched() {
|
||||
func (sh *Scheduler) runSched() {
|
||||
defer close(sh.closed)
|
||||
|
||||
iw := time.After(InitWait)
|
||||
@ -242,14 +244,14 @@ func (sh *scheduler) runSched() {
|
||||
toDisable = append(toDisable, dreq)
|
||||
doSched = true
|
||||
case req := <-sh.schedule:
|
||||
sh.schedQueue.Push(req)
|
||||
sh.SchedQueue.Push(req)
|
||||
doSched = true
|
||||
|
||||
if sh.testSync != nil {
|
||||
sh.testSync <- struct{}{}
|
||||
}
|
||||
case req := <-sh.windowRequests:
|
||||
sh.openWindows = append(sh.openWindows, req)
|
||||
sh.OpenWindows = append(sh.OpenWindows, req)
|
||||
doSched = true
|
||||
case ireq := <-sh.info:
|
||||
ireq(sh.diag())
|
||||
@ -273,12 +275,12 @@ func (sh *scheduler) runSched() {
|
||||
case dreq := <-sh.workerDisable:
|
||||
toDisable = append(toDisable, dreq)
|
||||
case req := <-sh.schedule:
|
||||
sh.schedQueue.Push(req)
|
||||
sh.SchedQueue.Push(req)
|
||||
if sh.testSync != nil {
|
||||
sh.testSync <- struct{}{}
|
||||
}
|
||||
case req := <-sh.windowRequests:
|
||||
sh.openWindows = append(sh.openWindows, req)
|
||||
sh.OpenWindows = append(sh.OpenWindows, req)
|
||||
default:
|
||||
break loop
|
||||
}
|
||||
@ -286,21 +288,21 @@ func (sh *scheduler) runSched() {
|
||||
|
||||
for _, req := range toDisable {
|
||||
for _, window := range req.activeWindows {
|
||||
for _, request := range window.todo {
|
||||
sh.schedQueue.Push(request)
|
||||
for _, request := range window.Todo {
|
||||
sh.SchedQueue.Push(request)
|
||||
}
|
||||
}
|
||||
|
||||
openWindows := make([]*schedWindowRequest, 0, len(sh.openWindows))
|
||||
for _, window := range sh.openWindows {
|
||||
if window.worker != req.wid {
|
||||
openWindows := make([]*SchedWindowRequest, 0, len(sh.OpenWindows))
|
||||
for _, window := range sh.OpenWindows {
|
||||
if window.Worker != req.wid {
|
||||
openWindows = append(openWindows, window)
|
||||
}
|
||||
}
|
||||
sh.openWindows = openWindows
|
||||
sh.OpenWindows = openWindows
|
||||
|
||||
sh.workersLk.Lock()
|
||||
sh.workers[req.wid].enabled = false
|
||||
sh.Workers[req.wid].Enabled = false
|
||||
sh.workersLk.Unlock()
|
||||
|
||||
req.done()
|
||||
@ -312,281 +314,51 @@ func (sh *scheduler) runSched() {
|
||||
}
|
||||
}
|
||||
|
||||
func (sh *scheduler) diag() SchedDiagInfo {
|
||||
func (sh *Scheduler) diag() SchedDiagInfo {
|
||||
var out SchedDiagInfo
|
||||
|
||||
for sqi := 0; sqi < sh.schedQueue.Len(); sqi++ {
|
||||
task := (*sh.schedQueue)[sqi]
|
||||
for sqi := 0; sqi < sh.SchedQueue.Len(); sqi++ {
|
||||
task := (*sh.SchedQueue)[sqi]
|
||||
|
||||
out.Requests = append(out.Requests, SchedDiagRequestInfo{
|
||||
Sector: task.sector.ID,
|
||||
TaskType: task.taskType,
|
||||
Priority: task.priority,
|
||||
Sector: task.Sector.ID,
|
||||
TaskType: task.TaskType,
|
||||
Priority: task.Priority,
|
||||
})
|
||||
}
|
||||
|
||||
sh.workersLk.RLock()
|
||||
defer sh.workersLk.RUnlock()
|
||||
|
||||
for _, window := range sh.openWindows {
|
||||
out.OpenWindows = append(out.OpenWindows, uuid.UUID(window.worker).String())
|
||||
for _, window := range sh.OpenWindows {
|
||||
out.OpenWindows = append(out.OpenWindows, uuid.UUID(window.Worker).String())
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (sh *scheduler) trySched() {
|
||||
/*
|
||||
This assigns tasks to workers based on:
|
||||
- Task priority (achieved by handling sh.schedQueue in order, since it's already sorted by priority)
|
||||
- Worker resource availability
|
||||
- Task-specified worker preference (acceptableWindows array below sorted by this preference)
|
||||
- Window request age
|
||||
|
||||
1. For each task in the schedQueue find windows which can handle them
|
||||
1.1. Create list of windows capable of handling a task
|
||||
1.2. Sort windows according to task selector preferences
|
||||
2. Going through schedQueue again, assign task to first acceptable window
|
||||
with resources available
|
||||
3. Submit windows with scheduled tasks to workers
|
||||
|
||||
*/
|
||||
type Assigner interface {
|
||||
TrySched(sh *Scheduler)
|
||||
}
|
||||
|
||||
func (sh *Scheduler) trySched() {
|
||||
sh.workersLk.RLock()
|
||||
defer sh.workersLk.RUnlock()
|
||||
|
||||
windowsLen := len(sh.openWindows)
|
||||
queueLen := sh.schedQueue.Len()
|
||||
|
||||
log.Debugf("SCHED %d queued; %d open windows", queueLen, windowsLen)
|
||||
|
||||
if windowsLen == 0 || queueLen == 0 {
|
||||
// nothing to schedule on
|
||||
return
|
||||
}
|
||||
|
||||
windows := make([]schedWindow, windowsLen)
|
||||
acceptableWindows := make([][]int, queueLen) // QueueIndex -> []OpenWindowIndex
|
||||
|
||||
// Step 1
|
||||
throttle := make(chan struct{}, windowsLen)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(queueLen)
|
||||
for i := 0; i < queueLen; i++ {
|
||||
throttle <- struct{}{}
|
||||
|
||||
go func(sqi int) {
|
||||
defer wg.Done()
|
||||
defer func() {
|
||||
<-throttle
|
||||
}()
|
||||
|
||||
task := (*sh.schedQueue)[sqi]
|
||||
|
||||
task.indexHeap = sqi
|
||||
for wnd, windowRequest := range sh.openWindows {
|
||||
worker, ok := sh.workers[windowRequest.worker]
|
||||
if !ok {
|
||||
log.Errorf("worker referenced by windowRequest not found (worker: %s)", windowRequest.worker)
|
||||
// TODO: How to move forward here?
|
||||
continue
|
||||
}
|
||||
|
||||
if !worker.enabled {
|
||||
log.Debugw("skipping disabled worker", "worker", windowRequest.worker)
|
||||
continue
|
||||
}
|
||||
|
||||
needRes := worker.info.Resources.ResourceSpec(task.sector.ProofType, task.taskType)
|
||||
|
||||
// TODO: allow bigger windows
|
||||
if !windows[wnd].allocated.canHandleRequest(needRes, windowRequest.worker, "schedAcceptable", worker.info) {
|
||||
continue
|
||||
}
|
||||
|
||||
rpcCtx, cancel := context.WithTimeout(task.ctx, SelectorTimeout)
|
||||
ok, err := task.sel.Ok(rpcCtx, task.taskType, task.sector.ProofType, worker)
|
||||
cancel()
|
||||
if err != nil {
|
||||
log.Errorf("trySched(1) req.sel.Ok error: %+v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
acceptableWindows[sqi] = append(acceptableWindows[sqi], wnd)
|
||||
}
|
||||
|
||||
if len(acceptableWindows[sqi]) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Pick best worker (shuffle in case some workers are equally as good)
|
||||
rand.Shuffle(len(acceptableWindows[sqi]), func(i, j int) {
|
||||
acceptableWindows[sqi][i], acceptableWindows[sqi][j] = acceptableWindows[sqi][j], acceptableWindows[sqi][i] // nolint:scopelint
|
||||
})
|
||||
sort.SliceStable(acceptableWindows[sqi], func(i, j int) bool {
|
||||
wii := sh.openWindows[acceptableWindows[sqi][i]].worker // nolint:scopelint
|
||||
wji := sh.openWindows[acceptableWindows[sqi][j]].worker // nolint:scopelint
|
||||
|
||||
if wii == wji {
|
||||
// for the same worker prefer older windows
|
||||
return acceptableWindows[sqi][i] < acceptableWindows[sqi][j] // nolint:scopelint
|
||||
}
|
||||
|
||||
wi := sh.workers[wii]
|
||||
wj := sh.workers[wji]
|
||||
|
||||
rpcCtx, cancel := context.WithTimeout(task.ctx, SelectorTimeout)
|
||||
defer cancel()
|
||||
|
||||
r, err := task.sel.Cmp(rpcCtx, task.taskType, wi, wj)
|
||||
if err != nil {
|
||||
log.Errorf("selecting best worker: %s", err)
|
||||
}
|
||||
return r
|
||||
})
|
||||
}(i)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
log.Debugf("SCHED windows: %+v", windows)
|
||||
log.Debugf("SCHED Acceptable win: %+v", acceptableWindows)
|
||||
|
||||
// Step 2
|
||||
scheduled := 0
|
||||
rmQueue := make([]int, 0, queueLen)
|
||||
workerUtil := map[storiface.WorkerID]float64{}
|
||||
|
||||
for sqi := 0; sqi < queueLen; sqi++ {
|
||||
task := (*sh.schedQueue)[sqi]
|
||||
|
||||
selectedWindow := -1
|
||||
var needRes storiface.Resources
|
||||
var info storiface.WorkerInfo
|
||||
var bestWid storiface.WorkerID
|
||||
bestUtilization := math.MaxFloat64 // smaller = better
|
||||
|
||||
for i, wnd := range acceptableWindows[task.indexHeap] {
|
||||
wid := sh.openWindows[wnd].worker
|
||||
w := sh.workers[wid]
|
||||
|
||||
res := info.Resources.ResourceSpec(task.sector.ProofType, task.taskType)
|
||||
|
||||
log.Debugf("SCHED try assign sqi:%d sector %d to window %d (awi:%d)", sqi, task.sector.ID.Number, wnd, i)
|
||||
|
||||
// TODO: allow bigger windows
|
||||
if !windows[wnd].allocated.canHandleRequest(needRes, wid, "schedAssign", info) {
|
||||
continue
|
||||
}
|
||||
|
||||
wu, found := workerUtil[wid]
|
||||
if !found {
|
||||
wu = w.utilization()
|
||||
workerUtil[wid] = wu
|
||||
}
|
||||
if wu >= bestUtilization {
|
||||
// acceptable worker list is initially sorted by utilization, and the initially-best workers
|
||||
// will be assigned tasks first. This means that if we find a worker which isn't better, it
|
||||
// probably means that the other workers aren't better either.
|
||||
//
|
||||
// utilization
|
||||
// ^
|
||||
// | /
|
||||
// | \ /
|
||||
// | \ /
|
||||
// | *
|
||||
// #--------> acceptableWindow index
|
||||
//
|
||||
// * -> we're here
|
||||
break
|
||||
}
|
||||
|
||||
info = w.info
|
||||
needRes = res
|
||||
bestWid = wid
|
||||
selectedWindow = wnd
|
||||
bestUtilization = wu
|
||||
}
|
||||
|
||||
if selectedWindow < 0 {
|
||||
// all windows full
|
||||
continue
|
||||
}
|
||||
|
||||
log.Debugw("SCHED ASSIGNED",
|
||||
"sqi", sqi,
|
||||
"sector", task.sector.ID.Number,
|
||||
"task", task.taskType,
|
||||
"window", selectedWindow,
|
||||
"worker", bestWid,
|
||||
"utilization", bestUtilization)
|
||||
|
||||
workerUtil[bestWid] += windows[selectedWindow].allocated.add(info.Resources, needRes)
|
||||
windows[selectedWindow].todo = append(windows[selectedWindow].todo, task)
|
||||
|
||||
rmQueue = append(rmQueue, sqi)
|
||||
scheduled++
|
||||
}
|
||||
|
||||
if len(rmQueue) > 0 {
|
||||
for i := len(rmQueue) - 1; i >= 0; i-- {
|
||||
sh.schedQueue.Remove(rmQueue[i])
|
||||
}
|
||||
}
|
||||
|
||||
// Step 3
|
||||
|
||||
if scheduled == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
scheduledWindows := map[int]struct{}{}
|
||||
for wnd, window := range windows {
|
||||
if len(window.todo) == 0 {
|
||||
// Nothing scheduled here, keep the window open
|
||||
continue
|
||||
}
|
||||
|
||||
scheduledWindows[wnd] = struct{}{}
|
||||
|
||||
window := window // copy
|
||||
select {
|
||||
case sh.openWindows[wnd].done <- &window:
|
||||
default:
|
||||
log.Error("expected sh.openWindows[wnd].done to be buffered")
|
||||
}
|
||||
}
|
||||
|
||||
// Rewrite sh.openWindows array, removing scheduled windows
|
||||
newOpenWindows := make([]*schedWindowRequest, 0, windowsLen-len(scheduledWindows))
|
||||
for wnd, window := range sh.openWindows {
|
||||
if _, scheduled := scheduledWindows[wnd]; scheduled {
|
||||
// keep unscheduled windows open
|
||||
continue
|
||||
}
|
||||
|
||||
newOpenWindows = append(newOpenWindows, window)
|
||||
}
|
||||
|
||||
sh.openWindows = newOpenWindows
|
||||
sh.assigner.TrySched(sh)
|
||||
}
|
||||
|
||||
func (sh *scheduler) schedClose() {
|
||||
func (sh *Scheduler) schedClose() {
|
||||
sh.workersLk.Lock()
|
||||
defer sh.workersLk.Unlock()
|
||||
log.Debugf("closing scheduler")
|
||||
|
||||
for i, w := range sh.workers {
|
||||
for i, w := range sh.Workers {
|
||||
sh.workerCleanup(i, w)
|
||||
}
|
||||
}
|
||||
|
||||
func (sh *scheduler) Info(ctx context.Context) (interface{}, error) {
|
||||
func (sh *Scheduler) Info(ctx context.Context) (interface{}, error) {
|
||||
ch := make(chan interface{}, 1)
|
||||
|
||||
sh.info <- func(res interface{}) {
|
||||
@ -601,7 +373,7 @@ func (sh *scheduler) Info(ctx context.Context) (interface{}, error) {
|
||||
}
|
||||
}
|
||||
|
||||
func (sh *scheduler) Close(ctx context.Context) error {
|
||||
func (sh *Scheduler) Close(ctx context.Context) error {
|
||||
close(sh.closing)
|
||||
select {
|
||||
case <-sh.closed:
|
||||
|
254
extern/sector-storage/sched_assigner_util.go
vendored
Normal file
254
extern/sector-storage/sched_assigner_util.go
vendored
Normal file
@ -0,0 +1,254 @@
|
||||
package sectorstorage
|
||||
|
||||
import (
|
||||
"context"
|
||||
"math"
|
||||
"math/rand"
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
"github.com/filecoin-project/lotus/extern/sector-storage/storiface"
|
||||
)
|
||||
|
||||
// AssignerUtil is a task assigner assigning tasks to workers with lowest utilization
|
||||
type AssignerUtil struct{}
|
||||
|
||||
var _ Assigner = &AssignerUtil{}
|
||||
|
||||
func (a *AssignerUtil) TrySched(sh *Scheduler) {
|
||||
/*
|
||||
This assigns tasks to workers based on:
|
||||
- Task priority (achieved by handling sh.SchedQueue in order, since it's already sorted by priority)
|
||||
- Worker resource availability
|
||||
- Task-specified worker preference (acceptableWindows array below sorted by this preference)
|
||||
- Window request age
|
||||
|
||||
1. For each task in the SchedQueue find windows which can handle them
|
||||
1.1. Create list of windows capable of handling a task
|
||||
1.2. Sort windows according to task selector preferences
|
||||
2. Going through SchedQueue again, assign task to first acceptable window
|
||||
with resources available
|
||||
3. Submit windows with scheduled tasks to workers
|
||||
|
||||
*/
|
||||
|
||||
windowsLen := len(sh.OpenWindows)
|
||||
queueLen := sh.SchedQueue.Len()
|
||||
|
||||
log.Debugf("SCHED %d queued; %d open windows", queueLen, windowsLen)
|
||||
|
||||
if windowsLen == 0 || queueLen == 0 {
|
||||
// nothing to schedule on
|
||||
return
|
||||
}
|
||||
|
||||
windows := make([]SchedWindow, windowsLen)
|
||||
acceptableWindows := make([][]int, queueLen) // QueueIndex -> []OpenWindowIndex
|
||||
|
||||
// Step 1
|
||||
throttle := make(chan struct{}, windowsLen)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(queueLen)
|
||||
for i := 0; i < queueLen; i++ {
|
||||
throttle <- struct{}{}
|
||||
|
||||
go func(sqi int) {
|
||||
defer wg.Done()
|
||||
defer func() {
|
||||
<-throttle
|
||||
}()
|
||||
|
||||
task := (*sh.SchedQueue)[sqi]
|
||||
|
||||
task.IndexHeap = sqi
|
||||
for wnd, windowRequest := range sh.OpenWindows {
|
||||
worker, ok := sh.Workers[windowRequest.Worker]
|
||||
if !ok {
|
||||
log.Errorf("worker referenced by windowRequest not found (worker: %s)", windowRequest.Worker)
|
||||
// TODO: How to move forward here?
|
||||
continue
|
||||
}
|
||||
|
||||
if !worker.Enabled {
|
||||
log.Debugw("skipping disabled worker", "worker", windowRequest.Worker)
|
||||
continue
|
||||
}
|
||||
|
||||
needRes := worker.Info.Resources.ResourceSpec(task.Sector.ProofType, task.TaskType)
|
||||
|
||||
// TODO: allow bigger windows
|
||||
if !windows[wnd].Allocated.CanHandleRequest(needRes, windowRequest.Worker, "schedAcceptable", worker.Info) {
|
||||
continue
|
||||
}
|
||||
|
||||
rpcCtx, cancel := context.WithTimeout(task.Ctx, SelectorTimeout)
|
||||
ok, err := task.Sel.Ok(rpcCtx, task.TaskType, task.Sector.ProofType, worker)
|
||||
cancel()
|
||||
if err != nil {
|
||||
log.Errorf("trySched(1) req.Sel.Ok error: %+v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
acceptableWindows[sqi] = append(acceptableWindows[sqi], wnd)
|
||||
}
|
||||
|
||||
if len(acceptableWindows[sqi]) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Pick best worker (shuffle in case some workers are equally as good)
|
||||
rand.Shuffle(len(acceptableWindows[sqi]), func(i, j int) {
|
||||
acceptableWindows[sqi][i], acceptableWindows[sqi][j] = acceptableWindows[sqi][j], acceptableWindows[sqi][i] // nolint:scopelint
|
||||
})
|
||||
sort.SliceStable(acceptableWindows[sqi], func(i, j int) bool {
|
||||
wii := sh.OpenWindows[acceptableWindows[sqi][i]].Worker // nolint:scopelint
|
||||
wji := sh.OpenWindows[acceptableWindows[sqi][j]].Worker // nolint:scopelint
|
||||
|
||||
if wii == wji {
|
||||
// for the same worker prefer older windows
|
||||
return acceptableWindows[sqi][i] < acceptableWindows[sqi][j] // nolint:scopelint
|
||||
}
|
||||
|
||||
wi := sh.Workers[wii]
|
||||
wj := sh.Workers[wji]
|
||||
|
||||
rpcCtx, cancel := context.WithTimeout(task.Ctx, SelectorTimeout)
|
||||
defer cancel()
|
||||
|
||||
r, err := task.Sel.Cmp(rpcCtx, task.TaskType, wi, wj)
|
||||
if err != nil {
|
||||
log.Errorf("selecting best worker: %s", err)
|
||||
}
|
||||
return r
|
||||
})
|
||||
}(i)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
log.Debugf("SCHED windows: %+v", windows)
|
||||
log.Debugf("SCHED Acceptable win: %+v", acceptableWindows)
|
||||
|
||||
// Step 2
|
||||
scheduled := 0
|
||||
rmQueue := make([]int, 0, queueLen)
|
||||
workerUtil := map[storiface.WorkerID]float64{}
|
||||
|
||||
for sqi := 0; sqi < queueLen; sqi++ {
|
||||
task := (*sh.SchedQueue)[sqi]
|
||||
|
||||
selectedWindow := -1
|
||||
var needRes storiface.Resources
|
||||
var info storiface.WorkerInfo
|
||||
var bestWid storiface.WorkerID
|
||||
bestUtilization := math.MaxFloat64 // smaller = better
|
||||
|
||||
for i, wnd := range acceptableWindows[task.IndexHeap] {
|
||||
wid := sh.OpenWindows[wnd].Worker
|
||||
w := sh.Workers[wid]
|
||||
|
||||
res := info.Resources.ResourceSpec(task.Sector.ProofType, task.TaskType)
|
||||
|
||||
log.Debugf("SCHED try assign sqi:%d sector %d to window %d (awi:%d)", sqi, task.Sector.ID.Number, wnd, i)
|
||||
|
||||
// TODO: allow bigger windows
|
||||
if !windows[wnd].Allocated.CanHandleRequest(needRes, wid, "schedAssign", info) {
|
||||
continue
|
||||
}
|
||||
|
||||
wu, found := workerUtil[wid]
|
||||
if !found {
|
||||
wu = w.Utilization()
|
||||
workerUtil[wid] = wu
|
||||
}
|
||||
if wu >= bestUtilization {
|
||||
// acceptable worker list is initially sorted by utilization, and the initially-best workers
|
||||
// will be assigned tasks first. This means that if we find a worker which isn't better, it
|
||||
// probably means that the other workers aren't better either.
|
||||
//
|
||||
// utilization
|
||||
// ^
|
||||
// | /
|
||||
// | \ /
|
||||
// | \ /
|
||||
// | *
|
||||
// #--------> acceptableWindow index
|
||||
//
|
||||
// * -> we're here
|
||||
break
|
||||
}
|
||||
|
||||
info = w.Info
|
||||
needRes = res
|
||||
bestWid = wid
|
||||
selectedWindow = wnd
|
||||
bestUtilization = wu
|
||||
}
|
||||
|
||||
if selectedWindow < 0 {
|
||||
// all windows full
|
||||
continue
|
||||
}
|
||||
|
||||
log.Debugw("SCHED ASSIGNED",
|
||||
"sqi", sqi,
|
||||
"sector", task.Sector.ID.Number,
|
||||
"task", task.TaskType,
|
||||
"window", selectedWindow,
|
||||
"worker", bestWid,
|
||||
"utilization", bestUtilization)
|
||||
|
||||
workerUtil[bestWid] += windows[selectedWindow].Allocated.Add(info.Resources, needRes)
|
||||
windows[selectedWindow].Todo = append(windows[selectedWindow].Todo, task)
|
||||
|
||||
rmQueue = append(rmQueue, sqi)
|
||||
scheduled++
|
||||
}
|
||||
|
||||
if len(rmQueue) > 0 {
|
||||
for i := len(rmQueue) - 1; i >= 0; i-- {
|
||||
sh.SchedQueue.Remove(rmQueue[i])
|
||||
}
|
||||
}
|
||||
|
||||
// Step 3
|
||||
|
||||
if scheduled == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
scheduledWindows := map[int]struct{}{}
|
||||
for wnd, window := range windows {
|
||||
if len(window.Todo) == 0 {
|
||||
// Nothing scheduled here, keep the window open
|
||||
continue
|
||||
}
|
||||
|
||||
scheduledWindows[wnd] = struct{}{}
|
||||
|
||||
window := window // copy
|
||||
select {
|
||||
case sh.OpenWindows[wnd].Done <- &window:
|
||||
default:
|
||||
log.Error("expected sh.OpenWindows[wnd].Done to be buffered")
|
||||
}
|
||||
}
|
||||
|
||||
// Rewrite sh.OpenWindows array, removing scheduled windows
|
||||
newOpenWindows := make([]*SchedWindowRequest, 0, windowsLen-len(scheduledWindows))
|
||||
for wnd, window := range sh.OpenWindows {
|
||||
if _, scheduled := scheduledWindows[wnd]; scheduled {
|
||||
// keep unscheduled windows open
|
||||
continue
|
||||
}
|
||||
|
||||
newOpenWindows = append(newOpenWindows, window)
|
||||
}
|
||||
|
||||
sh.OpenWindows = newOpenWindows
|
||||
}
|
28
extern/sector-storage/sched_post.go
vendored
28
extern/sector-storage/sched_post.go
vendored
@ -17,7 +17,7 @@ import (
|
||||
|
||||
type poStScheduler struct {
|
||||
lk sync.RWMutex
|
||||
workers map[storiface.WorkerID]*workerHandle
|
||||
workers map[storiface.WorkerID]*WorkerHandle
|
||||
cond *sync.Cond
|
||||
|
||||
postType sealtasks.TaskType
|
||||
@ -25,14 +25,14 @@ type poStScheduler struct {
|
||||
|
||||
func newPoStScheduler(t sealtasks.TaskType) *poStScheduler {
|
||||
ps := &poStScheduler{
|
||||
workers: map[storiface.WorkerID]*workerHandle{},
|
||||
workers: map[storiface.WorkerID]*WorkerHandle{},
|
||||
postType: t,
|
||||
}
|
||||
ps.cond = sync.NewCond(&ps.lk)
|
||||
return ps
|
||||
}
|
||||
|
||||
func (ps *poStScheduler) MaybeAddWorker(wid storiface.WorkerID, tasks map[sealtasks.TaskType]struct{}, w *workerHandle) bool {
|
||||
func (ps *poStScheduler) MaybeAddWorker(wid storiface.WorkerID, tasks map[sealtasks.TaskType]struct{}, w *WorkerHandle) bool {
|
||||
if _, ok := tasks[ps.postType]; !ok {
|
||||
return false
|
||||
}
|
||||
@ -49,10 +49,10 @@ func (ps *poStScheduler) MaybeAddWorker(wid storiface.WorkerID, tasks map[sealta
|
||||
return true
|
||||
}
|
||||
|
||||
func (ps *poStScheduler) delWorker(wid storiface.WorkerID) *workerHandle {
|
||||
func (ps *poStScheduler) delWorker(wid storiface.WorkerID) *WorkerHandle {
|
||||
ps.lk.Lock()
|
||||
defer ps.lk.Unlock()
|
||||
var w *workerHandle = nil
|
||||
var w *WorkerHandle = nil
|
||||
if wh, ok := ps.workers[wid]; ok {
|
||||
w = wh
|
||||
delete(ps.workers, wid)
|
||||
@ -68,7 +68,7 @@ func (ps *poStScheduler) CanSched(ctx context.Context) bool {
|
||||
}
|
||||
|
||||
for _, w := range ps.workers {
|
||||
if w.enabled {
|
||||
if w.Enabled {
|
||||
return true
|
||||
}
|
||||
}
|
||||
@ -105,7 +105,7 @@ func (ps *poStScheduler) Schedule(ctx context.Context, primary bool, spt abi.Reg
|
||||
selected := candidates[0]
|
||||
worker := ps.workers[selected.id]
|
||||
|
||||
return worker.active.withResources(selected.id, worker.info, selected.res, &ps.lk, func() error {
|
||||
return worker.active.withResources(selected.id, worker.Info, selected.res, &ps.lk, func() error {
|
||||
ps.lk.Unlock()
|
||||
defer ps.lk.Lock()
|
||||
|
||||
@ -122,9 +122,9 @@ func (ps *poStScheduler) readyWorkers(spt abi.RegisteredSealProof) (bool, []cand
|
||||
var accepts []candidateWorker
|
||||
//if the gpus of the worker are insufficient or it's disabled, it cannot be scheduled
|
||||
for wid, wr := range ps.workers {
|
||||
needRes := wr.info.Resources.ResourceSpec(spt, ps.postType)
|
||||
needRes := wr.Info.Resources.ResourceSpec(spt, ps.postType)
|
||||
|
||||
if !wr.active.canHandleRequest(needRes, wid, "post-readyWorkers", wr.info) {
|
||||
if !wr.active.CanHandleRequest(needRes, wid, "post-readyWorkers", wr.Info) {
|
||||
continue
|
||||
}
|
||||
|
||||
@ -145,16 +145,16 @@ func (ps *poStScheduler) readyWorkers(spt abi.RegisteredSealProof) (bool, []cand
|
||||
func (ps *poStScheduler) disable(wid storiface.WorkerID) {
|
||||
ps.lk.Lock()
|
||||
defer ps.lk.Unlock()
|
||||
ps.workers[wid].enabled = false
|
||||
ps.workers[wid].Enabled = false
|
||||
}
|
||||
|
||||
func (ps *poStScheduler) enable(wid storiface.WorkerID) {
|
||||
ps.lk.Lock()
|
||||
defer ps.lk.Unlock()
|
||||
ps.workers[wid].enabled = true
|
||||
ps.workers[wid].Enabled = true
|
||||
}
|
||||
|
||||
func (ps *poStScheduler) watch(wid storiface.WorkerID, worker *workerHandle) {
|
||||
func (ps *poStScheduler) watch(wid storiface.WorkerID, worker *WorkerHandle) {
|
||||
heartbeatTimer := time.NewTicker(stores.HeartbeatInterval)
|
||||
defer heartbeatTimer.Stop()
|
||||
|
||||
@ -197,7 +197,7 @@ func (ps *poStScheduler) watch(wid storiface.WorkerID, worker *workerHandle) {
|
||||
}
|
||||
}
|
||||
|
||||
func (ps *poStScheduler) workerCleanup(wid storiface.WorkerID, w *workerHandle) {
|
||||
func (ps *poStScheduler) workerCleanup(wid storiface.WorkerID, w *WorkerHandle) {
|
||||
select {
|
||||
case <-w.closingMgr:
|
||||
default:
|
||||
@ -223,7 +223,7 @@ func (ps *poStScheduler) schedClose() {
|
||||
}
|
||||
}
|
||||
|
||||
func (ps *poStScheduler) WorkerStats(ctx context.Context, cb func(ctx context.Context, wid storiface.WorkerID, worker *workerHandle)) {
|
||||
func (ps *poStScheduler) WorkerStats(ctx context.Context, cb func(ctx context.Context, wid storiface.WorkerID, worker *WorkerHandle)) {
|
||||
ps.lk.RLock()
|
||||
defer ps.lk.RUnlock()
|
||||
for id, w := range ps.workers {
|
||||
|
20
extern/sector-storage/sched_resources.go
vendored
20
extern/sector-storage/sched_resources.go
vendored
@ -10,7 +10,7 @@ import (
|
||||
)
|
||||
|
||||
func (a *activeResources) withResources(id storiface.WorkerID, wr storiface.WorkerInfo, r storiface.Resources, locker sync.Locker, cb func() error) error {
|
||||
for !a.canHandleRequest(r, id, "withResources", wr) {
|
||||
for !a.CanHandleRequest(r, id, "withResources", wr) {
|
||||
if a.cond == nil {
|
||||
a.cond = sync.NewCond(locker)
|
||||
}
|
||||
@ -19,7 +19,7 @@ func (a *activeResources) withResources(id storiface.WorkerID, wr storiface.Work
|
||||
a.waiting--
|
||||
}
|
||||
|
||||
a.add(wr.Resources, r)
|
||||
a.Add(wr.Resources, r)
|
||||
|
||||
err := cb()
|
||||
|
||||
@ -34,7 +34,7 @@ func (a *activeResources) hasWorkWaiting() bool {
|
||||
}
|
||||
|
||||
// add task resources to activeResources and return utilization difference
|
||||
func (a *activeResources) add(wr storiface.WorkerResources, r storiface.Resources) float64 {
|
||||
func (a *activeResources) Add(wr storiface.WorkerResources, r storiface.Resources) float64 {
|
||||
startUtil := a.utilization(wr)
|
||||
|
||||
if r.GPUUtilization > 0 {
|
||||
@ -60,9 +60,9 @@ func (a *activeResources) free(wr storiface.WorkerResources, r storiface.Resourc
|
||||
}
|
||||
}
|
||||
|
||||
// canHandleRequest evaluates if the worker has enough available resources to
|
||||
// CanHandleRequest evaluates if the worker has enough available resources to
|
||||
// handle the request.
|
||||
func (a *activeResources) canHandleRequest(needRes storiface.Resources, wid storiface.WorkerID, caller string, info storiface.WorkerInfo) bool {
|
||||
func (a *activeResources) CanHandleRequest(needRes storiface.Resources, wid storiface.WorkerID, caller string, info storiface.WorkerInfo) bool {
|
||||
if info.IgnoreResources {
|
||||
// shortcircuit; if this worker is ignoring resources, it can always handle the request.
|
||||
return true
|
||||
@ -145,14 +145,14 @@ func (a *activeResources) utilization(wr storiface.WorkerResources) float64 {
|
||||
return max
|
||||
}
|
||||
|
||||
func (wh *workerHandle) utilization() float64 {
|
||||
func (wh *WorkerHandle) Utilization() float64 {
|
||||
wh.lk.Lock()
|
||||
u := wh.active.utilization(wh.info.Resources)
|
||||
u += wh.preparing.utilization(wh.info.Resources)
|
||||
u := wh.active.utilization(wh.Info.Resources)
|
||||
u += wh.preparing.utilization(wh.Info.Resources)
|
||||
wh.lk.Unlock()
|
||||
wh.wndLk.Lock()
|
||||
for _, window := range wh.activeWindows {
|
||||
u += window.allocated.utilization(wh.info.Resources)
|
||||
u += window.Allocated.utilization(wh.Info.Resources)
|
||||
}
|
||||
wh.wndLk.Unlock()
|
||||
|
||||
@ -161,7 +161,7 @@ func (wh *workerHandle) utilization() float64 {
|
||||
|
||||
var tasksCacheTimeout = 30 * time.Second
|
||||
|
||||
func (wh *workerHandle) TaskTypes(ctx context.Context) (t map[sealtasks.TaskType]struct{}, err error) {
|
||||
func (wh *WorkerHandle) TaskTypes(ctx context.Context) (t map[sealtasks.TaskType]struct{}, err error) {
|
||||
wh.tasksLk.Lock()
|
||||
defer wh.tasksLk.Unlock()
|
||||
|
||||
|
66
extern/sector-storage/sched_test.go
vendored
66
extern/sector-storage/sched_test.go
vendored
@ -183,7 +183,7 @@ func (s *schedTestWorker) Close() error {
|
||||
|
||||
var _ Worker = &schedTestWorker{}
|
||||
|
||||
func addTestWorker(t *testing.T, sched *scheduler, index *stores.Index, name string, taskTypes map[sealtasks.TaskType]struct{}, resources storiface.WorkerResources, ignoreResources bool) {
|
||||
func addTestWorker(t *testing.T, sched *Scheduler, index *stores.Index, name string, taskTypes map[sealtasks.TaskType]struct{}, resources storiface.WorkerResources, ignoreResources bool) {
|
||||
w := &schedTestWorker{
|
||||
name: name,
|
||||
taskTypes: taskTypes,
|
||||
@ -259,13 +259,13 @@ func TestSched(t *testing.T) {
|
||||
wg sync.WaitGroup
|
||||
}
|
||||
|
||||
type task func(*testing.T, *scheduler, *stores.Index, *runMeta)
|
||||
type task func(*testing.T, *Scheduler, *stores.Index, *runMeta)
|
||||
|
||||
sched := func(taskName, expectWorker string, sid abi.SectorNumber, taskType sealtasks.TaskType) task {
|
||||
_, _, l, _ := runtime.Caller(1)
|
||||
_, _, l2, _ := runtime.Caller(2)
|
||||
|
||||
return func(t *testing.T, sched *scheduler, index *stores.Index, rm *runMeta) {
|
||||
return func(t *testing.T, sched *Scheduler, index *stores.Index, rm *runMeta) {
|
||||
done := make(chan struct{})
|
||||
rm.done[taskName] = done
|
||||
|
||||
@ -314,7 +314,7 @@ func TestSched(t *testing.T) {
|
||||
taskStarted := func(name string) task {
|
||||
_, _, l, _ := runtime.Caller(1)
|
||||
_, _, l2, _ := runtime.Caller(2)
|
||||
return func(t *testing.T, sched *scheduler, index *stores.Index, rm *runMeta) {
|
||||
return func(t *testing.T, sched *Scheduler, index *stores.Index, rm *runMeta) {
|
||||
select {
|
||||
case rm.done[name] <- struct{}{}:
|
||||
case <-ctx.Done():
|
||||
@ -326,7 +326,7 @@ func TestSched(t *testing.T) {
|
||||
taskDone := func(name string) task {
|
||||
_, _, l, _ := runtime.Caller(1)
|
||||
_, _, l2, _ := runtime.Caller(2)
|
||||
return func(t *testing.T, sched *scheduler, index *stores.Index, rm *runMeta) {
|
||||
return func(t *testing.T, sched *Scheduler, index *stores.Index, rm *runMeta) {
|
||||
select {
|
||||
case rm.done[name] <- struct{}{}:
|
||||
case <-ctx.Done():
|
||||
@ -339,7 +339,7 @@ func TestSched(t *testing.T) {
|
||||
taskNotScheduled := func(name string) task {
|
||||
_, _, l, _ := runtime.Caller(1)
|
||||
_, _, l2, _ := runtime.Caller(2)
|
||||
return func(t *testing.T, sched *scheduler, index *stores.Index, rm *runMeta) {
|
||||
return func(t *testing.T, sched *Scheduler, index *stores.Index, rm *runMeta) {
|
||||
select {
|
||||
case rm.done[name] <- struct{}{}:
|
||||
t.Fatal("not expected", l, l2)
|
||||
@ -378,7 +378,7 @@ func TestSched(t *testing.T) {
|
||||
}
|
||||
|
||||
multTask := func(tasks ...task) task {
|
||||
return func(t *testing.T, s *scheduler, index *stores.Index, meta *runMeta) {
|
||||
return func(t *testing.T, s *Scheduler, index *stores.Index, meta *runMeta) {
|
||||
for _, tsk := range tasks {
|
||||
tsk(t, s, index, meta)
|
||||
}
|
||||
@ -492,7 +492,7 @@ func TestSched(t *testing.T) {
|
||||
}
|
||||
|
||||
diag := func() task {
|
||||
return func(t *testing.T, s *scheduler, index *stores.Index, meta *runMeta) {
|
||||
return func(t *testing.T, s *Scheduler, index *stores.Index, meta *runMeta) {
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
for _, request := range s.diag().Requests {
|
||||
log.Infof("!!! sDIAG: sid(%d) task(%s)", request.Sector.Number, request.TaskType)
|
||||
@ -582,12 +582,12 @@ func TestSched(t *testing.T) {
|
||||
|
||||
type slowishSelector bool
|
||||
|
||||
func (s slowishSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, a *workerHandle) (bool, error) {
|
||||
func (s slowishSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, a *WorkerHandle) (bool, error) {
|
||||
time.Sleep(200 * time.Microsecond)
|
||||
return bool(s), nil
|
||||
}
|
||||
|
||||
func (s slowishSelector) Cmp(ctx context.Context, task sealtasks.TaskType, a, b *workerHandle) (bool, error) {
|
||||
func (s slowishSelector) Cmp(ctx context.Context, task sealtasks.TaskType, a, b *WorkerHandle) (bool, error) {
|
||||
time.Sleep(100 * time.Microsecond)
|
||||
return true, nil
|
||||
}
|
||||
@ -605,9 +605,9 @@ func BenchmarkTrySched(b *testing.B) {
|
||||
b.StopTimer()
|
||||
|
||||
sched := newScheduler()
|
||||
sched.workers[storiface.WorkerID{}] = &workerHandle{
|
||||
sched.Workers[storiface.WorkerID{}] = &WorkerHandle{
|
||||
workerRpc: nil,
|
||||
info: storiface.WorkerInfo{
|
||||
Info: storiface.WorkerInfo{
|
||||
Hostname: "t",
|
||||
Resources: decentWorkerResources,
|
||||
},
|
||||
@ -616,17 +616,17 @@ func BenchmarkTrySched(b *testing.B) {
|
||||
}
|
||||
|
||||
for i := 0; i < windows; i++ {
|
||||
sched.openWindows = append(sched.openWindows, &schedWindowRequest{
|
||||
worker: storiface.WorkerID{},
|
||||
done: make(chan *schedWindow, 1000),
|
||||
sched.OpenWindows = append(sched.OpenWindows, &SchedWindowRequest{
|
||||
Worker: storiface.WorkerID{},
|
||||
Done: make(chan *SchedWindow, 1000),
|
||||
})
|
||||
}
|
||||
|
||||
for i := 0; i < queue; i++ {
|
||||
sched.schedQueue.Push(&workerRequest{
|
||||
taskType: sealtasks.TTCommit2,
|
||||
sel: slowishSelector(true),
|
||||
ctx: ctx,
|
||||
sched.SchedQueue.Push(&WorkerRequest{
|
||||
TaskType: sealtasks.TTCommit2,
|
||||
Sel: slowishSelector(true),
|
||||
Ctx: ctx,
|
||||
})
|
||||
}
|
||||
|
||||
@ -644,26 +644,26 @@ func BenchmarkTrySched(b *testing.B) {
|
||||
}
|
||||
|
||||
func TestWindowCompact(t *testing.T) {
|
||||
sh := scheduler{}
|
||||
sh := Scheduler{}
|
||||
spt := abi.RegisteredSealProof_StackedDrg32GiBV1
|
||||
|
||||
test := func(start [][]sealtasks.TaskType, expect [][]sealtasks.TaskType) func(t *testing.T) {
|
||||
return func(t *testing.T) {
|
||||
wh := &workerHandle{
|
||||
info: storiface.WorkerInfo{
|
||||
wh := &WorkerHandle{
|
||||
Info: storiface.WorkerInfo{
|
||||
Resources: decentWorkerResources,
|
||||
},
|
||||
}
|
||||
|
||||
for _, windowTasks := range start {
|
||||
window := &schedWindow{}
|
||||
window := &SchedWindow{}
|
||||
|
||||
for _, task := range windowTasks {
|
||||
window.todo = append(window.todo, &workerRequest{
|
||||
taskType: task,
|
||||
sector: storage.SectorRef{ProofType: spt},
|
||||
window.Todo = append(window.Todo, &WorkerRequest{
|
||||
TaskType: task,
|
||||
Sector: storage.SectorRef{ProofType: spt},
|
||||
})
|
||||
window.allocated.add(wh.info.Resources, storiface.ResourceTable[task][spt])
|
||||
window.Allocated.Add(wh.Info.Resources, storiface.ResourceTable[task][spt])
|
||||
}
|
||||
|
||||
wh.activeWindows = append(wh.activeWindows, window)
|
||||
@ -681,14 +681,14 @@ func TestWindowCompact(t *testing.T) {
|
||||
var expectRes activeResources
|
||||
|
||||
for ti, task := range tasks {
|
||||
require.Equal(t, task, wh.activeWindows[wi].todo[ti].taskType, "%d, %d", wi, ti)
|
||||
expectRes.add(wh.info.Resources, storiface.ResourceTable[task][spt])
|
||||
require.Equal(t, task, wh.activeWindows[wi].Todo[ti].TaskType, "%d, %d", wi, ti)
|
||||
expectRes.Add(wh.Info.Resources, storiface.ResourceTable[task][spt])
|
||||
}
|
||||
|
||||
require.Equal(t, expectRes.cpuUse, wh.activeWindows[wi].allocated.cpuUse, "%d", wi)
|
||||
require.Equal(t, expectRes.gpuUsed, wh.activeWindows[wi].allocated.gpuUsed, "%d", wi)
|
||||
require.Equal(t, expectRes.memUsedMin, wh.activeWindows[wi].allocated.memUsedMin, "%d", wi)
|
||||
require.Equal(t, expectRes.memUsedMax, wh.activeWindows[wi].allocated.memUsedMax, "%d", wi)
|
||||
require.Equal(t, expectRes.cpuUse, wh.activeWindows[wi].Allocated.cpuUse, "%d", wi)
|
||||
require.Equal(t, expectRes.gpuUsed, wh.activeWindows[wi].Allocated.gpuUsed, "%d", wi)
|
||||
require.Equal(t, expectRes.memUsedMin, wh.activeWindows[wi].Allocated.memUsedMin, "%d", wi)
|
||||
require.Equal(t, expectRes.memUsedMax, wh.activeWindows[wi].Allocated.memUsedMax, "%d", wi)
|
||||
}
|
||||
|
||||
}
|
||||
|
144
extern/sector-storage/sched_worker.go
vendored
144
extern/sector-storage/sched_worker.go
vendored
@ -12,31 +12,31 @@ import (
|
||||
)
|
||||
|
||||
type schedWorker struct {
|
||||
sched *scheduler
|
||||
worker *workerHandle
|
||||
sched *Scheduler
|
||||
worker *WorkerHandle
|
||||
|
||||
wid storiface.WorkerID
|
||||
|
||||
heartbeatTimer *time.Ticker
|
||||
scheduledWindows chan *schedWindow
|
||||
scheduledWindows chan *SchedWindow
|
||||
taskDone chan struct{}
|
||||
|
||||
windowsRequested int
|
||||
}
|
||||
|
||||
func newWorkerHandle(ctx context.Context, w Worker) (*workerHandle, error) {
|
||||
func newWorkerHandle(ctx context.Context, w Worker) (*WorkerHandle, error) {
|
||||
info, err := w.Info(ctx)
|
||||
if err != nil {
|
||||
return nil, xerrors.Errorf("getting worker info: %w", err)
|
||||
}
|
||||
|
||||
worker := &workerHandle{
|
||||
worker := &WorkerHandle{
|
||||
workerRpc: w,
|
||||
info: info,
|
||||
Info: info,
|
||||
|
||||
preparing: &activeResources{},
|
||||
active: &activeResources{},
|
||||
enabled: true,
|
||||
Enabled: true,
|
||||
|
||||
closingMgr: make(chan struct{}),
|
||||
closedMgr: make(chan struct{}),
|
||||
@ -46,9 +46,9 @@ func newWorkerHandle(ctx context.Context, w Worker) (*workerHandle, error) {
|
||||
}
|
||||
|
||||
// context only used for startup
|
||||
func (sh *scheduler) runWorker(ctx context.Context, wid storiface.WorkerID, worker *workerHandle) error {
|
||||
func (sh *Scheduler) runWorker(ctx context.Context, wid storiface.WorkerID, worker *WorkerHandle) error {
|
||||
sh.workersLk.Lock()
|
||||
_, exist := sh.workers[wid]
|
||||
_, exist := sh.Workers[wid]
|
||||
if exist {
|
||||
log.Warnw("duplicated worker added", "id", wid)
|
||||
|
||||
@ -57,7 +57,7 @@ func (sh *scheduler) runWorker(ctx context.Context, wid storiface.WorkerID, work
|
||||
return nil
|
||||
}
|
||||
|
||||
sh.workers[wid] = worker
|
||||
sh.Workers[wid] = worker
|
||||
sh.workersLk.Unlock()
|
||||
|
||||
sw := &schedWorker{
|
||||
@ -67,7 +67,7 @@ func (sh *scheduler) runWorker(ctx context.Context, wid storiface.WorkerID, work
|
||||
wid: wid,
|
||||
|
||||
heartbeatTimer: time.NewTicker(stores.HeartbeatInterval),
|
||||
scheduledWindows: make(chan *schedWindow, SchedWindows),
|
||||
scheduledWindows: make(chan *SchedWindow, SchedWindows),
|
||||
taskDone: make(chan struct{}, 1),
|
||||
|
||||
windowsRequested: 0,
|
||||
@ -94,7 +94,7 @@ func (sw *schedWorker) handleWorker() {
|
||||
}
|
||||
|
||||
sched.workersLk.Lock()
|
||||
delete(sched.workers, sw.wid)
|
||||
delete(sched.Workers, sw.wid)
|
||||
sched.workersLk.Unlock()
|
||||
}()
|
||||
|
||||
@ -103,7 +103,7 @@ func (sw *schedWorker) handleWorker() {
|
||||
for {
|
||||
{
|
||||
sched.workersLk.Lock()
|
||||
enabled := worker.enabled
|
||||
enabled := worker.Enabled
|
||||
sched.workersLk.Unlock()
|
||||
|
||||
// ask for more windows if we need them (non-blocking)
|
||||
@ -124,8 +124,8 @@ func (sw *schedWorker) handleWorker() {
|
||||
// session looks good
|
||||
{
|
||||
sched.workersLk.Lock()
|
||||
enabled := worker.enabled
|
||||
worker.enabled = true
|
||||
enabled := worker.Enabled
|
||||
worker.Enabled = true
|
||||
sched.workersLk.Unlock()
|
||||
|
||||
if !enabled {
|
||||
@ -248,9 +248,9 @@ func (sw *schedWorker) checkSession(ctx context.Context) bool {
|
||||
func (sw *schedWorker) requestWindows() bool {
|
||||
for ; sw.windowsRequested < SchedWindows; sw.windowsRequested++ {
|
||||
select {
|
||||
case sw.sched.windowRequests <- &schedWindowRequest{
|
||||
worker: sw.wid,
|
||||
done: sw.scheduledWindows,
|
||||
case sw.sched.windowRequests <- &SchedWindowRequest{
|
||||
Worker: sw.wid,
|
||||
Done: sw.scheduledWindows,
|
||||
}:
|
||||
case <-sw.sched.closing:
|
||||
return false
|
||||
@ -290,21 +290,21 @@ func (sw *schedWorker) workerCompactWindows() {
|
||||
lower := worker.activeWindows[wi]
|
||||
var moved []int
|
||||
|
||||
for ti, todo := range window.todo {
|
||||
needRes := worker.info.Resources.ResourceSpec(todo.sector.ProofType, todo.taskType)
|
||||
if !lower.allocated.canHandleRequest(needRes, sw.wid, "compactWindows", worker.info) {
|
||||
for ti, todo := range window.Todo {
|
||||
needRes := worker.Info.Resources.ResourceSpec(todo.Sector.ProofType, todo.TaskType)
|
||||
if !lower.Allocated.CanHandleRequest(needRes, sw.wid, "compactWindows", worker.Info) {
|
||||
continue
|
||||
}
|
||||
|
||||
moved = append(moved, ti)
|
||||
lower.todo = append(lower.todo, todo)
|
||||
lower.allocated.add(worker.info.Resources, needRes)
|
||||
window.allocated.free(worker.info.Resources, needRes)
|
||||
lower.Todo = append(lower.Todo, todo)
|
||||
lower.Allocated.Add(worker.Info.Resources, needRes)
|
||||
window.Allocated.free(worker.Info.Resources, needRes)
|
||||
}
|
||||
|
||||
if len(moved) > 0 {
|
||||
newTodo := make([]*workerRequest, 0, len(window.todo)-len(moved))
|
||||
for i, t := range window.todo {
|
||||
newTodo := make([]*WorkerRequest, 0, len(window.Todo)-len(moved))
|
||||
for i, t := range window.Todo {
|
||||
if len(moved) > 0 && moved[0] == i {
|
||||
moved = moved[1:]
|
||||
continue
|
||||
@ -312,16 +312,16 @@ func (sw *schedWorker) workerCompactWindows() {
|
||||
|
||||
newTodo = append(newTodo, t)
|
||||
}
|
||||
window.todo = newTodo
|
||||
window.Todo = newTodo
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var compacted int
|
||||
var newWindows []*schedWindow
|
||||
var newWindows []*SchedWindow
|
||||
|
||||
for _, window := range worker.activeWindows {
|
||||
if len(window.todo) == 0 {
|
||||
if len(window.Todo) == 0 {
|
||||
compacted++
|
||||
continue
|
||||
}
|
||||
@ -347,13 +347,13 @@ assignLoop:
|
||||
firstWindow := worker.activeWindows[0]
|
||||
|
||||
// process tasks within a window, preferring tasks at lower indexes
|
||||
for len(firstWindow.todo) > 0 {
|
||||
for len(firstWindow.Todo) > 0 {
|
||||
tidx := -1
|
||||
|
||||
worker.lk.Lock()
|
||||
for t, todo := range firstWindow.todo {
|
||||
needRes := worker.info.Resources.ResourceSpec(todo.sector.ProofType, todo.taskType)
|
||||
if worker.preparing.canHandleRequest(needRes, sw.wid, "startPreparing", worker.info) {
|
||||
for t, todo := range firstWindow.Todo {
|
||||
needRes := worker.Info.Resources.ResourceSpec(todo.Sector.ProofType, todo.TaskType)
|
||||
if worker.preparing.CanHandleRequest(needRes, sw.wid, "startPreparing", worker.Info) {
|
||||
tidx = t
|
||||
break
|
||||
}
|
||||
@ -364,9 +364,9 @@ assignLoop:
|
||||
break assignLoop
|
||||
}
|
||||
|
||||
todo := firstWindow.todo[tidx]
|
||||
todo := firstWindow.Todo[tidx]
|
||||
|
||||
log.Debugf("assign worker sector %d", todo.sector.ID.Number)
|
||||
log.Debugf("assign worker sector %d", todo.Sector.ID.Number)
|
||||
err := sw.startProcessingTask(todo)
|
||||
|
||||
if err != nil {
|
||||
@ -375,9 +375,9 @@ assignLoop:
|
||||
}
|
||||
|
||||
// Note: we're not freeing window.allocated resources here very much on purpose
|
||||
copy(firstWindow.todo[tidx:], firstWindow.todo[tidx+1:])
|
||||
firstWindow.todo[len(firstWindow.todo)-1] = nil
|
||||
firstWindow.todo = firstWindow.todo[:len(firstWindow.todo)-1]
|
||||
copy(firstWindow.Todo[tidx:], firstWindow.Todo[tidx+1:])
|
||||
firstWindow.Todo[len(firstWindow.Todo)-1] = nil
|
||||
firstWindow.Todo = firstWindow.Todo[:len(firstWindow.Todo)-1]
|
||||
}
|
||||
|
||||
copy(worker.activeWindows, worker.activeWindows[1:])
|
||||
@ -405,16 +405,16 @@ assignLoop:
|
||||
firstWindow := worker.activeWindows[0]
|
||||
|
||||
// process tasks within a window, preferring tasks at lower indexes
|
||||
for len(firstWindow.todo) > 0 {
|
||||
for len(firstWindow.Todo) > 0 {
|
||||
tidx := -1
|
||||
|
||||
for t, todo := range firstWindow.todo {
|
||||
if todo.taskType != sealtasks.TTCommit1 && todo.taskType != sealtasks.TTCommit2 { // todo put in task
|
||||
for t, todo := range firstWindow.Todo {
|
||||
if todo.TaskType != sealtasks.TTCommit1 && todo.TaskType != sealtasks.TTCommit2 { // todo put in task
|
||||
continue
|
||||
}
|
||||
|
||||
needRes := storiface.ResourceTable[todo.taskType][todo.sector.ProofType]
|
||||
if worker.active.canHandleRequest(needRes, sw.wid, "startPreparing", worker.info) {
|
||||
needRes := storiface.ResourceTable[todo.TaskType][todo.Sector.ProofType]
|
||||
if worker.active.CanHandleRequest(needRes, sw.wid, "startPreparing", worker.Info) {
|
||||
tidx = t
|
||||
break
|
||||
}
|
||||
@ -424,9 +424,9 @@ assignLoop:
|
||||
break assignLoop
|
||||
}
|
||||
|
||||
todo := firstWindow.todo[tidx]
|
||||
todo := firstWindow.Todo[tidx]
|
||||
|
||||
log.Debugf("assign worker sector %d (ready)", todo.sector.ID.Number)
|
||||
log.Debugf("assign worker sector %d (ready)", todo.Sector.ID.Number)
|
||||
err := sw.startProcessingReadyTask(todo)
|
||||
|
||||
if err != nil {
|
||||
@ -435,9 +435,9 @@ assignLoop:
|
||||
}
|
||||
|
||||
// Note: we're not freeing window.allocated resources here very much on purpose
|
||||
copy(firstWindow.todo[tidx:], firstWindow.todo[tidx+1:])
|
||||
firstWindow.todo[len(firstWindow.todo)-1] = nil
|
||||
firstWindow.todo = firstWindow.todo[:len(firstWindow.todo)-1]
|
||||
copy(firstWindow.Todo[tidx:], firstWindow.Todo[tidx+1:])
|
||||
firstWindow.Todo[len(firstWindow.Todo)-1] = nil
|
||||
firstWindow.Todo = firstWindow.Todo[:len(firstWindow.Todo)-1]
|
||||
}
|
||||
|
||||
copy(worker.activeWindows, worker.activeWindows[1:])
|
||||
@ -448,24 +448,24 @@ assignLoop:
|
||||
}
|
||||
}
|
||||
|
||||
func (sw *schedWorker) startProcessingTask(req *workerRequest) error {
|
||||
func (sw *schedWorker) startProcessingTask(req *WorkerRequest) error {
|
||||
w, sh := sw.worker, sw.sched
|
||||
|
||||
needRes := w.info.Resources.ResourceSpec(req.sector.ProofType, req.taskType)
|
||||
needRes := w.Info.Resources.ResourceSpec(req.Sector.ProofType, req.TaskType)
|
||||
|
||||
w.lk.Lock()
|
||||
w.preparing.add(w.info.Resources, needRes)
|
||||
w.preparing.Add(w.Info.Resources, needRes)
|
||||
w.lk.Unlock()
|
||||
|
||||
go func() {
|
||||
// first run the prepare step (e.g. fetching sector data from other worker)
|
||||
tw := sh.workTracker.worker(sw.wid, w.info, w.workerRpc)
|
||||
tw := sh.workTracker.worker(sw.wid, w.Info, w.workerRpc)
|
||||
tw.start()
|
||||
err := req.prepare(req.ctx, tw)
|
||||
err := req.prepare(req.Ctx, tw)
|
||||
w.lk.Lock()
|
||||
|
||||
if err != nil {
|
||||
w.preparing.free(w.info.Resources, needRes)
|
||||
w.preparing.free(w.Info.Resources, needRes)
|
||||
w.lk.Unlock()
|
||||
|
||||
select {
|
||||
@ -477,7 +477,7 @@ func (sw *schedWorker) startProcessingTask(req *workerRequest) error {
|
||||
|
||||
select {
|
||||
case req.ret <- workerResponse{err: err}:
|
||||
case <-req.ctx.Done():
|
||||
case <-req.Ctx.Done():
|
||||
log.Warnf("request got cancelled before we could respond (prepare error: %+v)", err)
|
||||
case <-sh.closing:
|
||||
log.Warnf("scheduler closed while sending response (prepare error: %+v)", err)
|
||||
@ -485,17 +485,17 @@ func (sw *schedWorker) startProcessingTask(req *workerRequest) error {
|
||||
return
|
||||
}
|
||||
|
||||
tw = sh.workTracker.worker(sw.wid, w.info, w.workerRpc)
|
||||
tw = sh.workTracker.worker(sw.wid, w.Info, w.workerRpc)
|
||||
|
||||
// start tracking work first early in case we need to wait for resources
|
||||
werr := make(chan error, 1)
|
||||
go func() {
|
||||
werr <- req.work(req.ctx, tw)
|
||||
werr <- req.work(req.Ctx, tw)
|
||||
}()
|
||||
|
||||
// wait (if needed) for resources in the 'active' window
|
||||
err = w.active.withResources(sw.wid, w.info, needRes, &w.lk, func() error {
|
||||
w.preparing.free(w.info.Resources, needRes)
|
||||
err = w.active.withResources(sw.wid, w.Info, needRes, &w.lk, func() error {
|
||||
w.preparing.free(w.Info.Resources, needRes)
|
||||
w.lk.Unlock()
|
||||
defer w.lk.Lock() // we MUST return locked from this function
|
||||
|
||||
@ -511,7 +511,7 @@ func (sw *schedWorker) startProcessingTask(req *workerRequest) error {
|
||||
|
||||
select {
|
||||
case req.ret <- workerResponse{err: err}:
|
||||
case <-req.ctx.Done():
|
||||
case <-req.Ctx.Done():
|
||||
log.Warnf("request got cancelled before we could respond")
|
||||
case <-sh.closing:
|
||||
log.Warnf("scheduler closed while sending response")
|
||||
@ -531,22 +531,22 @@ func (sw *schedWorker) startProcessingTask(req *workerRequest) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (sw *schedWorker) startProcessingReadyTask(req *workerRequest) error {
|
||||
func (sw *schedWorker) startProcessingReadyTask(req *WorkerRequest) error {
|
||||
w, sh := sw.worker, sw.sched
|
||||
|
||||
needRes := w.info.Resources.ResourceSpec(req.sector.ProofType, req.taskType)
|
||||
needRes := w.Info.Resources.ResourceSpec(req.Sector.ProofType, req.TaskType)
|
||||
|
||||
w.active.add(w.info.Resources, needRes)
|
||||
w.active.Add(w.Info.Resources, needRes)
|
||||
|
||||
go func() {
|
||||
// Do the work!
|
||||
tw := sh.workTracker.worker(sw.wid, w.info, w.workerRpc)
|
||||
tw := sh.workTracker.worker(sw.wid, w.Info, w.workerRpc)
|
||||
tw.start()
|
||||
err := req.work(req.ctx, tw)
|
||||
err := req.work(req.Ctx, tw)
|
||||
|
||||
select {
|
||||
case req.ret <- workerResponse{err: err}:
|
||||
case <-req.ctx.Done():
|
||||
case <-req.Ctx.Done():
|
||||
log.Warnf("request got cancelled before we could respond")
|
||||
case <-sh.closing:
|
||||
log.Warnf("scheduler closed while sending response")
|
||||
@ -554,7 +554,7 @@ func (sw *schedWorker) startProcessingReadyTask(req *workerRequest) error {
|
||||
|
||||
w.lk.Lock()
|
||||
|
||||
w.active.free(w.info.Resources, needRes)
|
||||
w.active.free(w.Info.Resources, needRes)
|
||||
|
||||
select {
|
||||
case sw.taskDone <- struct{}{}:
|
||||
@ -574,7 +574,7 @@ func (sw *schedWorker) startProcessingReadyTask(req *workerRequest) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (sh *scheduler) workerCleanup(wid storiface.WorkerID, w *workerHandle) {
|
||||
func (sh *Scheduler) workerCleanup(wid storiface.WorkerID, w *WorkerHandle) {
|
||||
select {
|
||||
case <-w.closingMgr:
|
||||
default:
|
||||
@ -592,13 +592,13 @@ func (sh *scheduler) workerCleanup(wid storiface.WorkerID, w *workerHandle) {
|
||||
if !w.cleanupStarted {
|
||||
w.cleanupStarted = true
|
||||
|
||||
newWindows := make([]*schedWindowRequest, 0, len(sh.openWindows))
|
||||
for _, window := range sh.openWindows {
|
||||
if window.worker != wid {
|
||||
newWindows := make([]*SchedWindowRequest, 0, len(sh.OpenWindows))
|
||||
for _, window := range sh.OpenWindows {
|
||||
if window.Worker != wid {
|
||||
newWindows = append(newWindows, window)
|
||||
}
|
||||
}
|
||||
sh.openWindows = newWindows
|
||||
sh.OpenWindows = newWindows
|
||||
|
||||
log.Debugf("worker %s dropped", wid)
|
||||
}
|
||||
|
6
extern/sector-storage/selector_alloc.go
vendored
6
extern/sector-storage/selector_alloc.go
vendored
@ -26,7 +26,7 @@ func newAllocSelector(index stores.SectorIndex, alloc storiface.SectorFileType,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *allocSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, whnd *workerHandle) (bool, error) {
|
||||
func (s *allocSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, whnd *WorkerHandle) (bool, error) {
|
||||
tasks, err := whnd.TaskTypes(ctx)
|
||||
if err != nil {
|
||||
return false, xerrors.Errorf("getting supported worker task types: %w", err)
|
||||
@ -64,8 +64,8 @@ func (s *allocSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt abi
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func (s *allocSelector) Cmp(ctx context.Context, task sealtasks.TaskType, a, b *workerHandle) (bool, error) {
|
||||
return a.utilization() < b.utilization(), nil
|
||||
func (s *allocSelector) Cmp(ctx context.Context, task sealtasks.TaskType, a, b *WorkerHandle) (bool, error) {
|
||||
return a.Utilization() < b.Utilization(), nil
|
||||
}
|
||||
|
||||
var _ WorkerSelector = &allocSelector{}
|
||||
|
6
extern/sector-storage/selector_existing.go
vendored
6
extern/sector-storage/selector_existing.go
vendored
@ -28,7 +28,7 @@ func newExistingSelector(index stores.SectorIndex, sector abi.SectorID, alloc st
|
||||
}
|
||||
}
|
||||
|
||||
func (s *existingSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, whnd *workerHandle) (bool, error) {
|
||||
func (s *existingSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, whnd *WorkerHandle) (bool, error) {
|
||||
tasks, err := whnd.TaskTypes(ctx)
|
||||
if err != nil {
|
||||
return false, xerrors.Errorf("getting supported worker task types: %w", err)
|
||||
@ -66,8 +66,8 @@ func (s *existingSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func (s *existingSelector) Cmp(ctx context.Context, task sealtasks.TaskType, a, b *workerHandle) (bool, error) {
|
||||
return a.utilization() < b.utilization(), nil
|
||||
func (s *existingSelector) Cmp(ctx context.Context, task sealtasks.TaskType, a, b *WorkerHandle) (bool, error) {
|
||||
return a.Utilization() < b.Utilization(), nil
|
||||
}
|
||||
|
||||
var _ WorkerSelector = &existingSelector{}
|
||||
|
6
extern/sector-storage/selector_task.go
vendored
6
extern/sector-storage/selector_task.go
vendored
@ -19,7 +19,7 @@ func newTaskSelector() *taskSelector {
|
||||
return &taskSelector{}
|
||||
}
|
||||
|
||||
func (s *taskSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, whnd *workerHandle) (bool, error) {
|
||||
func (s *taskSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, whnd *WorkerHandle) (bool, error) {
|
||||
tasks, err := whnd.TaskTypes(ctx)
|
||||
if err != nil {
|
||||
return false, xerrors.Errorf("getting supported worker task types: %w", err)
|
||||
@ -29,7 +29,7 @@ func (s *taskSelector) Ok(ctx context.Context, task sealtasks.TaskType, spt abi.
|
||||
return supported, nil
|
||||
}
|
||||
|
||||
func (s *taskSelector) Cmp(ctx context.Context, _ sealtasks.TaskType, a, b *workerHandle) (bool, error) {
|
||||
func (s *taskSelector) Cmp(ctx context.Context, _ sealtasks.TaskType, a, b *WorkerHandle) (bool, error) {
|
||||
atasks, err := a.TaskTypes(ctx)
|
||||
if err != nil {
|
||||
return false, xerrors.Errorf("getting supported worker task types: %w", err)
|
||||
@ -43,7 +43,7 @@ func (s *taskSelector) Cmp(ctx context.Context, _ sealtasks.TaskType, a, b *work
|
||||
return len(atasks) < len(btasks), nil // prefer workers which can do less
|
||||
}
|
||||
|
||||
return a.utilization() < b.utilization(), nil
|
||||
return a.Utilization() < b.Utilization(), nil
|
||||
}
|
||||
|
||||
var _ WorkerSelector = &taskSelector{}
|
||||
|
16
extern/sector-storage/stats.go
vendored
16
extern/sector-storage/stats.go
vendored
@ -15,7 +15,7 @@ func (m *Manager) WorkerStats(ctx context.Context) map[uuid.UUID]storiface.Worke
|
||||
|
||||
out := map[uuid.UUID]storiface.WorkerStats{}
|
||||
|
||||
cb := func(ctx context.Context, id storiface.WorkerID, handle *workerHandle) {
|
||||
cb := func(ctx context.Context, id storiface.WorkerID, handle *WorkerHandle) {
|
||||
handle.lk.Lock()
|
||||
|
||||
ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
|
||||
@ -32,9 +32,9 @@ func (m *Manager) WorkerStats(ctx context.Context) map[uuid.UUID]storiface.Worke
|
||||
}
|
||||
|
||||
out[uuid.UUID(id)] = storiface.WorkerStats{
|
||||
Info: handle.info,
|
||||
Info: handle.Info,
|
||||
Tasks: taskList,
|
||||
Enabled: handle.enabled,
|
||||
Enabled: handle.Enabled,
|
||||
MemUsedMin: handle.active.memUsedMin,
|
||||
MemUsedMax: handle.active.memUsedMax,
|
||||
GpuUsed: handle.active.gpuUsed,
|
||||
@ -43,7 +43,7 @@ func (m *Manager) WorkerStats(ctx context.Context) map[uuid.UUID]storiface.Worke
|
||||
handle.lk.Unlock()
|
||||
}
|
||||
|
||||
for id, handle := range m.sched.workers {
|
||||
for id, handle := range m.sched.Workers {
|
||||
cb(ctx, id, handle)
|
||||
}
|
||||
|
||||
@ -72,14 +72,14 @@ func (m *Manager) WorkerJobs() map[uuid.UUID][]storiface.WorkerJob {
|
||||
|
||||
m.sched.workersLk.RLock()
|
||||
|
||||
for id, handle := range m.sched.workers {
|
||||
for id, handle := range m.sched.Workers {
|
||||
handle.wndLk.Lock()
|
||||
for wi, window := range handle.activeWindows {
|
||||
for _, request := range window.todo {
|
||||
for _, request := range window.Todo {
|
||||
out[uuid.UUID(id)] = append(out[uuid.UUID(id)], storiface.WorkerJob{
|
||||
ID: storiface.UndefCall,
|
||||
Sector: request.sector.ID,
|
||||
Task: request.taskType,
|
||||
Sector: request.Sector.ID,
|
||||
Task: request.TaskType,
|
||||
RunWait: wi + 2,
|
||||
Start: request.start,
|
||||
})
|
||||
|
Loading…
Reference in New Issue
Block a user