lotus/extern/sector-storage/sched.go

846 lines
19 KiB
Go
Raw Normal View History

package sectorstorage
2020-03-23 11:40:02 +00:00
import (
"context"
2020-07-17 10:59:12 +00:00
"fmt"
2020-07-09 10:58:52 +00:00
"math/rand"
"sort"
"sync"
2020-06-23 09:42:47 +00:00
"time"
2020-03-23 11:40:02 +00:00
"golang.org/x/xerrors"
"github.com/filecoin-project/specs-actors/actors/abi"
"github.com/filecoin-project/lotus/extern/sector-storage/sealtasks"
"github.com/filecoin-project/lotus/extern/sector-storage/storiface"
2020-03-23 11:40:02 +00:00
)
2020-06-24 21:06:56 +00:00
type schedPrioCtxKey int
var SchedPriorityKey schedPrioCtxKey
var DefaultSchedPriority = 0
2020-07-09 10:58:52 +00:00
var SelectorTimeout = 5 * time.Second
var InitWait = 3 * time.Second
2020-07-09 10:58:52 +00:00
var (
SchedWindows = 2
)
2020-06-24 21:06:56 +00:00
func getPriority(ctx context.Context) int {
sp := ctx.Value(SchedPriorityKey)
if p, ok := sp.(int); ok {
return p
}
return DefaultSchedPriority
}
func WithPriority(ctx context.Context, priority int) context.Context {
return context.WithValue(ctx, SchedPriorityKey, priority)
}
2020-03-23 11:40:02 +00:00
const mib = 1 << 20
type WorkerAction func(ctx context.Context, w Worker) error
type WorkerSelector interface {
2020-06-15 12:32:17 +00:00
Ok(ctx context.Context, task sealtasks.TaskType, spt abi.RegisteredSealProof, a *workerHandle) (bool, error) // true if worker is acceptable for performing a task
Cmp(ctx context.Context, task sealtasks.TaskType, a, b *workerHandle) (bool, error) // true if a is preferred over b
}
type scheduler struct {
2020-06-15 12:32:17 +00:00
spt abi.RegisteredSealProof
2020-08-03 12:18:11 +00:00
workersLk sync.RWMutex
nextWorker WorkerID
workers map[WorkerID]*workerHandle
newWorkers chan *workerHandle
2020-05-01 18:00:17 +00:00
watchClosing chan WorkerID
workerClosing chan WorkerID
2020-07-09 10:58:52 +00:00
schedule chan *workerRequest
windowRequests chan *schedWindowRequest
// owned by the sh.runSched goroutine
schedQueue *requestQueue
openWindows []*schedWindowRequest
info chan func(interface{})
2020-07-16 23:32:49 +00:00
closing chan struct{}
2020-07-17 10:59:12 +00:00
closed chan struct{}
2020-07-16 23:26:55 +00:00
testSync chan struct{} // used for testing
2020-07-09 10:58:52 +00:00
}
type workerHandle struct {
w Worker
info storiface.WorkerInfo
preparing *activeResources
active *activeResources
2020-07-17 10:59:12 +00:00
2020-08-03 12:18:11 +00:00
lk sync.Mutex
2020-08-27 21:14:46 +00:00
wndLk sync.Mutex
activeWindows []*schedWindow
2020-07-21 18:01:25 +00:00
// stats / tracking
wt *workTracker
2020-07-17 10:59:12 +00:00
// for sync manager goroutine closing
cleanupStarted bool
closedMgr chan struct{}
closingMgr chan struct{}
2020-07-09 10:58:52 +00:00
}
type schedWindowRequest struct {
worker WorkerID
done chan *schedWindow
}
type schedWindow struct {
2020-07-09 12:40:53 +00:00
allocated activeResources
2020-07-09 10:58:52 +00:00
todo []*workerRequest
}
type activeResources struct {
memUsedMin uint64
memUsedMax uint64
gpuUsed bool
cpuUse uint64
cond *sync.Cond
}
type workerRequest struct {
sector abi.SectorID
taskType sealtasks.TaskType
priority int // larger values more important
sel WorkerSelector
prepare WorkerAction
work WorkerAction
start time.Time
2020-07-09 10:58:52 +00:00
index int // The index of the item in the heap.
2020-07-27 06:21:29 +00:00
indexHeap int
2020-07-27 11:21:36 +00:00
ret chan<- workerResponse
ctx context.Context
2020-07-09 10:58:52 +00:00
}
2020-07-09 10:58:52 +00:00
type workerResponse struct {
err error
}
2020-06-15 12:32:17 +00:00
func newScheduler(spt abi.RegisteredSealProof) *scheduler {
return &scheduler{
spt: spt,
nextWorker: 0,
workers: map[WorkerID]*workerHandle{},
2020-05-01 18:04:21 +00:00
newWorkers: make(chan *workerHandle),
2020-05-01 18:00:17 +00:00
watchClosing: make(chan WorkerID),
workerClosing: make(chan WorkerID),
2020-07-09 12:40:53 +00:00
schedule: make(chan *workerRequest),
windowRequests: make(chan *schedWindowRequest, 20),
2020-05-07 23:38:05 +00:00
schedQueue: &requestQueue{},
2020-07-09 12:40:53 +00:00
info: make(chan func(interface{})),
2020-07-09 12:40:53 +00:00
closing: make(chan struct{}),
2020-07-17 10:59:12 +00:00
closed: make(chan struct{}),
}
}
func (sh *scheduler) Schedule(ctx context.Context, sector abi.SectorID, taskType sealtasks.TaskType, sel WorkerSelector, prepare WorkerAction, work WorkerAction) error {
ret := make(chan workerResponse)
select {
case sh.schedule <- &workerRequest{
sector: sector,
taskType: taskType,
2020-06-24 21:06:56 +00:00
priority: getPriority(ctx),
sel: sel,
prepare: prepare,
work: work,
start: time.Now(),
ret: ret,
ctx: ctx,
}:
case <-sh.closing:
return xerrors.New("closing")
case <-ctx.Done():
return ctx.Err()
}
select {
case resp := <-ret:
return resp.err
case <-sh.closing:
return xerrors.New("closing")
case <-ctx.Done():
return ctx.Err()
}
}
func (r *workerRequest) respond(err error) {
2020-03-23 11:40:02 +00:00
select {
case r.ret <- workerResponse{err: err}:
case <-r.ctx.Done():
2020-03-23 11:40:02 +00:00
log.Warnf("request got cancelled before we could respond")
}
}
type SchedDiagRequestInfo struct {
Sector abi.SectorID
TaskType sealtasks.TaskType
Priority int
}
type SchedDiagInfo struct {
2020-07-27 11:21:36 +00:00
Requests []SchedDiagRequestInfo
OpenWindows []WorkerID
}
func (sh *scheduler) runSched() {
2020-07-17 10:59:12 +00:00
defer close(sh.closed)
2020-05-01 18:00:17 +00:00
go sh.runWorkerWatcher()
iw := time.After(InitWait)
var initialised bool
2020-03-23 11:40:02 +00:00
for {
var doSched bool
2020-03-23 11:40:02 +00:00
select {
case w := <-sh.newWorkers:
2020-07-09 10:58:52 +00:00
sh.newWorker(w)
2020-05-01 18:00:17 +00:00
case wid := <-sh.workerClosing:
2020-07-09 10:58:52 +00:00
sh.dropWorker(wid)
2020-03-23 11:40:02 +00:00
2020-07-09 10:58:52 +00:00
case req := <-sh.schedule:
sh.schedQueue.Push(req)
doSched = true
2020-07-09 10:58:52 +00:00
2020-07-16 23:26:55 +00:00
if sh.testSync != nil {
sh.testSync <- struct{}{}
}
2020-07-09 10:58:52 +00:00
case req := <-sh.windowRequests:
sh.openWindows = append(sh.openWindows, req)
doSched = true
case ireq := <-sh.info:
ireq(sh.diag())
case <-iw:
initialised = true
iw = nil
doSched = true
case <-sh.closing:
sh.schedClose()
2020-03-24 23:49:45 +00:00
return
2020-03-23 11:40:02 +00:00
}
if doSched && initialised {
// First gather any pending tasks, so we go through the scheduling loop
// once for every added task
loop:
for {
select {
case req := <-sh.schedule:
sh.schedQueue.Push(req)
if sh.testSync != nil {
sh.testSync <- struct{}{}
}
case req := <-sh.windowRequests:
sh.openWindows = append(sh.openWindows, req)
default:
break loop
}
}
sh.trySched()
}
2020-03-23 11:40:02 +00:00
}
}
func (sh *scheduler) diag() SchedDiagInfo {
var out SchedDiagInfo
for sqi := 0; sqi < sh.schedQueue.Len(); sqi++ {
task := (*sh.schedQueue)[sqi]
out.Requests = append(out.Requests, SchedDiagRequestInfo{
Sector: task.sector,
TaskType: task.taskType,
Priority: task.priority,
})
}
for _, window := range sh.openWindows {
out.OpenWindows = append(out.OpenWindows, window.worker)
}
return out
}
2020-07-09 10:58:52 +00:00
func (sh *scheduler) trySched() {
/*
This assigns tasks to workers based on:
- Task priority (achieved by handling sh.schedQueue in order, since it's already sorted by priority)
- Worker resource availability
- Task-specified worker preference (acceptableWindows array below sorted by this preference)
- Window request age
2020-05-01 18:00:17 +00:00
2020-07-09 10:58:52 +00:00
1. For each task in the schedQueue find windows which can handle them
1.1. Create list of windows capable of handling a task
1.2. Sort windows according to task selector preferences
2. Going through schedQueue again, assign task to first acceptable window
with resources available
3. Submit windows with scheduled tasks to workers
2020-07-09 10:58:52 +00:00
*/
2020-07-09 10:58:52 +00:00
windows := make([]schedWindow, len(sh.openWindows))
acceptableWindows := make([][]int, sh.schedQueue.Len())
2020-03-23 11:40:02 +00:00
log.Debugf("SCHED %d queued; %d open windows", sh.schedQueue.Len(), len(windows))
2020-07-16 23:26:55 +00:00
2020-08-03 12:18:11 +00:00
sh.workersLk.RLock()
defer sh.workersLk.RUnlock()
if len(sh.openWindows) == 0 {
// nothing to schedule on
return
}
2020-08-03 12:18:11 +00:00
2020-07-09 10:58:52 +00:00
// Step 1
concurrency := len(sh.openWindows)
throttle := make(chan struct{}, concurrency)
var wg sync.WaitGroup
wg.Add(sh.schedQueue.Len())
for i := 0; i < sh.schedQueue.Len(); i++ {
throttle <- struct{}{}
go func(sqi int) {
defer wg.Done()
defer func() {
<-throttle
}()
task := (*sh.schedQueue)[sqi]
needRes := ResourceTable[task.taskType][sh.spt]
task.indexHeap = sqi
for wnd, windowRequest := range sh.openWindows {
worker, ok := sh.workers[windowRequest.worker]
if !ok {
log.Errorf("worker referenced by windowRequest not found (worker: %d)", windowRequest.worker)
// TODO: How to move forward here?
continue
}
// TODO: allow bigger windows
if !windows[wnd].allocated.canHandleRequest(needRes, windowRequest.worker, "schedAcceptable", worker.info.Resources) {
continue
}
rpcCtx, cancel := context.WithTimeout(task.ctx, SelectorTimeout)
ok, err := task.sel.Ok(rpcCtx, task.taskType, sh.spt, worker)
cancel()
if err != nil {
log.Errorf("trySched(1) req.sel.Ok error: %+v", err)
continue
}
if !ok {
continue
}
acceptableWindows[sqi] = append(acceptableWindows[sqi], wnd)
2020-07-09 10:58:52 +00:00
}
if len(acceptableWindows[sqi]) == 0 {
return
2020-07-09 10:58:52 +00:00
}
2020-06-23 09:42:47 +00:00
// Pick best worker (shuffle in case some workers are equally as good)
rand.Shuffle(len(acceptableWindows[sqi]), func(i, j int) {
acceptableWindows[sqi][i], acceptableWindows[sqi][j] = acceptableWindows[sqi][j], acceptableWindows[sqi][i] // nolint:scopelint
})
sort.SliceStable(acceptableWindows[sqi], func(i, j int) bool {
wii := sh.openWindows[acceptableWindows[sqi][i]].worker // nolint:scopelint
wji := sh.openWindows[acceptableWindows[sqi][j]].worker // nolint:scopelint
if wii == wji {
// for the same worker prefer older windows
return acceptableWindows[sqi][i] < acceptableWindows[sqi][j] // nolint:scopelint
}
wi := sh.workers[wii]
wj := sh.workers[wji]
rpcCtx, cancel := context.WithTimeout(task.ctx, SelectorTimeout)
defer cancel()
r, err := task.sel.Cmp(rpcCtx, task.taskType, wi, wj)
if err != nil {
log.Error("selecting best worker: %s", err)
}
return r
})
}(i)
}
wg.Wait()
log.Debugf("SCHED windows: %+v", windows)
log.Debugf("SCHED Acceptable win: %+v", acceptableWindows)
2020-07-09 10:58:52 +00:00
// Step 2
scheduled := 0
2020-07-09 10:58:52 +00:00
for sqi := 0; sqi < sh.schedQueue.Len(); sqi++ {
task := (*sh.schedQueue)[sqi]
needRes := ResourceTable[task.taskType][sh.spt]
2020-07-09 10:58:52 +00:00
selectedWindow := -1
2020-07-27 06:21:29 +00:00
for _, wnd := range acceptableWindows[task.indexHeap] {
2020-07-09 10:58:52 +00:00
wid := sh.openWindows[wnd].worker
wr := sh.workers[wid].info.Resources
log.Debugf("SCHED try assign sqi:%d sector %d to window %d", sqi, task.sector.Number, wnd)
2020-07-16 23:26:55 +00:00
2020-07-09 10:58:52 +00:00
// TODO: allow bigger windows
if !windows[wnd].allocated.canHandleRequest(needRes, wid, "schedAssign", wr) {
2020-07-09 10:58:52 +00:00
continue
}
log.Debugf("SCHED ASSIGNED sqi:%d sector %d task %s to window %d", sqi, task.sector.Number, task.taskType, wnd)
2020-07-16 23:26:55 +00:00
2020-07-09 10:58:52 +00:00
windows[wnd].allocated.add(wr, needRes)
// TODO: We probably want to re-sort acceptableWindows here based on new
// workerHandle.utilization + windows[wnd].allocated.utilization (workerHandle.utilization is used in all
// task selectors, but not in the same way, so need to figure out how to do that in a non-O(n^2 way), and
// without additional network roundtrips (O(n^2) could be avoided by turning acceptableWindows.[] into heaps))
2020-03-23 11:40:02 +00:00
2020-07-09 10:58:52 +00:00
selectedWindow = wnd
break
}
2020-03-23 11:40:02 +00:00
2020-07-09 12:40:53 +00:00
if selectedWindow < 0 {
// all windows full
continue
}
2020-07-09 10:58:52 +00:00
windows[selectedWindow].todo = append(windows[selectedWindow].todo, task)
2020-03-23 11:40:02 +00:00
sh.schedQueue.Remove(sqi)
2020-07-09 10:58:52 +00:00
sqi--
scheduled++
}
2020-03-23 11:40:02 +00:00
2020-07-09 10:58:52 +00:00
// Step 3
2020-03-23 11:40:02 +00:00
2020-07-09 10:58:52 +00:00
if scheduled == 0 {
return
}
2020-07-09 10:58:52 +00:00
scheduledWindows := map[int]struct{}{}
for wnd, window := range windows {
if len(window.todo) == 0 {
// Nothing scheduled here, keep the window open
continue
}
2020-07-09 10:58:52 +00:00
scheduledWindows[wnd] = struct{}{}
2020-07-09 12:40:53 +00:00
window := window // copy
2020-07-09 10:58:52 +00:00
select {
case sh.openWindows[wnd].done <- &window:
default:
log.Error("expected sh.openWindows[wnd].done to be buffered")
}
2020-07-09 10:58:52 +00:00
}
2020-07-09 10:58:52 +00:00
// Rewrite sh.openWindows array, removing scheduled windows
newOpenWindows := make([]*schedWindowRequest, 0, len(sh.openWindows)-len(scheduledWindows))
for wnd, window := range sh.openWindows {
2020-07-09 17:17:15 +00:00
if _, scheduled := scheduledWindows[wnd]; scheduled {
2020-07-09 10:58:52 +00:00
// keep unscheduled windows open
continue
}
2020-07-09 10:58:52 +00:00
newOpenWindows = append(newOpenWindows, window)
}
2020-07-09 10:58:52 +00:00
sh.openWindows = newOpenWindows
}
2020-07-09 10:58:52 +00:00
func (sh *scheduler) runWorker(wid WorkerID) {
2020-07-17 10:59:12 +00:00
var ready sync.WaitGroup
ready.Add(1)
defer ready.Wait()
2020-07-09 10:58:52 +00:00
go func() {
2020-08-03 12:18:11 +00:00
sh.workersLk.RLock()
2020-07-17 10:59:12 +00:00
worker, found := sh.workers[wid]
2020-08-03 12:18:11 +00:00
sh.workersLk.RUnlock()
2020-07-16 23:46:59 +00:00
2020-07-17 10:59:12 +00:00
ready.Done()
if !found {
panic(fmt.Sprintf("worker %d not found", wid))
}
defer close(worker.closedMgr)
2020-07-09 11:49:01 +00:00
scheduledWindows := make(chan *schedWindow, SchedWindows)
taskDone := make(chan struct{}, 1)
windowsRequested := 0
2020-04-28 10:31:08 +00:00
2020-07-09 11:49:01 +00:00
ctx, cancel := context.WithCancel(context.TODO())
defer cancel()
2020-07-09 11:49:01 +00:00
workerClosing, err := worker.w.Closing(ctx)
if err != nil {
return
}
2020-07-09 11:49:01 +00:00
defer func() {
log.Warnw("Worker closing", "workerid", wid)
2020-07-09 11:49:01 +00:00
// TODO: close / return all queued tasks
}()
2020-03-23 11:40:02 +00:00
2020-07-09 11:49:01 +00:00
for {
// ask for more windows if we need them
for ; windowsRequested < SchedWindows; windowsRequested++ {
select {
case sh.windowRequests <- &schedWindowRequest{
worker: wid,
done: scheduledWindows,
}:
case <-sh.closing:
return
case <-workerClosing:
return
2020-07-17 10:59:12 +00:00
case <-worker.closingMgr:
return
2020-07-09 11:49:01 +00:00
}
}
2020-07-09 11:49:01 +00:00
select {
case w := <-scheduledWindows:
worker.wndLk.Lock()
worker.activeWindows = append(worker.activeWindows, w)
worker.wndLk.Unlock()
2020-07-09 11:49:01 +00:00
case <-taskDone:
2020-07-09 12:40:53 +00:00
log.Debugw("task done", "workerid", wid)
2020-07-09 11:49:01 +00:00
case <-sh.closing:
return
case <-workerClosing:
return
2020-07-17 10:59:12 +00:00
case <-worker.closingMgr:
return
2020-07-09 11:49:01 +00:00
}
sh.workersLk.RLock()
worker.wndLk.Lock()
windowsRequested -= sh.workerCompactWindows(worker, wid)
2020-07-09 11:49:01 +00:00
assignLoop:
// process windows in order
for len(worker.activeWindows) > 0 {
firstWindow := worker.activeWindows[0]
2020-07-09 11:49:01 +00:00
// process tasks within a window, preferring tasks at lower indexes
for len(firstWindow.todo) > 0 {
tidx := -1
2020-08-03 12:18:11 +00:00
worker.lk.Lock()
for t, todo := range firstWindow.todo {
needRes := ResourceTable[todo.taskType][sh.spt]
if worker.preparing.canHandleRequest(needRes, wid, "startPreparing", worker.info.Resources) {
tidx = t
break
}
}
2020-08-03 12:18:11 +00:00
worker.lk.Unlock()
if tidx == -1 {
2020-07-09 11:49:01 +00:00
break assignLoop
}
todo := firstWindow.todo[tidx]
2020-07-16 23:26:55 +00:00
log.Debugf("assign worker sector %d", todo.sector.Number)
2020-07-09 11:49:01 +00:00
err := sh.assignWorker(taskDone, wid, worker, todo)
if err != nil {
log.Error("assignWorker error: %+v", err)
go todo.respond(xerrors.Errorf("assignWorker error: %w", err))
}
// Note: we're not freeing window.allocated resources here very much on purpose
copy(firstWindow.todo[tidx:], firstWindow.todo[tidx+1:])
firstWindow.todo[len(firstWindow.todo)-1] = nil
firstWindow.todo = firstWindow.todo[:len(firstWindow.todo)-1]
2020-07-09 11:49:01 +00:00
}
copy(worker.activeWindows, worker.activeWindows[1:])
worker.activeWindows[len(worker.activeWindows)-1] = nil
worker.activeWindows = worker.activeWindows[:len(worker.activeWindows)-1]
2020-07-09 11:49:01 +00:00
windowsRequested--
}
worker.wndLk.Unlock()
sh.workersLk.RUnlock()
2020-07-09 11:49:01 +00:00
}
}()
}
func (sh *scheduler) workerCompactWindows(worker *workerHandle, wid WorkerID) int {
// move tasks from older windows to newer windows if older windows
// still can fit them
if len(worker.activeWindows) > 1 {
for wi, window := range worker.activeWindows[1:] {
lower := worker.activeWindows[wi]
var moved []int
for ti, todo := range window.todo {
needRes := ResourceTable[todo.taskType][sh.spt]
if !lower.allocated.canHandleRequest(needRes, wid, "compactWindows", worker.info.Resources) {
continue
}
moved = append(moved, ti)
lower.todo = append(lower.todo, todo)
lower.allocated.add(worker.info.Resources, needRes)
window.allocated.free(worker.info.Resources, needRes)
}
if len(moved) > 0 {
newTodo := make([]*workerRequest, 0, len(window.todo)-len(moved))
for i, t := range window.todo {
if len(moved) > 0 && moved[0] == i {
moved = moved[1:]
continue
}
newTodo = append(newTodo, t)
}
window.todo = newTodo
}
}
}
var compacted int
var newWindows []*schedWindow
for _, window := range worker.activeWindows {
if len(window.todo) == 0 {
compacted++
continue
}
newWindows = append(newWindows, window)
}
worker.activeWindows = newWindows
return compacted
}
2020-07-09 11:49:01 +00:00
func (sh *scheduler) assignWorker(taskDone chan struct{}, wid WorkerID, w *workerHandle, req *workerRequest) error {
needRes := ResourceTable[req.taskType][sh.spt]
2020-08-03 12:18:11 +00:00
w.lk.Lock()
2020-07-09 11:49:01 +00:00
w.preparing.add(w.info.Resources, needRes)
2020-08-03 12:18:11 +00:00
w.lk.Unlock()
2020-07-09 11:49:01 +00:00
go func() {
2020-07-21 18:01:25 +00:00
err := req.prepare(req.ctx, w.wt.worker(w.w))
2020-07-09 11:49:01 +00:00
sh.workersLk.Lock()
2020-03-23 11:40:02 +00:00
2020-07-09 11:49:01 +00:00
if err != nil {
2020-08-03 12:18:11 +00:00
w.lk.Lock()
2020-07-09 11:49:01 +00:00
w.preparing.free(w.info.Resources, needRes)
2020-08-03 12:18:11 +00:00
w.lk.Unlock()
2020-07-09 11:49:01 +00:00
sh.workersLk.Unlock()
2020-07-09 11:49:01 +00:00
select {
case taskDone <- struct{}{}:
case <-sh.closing:
log.Warnf("scheduler closed while sending response (prepare error: %+v)", err)
}
2020-03-23 11:40:02 +00:00
2020-07-09 11:49:01 +00:00
select {
case req.ret <- workerResponse{err: err}:
case <-req.ctx.Done():
log.Warnf("request got cancelled before we could respond (prepare error: %+v)", err)
case <-sh.closing:
log.Warnf("scheduler closed while sending response (prepare error: %+v)", err)
}
return
}
2020-03-23 11:40:02 +00:00
2020-07-09 11:49:01 +00:00
err = w.active.withResources(wid, w.info.Resources, needRes, &sh.workersLk, func() error {
2020-08-03 12:18:11 +00:00
w.lk.Lock()
2020-07-09 11:49:01 +00:00
w.preparing.free(w.info.Resources, needRes)
2020-08-03 12:18:11 +00:00
w.lk.Unlock()
2020-07-09 11:49:01 +00:00
sh.workersLk.Unlock()
defer sh.workersLk.Lock() // we MUST return locked from this function
2020-03-23 11:40:02 +00:00
2020-07-09 11:49:01 +00:00
select {
case taskDone <- struct{}{}:
case <-sh.closing:
}
2020-03-23 11:40:02 +00:00
2020-07-21 18:01:25 +00:00
err = req.work(req.ctx, w.wt.worker(w.w))
2020-07-09 11:49:01 +00:00
select {
case req.ret <- workerResponse{err: err}:
case <-req.ctx.Done():
log.Warnf("request got cancelled before we could respond")
case <-sh.closing:
log.Warnf("scheduler closed while sending response")
}
2020-07-09 11:49:01 +00:00
return nil
})
2020-07-09 11:49:01 +00:00
sh.workersLk.Unlock()
2020-07-09 11:49:01 +00:00
// This error should always be nil, since nothing is setting it, but just to be safe:
if err != nil {
log.Errorf("error executing worker (withResources): %+v", err)
}
}()
return nil
}
2020-07-09 10:58:52 +00:00
func (sh *scheduler) newWorker(w *workerHandle) {
2020-07-17 10:59:12 +00:00
w.closedMgr = make(chan struct{})
w.closingMgr = make(chan struct{})
sh.workersLk.Lock()
2020-03-23 11:40:02 +00:00
id := sh.nextWorker
sh.workers[id] = w
sh.nextWorker++
2020-05-01 18:00:17 +00:00
sh.workersLk.Unlock()
2020-07-17 10:59:12 +00:00
sh.runWorker(id)
2020-05-01 18:00:17 +00:00
select {
case sh.watchClosing <- id:
case <-sh.closing:
return
}
}
2020-07-09 10:58:52 +00:00
func (sh *scheduler) dropWorker(wid WorkerID) {
2020-05-01 18:00:17 +00:00
sh.workersLk.Lock()
defer sh.workersLk.Unlock()
w := sh.workers[wid]
2020-07-17 10:59:12 +00:00
sh.workerCleanup(wid, w)
2020-05-01 18:00:17 +00:00
delete(sh.workers, wid)
2020-07-17 10:59:12 +00:00
}
2020-05-01 18:00:17 +00:00
2020-07-17 10:59:12 +00:00
func (sh *scheduler) workerCleanup(wid WorkerID, w *workerHandle) {
2020-09-02 15:37:19 +00:00
select {
case <-w.closingMgr:
default:
2020-07-17 10:59:12 +00:00
close(w.closingMgr)
}
2020-09-02 15:37:19 +00:00
sh.workersLk.Unlock()
2020-07-17 10:59:12 +00:00
select {
case <-w.closedMgr:
case <-time.After(time.Second):
log.Errorf("timeout closing worker manager goroutine %d", wid)
}
2020-09-02 15:37:19 +00:00
sh.workersLk.Lock()
2020-07-17 10:59:12 +00:00
if !w.cleanupStarted {
w.cleanupStarted = true
2020-07-17 10:59:12 +00:00
newWindows := make([]*schedWindowRequest, 0, len(sh.openWindows))
for _, window := range sh.openWindows {
if window.worker != wid {
newWindows = append(newWindows, window)
}
2020-05-01 18:00:17 +00:00
}
2020-07-17 10:59:12 +00:00
sh.openWindows = newWindows
log.Debugf("dropWorker %d", wid)
go func() {
if err := w.w.Close(); err != nil {
log.Warnf("closing worker %d: %+v", err)
}
}()
}
2020-03-23 11:40:02 +00:00
}
2020-03-24 23:49:45 +00:00
func (sh *scheduler) schedClose() {
sh.workersLk.Lock()
defer sh.workersLk.Unlock()
2020-07-17 10:59:12 +00:00
log.Debugf("closing scheduler")
2020-03-24 23:49:45 +00:00
for i, w := range sh.workers {
2020-07-17 10:59:12 +00:00
sh.workerCleanup(i, w)
2020-03-24 23:49:45 +00:00
}
}
func (sh *scheduler) Info(ctx context.Context) (interface{}, error) {
ch := make(chan interface{}, 1)
sh.info <- func(res interface{}) {
ch <- res
}
select {
2020-07-27 11:21:36 +00:00
case res := <-ch:
return res, nil
case <-ctx.Done():
return nil, ctx.Err()
}
}
2020-07-17 10:59:12 +00:00
func (sh *scheduler) Close(ctx context.Context) error {
close(sh.closing)
2020-07-17 10:59:12 +00:00
select {
case <-sh.closed:
case <-ctx.Done():
return ctx.Err()
}
return nil
}