2020-08-17 13:26:18 +00:00
package sectorstorage
2020-07-09 11:49:01 +00:00
import (
2022-04-06 16:31:42 +00:00
"context"
2020-07-09 11:49:01 +00:00
"sync"
2022-04-06 16:31:42 +00:00
"time"
2020-07-09 11:49:01 +00:00
2022-04-06 16:31:42 +00:00
"github.com/filecoin-project/lotus/extern/sector-storage/sealtasks"
2020-08-17 13:26:18 +00:00
"github.com/filecoin-project/lotus/extern/sector-storage/storiface"
2020-07-09 11:49:01 +00:00
)
2022-05-27 14:15:52 +00:00
type ActiveResources struct {
2022-05-25 12:44:11 +00:00
memUsedMin uint64
memUsedMax uint64
gpuUsed float64
cpuUse uint64
taskCounters map [ sealtasks . SealTaskType ] int
cond * sync . Cond
waiting int
}
2022-05-27 14:15:52 +00:00
func NewActiveResources ( ) * ActiveResources {
return & ActiveResources {
2022-05-25 12:44:11 +00:00
taskCounters : map [ sealtasks . SealTaskType ] int { } ,
}
}
2022-05-27 14:15:52 +00:00
func ( a * ActiveResources ) withResources ( id storiface . WorkerID , wr storiface . WorkerInfo , tt sealtasks . SealTaskType , r storiface . Resources , locker sync . Locker , cb func ( ) error ) error {
2022-05-27 14:01:32 +00:00
for ! a . CanHandleRequest ( tt , r , id , "withResources" , wr ) {
2020-07-09 11:49:01 +00:00
if a . cond == nil {
a . cond = sync . NewCond ( locker )
}
2021-09-15 14:37:27 +00:00
a . waiting ++
2020-07-09 11:49:01 +00:00
a . cond . Wait ( )
2021-09-15 14:37:27 +00:00
a . waiting --
2020-07-09 11:49:01 +00:00
}
2022-05-27 14:01:32 +00:00
a . Add ( tt , wr . Resources , r )
2020-07-09 11:49:01 +00:00
err := cb ( )
2022-05-27 14:01:32 +00:00
a . Free ( tt , wr . Resources , r )
2020-07-09 11:49:01 +00:00
return err
}
2021-09-15 14:37:27 +00:00
// must be called with the same lock as the one passed to withResources
2022-05-27 14:15:52 +00:00
func ( a * ActiveResources ) hasWorkWaiting ( ) bool {
2021-09-15 14:37:27 +00:00
return a . waiting > 0
}
2022-05-27 14:15:52 +00:00
// add task resources to ActiveResources and return utilization difference
func ( a * ActiveResources ) Add ( tt sealtasks . SealTaskType , wr storiface . WorkerResources , r storiface . Resources ) float64 {
2022-04-06 20:55:28 +00:00
startUtil := a . utilization ( wr )
2021-09-01 01:59:25 +00:00
if r . GPUUtilization > 0 {
a . gpuUsed += r . GPUUtilization
2020-10-28 18:52:33 +00:00
}
2021-11-29 11:40:54 +00:00
a . cpuUse += r . Threads ( wr . CPUs , len ( wr . GPUs ) )
2020-07-09 11:49:01 +00:00
a . memUsedMin += r . MinMemory
a . memUsedMax += r . MaxMemory
2022-05-25 15:30:00 +00:00
a . taskCounters [ tt ] ++
2022-04-06 20:55:28 +00:00
return a . utilization ( wr ) - startUtil
2020-07-09 11:49:01 +00:00
}
2022-05-27 14:15:52 +00:00
func ( a * ActiveResources ) Free ( tt sealtasks . SealTaskType , wr storiface . WorkerResources , r storiface . Resources ) {
2021-09-01 01:59:25 +00:00
if r . GPUUtilization > 0 {
a . gpuUsed -= r . GPUUtilization
2020-07-09 11:49:01 +00:00
}
2021-11-29 11:40:54 +00:00
a . cpuUse -= r . Threads ( wr . CPUs , len ( wr . GPUs ) )
2020-07-09 11:49:01 +00:00
a . memUsedMin -= r . MinMemory
a . memUsedMax -= r . MaxMemory
2022-05-25 12:44:11 +00:00
a . taskCounters [ tt ] --
2021-09-15 14:37:27 +00:00
if a . cond != nil {
a . cond . Broadcast ( )
}
2020-07-09 11:49:01 +00:00
}
2022-05-18 13:47:08 +00:00
// CanHandleRequest evaluates if the worker has enough available resources to
2021-06-21 19:49:16 +00:00
// handle the request.
2022-05-27 14:15:52 +00:00
func ( a * ActiveResources ) CanHandleRequest ( tt sealtasks . SealTaskType , needRes storiface . Resources , wid storiface . WorkerID , caller string , info storiface . WorkerInfo ) bool {
2022-05-25 12:44:11 +00:00
if needRes . MaxConcurrent > 0 {
if a . taskCounters [ tt ] >= needRes . MaxConcurrent {
log . Debugf ( "sched: not scheduling on worker %s for %s; at task limit tt=%s, curcount=%d" , wid , caller , tt , a . taskCounters [ tt ] )
return false
}
}
2021-06-21 19:49:16 +00:00
if info . IgnoreResources {
// shortcircuit; if this worker is ignoring resources, it can always handle the request.
return true
}
2020-07-09 11:49:01 +00:00
2021-06-21 19:49:16 +00:00
res := info . Resources
2021-09-09 21:41:59 +00:00
2020-07-09 11:49:01 +00:00
// TODO: dedupe needRes.BaseMinMemory per task type (don't add if that task is already running)
2021-09-09 21:41:59 +00:00
memNeeded := needRes . MinMemory + needRes . BaseMinMemory
memUsed := a . memUsedMin
// assume that MemUsed can be swapped, so only check it in the vmem Check
memAvail := res . MemPhysical - memUsed
if memNeeded > memAvail {
log . Debugf ( "sched: not scheduling on worker %s for %s; not enough physical memory - need: %dM, have %dM available" , wid , caller , memNeeded / mib , memAvail / mib )
2020-07-09 11:49:01 +00:00
return false
}
2021-09-09 21:41:59 +00:00
vmemNeeded := needRes . MaxMemory + needRes . BaseMinMemory
vmemUsed := a . memUsedMax
2021-11-30 19:50:34 +00:00
workerMemoryReserved := res . MemUsed + res . MemSwapUsed // memory used outside lotus-worker (used by the OS, etc.)
if vmemUsed < workerMemoryReserved {
vmemUsed = workerMemoryReserved
2021-09-09 21:41:59 +00:00
}
2021-11-30 19:50:34 +00:00
vmemAvail := ( res . MemPhysical + res . MemSwap ) - vmemUsed
2020-07-09 11:49:01 +00:00
2021-09-09 21:41:59 +00:00
if vmemNeeded > vmemAvail {
log . Debugf ( "sched: not scheduling on worker %s for %s; not enough virtual memory - need: %dM, have %dM available" , wid , caller , vmemNeeded / mib , vmemAvail / mib )
2020-07-09 11:49:01 +00:00
return false
}
2021-11-29 11:40:54 +00:00
if a . cpuUse + needRes . Threads ( res . CPUs , len ( res . GPUs ) ) > res . CPUs {
log . Debugf ( "sched: not scheduling on worker %s for %s; not enough threads, need %d, %d in use, target %d" , wid , caller , needRes . Threads ( res . CPUs , len ( res . GPUs ) ) , a . cpuUse , res . CPUs )
2020-09-30 22:27:54 +00:00
return false
2020-07-09 11:49:01 +00:00
}
2021-09-01 01:59:25 +00:00
if len ( res . GPUs ) > 0 && needRes . GPUUtilization > 0 {
if a . gpuUsed + needRes . GPUUtilization > float64 ( len ( res . GPUs ) ) {
log . Debugf ( "sched: not scheduling on worker %s for %s; GPU(s) in use" , wid , caller )
2020-07-09 11:49:01 +00:00
return false
}
}
return true
}
2022-04-06 20:55:28 +00:00
// utilization returns a number in 0..1 range indicating fraction of used resources
2022-05-27 14:15:52 +00:00
func ( a * ActiveResources ) utilization ( wr storiface . WorkerResources ) float64 { // todo task type
2020-07-09 11:49:01 +00:00
var max float64
cpu := float64 ( a . cpuUse ) / float64 ( wr . CPUs )
max = cpu
2021-09-09 21:41:59 +00:00
memUsed := a . memUsedMin
if memUsed < wr . MemUsed {
memUsed = wr . MemUsed
}
memMin := float64 ( memUsed ) / float64 ( wr . MemPhysical )
2020-07-09 11:49:01 +00:00
if memMin > max {
max = memMin
}
2021-09-09 21:41:59 +00:00
vmemUsed := a . memUsedMax
if a . memUsedMax < wr . MemUsed + wr . MemSwapUsed {
vmemUsed = wr . MemUsed + wr . MemSwapUsed
}
memMax := float64 ( vmemUsed ) / float64 ( wr . MemPhysical + wr . MemSwap )
2020-07-09 11:49:01 +00:00
if memMax > max {
max = memMax
}
2022-04-06 20:55:28 +00:00
if len ( wr . GPUs ) > 0 {
gpuMax := a . gpuUsed / float64 ( len ( wr . GPUs ) )
if gpuMax > max {
max = gpuMax
}
}
2020-07-09 11:49:01 +00:00
return max
}
2020-08-31 11:31:11 +00:00
2022-05-18 13:47:08 +00:00
func ( wh * WorkerHandle ) Utilization ( ) float64 {
2020-08-31 11:31:11 +00:00
wh . lk . Lock ( )
2022-05-18 13:47:08 +00:00
u := wh . active . utilization ( wh . Info . Resources )
u += wh . preparing . utilization ( wh . Info . Resources )
2020-08-31 11:31:11 +00:00
wh . lk . Unlock ( )
wh . wndLk . Lock ( )
for _ , window := range wh . activeWindows {
2022-05-18 13:47:08 +00:00
u += window . Allocated . utilization ( wh . Info . Resources )
2020-08-31 11:31:11 +00:00
}
wh . wndLk . Unlock ( )
return u
}
2022-04-06 16:31:42 +00:00
var tasksCacheTimeout = 30 * time . Second
2022-05-18 13:47:08 +00:00
func ( wh * WorkerHandle ) TaskTypes ( ctx context . Context ) ( t map [ sealtasks . TaskType ] struct { } , err error ) {
2022-04-06 20:55:28 +00:00
wh . tasksLk . Lock ( )
defer wh . tasksLk . Unlock ( )
2022-04-06 16:31:42 +00:00
if wh . tasksCache == nil || time . Now ( ) . Sub ( wh . tasksUpdate ) > tasksCacheTimeout {
wh . tasksCache , err = wh . workerRpc . TaskTypes ( ctx )
if err != nil {
return nil , err
}
wh . tasksUpdate = time . Now ( )
}
return wh . tasksCache , nil
}