2022-06-14 18:03:38 +00:00
package sealer
2020-07-09 11:49:01 +00:00
import (
2022-11-28 17:02:16 +00:00
"sync"
2022-06-14 18:03:38 +00:00
"github.com/filecoin-project/lotus/storage/sealer/sealtasks"
2022-06-15 10:06:22 +00:00
"github.com/filecoin-project/lotus/storage/sealer/storiface"
2020-07-09 11:49:01 +00:00
)
2022-05-27 14:15:52 +00:00
type ActiveResources struct {
2022-05-25 12:44:11 +00:00
memUsedMin uint64
memUsedMax uint64
gpuUsed float64
cpuUse uint64
2023-02-28 08:02:18 +00:00
taskCounters * taskCounter
2022-05-25 12:44:11 +00:00
cond * sync . Cond
waiting int
}
2023-02-28 08:02:18 +00:00
type taskCounter struct {
taskCounters map [ sealtasks . SealTaskType ] int
// this lock is technically redundant, as ActiveResources is always accessed
// with the worker lock, but let's not panic if we ever change that
lk sync . Mutex
}
func newTaskCounter ( ) * taskCounter {
return & taskCounter {
2022-05-25 12:44:11 +00:00
taskCounters : map [ sealtasks . SealTaskType ] int { } ,
}
}
2023-02-28 08:02:18 +00:00
func ( tc * taskCounter ) Add ( tt sealtasks . SealTaskType ) {
tc . lk . Lock ( )
defer tc . lk . Unlock ( )
tc . taskCounters [ tt ] ++
}
func ( tc * taskCounter ) Free ( tt sealtasks . SealTaskType ) {
tc . lk . Lock ( )
defer tc . lk . Unlock ( )
tc . taskCounters [ tt ] --
}
func ( tc * taskCounter ) Get ( tt sealtasks . SealTaskType ) int {
tc . lk . Lock ( )
defer tc . lk . Unlock ( )
return tc . taskCounters [ tt ]
}
func ( tc * taskCounter ) Sum ( ) int {
tc . lk . Lock ( )
defer tc . lk . Unlock ( )
sum := 0
for _ , v := range tc . taskCounters {
sum += v
}
return sum
}
func ( tc * taskCounter ) ForEach ( cb func ( tt sealtasks . SealTaskType , count int ) ) {
tc . lk . Lock ( )
defer tc . lk . Unlock ( )
for tt , count := range tc . taskCounters {
cb ( tt , count )
}
}
func NewActiveResources ( tc * taskCounter ) * ActiveResources {
return & ActiveResources {
taskCounters : tc ,
}
}
2022-05-27 14:15:52 +00:00
func ( a * ActiveResources ) withResources ( id storiface . WorkerID , wr storiface . WorkerInfo , tt sealtasks . SealTaskType , r storiface . Resources , locker sync . Locker , cb func ( ) error ) error {
2022-05-27 14:01:32 +00:00
for ! a . CanHandleRequest ( tt , r , id , "withResources" , wr ) {
2020-07-09 11:49:01 +00:00
if a . cond == nil {
a . cond = sync . NewCond ( locker )
}
2021-09-15 14:37:27 +00:00
a . waiting ++
2020-07-09 11:49:01 +00:00
a . cond . Wait ( )
2021-09-15 14:37:27 +00:00
a . waiting --
2020-07-09 11:49:01 +00:00
}
2022-05-27 14:01:32 +00:00
a . Add ( tt , wr . Resources , r )
2020-07-09 11:49:01 +00:00
err := cb ( )
2022-05-27 14:01:32 +00:00
a . Free ( tt , wr . Resources , r )
2020-07-09 11:49:01 +00:00
return err
}
2021-09-15 14:37:27 +00:00
// must be called with the same lock as the one passed to withResources
2022-05-27 14:15:52 +00:00
func ( a * ActiveResources ) hasWorkWaiting ( ) bool {
2021-09-15 14:37:27 +00:00
return a . waiting > 0
}
2022-05-27 14:15:52 +00:00
// add task resources to ActiveResources and return utilization difference
func ( a * ActiveResources ) Add ( tt sealtasks . SealTaskType , wr storiface . WorkerResources , r storiface . Resources ) float64 {
2022-04-06 20:55:28 +00:00
startUtil := a . utilization ( wr )
2021-09-01 01:59:25 +00:00
if r . GPUUtilization > 0 {
a . gpuUsed += r . GPUUtilization
2020-10-28 18:52:33 +00:00
}
2021-11-29 11:40:54 +00:00
a . cpuUse += r . Threads ( wr . CPUs , len ( wr . GPUs ) )
2020-07-09 11:49:01 +00:00
a . memUsedMin += r . MinMemory
a . memUsedMax += r . MaxMemory
2023-02-28 08:02:18 +00:00
a . taskCounters . Add ( tt )
2022-04-06 20:55:28 +00:00
return a . utilization ( wr ) - startUtil
2020-07-09 11:49:01 +00:00
}
2022-05-27 14:15:52 +00:00
func ( a * ActiveResources ) Free ( tt sealtasks . SealTaskType , wr storiface . WorkerResources , r storiface . Resources ) {
2021-09-01 01:59:25 +00:00
if r . GPUUtilization > 0 {
a . gpuUsed -= r . GPUUtilization
2020-07-09 11:49:01 +00:00
}
2021-11-29 11:40:54 +00:00
a . cpuUse -= r . Threads ( wr . CPUs , len ( wr . GPUs ) )
2020-07-09 11:49:01 +00:00
a . memUsedMin -= r . MinMemory
a . memUsedMax -= r . MaxMemory
2023-02-28 08:02:18 +00:00
a . taskCounters . Free ( tt )
2021-09-15 14:37:27 +00:00
if a . cond != nil {
a . cond . Broadcast ( )
}
2020-07-09 11:49:01 +00:00
}
2022-05-18 13:47:08 +00:00
// CanHandleRequest evaluates if the worker has enough available resources to
2021-06-21 19:49:16 +00:00
// handle the request.
2022-05-27 14:15:52 +00:00
func ( a * ActiveResources ) CanHandleRequest ( tt sealtasks . SealTaskType , needRes storiface . Resources , wid storiface . WorkerID , caller string , info storiface . WorkerInfo ) bool {
2022-05-25 12:44:11 +00:00
if needRes . MaxConcurrent > 0 {
2023-02-28 08:02:18 +00:00
if a . taskCounters . Get ( tt ) >= needRes . MaxConcurrent {
log . Debugf ( "sched: not scheduling on worker %s for %s; at task limit tt=%s, curcount=%d" , wid , caller , tt , a . taskCounters . Get ( tt ) )
2022-05-25 12:44:11 +00:00
return false
}
}
2021-06-21 19:49:16 +00:00
if info . IgnoreResources {
// shortcircuit; if this worker is ignoring resources, it can always handle the request.
return true
}
2020-07-09 11:49:01 +00:00
2021-06-21 19:49:16 +00:00
res := info . Resources
2021-09-09 21:41:59 +00:00
2020-07-09 11:49:01 +00:00
// TODO: dedupe needRes.BaseMinMemory per task type (don't add if that task is already running)
2021-09-09 21:41:59 +00:00
memNeeded := needRes . MinMemory + needRes . BaseMinMemory
memUsed := a . memUsedMin
// assume that MemUsed can be swapped, so only check it in the vmem Check
memAvail := res . MemPhysical - memUsed
if memNeeded > memAvail {
log . Debugf ( "sched: not scheduling on worker %s for %s; not enough physical memory - need: %dM, have %dM available" , wid , caller , memNeeded / mib , memAvail / mib )
2020-07-09 11:49:01 +00:00
return false
}
2021-09-09 21:41:59 +00:00
vmemNeeded := needRes . MaxMemory + needRes . BaseMinMemory
vmemUsed := a . memUsedMax
2021-11-30 19:50:34 +00:00
workerMemoryReserved := res . MemUsed + res . MemSwapUsed // memory used outside lotus-worker (used by the OS, etc.)
if vmemUsed < workerMemoryReserved {
vmemUsed = workerMemoryReserved
2021-09-09 21:41:59 +00:00
}
2021-11-30 19:50:34 +00:00
vmemAvail := ( res . MemPhysical + res . MemSwap ) - vmemUsed
2020-07-09 11:49:01 +00:00
2021-09-09 21:41:59 +00:00
if vmemNeeded > vmemAvail {
log . Debugf ( "sched: not scheduling on worker %s for %s; not enough virtual memory - need: %dM, have %dM available" , wid , caller , vmemNeeded / mib , vmemAvail / mib )
2020-07-09 11:49:01 +00:00
return false
}
2021-11-29 11:40:54 +00:00
if a . cpuUse + needRes . Threads ( res . CPUs , len ( res . GPUs ) ) > res . CPUs {
log . Debugf ( "sched: not scheduling on worker %s for %s; not enough threads, need %d, %d in use, target %d" , wid , caller , needRes . Threads ( res . CPUs , len ( res . GPUs ) ) , a . cpuUse , res . CPUs )
2020-09-30 22:27:54 +00:00
return false
2020-07-09 11:49:01 +00:00
}
2021-09-01 01:59:25 +00:00
if len ( res . GPUs ) > 0 && needRes . GPUUtilization > 0 {
if a . gpuUsed + needRes . GPUUtilization > float64 ( len ( res . GPUs ) ) {
log . Debugf ( "sched: not scheduling on worker %s for %s; GPU(s) in use" , wid , caller )
2020-07-09 11:49:01 +00:00
return false
}
}
return true
}
2022-04-06 20:55:28 +00:00
// utilization returns a number in 0..1 range indicating fraction of used resources
2022-05-27 14:15:52 +00:00
func ( a * ActiveResources ) utilization ( wr storiface . WorkerResources ) float64 { // todo task type
2020-07-09 11:49:01 +00:00
var max float64
cpu := float64 ( a . cpuUse ) / float64 ( wr . CPUs )
max = cpu
2021-09-09 21:41:59 +00:00
memUsed := a . memUsedMin
if memUsed < wr . MemUsed {
memUsed = wr . MemUsed
}
memMin := float64 ( memUsed ) / float64 ( wr . MemPhysical )
2020-07-09 11:49:01 +00:00
if memMin > max {
max = memMin
}
2021-09-09 21:41:59 +00:00
vmemUsed := a . memUsedMax
if a . memUsedMax < wr . MemUsed + wr . MemSwapUsed {
vmemUsed = wr . MemUsed + wr . MemSwapUsed
}
memMax := float64 ( vmemUsed ) / float64 ( wr . MemPhysical + wr . MemSwap )
2020-07-09 11:49:01 +00:00
if memMax > max {
max = memMax
}
2022-04-06 20:55:28 +00:00
if len ( wr . GPUs ) > 0 {
gpuMax := a . gpuUsed / float64 ( len ( wr . GPUs ) )
if gpuMax > max {
max = gpuMax
}
}
2020-07-09 11:49:01 +00:00
return max
}
2020-08-31 11:31:11 +00:00
2023-02-27 16:44:38 +00:00
func ( a * ActiveResources ) taskCount ( tt * sealtasks . SealTaskType ) int {
// nil means all tasks
if tt == nil {
2023-02-28 08:02:18 +00:00
return a . taskCounters . Sum ( )
2023-02-27 16:44:38 +00:00
}
2023-02-28 08:02:18 +00:00
return a . taskCounters . Get ( * tt )
2023-02-27 16:44:38 +00:00
}
2022-05-18 13:47:08 +00:00
func ( wh * WorkerHandle ) Utilization ( ) float64 {
2020-08-31 11:31:11 +00:00
wh . lk . Lock ( )
2022-05-18 13:47:08 +00:00
u := wh . active . utilization ( wh . Info . Resources )
u += wh . preparing . utilization ( wh . Info . Resources )
2020-08-31 11:31:11 +00:00
wh . lk . Unlock ( )
wh . wndLk . Lock ( )
for _ , window := range wh . activeWindows {
2022-05-18 13:47:08 +00:00
u += window . Allocated . utilization ( wh . Info . Resources )
2020-08-31 11:31:11 +00:00
}
wh . wndLk . Unlock ( )
return u
}
2023-02-27 16:44:38 +00:00
func ( wh * WorkerHandle ) TaskCounts ( ) int {
wh . lk . Lock ( )
u := wh . active . taskCount ( nil )
u += wh . preparing . taskCount ( nil )
wh . lk . Unlock ( )
wh . wndLk . Lock ( )
for _ , window := range wh . activeWindows {
u += window . Allocated . taskCount ( nil )
}
wh . wndLk . Unlock ( )
return u
}
func ( wh * WorkerHandle ) TaskCount ( tt * sealtasks . SealTaskType ) int {
wh . lk . Lock ( )
u := wh . active . taskCount ( tt )
u += wh . preparing . taskCount ( tt )
wh . lk . Unlock ( )
wh . wndLk . Lock ( )
for _ , window := range wh . activeWindows {
u += window . Allocated . taskCount ( tt )
}
wh . wndLk . Unlock ( )
return u
}