Use a float to represent GPU utilization

Before this change workers can only be allocated one GPU task,
regardless of how much of the GPU resources that task uses, or how many
GPUs are in the system.

This makes GPUUtilization a float which can represent that a task needs
a portion, or multiple GPUs. GPUs are accounted for like RAM and CPUs so
that workers with more GPUs can be allocated more tasks.

A known issue is that PC2 cannot use multiple GPUs. And even if the
worker has multiple GPUs and is allocated multiple PC2 tasks, those
tasks will only run on the first GPU.

This could result in unexpected behavior when a worker with multiple
GPUs is assigned multiple PC2 tasks. But this should not suprise any
existing users who upgrade, as any existing users who run workers with
multiple GPUs should already know this and be running a worker per GPU
for PC2. But now those users have the freedom to customize the GPU
utilization of PC2 to be less than one and effectively run multiple PC2
processes in a single worker.

C2 is capable of utilizing multiple GPUs, and now workers can be
customized for C2 accordingly.
This commit is contained in:
Clint Armstrong 2021-08-31 21:59:25 -04:00 committed by Łukasz Magiera
parent c4f46171ae
commit 93e4656a27
7 changed files with 32 additions and 26 deletions

View File

@ -241,7 +241,7 @@ func init() {
Enabled: true,
MemUsedMin: 0,
MemUsedMax: 0,
GpuUsed: false,
GpuUsed: 0,
CpuUse: 0,
},
})

View File

@ -58,7 +58,7 @@ var (
FullAPIVersion1 = newVer(2, 1, 0)
MinerAPIVersion0 = newVer(1, 2, 0)
WorkerAPIVersion0 = newVer(1, 2, 0)
WorkerAPIVersion0 = newVer(1, 3, 0)
)
//nolint:varcheck,deadcode

View File

@ -89,7 +89,7 @@ var sealingWorkersCmd = &cli.Command{
for _, stat := range st {
gpuUse := "not "
gpuCol := color.FgBlue
if stat.GpuUsed {
if stat.GpuUsed > 0 {
gpuCol = color.FgGreen
gpuUse = ""
}
@ -132,6 +132,12 @@ var sealingWorkersCmd = &cli.Command{
types.SizeStr(types.NewInt(vmemTasks+vmemReserved)),
types.SizeStr(types.NewInt(vmemTotal)))
if len(stat.Info.Resources.GPUs) > 0 {
gpuBar := barString(float64(len(stat.Info.Resources.GPUs)), 0, stat.GpuUsed)
fmt.Printf("\tGPU: [%s] %.f%% %.2f/%d gpu(s) in use\n", color.GreenString(gpuBar),
stat.GpuUsed*100/float64(len(stat.Info.Resources.GPUs)),
stat.GpuUsed, len(stat.Info.Resources.GPUs))
}
for _, gpu := range stat.Info.Resources.GPUs {
fmt.Printf("\tGPU: %s\n", color.New(gpuCol).Sprintf("%s, %sused", gpu, gpuUse))
}

View File

@ -11,7 +11,7 @@ type Resources struct {
MaxMemory uint64 // Memory required (swap + ram)
MaxParallelism int // -1 = multithread
CanGPU bool
GPUUtilization float64
BaseMinMemory uint64 // What Must be in RAM for decent perf (shared between threads)
}
@ -135,7 +135,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
MinMemory: 30 << 30,
MaxParallelism: -1,
CanGPU: true,
GPUUtilization: 1.0,
BaseMinMemory: 1 << 30,
},
@ -144,7 +144,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
MinMemory: 15 << 30,
MaxParallelism: -1,
CanGPU: true,
GPUUtilization: 1.0,
BaseMinMemory: 1 << 30,
},
@ -221,7 +221,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
MinMemory: 60 << 30,
MaxParallelism: -1,
CanGPU: true,
GPUUtilization: 1.0,
BaseMinMemory: 64 << 30, // params
},
@ -230,7 +230,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
MinMemory: 30 << 30,
MaxParallelism: -1,
CanGPU: true,
GPUUtilization: 1.0,
BaseMinMemory: 32 << 30, // params
},
@ -239,7 +239,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
MinMemory: 1 << 30,
MaxParallelism: 1, // This is fine
CanGPU: true,
GPUUtilization: 1.0,
BaseMinMemory: 10 << 30,
},
@ -248,7 +248,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
MinMemory: 2 << 10,
MaxParallelism: 1,
CanGPU: true,
GPUUtilization: 1.0,
BaseMinMemory: 2 << 10,
},
@ -257,7 +257,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
MinMemory: 8 << 20,
MaxParallelism: 1,
CanGPU: true,
GPUUtilization: 1.0,
BaseMinMemory: 8 << 20,
},
@ -268,7 +268,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
MinMemory: 1 << 20,
MaxParallelism: 0,
CanGPU: false,
GPUUtilization: 0,
BaseMinMemory: 0,
},
@ -277,7 +277,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
MinMemory: 1 << 20,
MaxParallelism: 0,
CanGPU: false,
GPUUtilization: 0,
BaseMinMemory: 0,
},
@ -286,7 +286,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
MinMemory: 1 << 20,
MaxParallelism: 0,
CanGPU: false,
GPUUtilization: 0,
BaseMinMemory: 0,
},
@ -295,7 +295,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
MinMemory: 1 << 20,
MaxParallelism: 0,
CanGPU: false,
GPUUtilization: 0,
BaseMinMemory: 0,
},
@ -304,7 +304,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
MinMemory: 1 << 20,
MaxParallelism: 0,
CanGPU: false,
GPUUtilization: 0,
BaseMinMemory: 0,
},

View File

@ -114,7 +114,7 @@ type workerDisableReq struct {
type activeResources struct {
memUsedMin uint64
memUsedMax uint64
gpuUsed bool
gpuUsed float64
cpuUse uint64
cond *sync.Cond

View File

@ -31,8 +31,8 @@ func (a *activeResources) hasWorkWaiting() bool {
}
func (a *activeResources) add(wr storiface.WorkerResources, r Resources) {
if r.CanGPU {
a.gpuUsed = true
if r.GPUUtilization > 0 {
a.gpuUsed += r.GPUUtilization
}
a.cpuUse += r.Threads(wr.CPUs)
a.memUsedMin += r.MinMemory
@ -40,8 +40,8 @@ func (a *activeResources) add(wr storiface.WorkerResources, r Resources) {
}
func (a *activeResources) free(wr storiface.WorkerResources, r Resources) {
if r.CanGPU {
a.gpuUsed = false
if r.GPUUtilization > 0 {
a.gpuUsed -= r.GPUUtilization
}
a.cpuUse -= r.Threads(wr.CPUs)
a.memUsedMin -= r.MinMemory
@ -89,9 +89,9 @@ func (a *activeResources) canHandleRequest(needRes Resources, wid WorkerID, call
return false
}
if len(res.GPUs) > 0 && needRes.CanGPU {
if a.gpuUsed {
log.Debugf("sched: not scheduling on worker %s for %s; GPU in use", wid, caller)
if len(res.GPUs) > 0 && needRes.GPUUtilization > 0 {
if a.gpuUsed+needRes.GPUUtilization > float64(len(res.GPUs)) {
log.Debugf("sched: not scheduling on worker %s for %s; GPU(s) in use", wid, caller)
return false
}
}

View File

@ -42,8 +42,8 @@ type WorkerStats struct {
MemUsedMin uint64
MemUsedMax uint64
GpuUsed bool // nolint
CpuUse uint64 // nolint
GpuUsed float64 // nolint
CpuUse uint64 // nolint
}
const (