Use a float to represent GPU utilization
Before this change workers can only be allocated one GPU task, regardless of how much of the GPU resources that task uses, or how many GPUs are in the system. This makes GPUUtilization a float which can represent that a task needs a portion, or multiple GPUs. GPUs are accounted for like RAM and CPUs so that workers with more GPUs can be allocated more tasks. A known issue is that PC2 cannot use multiple GPUs. And even if the worker has multiple GPUs and is allocated multiple PC2 tasks, those tasks will only run on the first GPU. This could result in unexpected behavior when a worker with multiple GPUs is assigned multiple PC2 tasks. But this should not suprise any existing users who upgrade, as any existing users who run workers with multiple GPUs should already know this and be running a worker per GPU for PC2. But now those users have the freedom to customize the GPU utilization of PC2 to be less than one and effectively run multiple PC2 processes in a single worker. C2 is capable of utilizing multiple GPUs, and now workers can be customized for C2 accordingly.
This commit is contained in:
parent
c4f46171ae
commit
93e4656a27
@ -241,7 +241,7 @@ func init() {
|
||||
Enabled: true,
|
||||
MemUsedMin: 0,
|
||||
MemUsedMax: 0,
|
||||
GpuUsed: false,
|
||||
GpuUsed: 0,
|
||||
CpuUse: 0,
|
||||
},
|
||||
})
|
||||
|
@ -58,7 +58,7 @@ var (
|
||||
FullAPIVersion1 = newVer(2, 1, 0)
|
||||
|
||||
MinerAPIVersion0 = newVer(1, 2, 0)
|
||||
WorkerAPIVersion0 = newVer(1, 2, 0)
|
||||
WorkerAPIVersion0 = newVer(1, 3, 0)
|
||||
)
|
||||
|
||||
//nolint:varcheck,deadcode
|
||||
|
@ -89,7 +89,7 @@ var sealingWorkersCmd = &cli.Command{
|
||||
for _, stat := range st {
|
||||
gpuUse := "not "
|
||||
gpuCol := color.FgBlue
|
||||
if stat.GpuUsed {
|
||||
if stat.GpuUsed > 0 {
|
||||
gpuCol = color.FgGreen
|
||||
gpuUse = ""
|
||||
}
|
||||
@ -132,6 +132,12 @@ var sealingWorkersCmd = &cli.Command{
|
||||
types.SizeStr(types.NewInt(vmemTasks+vmemReserved)),
|
||||
types.SizeStr(types.NewInt(vmemTotal)))
|
||||
|
||||
if len(stat.Info.Resources.GPUs) > 0 {
|
||||
gpuBar := barString(float64(len(stat.Info.Resources.GPUs)), 0, stat.GpuUsed)
|
||||
fmt.Printf("\tGPU: [%s] %.f%% %.2f/%d gpu(s) in use\n", color.GreenString(gpuBar),
|
||||
stat.GpuUsed*100/float64(len(stat.Info.Resources.GPUs)),
|
||||
stat.GpuUsed, len(stat.Info.Resources.GPUs))
|
||||
}
|
||||
for _, gpu := range stat.Info.Resources.GPUs {
|
||||
fmt.Printf("\tGPU: %s\n", color.New(gpuCol).Sprintf("%s, %sused", gpu, gpuUse))
|
||||
}
|
||||
|
26
extern/sector-storage/resources.go
vendored
26
extern/sector-storage/resources.go
vendored
@ -11,7 +11,7 @@ type Resources struct {
|
||||
MaxMemory uint64 // Memory required (swap + ram)
|
||||
|
||||
MaxParallelism int // -1 = multithread
|
||||
CanGPU bool
|
||||
GPUUtilization float64
|
||||
|
||||
BaseMinMemory uint64 // What Must be in RAM for decent perf (shared between threads)
|
||||
}
|
||||
@ -135,7 +135,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
|
||||
MinMemory: 30 << 30,
|
||||
|
||||
MaxParallelism: -1,
|
||||
CanGPU: true,
|
||||
GPUUtilization: 1.0,
|
||||
|
||||
BaseMinMemory: 1 << 30,
|
||||
},
|
||||
@ -144,7 +144,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
|
||||
MinMemory: 15 << 30,
|
||||
|
||||
MaxParallelism: -1,
|
||||
CanGPU: true,
|
||||
GPUUtilization: 1.0,
|
||||
|
||||
BaseMinMemory: 1 << 30,
|
||||
},
|
||||
@ -221,7 +221,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
|
||||
MinMemory: 60 << 30,
|
||||
|
||||
MaxParallelism: -1,
|
||||
CanGPU: true,
|
||||
GPUUtilization: 1.0,
|
||||
|
||||
BaseMinMemory: 64 << 30, // params
|
||||
},
|
||||
@ -230,7 +230,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
|
||||
MinMemory: 30 << 30,
|
||||
|
||||
MaxParallelism: -1,
|
||||
CanGPU: true,
|
||||
GPUUtilization: 1.0,
|
||||
|
||||
BaseMinMemory: 32 << 30, // params
|
||||
},
|
||||
@ -239,7 +239,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
|
||||
MinMemory: 1 << 30,
|
||||
|
||||
MaxParallelism: 1, // This is fine
|
||||
CanGPU: true,
|
||||
GPUUtilization: 1.0,
|
||||
|
||||
BaseMinMemory: 10 << 30,
|
||||
},
|
||||
@ -248,7 +248,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
|
||||
MinMemory: 2 << 10,
|
||||
|
||||
MaxParallelism: 1,
|
||||
CanGPU: true,
|
||||
GPUUtilization: 1.0,
|
||||
|
||||
BaseMinMemory: 2 << 10,
|
||||
},
|
||||
@ -257,7 +257,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
|
||||
MinMemory: 8 << 20,
|
||||
|
||||
MaxParallelism: 1,
|
||||
CanGPU: true,
|
||||
GPUUtilization: 1.0,
|
||||
|
||||
BaseMinMemory: 8 << 20,
|
||||
},
|
||||
@ -268,7 +268,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
|
||||
MinMemory: 1 << 20,
|
||||
|
||||
MaxParallelism: 0,
|
||||
CanGPU: false,
|
||||
GPUUtilization: 0,
|
||||
|
||||
BaseMinMemory: 0,
|
||||
},
|
||||
@ -277,7 +277,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
|
||||
MinMemory: 1 << 20,
|
||||
|
||||
MaxParallelism: 0,
|
||||
CanGPU: false,
|
||||
GPUUtilization: 0,
|
||||
|
||||
BaseMinMemory: 0,
|
||||
},
|
||||
@ -286,7 +286,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
|
||||
MinMemory: 1 << 20,
|
||||
|
||||
MaxParallelism: 0,
|
||||
CanGPU: false,
|
||||
GPUUtilization: 0,
|
||||
|
||||
BaseMinMemory: 0,
|
||||
},
|
||||
@ -295,7 +295,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
|
||||
MinMemory: 1 << 20,
|
||||
|
||||
MaxParallelism: 0,
|
||||
CanGPU: false,
|
||||
GPUUtilization: 0,
|
||||
|
||||
BaseMinMemory: 0,
|
||||
},
|
||||
@ -304,7 +304,7 @@ var ResourceTable = map[sealtasks.TaskType]map[abi.RegisteredSealProof]Resources
|
||||
MinMemory: 1 << 20,
|
||||
|
||||
MaxParallelism: 0,
|
||||
CanGPU: false,
|
||||
GPUUtilization: 0,
|
||||
|
||||
BaseMinMemory: 0,
|
||||
},
|
||||
|
2
extern/sector-storage/sched.go
vendored
2
extern/sector-storage/sched.go
vendored
@ -114,7 +114,7 @@ type workerDisableReq struct {
|
||||
type activeResources struct {
|
||||
memUsedMin uint64
|
||||
memUsedMax uint64
|
||||
gpuUsed bool
|
||||
gpuUsed float64
|
||||
cpuUse uint64
|
||||
|
||||
cond *sync.Cond
|
||||
|
14
extern/sector-storage/sched_resources.go
vendored
14
extern/sector-storage/sched_resources.go
vendored
@ -31,8 +31,8 @@ func (a *activeResources) hasWorkWaiting() bool {
|
||||
}
|
||||
|
||||
func (a *activeResources) add(wr storiface.WorkerResources, r Resources) {
|
||||
if r.CanGPU {
|
||||
a.gpuUsed = true
|
||||
if r.GPUUtilization > 0 {
|
||||
a.gpuUsed += r.GPUUtilization
|
||||
}
|
||||
a.cpuUse += r.Threads(wr.CPUs)
|
||||
a.memUsedMin += r.MinMemory
|
||||
@ -40,8 +40,8 @@ func (a *activeResources) add(wr storiface.WorkerResources, r Resources) {
|
||||
}
|
||||
|
||||
func (a *activeResources) free(wr storiface.WorkerResources, r Resources) {
|
||||
if r.CanGPU {
|
||||
a.gpuUsed = false
|
||||
if r.GPUUtilization > 0 {
|
||||
a.gpuUsed -= r.GPUUtilization
|
||||
}
|
||||
a.cpuUse -= r.Threads(wr.CPUs)
|
||||
a.memUsedMin -= r.MinMemory
|
||||
@ -89,9 +89,9 @@ func (a *activeResources) canHandleRequest(needRes Resources, wid WorkerID, call
|
||||
return false
|
||||
}
|
||||
|
||||
if len(res.GPUs) > 0 && needRes.CanGPU {
|
||||
if a.gpuUsed {
|
||||
log.Debugf("sched: not scheduling on worker %s for %s; GPU in use", wid, caller)
|
||||
if len(res.GPUs) > 0 && needRes.GPUUtilization > 0 {
|
||||
if a.gpuUsed+needRes.GPUUtilization > float64(len(res.GPUs)) {
|
||||
log.Debugf("sched: not scheduling on worker %s for %s; GPU(s) in use", wid, caller)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
4
extern/sector-storage/storiface/worker.go
vendored
4
extern/sector-storage/storiface/worker.go
vendored
@ -42,8 +42,8 @@ type WorkerStats struct {
|
||||
|
||||
MemUsedMin uint64
|
||||
MemUsedMax uint64
|
||||
GpuUsed bool // nolint
|
||||
CpuUse uint64 // nolint
|
||||
GpuUsed float64 // nolint
|
||||
CpuUse uint64 // nolint
|
||||
}
|
||||
|
||||
const (
|
||||
|
Loading…
Reference in New Issue
Block a user