harmonytask - final review comments
This commit is contained in:
parent
ec8fd28834
commit
72917c19cd
@ -609,7 +609,6 @@ var runCmd = &cli.Command{
|
|||||||
if err := srv.Shutdown(context.TODO()); err != nil {
|
if err := srv.Shutdown(context.TODO()); err != nil {
|
||||||
log.Errorf("shutting down RPC server failed: %s", err)
|
log.Errorf("shutting down RPC server failed: %s", err)
|
||||||
}
|
}
|
||||||
//taskManager.GracefullyTerminate(5*time.Hour)
|
|
||||||
log.Warn("Graceful shutdown successful")
|
log.Warn("Graceful shutdown successful")
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ import (
|
|||||||
type task1 struct {
|
type task1 struct {
|
||||||
toAdd []int
|
toAdd []int
|
||||||
myPersonalTableLock sync.Mutex
|
myPersonalTableLock sync.Mutex
|
||||||
myPersonalTable map[harmonytask.TaskID]int // This would typicallyb be a DB table
|
myPersonalTable map[harmonytask.TaskID]int // This would typically be a DB table
|
||||||
WorkCompleted []string
|
WorkCompleted []string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,8 +23,9 @@ Mental Model:
|
|||||||
- Async Listener task (for chain, etc)
|
- Async Listener task (for chain, etc)
|
||||||
- Followers: Tasks get added because another task completed
|
- Followers: Tasks get added because another task completed
|
||||||
When Follower collectors run:
|
When Follower collectors run:
|
||||||
- If both sides are process-local, then
|
- If both sides are process-local, then this process will pick it up.
|
||||||
- Otherwise, at the listen interval during db scrape
|
- If properly registered already, the http endpoint will be tried to start it.
|
||||||
|
- Otherwise, at the listen interval during db scrape it will be found.
|
||||||
How duplicate tasks are avoided:
|
How duplicate tasks are avoided:
|
||||||
- that's up to the task definition, but probably a unique key
|
- that's up to the task definition, but probably a unique key
|
||||||
|
|
||||||
|
@ -56,7 +56,7 @@ type TaskInterface interface {
|
|||||||
// CanAccept should return if the task can run on this machine. It should
|
// CanAccept should return if the task can run on this machine. It should
|
||||||
// return null if the task type is not allowed on this machine.
|
// return null if the task type is not allowed on this machine.
|
||||||
// It should select the task it most wants to accomplish.
|
// It should select the task it most wants to accomplish.
|
||||||
// It is also responsible for determining disk space (including scratch).
|
// It is also responsible for determining & reserving disk space (including scratch).
|
||||||
CanAccept([]TaskID) (*TaskID, error)
|
CanAccept([]TaskID) (*TaskID, error)
|
||||||
|
|
||||||
// TypeDetails() returns static details about how this task behaves and
|
// TypeDetails() returns static details about how this task behaves and
|
||||||
@ -181,7 +181,7 @@ func New(
|
|||||||
if h == nil {
|
if h == nil {
|
||||||
_, err := db.Exec(e.ctx, `UPDATE harmony_task SET owner=NULL WHERE id=$1`, w.ID)
|
_, err := db.Exec(e.ctx, `UPDATE harmony_task SET owner=NULL WHERE id=$1`, w.ID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("Cannot remove self from owner field: ", err)
|
log.Errorw("Cannot remove self from owner field", "error", err)
|
||||||
continue // not really fatal, but not great
|
continue // not really fatal, but not great
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -206,12 +206,14 @@ func (e *TaskEngine) GracefullyTerminate(deadline time.Duration) {
|
|||||||
e.reg.Shutdown()
|
e.reg.Shutdown()
|
||||||
deadlineChan := time.NewTimer(deadline).C
|
deadlineChan := time.NewTimer(deadline).C
|
||||||
|
|
||||||
|
ctx := context.TODO()
|
||||||
|
|
||||||
// block bumps & follows by unreg from DBs.
|
// block bumps & follows by unreg from DBs.
|
||||||
_, err := e.db.Exec(context.TODO(), `DELETE FROM harmony_task_impl WHERE owner_id=$1`, e.ownerID)
|
_, err := e.db.Exec(ctx, `DELETE FROM harmony_task_impl WHERE owner_id=$1`, e.ownerID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("Could not clean-up impl table: %w", err)
|
log.Warn("Could not clean-up impl table: %w", err)
|
||||||
}
|
}
|
||||||
_, err = e.db.Exec(context.Background(), `DELETE FROM harmony_task_follow WHERE owner_id=$1`, e.ownerID)
|
_, err = e.db.Exec(ctx, `DELETE FROM harmony_task_follow WHERE owner_id=$1`, e.ownerID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("Could not clean-up impl table: %w", err)
|
log.Warn("Could not clean-up impl table: %w", err)
|
||||||
}
|
}
|
||||||
@ -271,7 +273,7 @@ func (e *TaskEngine) followWorkInDB() {
|
|||||||
// we need to create this task
|
// we need to create this task
|
||||||
if !src.h.Follows[fromName](TaskID(workAlreadyDone), src.h.AddTask) {
|
if !src.h.Follows[fromName](TaskID(workAlreadyDone), src.h.AddTask) {
|
||||||
// But someone may have beaten us to it.
|
// But someone may have beaten us to it.
|
||||||
log.Infof("Unable to add task %s following Task(%d, %s)", src.h.Name, workAlreadyDone, fromName)
|
log.Debugf("Unable to add task %s following Task(%d, %s)", src.h.Name, workAlreadyDone, fromName)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -386,11 +388,21 @@ func (e *TaskEngine) bump(taskType string) {
|
|||||||
// resourcesInUse requires workListsMutex to be already locked.
|
// resourcesInUse requires workListsMutex to be already locked.
|
||||||
func (e *TaskEngine) resourcesInUse() resources.Resources {
|
func (e *TaskEngine) resourcesInUse() resources.Resources {
|
||||||
tmp := e.reg.Resources
|
tmp := e.reg.Resources
|
||||||
|
copy(tmp.GpuRam, e.reg.Resources.GpuRam)
|
||||||
for _, t := range e.handlers {
|
for _, t := range e.handlers {
|
||||||
ct := t.Count.Load()
|
ct := t.Count.Load()
|
||||||
tmp.Cpu -= int(ct) * t.Cost.Cpu
|
tmp.Cpu -= int(ct) * t.Cost.Cpu
|
||||||
tmp.Gpu -= float64(ct) * t.Cost.Gpu
|
tmp.Gpu -= float64(ct) * t.Cost.Gpu
|
||||||
tmp.Ram -= uint64(ct) * t.Cost.Ram
|
tmp.Ram -= uint64(ct) * t.Cost.Ram
|
||||||
|
for i := int32(0); i < ct; i++ {
|
||||||
|
for grIdx, j := range tmp.GpuRam {
|
||||||
|
if j > t.Cost.GpuRam[0] {
|
||||||
|
tmp.GpuRam[grIdx] = j - t.Cost.GpuRam[0]
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
log.Warn("We should never get out of gpuram for what's consumed.")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return tmp
|
return tmp
|
||||||
}
|
}
|
||||||
|
@ -223,6 +223,13 @@ func (h *taskTypeHandler) AssertMachineHasCapacity() error {
|
|||||||
if r.Gpu-h.Cost.Gpu < 0 {
|
if r.Gpu-h.Cost.Gpu < 0 {
|
||||||
return errors.New("Did not accept " + h.Name + " task: out of available GPU")
|
return errors.New("Did not accept " + h.Name + " task: out of available GPU")
|
||||||
}
|
}
|
||||||
|
for _, u := range r.GpuRam {
|
||||||
|
if u > h.Cost.GpuRam[0] {
|
||||||
|
goto enoughGpuRam
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return errors.New("Did not accept " + h.Name + " task: out of GPURam")
|
||||||
|
enoughGpuRam:
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
@ -1,3 +1,5 @@
|
|||||||
|
// Package cl was borrowed from the go-opencl library which is more complex and
|
||||||
|
// doesn't compile well for our needs.
|
||||||
package cl
|
package cl
|
||||||
|
|
||||||
// #include "cl.h"
|
// #include "cl.h"
|
@ -27,7 +27,7 @@ var LOOKS_DEAD_TIMEOUT = 10 * time.Minute // Time w/o minute heartbeats
|
|||||||
type Resources struct {
|
type Resources struct {
|
||||||
Cpu int
|
Cpu int
|
||||||
Gpu float64
|
Gpu float64
|
||||||
GpuRam uint64
|
GpuRam []uint64
|
||||||
Ram uint64
|
Ram uint64
|
||||||
MachineID int
|
MachineID int
|
||||||
}
|
}
|
||||||
@ -72,7 +72,8 @@ func Register(db *harmonydb.DB, hostnameAndPort string) (*Reg, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
CleanupMachines(context.Background(), db)
|
cleaned := CleanupMachines(context.Background(), db)
|
||||||
|
logger.Infow("Cleaned up machines", "count", cleaned)
|
||||||
}
|
}
|
||||||
go func() {
|
go func() {
|
||||||
for {
|
for {
|
||||||
@ -138,21 +139,24 @@ func getResources() (res Resources, err error) {
|
|||||||
return res, nil
|
return res, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func getGpuRam() uint64 {
|
func getGpuRam() (res []uint64) {
|
||||||
platforms, err := cl.GetPlatforms()
|
platforms, err := cl.GetPlatforms()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Error(err)
|
logger.Error(err)
|
||||||
return 0
|
return res
|
||||||
}
|
}
|
||||||
|
|
||||||
return uint64(lo.SumBy(platforms, func(p *cl.Platform) int64 {
|
lo.ForEach(platforms, func(p *cl.Platform, i int) {
|
||||||
d, err := p.GetAllDevices()
|
d, err := p.GetAllDevices()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Error(err)
|
logger.Error(err)
|
||||||
return 0
|
return
|
||||||
}
|
}
|
||||||
return lo.SumBy(d, func(d *cl.Device) int64 { return d.GlobalMemSize() })
|
lo.ForEach(d, func(d *cl.Device, i int) {
|
||||||
}))
|
res = append(res, uint64(d.GlobalMemSize()))
|
||||||
|
})
|
||||||
|
})
|
||||||
|
return res
|
||||||
}
|
}
|
||||||
|
|
||||||
func DiskFree(path string) (uint64, error) {
|
func DiskFree(path string) (uint64, error) {
|
||||||
@ -164,17 +168,3 @@ func DiskFree(path string) (uint64, error) {
|
|||||||
|
|
||||||
return s.Bfree * uint64(s.Bsize), nil
|
return s.Bfree * uint64(s.Bsize), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
/* NOT for Darwin.
|
|
||||||
func GetMemFree() uint64 {
|
|
||||||
in := unix.Sysinfo_t{}
|
|
||||||
err := unix.Sysinfo(&in)
|
|
||||||
if err != nil {
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
// If this is a 32-bit system, then these fields are
|
|
||||||
// uint32 instead of uint64.
|
|
||||||
// So we always convert to uint64 to match signature.
|
|
||||||
return uint64(in.Freeram) * uint64(in.Unit)
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
Loading…
Reference in New Issue
Block a user