2023-08-14 16:40:12 +00:00
|
|
|
package resources
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"context"
|
|
|
|
"os/exec"
|
|
|
|
"regexp"
|
|
|
|
"runtime"
|
|
|
|
"sync/atomic"
|
|
|
|
"time"
|
|
|
|
|
2024-01-12 10:03:37 +00:00
|
|
|
"github.com/elastic/go-sysinfo"
|
2023-08-14 16:40:12 +00:00
|
|
|
logging "github.com/ipfs/go-log/v2"
|
2023-08-16 22:54:26 +00:00
|
|
|
"golang.org/x/sys/unix"
|
2023-11-08 17:24:17 +00:00
|
|
|
"golang.org/x/xerrors"
|
2023-08-21 16:26:26 +00:00
|
|
|
|
|
|
|
"github.com/filecoin-project/lotus/lib/harmony/harmonydb"
|
2023-08-14 16:40:12 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
var LOOKS_DEAD_TIMEOUT = 10 * time.Minute // Time w/o minute heartbeats
|
|
|
|
|
|
|
|
type Resources struct {
|
|
|
|
Cpu int
|
|
|
|
Gpu float64
|
|
|
|
Ram uint64
|
|
|
|
MachineID int
|
2024-02-27 18:47:58 +00:00
|
|
|
Storage
|
|
|
|
}
|
|
|
|
|
|
|
|
// Optional Storage management.
|
|
|
|
type Storage interface {
|
|
|
|
HasCapacity() bool
|
|
|
|
|
2024-05-20 11:04:37 +00:00
|
|
|
// This allows some other system to claim space for this task. Returns a cleanup function
|
|
|
|
Claim(taskID int) (func() error, error)
|
2023-08-14 16:40:12 +00:00
|
|
|
}
|
|
|
|
type Reg struct {
|
|
|
|
Resources
|
|
|
|
shutdown atomic.Bool
|
|
|
|
}
|
|
|
|
|
|
|
|
var logger = logging.Logger("harmonytask")
|
|
|
|
|
2023-08-16 21:56:09 +00:00
|
|
|
var lotusRE = regexp.MustCompile("lotus-worker|lotus-harmony|yugabyted|yb-master|yb-tserver")
|
2023-08-14 16:40:12 +00:00
|
|
|
|
|
|
|
func Register(db *harmonydb.DB, hostnameAndPort string) (*Reg, error) {
|
|
|
|
var reg Reg
|
|
|
|
var err error
|
|
|
|
reg.Resources, err = getResources()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
ctx := context.Background()
|
|
|
|
{ // Learn our owner_id while updating harmony_machines
|
2023-11-04 10:04:46 +00:00
|
|
|
var ownerID *int
|
2023-08-14 16:40:12 +00:00
|
|
|
|
2023-10-31 14:05:33 +00:00
|
|
|
// Upsert query with last_contact update, fetch the machine ID
|
|
|
|
// (note this isn't a simple insert .. on conflict because host_and_port isn't unique)
|
|
|
|
err := db.QueryRow(ctx, `
|
|
|
|
WITH upsert AS (
|
|
|
|
UPDATE harmony_machines
|
2023-10-31 22:16:04 +00:00
|
|
|
SET cpu = $2, ram = $3, gpu = $4, last_contact = CURRENT_TIMESTAMP
|
2023-10-31 14:05:33 +00:00
|
|
|
WHERE host_and_port = $1
|
|
|
|
RETURNING id
|
|
|
|
),
|
|
|
|
inserted AS (
|
2023-11-04 10:04:46 +00:00
|
|
|
INSERT INTO harmony_machines (host_and_port, cpu, ram, gpu, last_contact)
|
2023-10-31 22:16:04 +00:00
|
|
|
SELECT $1, $2, $3, $4, CURRENT_TIMESTAMP
|
2023-10-31 14:05:33 +00:00
|
|
|
WHERE NOT EXISTS (SELECT id FROM upsert)
|
|
|
|
RETURNING id
|
|
|
|
)
|
|
|
|
SELECT id FROM upsert
|
|
|
|
UNION ALL
|
|
|
|
SELECT id FROM inserted;
|
2023-10-31 22:16:04 +00:00
|
|
|
`, hostnameAndPort, reg.Cpu, reg.Ram, reg.Gpu).Scan(&ownerID)
|
2023-08-14 16:40:12 +00:00
|
|
|
if err != nil {
|
2023-11-04 10:04:46 +00:00
|
|
|
return nil, xerrors.Errorf("inserting machine entry: %w", err)
|
|
|
|
}
|
|
|
|
if ownerID == nil {
|
|
|
|
return nil, xerrors.Errorf("no owner id")
|
2023-08-14 16:40:12 +00:00
|
|
|
}
|
|
|
|
|
2023-11-04 10:04:46 +00:00
|
|
|
reg.MachineID = *ownerID
|
2023-10-31 14:05:33 +00:00
|
|
|
|
2023-08-25 21:11:31 +00:00
|
|
|
cleaned := CleanupMachines(context.Background(), db)
|
|
|
|
logger.Infow("Cleaned up machines", "count", cleaned)
|
2023-08-14 16:40:12 +00:00
|
|
|
}
|
|
|
|
go func() {
|
|
|
|
for {
|
|
|
|
time.Sleep(time.Minute)
|
|
|
|
if reg.shutdown.Load() {
|
|
|
|
return
|
|
|
|
}
|
2024-02-17 22:12:11 +00:00
|
|
|
_, err := db.Exec(ctx, `UPDATE harmony_machines SET last_contact=CURRENT_TIMESTAMP where id=$1`, reg.MachineID)
|
2023-08-14 16:40:12 +00:00
|
|
|
if err != nil {
|
|
|
|
logger.Error("Cannot keepalive ", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
|
|
|
return ®, nil
|
|
|
|
}
|
2023-11-04 10:04:46 +00:00
|
|
|
|
2023-08-14 16:40:12 +00:00
|
|
|
func CleanupMachines(ctx context.Context, db *harmonydb.DB) int {
|
2023-11-15 14:01:32 +00:00
|
|
|
ct, err := db.Exec(ctx,
|
2023-11-16 02:02:47 +00:00
|
|
|
`DELETE FROM harmony_machines WHERE last_contact < CURRENT_TIMESTAMP - INTERVAL '1 MILLISECOND' * $1 `,
|
|
|
|
LOOKS_DEAD_TIMEOUT.Milliseconds()) // ms enables unit testing to change timeout.
|
2023-08-14 16:40:12 +00:00
|
|
|
if err != nil {
|
|
|
|
logger.Warn("unable to delete old machines: ", err)
|
|
|
|
}
|
|
|
|
return ct
|
|
|
|
}
|
|
|
|
|
|
|
|
func (res *Reg) Shutdown() {
|
|
|
|
res.shutdown.Store(true)
|
|
|
|
}
|
|
|
|
|
|
|
|
func getResources() (res Resources, err error) {
|
|
|
|
b, err := exec.Command(`ps`, `-ef`).CombinedOutput()
|
|
|
|
if err != nil {
|
|
|
|
logger.Warn("Could not safety check for 2+ processes: ", err)
|
|
|
|
} else {
|
|
|
|
found := 0
|
|
|
|
for _, b := range bytes.Split(b, []byte("\n")) {
|
|
|
|
if lotusRE.Match(b) {
|
|
|
|
found++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if found > 1 {
|
2024-03-15 21:38:13 +00:00
|
|
|
logger.Warn("curio's defaults are for running alone. Use task maximums or CGroups.")
|
2023-08-14 16:40:12 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-12-23 18:46:57 +00:00
|
|
|
h, err := sysinfo.Host()
|
|
|
|
if err != nil {
|
|
|
|
return Resources{}, err
|
|
|
|
}
|
|
|
|
|
|
|
|
mem, err := h.Memory()
|
|
|
|
if err != nil {
|
|
|
|
return Resources{}, err
|
|
|
|
}
|
|
|
|
|
2023-08-14 16:40:12 +00:00
|
|
|
res = Resources{
|
2023-10-31 22:13:16 +00:00
|
|
|
Cpu: runtime.NumCPU(),
|
2023-12-23 18:46:57 +00:00
|
|
|
Ram: mem.Available,
|
2023-10-31 22:13:16 +00:00
|
|
|
Gpu: getGPUDevices(),
|
2023-08-14 16:40:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return res, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func DiskFree(path string) (uint64, error) {
|
|
|
|
s := unix.Statfs_t{}
|
|
|
|
err := unix.Statfs(path, &s)
|
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return s.Bfree * uint64(s.Bsize), nil
|
|
|
|
}
|