Merge pull request #4779 from filecoin-project/feat/work-track-hostname
storage: Track worker hostnames with work
This commit is contained in:
commit
18aa97f962
@ -187,10 +187,14 @@ var sealingJobsCmd = &cli.Command{
|
||||
|
||||
for _, l := range lines {
|
||||
state := "running"
|
||||
if l.RunWait > 0 {
|
||||
switch {
|
||||
case l.RunWait > 0:
|
||||
state = fmt.Sprintf("assigned(%d)", l.RunWait-1)
|
||||
}
|
||||
if l.RunWait == -1 {
|
||||
case l.RunWait == storiface.RWRetDone:
|
||||
state = "ret-done"
|
||||
case l.RunWait == storiface.RWReturned:
|
||||
state = "returned"
|
||||
case l.RunWait == storiface.RWRetWait:
|
||||
state = "ret-wait"
|
||||
}
|
||||
dur := "n/a"
|
||||
@ -198,11 +202,16 @@ var sealingJobsCmd = &cli.Command{
|
||||
dur = time.Now().Sub(l.Start).Truncate(time.Millisecond * 100).String()
|
||||
}
|
||||
|
||||
hostname, ok := workerHostnames[l.wid]
|
||||
if !ok {
|
||||
hostname = l.Hostname
|
||||
}
|
||||
|
||||
_, _ = fmt.Fprintf(tw, "%s\t%d\t%s\t%s\t%s\t%s\t%s\n",
|
||||
hex.EncodeToString(l.ID.ID[10:]),
|
||||
l.Sector.Number,
|
||||
hex.EncodeToString(l.wid[5:]),
|
||||
workerHostnames[l.wid],
|
||||
hostname,
|
||||
l.Task.Short(),
|
||||
state,
|
||||
dur)
|
||||
|
84
extern/sector-storage/cbor_gen.go
vendored
84
extern/sector-storage/cbor_gen.go
vendored
@ -199,7 +199,7 @@ func (t *WorkState) MarshalCBOR(w io.Writer) error {
|
||||
_, err := w.Write(cbg.CborNull)
|
||||
return err
|
||||
}
|
||||
if _, err := w.Write([]byte{164}); err != nil {
|
||||
if _, err := w.Write([]byte{166}); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@ -282,6 +282,51 @@ func (t *WorkState) MarshalCBOR(w io.Writer) error {
|
||||
if _, err := io.WriteString(w, string(t.WorkError)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// t.WorkerHostname (string) (string)
|
||||
if len("WorkerHostname") > cbg.MaxLength {
|
||||
return xerrors.Errorf("Value in field \"WorkerHostname\" was too long")
|
||||
}
|
||||
|
||||
if err := cbg.WriteMajorTypeHeaderBuf(scratch, w, cbg.MajTextString, uint64(len("WorkerHostname"))); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := io.WriteString(w, string("WorkerHostname")); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if len(t.WorkerHostname) > cbg.MaxLength {
|
||||
return xerrors.Errorf("Value in field t.WorkerHostname was too long")
|
||||
}
|
||||
|
||||
if err := cbg.WriteMajorTypeHeaderBuf(scratch, w, cbg.MajTextString, uint64(len(t.WorkerHostname))); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := io.WriteString(w, string(t.WorkerHostname)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// t.StartTime (int64) (int64)
|
||||
if len("StartTime") > cbg.MaxLength {
|
||||
return xerrors.Errorf("Value in field \"StartTime\" was too long")
|
||||
}
|
||||
|
||||
if err := cbg.WriteMajorTypeHeaderBuf(scratch, w, cbg.MajTextString, uint64(len("StartTime"))); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := io.WriteString(w, string("StartTime")); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if t.StartTime >= 0 {
|
||||
if err := cbg.WriteMajorTypeHeaderBuf(scratch, w, cbg.MajUnsignedInt, uint64(t.StartTime)); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
if err := cbg.WriteMajorTypeHeaderBuf(scratch, w, cbg.MajNegativeInt, uint64(-t.StartTime-1)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -360,6 +405,43 @@ func (t *WorkState) UnmarshalCBOR(r io.Reader) error {
|
||||
|
||||
t.WorkError = string(sval)
|
||||
}
|
||||
// t.WorkerHostname (string) (string)
|
||||
case "WorkerHostname":
|
||||
|
||||
{
|
||||
sval, err := cbg.ReadStringBuf(br, scratch)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
t.WorkerHostname = string(sval)
|
||||
}
|
||||
// t.StartTime (int64) (int64)
|
||||
case "StartTime":
|
||||
{
|
||||
maj, extra, err := cbg.CborReadHeaderBuf(br, scratch)
|
||||
var extraI int64
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
switch maj {
|
||||
case cbg.MajUnsignedInt:
|
||||
extraI = int64(extra)
|
||||
if extraI < 0 {
|
||||
return fmt.Errorf("int64 positive overflow")
|
||||
}
|
||||
case cbg.MajNegativeInt:
|
||||
extraI = int64(extra)
|
||||
if extraI < 0 {
|
||||
return fmt.Errorf("int64 negative oveflow")
|
||||
}
|
||||
extraI = -1 - extraI
|
||||
default:
|
||||
return fmt.Errorf("wrong type for int64 field: %d", maj)
|
||||
}
|
||||
|
||||
t.StartTime = int64(extraI)
|
||||
}
|
||||
|
||||
default:
|
||||
return fmt.Errorf("unknown struct field %d: '%s'", i, name)
|
||||
|
8
extern/sector-storage/manager.go
vendored
8
extern/sector-storage/manager.go
vendored
@ -383,7 +383,7 @@ func (m *Manager) SealPreCommit1(ctx context.Context, sector abi.SectorID, ticke
|
||||
selector := newAllocSelector(m.index, storiface.FTCache|storiface.FTSealed, storiface.PathSealing)
|
||||
|
||||
err = m.sched.Schedule(ctx, sector, sealtasks.TTPreCommit1, selector, m.schedFetch(sector, storiface.FTUnsealed, storiface.PathSealing, storiface.AcquireMove), func(ctx context.Context, w Worker) error {
|
||||
err := m.startWork(ctx, wk)(w.SealPreCommit1(ctx, sector, ticket, pieces))
|
||||
err := m.startWork(ctx, w, wk)(w.SealPreCommit1(ctx, sector, ticket, pieces))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@ -430,7 +430,7 @@ func (m *Manager) SealPreCommit2(ctx context.Context, sector abi.SectorID, phase
|
||||
selector := newExistingSelector(m.index, sector, storiface.FTCache|storiface.FTSealed, true)
|
||||
|
||||
err = m.sched.Schedule(ctx, sector, sealtasks.TTPreCommit2, selector, m.schedFetch(sector, storiface.FTCache|storiface.FTSealed, storiface.PathSealing, storiface.AcquireMove), func(ctx context.Context, w Worker) error {
|
||||
err := m.startWork(ctx, wk)(w.SealPreCommit2(ctx, sector, phase1Out))
|
||||
err := m.startWork(ctx, w, wk)(w.SealPreCommit2(ctx, sector, phase1Out))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@ -480,7 +480,7 @@ func (m *Manager) SealCommit1(ctx context.Context, sector abi.SectorID, ticket a
|
||||
selector := newExistingSelector(m.index, sector, storiface.FTCache|storiface.FTSealed, false)
|
||||
|
||||
err = m.sched.Schedule(ctx, sector, sealtasks.TTCommit1, selector, m.schedFetch(sector, storiface.FTCache|storiface.FTSealed, storiface.PathSealing, storiface.AcquireMove), func(ctx context.Context, w Worker) error {
|
||||
err := m.startWork(ctx, wk)(w.SealCommit1(ctx, sector, ticket, seed, pieces, cids))
|
||||
err := m.startWork(ctx, w, wk)(w.SealCommit1(ctx, sector, ticket, seed, pieces, cids))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@ -520,7 +520,7 @@ func (m *Manager) SealCommit2(ctx context.Context, sector abi.SectorID, phase1Ou
|
||||
selector := newTaskSelector()
|
||||
|
||||
err = m.sched.Schedule(ctx, sector, sealtasks.TTCommit2, selector, schedNop, func(ctx context.Context, w Worker) error {
|
||||
err := m.startWork(ctx, wk)(w.SealCommit2(ctx, sector, phase1Out))
|
||||
err := m.startWork(ctx, w, wk)(w.SealCommit2(ctx, sector, phase1Out))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
33
extern/sector-storage/manager_calltracker.go
vendored
33
extern/sector-storage/manager_calltracker.go
vendored
@ -8,6 +8,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"golang.org/x/xerrors"
|
||||
|
||||
@ -41,6 +42,9 @@ type WorkState struct {
|
||||
|
||||
WorkerCall storiface.CallID // Set when entering wsRunning
|
||||
WorkError string // Status = wsDone, set when failed to start work
|
||||
|
||||
WorkerHostname string // hostname of last worker handling this job
|
||||
StartTime int64 // unix seconds
|
||||
}
|
||||
|
||||
func newWorkID(method sealtasks.TaskType, params ...interface{}) (WorkID, error) {
|
||||
@ -85,8 +89,7 @@ func (m *Manager) setupWorkTracker() {
|
||||
log.Errorf("cleannig up work state for %s", wid)
|
||||
}
|
||||
case wsDone:
|
||||
// realistically this shouldn't ever happen as we return results
|
||||
// immediately after getting them
|
||||
// can happen after restart, abandoning work, and another restart
|
||||
log.Warnf("dropping done work, no result, wid %s", wid)
|
||||
|
||||
if err := m.work.Get(wid).End(); err != nil {
|
||||
@ -167,8 +170,16 @@ func (m *Manager) getWork(ctx context.Context, method sealtasks.TaskType, params
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (m *Manager) startWork(ctx context.Context, wk WorkID) func(callID storiface.CallID, err error) error {
|
||||
func (m *Manager) startWork(ctx context.Context, w Worker, wk WorkID) func(callID storiface.CallID, err error) error {
|
||||
return func(callID storiface.CallID, err error) error {
|
||||
var hostname string
|
||||
info, ierr := w.Info(ctx)
|
||||
if ierr != nil {
|
||||
hostname = "[err]"
|
||||
} else {
|
||||
hostname = info.Hostname
|
||||
}
|
||||
|
||||
m.workLk.Lock()
|
||||
defer m.workLk.Unlock()
|
||||
|
||||
@ -194,6 +205,8 @@ func (m *Manager) startWork(ctx context.Context, wk WorkID) func(callID storifac
|
||||
ws.Status = wsRunning
|
||||
}
|
||||
ws.WorkerCall = callID
|
||||
ws.WorkerHostname = hostname
|
||||
ws.StartTime = time.Now().Unix()
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
@ -379,6 +392,20 @@ func (m *Manager) returnResult(callID storiface.CallID, r interface{}, serr stri
|
||||
|
||||
m.results[wid] = res
|
||||
|
||||
err = m.work.Get(wid).Mutate(func(ws *WorkState) error {
|
||||
ws.Status = wsDone
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
// in the unlikely case:
|
||||
// * manager has restarted, and we're still tracking this work, and
|
||||
// * the work is abandoned (storage-fsm doesn't do a matching call on the sector), and
|
||||
// * the call is returned from the worker, and
|
||||
// * this errors
|
||||
// the user will get jobs stuck in ret-wait state
|
||||
log.Errorf("marking work as done: %+v", err)
|
||||
}
|
||||
|
||||
_, found := m.waitRes[wid]
|
||||
if found {
|
||||
close(m.waitRes[wid])
|
||||
|
24
extern/sector-storage/stats.go
vendored
24
extern/sector-storage/stats.go
vendored
@ -67,12 +67,26 @@ func (m *Manager) WorkerJobs() map[uuid.UUID][]storiface.WorkerJob {
|
||||
continue
|
||||
}
|
||||
|
||||
var ws WorkState
|
||||
if err := m.work.Get(work).Get(&ws); err != nil {
|
||||
log.Errorf("WorkerJobs: get work %s: %+v", work, err)
|
||||
}
|
||||
|
||||
wait := storiface.RWRetWait
|
||||
if _, ok := m.results[work]; ok {
|
||||
wait = storiface.RWReturned
|
||||
}
|
||||
if ws.Status == wsDone {
|
||||
wait = storiface.RWRetDone
|
||||
}
|
||||
|
||||
out[uuid.UUID{}] = append(out[uuid.UUID{}], storiface.WorkerJob{
|
||||
ID: id,
|
||||
Sector: id.Sector,
|
||||
Task: work.Method,
|
||||
RunWait: -1,
|
||||
Start: time.Time{},
|
||||
ID: id,
|
||||
Sector: id.Sector,
|
||||
Task: work.Method,
|
||||
RunWait: wait,
|
||||
Start: time.Unix(ws.StartTime, 0),
|
||||
Hostname: ws.WorkerHostname,
|
||||
})
|
||||
}
|
||||
|
||||
|
15
extern/sector-storage/storiface/worker.go
vendored
15
extern/sector-storage/storiface/worker.go
vendored
@ -41,13 +41,26 @@ type WorkerStats struct {
|
||||
CpuUse uint64 // nolint
|
||||
}
|
||||
|
||||
const (
|
||||
RWRetWait = -1
|
||||
RWReturned = -2
|
||||
RWRetDone = -3
|
||||
)
|
||||
|
||||
type WorkerJob struct {
|
||||
ID CallID
|
||||
Sector abi.SectorID
|
||||
Task sealtasks.TaskType
|
||||
|
||||
RunWait int // -1 - ret-wait, 0 - running, 1+ - assigned
|
||||
// 1+ - assigned
|
||||
// 0 - running
|
||||
// -1 - ret-wait
|
||||
// -2 - returned
|
||||
// -3 - ret-done
|
||||
RunWait int
|
||||
Start time.Time
|
||||
|
||||
Hostname string `json:",omitempty"` // optional, set for ret-wait jobs
|
||||
}
|
||||
|
||||
type CallID struct {
|
||||
|
Loading…
Reference in New Issue
Block a user