storage: Track abandoned work more correctly

This commit is contained in:
Łukasz Magiera 2020-11-09 23:38:20 +01:00
parent f819e71d12
commit 5caa277261
4 changed files with 36 additions and 7 deletions

View File

@ -190,9 +190,11 @@ var sealingJobsCmd = &cli.Command{
switch { switch {
case l.RunWait > 0: case l.RunWait > 0:
state = fmt.Sprintf("assigned(%d)", l.RunWait-1) state = fmt.Sprintf("assigned(%d)", l.RunWait-1)
case l.RunWait == -2: case l.RunWait == storiface.RWRetDone:
state = "ret-done"
case l.RunWait == storiface.RWReturned:
state = "returned" state = "returned"
case l.RunWait == -1: case l.RunWait == storiface.RWRetWait:
state = "ret-wait" state = "ret-wait"
} }
dur := "n/a" dur := "n/a"

View File

@ -89,8 +89,7 @@ func (m *Manager) setupWorkTracker() {
log.Errorf("cleannig up work state for %s", wid) log.Errorf("cleannig up work state for %s", wid)
} }
case wsDone: case wsDone:
// realistically this shouldn't ever happen as we return results // can happen after restart, abandoning work, and another restart
// immediately after getting them
log.Warnf("dropping done work, no result, wid %s", wid) log.Warnf("dropping done work, no result, wid %s", wid)
if err := m.work.Get(wid).End(); err != nil { if err := m.work.Get(wid).End(); err != nil {
@ -393,6 +392,20 @@ func (m *Manager) returnResult(callID storiface.CallID, r interface{}, serr stri
m.results[wid] = res m.results[wid] = res
err = m.work.Get(wid).Mutate(func(ws *WorkState) error {
ws.Status = wsDone
return nil
})
if err != nil {
// in the unlikely case:
// * manager has restarted, and we're still tracking this work, and
// * the work is abandoned (storage-fsm doesn't do a matching call on the sector), and
// * the call is returned from the worker, and
// * this errors
// the user will get jobs stuck in ret-wait state
log.Errorf("marking work as done: %+v", err)
}
_, found := m.waitRes[wid] _, found := m.waitRes[wid]
if found { if found {
close(m.waitRes[wid]) close(m.waitRes[wid])

View File

@ -72,9 +72,12 @@ func (m *Manager) WorkerJobs() map[uuid.UUID][]storiface.WorkerJob {
log.Errorf("WorkerJobs: get work %s: %+v", work, err) log.Errorf("WorkerJobs: get work %s: %+v", work, err)
} }
wait := -1 wait := storiface.RWRetWait
if _, ok := m.results[work]; ok { if _, ok := m.results[work]; ok {
wait = -2 // mark as returned instead of ret-wait wait = storiface.RWReturned
}
if ws.Status == wsDone {
wait = storiface.RWRetDone
} }
out[uuid.UUID{}] = append(out[uuid.UUID{}], storiface.WorkerJob{ out[uuid.UUID{}] = append(out[uuid.UUID{}], storiface.WorkerJob{

View File

@ -41,12 +41,23 @@ type WorkerStats struct {
CpuUse uint64 // nolint CpuUse uint64 // nolint
} }
const (
RWRetWait = -1
RWReturned = -2
RWRetDone = -3
)
type WorkerJob struct { type WorkerJob struct {
ID CallID ID CallID
Sector abi.SectorID Sector abi.SectorID
Task sealtasks.TaskType Task sealtasks.TaskType
RunWait int // -2 - returned, -1 - ret-wait, 0 - running, 1+ - assigned // 1+ - assigned
// 0 - running
// -1 - ret-wait
// -2 - returned
// -3 - ret-done
RunWait int
Start time.Time Start time.Time
Hostname string `json:",omitempty"` // optional, set for ret-wait jobs Hostname string `json:",omitempty"` // optional, set for ret-wait jobs