storage: Track abandoned work more correctly

This commit is contained in:
Łukasz Magiera 2020-11-09 23:38:20 +01:00
parent f819e71d12
commit 5caa277261
4 changed files with 36 additions and 7 deletions

View File

@ -190,9 +190,11 @@ var sealingJobsCmd = &cli.Command{
switch {
case l.RunWait > 0:
state = fmt.Sprintf("assigned(%d)", l.RunWait-1)
case l.RunWait == -2:
case l.RunWait == storiface.RWRetDone:
state = "ret-done"
case l.RunWait == storiface.RWReturned:
state = "returned"
case l.RunWait == -1:
case l.RunWait == storiface.RWRetWait:
state = "ret-wait"
}
dur := "n/a"

View File

@ -89,8 +89,7 @@ func (m *Manager) setupWorkTracker() {
log.Errorf("cleannig up work state for %s", wid)
}
case wsDone:
// realistically this shouldn't ever happen as we return results
// immediately after getting them
// can happen after restart, abandoning work, and another restart
log.Warnf("dropping done work, no result, wid %s", wid)
if err := m.work.Get(wid).End(); err != nil {
@ -393,6 +392,20 @@ func (m *Manager) returnResult(callID storiface.CallID, r interface{}, serr stri
m.results[wid] = res
err = m.work.Get(wid).Mutate(func(ws *WorkState) error {
ws.Status = wsDone
return nil
})
if err != nil {
// in the unlikely case:
// * manager has restarted, and we're still tracking this work, and
// * the work is abandoned (storage-fsm doesn't do a matching call on the sector), and
// * the call is returned from the worker, and
// * this errors
// the user will get jobs stuck in ret-wait state
log.Errorf("marking work as done: %+v", err)
}
_, found := m.waitRes[wid]
if found {
close(m.waitRes[wid])

View File

@ -72,9 +72,12 @@ func (m *Manager) WorkerJobs() map[uuid.UUID][]storiface.WorkerJob {
log.Errorf("WorkerJobs: get work %s: %+v", work, err)
}
wait := -1
wait := storiface.RWRetWait
if _, ok := m.results[work]; ok {
wait = -2 // mark as returned instead of ret-wait
wait = storiface.RWReturned
}
if ws.Status == wsDone {
wait = storiface.RWRetDone
}
out[uuid.UUID{}] = append(out[uuid.UUID{}], storiface.WorkerJob{

View File

@ -41,12 +41,23 @@ type WorkerStats struct {
CpuUse uint64 // nolint
}
const (
RWRetWait = -1
RWReturned = -2
RWRetDone = -3
)
type WorkerJob struct {
ID CallID
Sector abi.SectorID
Task sealtasks.TaskType
RunWait int // -2 - returned, -1 - ret-wait, 0 - running, 1+ - assigned
// 1+ - assigned
// 0 - running
// -1 - ret-wait
// -2 - returned
// -3 - ret-done
RunWait int
Start time.Time
Hostname string `json:",omitempty"` // optional, set for ret-wait jobs