Merge pull request #4666 from filecoin-project/fix/sched-issues
Fix worker reenabling, handle multiple restarts in worker
This commit is contained in:
commit
094ea3fe97
@ -451,14 +451,24 @@ var runCmd = &cli.Command{
|
|||||||
return xerrors.Errorf("getting miner session: %w", err)
|
return xerrors.Errorf("getting miner session: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
waitQuietCh := func() chan struct{} {
|
||||||
|
out := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
workerApi.LocalWorker.WaitQuiet()
|
||||||
|
close(out)
|
||||||
|
}()
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
heartbeats := time.NewTicker(stores.HeartbeatInterval)
|
heartbeats := time.NewTicker(stores.HeartbeatInterval)
|
||||||
defer heartbeats.Stop()
|
defer heartbeats.Stop()
|
||||||
|
|
||||||
var connected, reconnect bool
|
var redeclareStorage bool
|
||||||
|
var readyCh chan struct{}
|
||||||
for {
|
for {
|
||||||
// If we're reconnecting, redeclare storage first
|
// If we're reconnecting, redeclare storage first
|
||||||
if reconnect {
|
if redeclareStorage {
|
||||||
log.Info("Redeclaring local storage")
|
log.Info("Redeclaring local storage")
|
||||||
|
|
||||||
if err := localStore.Redeclare(ctx); err != nil {
|
if err := localStore.Redeclare(ctx); err != nil {
|
||||||
@ -471,14 +481,13 @@ var runCmd = &cli.Command{
|
|||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
connected = false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Info("Making sure no local tasks are running")
|
|
||||||
|
|
||||||
// TODO: we could get rid of this, but that requires tracking resources for restarted tasks correctly
|
// TODO: we could get rid of this, but that requires tracking resources for restarted tasks correctly
|
||||||
workerApi.LocalWorker.WaitQuiet()
|
if readyCh == nil {
|
||||||
|
log.Info("Making sure no local tasks are running")
|
||||||
|
readyCh = waitQuietCh()
|
||||||
|
}
|
||||||
|
|
||||||
for {
|
for {
|
||||||
curSession, err := nodeApi.Session(ctx)
|
curSession, err := nodeApi.Session(ctx)
|
||||||
@ -489,29 +498,28 @@ var runCmd = &cli.Command{
|
|||||||
minerSession = curSession
|
minerSession = curSession
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
if !connected {
|
|
||||||
if err := nodeApi.WorkerConnect(ctx, "http://"+address+"/rpc/v0"); err != nil {
|
|
||||||
log.Errorf("Registering worker failed: %+v", err)
|
|
||||||
cancel()
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Info("Worker registered successfully, waiting for tasks")
|
|
||||||
connected = true
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
select {
|
select {
|
||||||
|
case <-readyCh:
|
||||||
|
if err := nodeApi.WorkerConnect(ctx, "http://"+address+"/rpc/v0"); err != nil {
|
||||||
|
log.Errorf("Registering worker failed: %+v", err)
|
||||||
|
cancel()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Info("Worker registered successfully, waiting for tasks")
|
||||||
|
|
||||||
|
readyCh = nil
|
||||||
|
case <-heartbeats.C:
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return // graceful shutdown
|
return // graceful shutdown
|
||||||
case <-heartbeats.C:
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Errorf("LOTUS-MINER CONNECTION LOST")
|
log.Errorf("LOTUS-MINER CONNECTION LOST")
|
||||||
|
|
||||||
reconnect = true
|
redeclareStorage = true
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
75
extern/sector-storage/manager_test.go
vendored
75
extern/sector-storage/manager_test.go
vendored
@ -10,6 +10,7 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -376,3 +377,77 @@ func TestRestartWorker(t *testing.T) {
|
|||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
require.Empty(t, uf)
|
require.Empty(t, uf)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestReenableWorker(t *testing.T) {
|
||||||
|
logging.SetAllLoggers(logging.LevelDebug)
|
||||||
|
stores.HeartbeatInterval = 5 * time.Millisecond
|
||||||
|
|
||||||
|
ctx, done := context.WithCancel(context.Background())
|
||||||
|
defer done()
|
||||||
|
|
||||||
|
ds := datastore.NewMapDatastore()
|
||||||
|
|
||||||
|
m, lstor, stor, idx, cleanup := newTestMgr(ctx, t, ds)
|
||||||
|
defer cleanup()
|
||||||
|
|
||||||
|
localTasks := []sealtasks.TaskType{
|
||||||
|
sealtasks.TTAddPiece, sealtasks.TTPreCommit1, sealtasks.TTCommit1, sealtasks.TTFinalize, sealtasks.TTFetch,
|
||||||
|
}
|
||||||
|
|
||||||
|
wds := datastore.NewMapDatastore()
|
||||||
|
|
||||||
|
arch := make(chan chan apres)
|
||||||
|
w := newLocalWorker(func() (ffiwrapper.Storage, error) {
|
||||||
|
return &testExec{apch: arch}, nil
|
||||||
|
}, WorkerConfig{
|
||||||
|
SealProof: 0,
|
||||||
|
TaskTypes: localTasks,
|
||||||
|
}, stor, lstor, idx, m, statestore.New(wds))
|
||||||
|
|
||||||
|
err := m.AddWorker(ctx, w)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
time.Sleep(time.Millisecond * 100)
|
||||||
|
|
||||||
|
i, _ := m.sched.Info(ctx)
|
||||||
|
require.Len(t, i.(SchedDiagInfo).OpenWindows, 2)
|
||||||
|
|
||||||
|
// disable
|
||||||
|
atomic.StoreInt64(&w.testDisable, 1)
|
||||||
|
|
||||||
|
for i := 0; i < 100; i++ {
|
||||||
|
if !m.WorkerStats()[w.session].Enabled {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
time.Sleep(time.Millisecond * 3)
|
||||||
|
}
|
||||||
|
require.False(t, m.WorkerStats()[w.session].Enabled)
|
||||||
|
|
||||||
|
i, _ = m.sched.Info(ctx)
|
||||||
|
require.Len(t, i.(SchedDiagInfo).OpenWindows, 0)
|
||||||
|
|
||||||
|
// reenable
|
||||||
|
atomic.StoreInt64(&w.testDisable, 0)
|
||||||
|
|
||||||
|
for i := 0; i < 100; i++ {
|
||||||
|
if m.WorkerStats()[w.session].Enabled {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
time.Sleep(time.Millisecond * 3)
|
||||||
|
}
|
||||||
|
require.True(t, m.WorkerStats()[w.session].Enabled)
|
||||||
|
|
||||||
|
for i := 0; i < 100; i++ {
|
||||||
|
info, _ := m.sched.Info(ctx)
|
||||||
|
if len(info.(SchedDiagInfo).OpenWindows) != 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
time.Sleep(time.Millisecond * 3)
|
||||||
|
}
|
||||||
|
|
||||||
|
i, _ = m.sched.Info(ctx)
|
||||||
|
require.Len(t, i.(SchedDiagInfo).OpenWindows, 2)
|
||||||
|
}
|
||||||
|
5
extern/sector-storage/sched.go
vendored
5
extern/sector-storage/sched.go
vendored
@ -7,6 +7,7 @@ import (
|
|||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/google/uuid"
|
||||||
"golang.org/x/xerrors"
|
"golang.org/x/xerrors"
|
||||||
|
|
||||||
"github.com/filecoin-project/go-state-types/abi"
|
"github.com/filecoin-project/go-state-types/abi"
|
||||||
@ -217,7 +218,7 @@ type SchedDiagRequestInfo struct {
|
|||||||
|
|
||||||
type SchedDiagInfo struct {
|
type SchedDiagInfo struct {
|
||||||
Requests []SchedDiagRequestInfo
|
Requests []SchedDiagRequestInfo
|
||||||
OpenWindows []WorkerID
|
OpenWindows []string
|
||||||
}
|
}
|
||||||
|
|
||||||
func (sh *scheduler) runSched() {
|
func (sh *scheduler) runSched() {
|
||||||
@ -324,7 +325,7 @@ func (sh *scheduler) diag() SchedDiagInfo {
|
|||||||
defer sh.workersLk.RUnlock()
|
defer sh.workersLk.RUnlock()
|
||||||
|
|
||||||
for _, window := range sh.openWindows {
|
for _, window := range sh.openWindows {
|
||||||
out.OpenWindows = append(out.OpenWindows, window.worker)
|
out.OpenWindows = append(out.OpenWindows, uuid.UUID(window.worker).String())
|
||||||
}
|
}
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
24
extern/sector-storage/sched_worker.go
vendored
24
extern/sector-storage/sched_worker.go
vendored
@ -104,14 +104,16 @@ func (sw *schedWorker) handleWorker() {
|
|||||||
defer sw.heartbeatTimer.Stop()
|
defer sw.heartbeatTimer.Stop()
|
||||||
|
|
||||||
for {
|
for {
|
||||||
sched.workersLk.Lock()
|
{
|
||||||
enabled := worker.enabled
|
sched.workersLk.Lock()
|
||||||
sched.workersLk.Unlock()
|
enabled := worker.enabled
|
||||||
|
sched.workersLk.Unlock()
|
||||||
|
|
||||||
// ask for more windows if we need them (non-blocking)
|
// ask for more windows if we need them (non-blocking)
|
||||||
if enabled {
|
if enabled {
|
||||||
if !sw.requestWindows() {
|
if !sw.requestWindows() {
|
||||||
return // graceful shutdown
|
return // graceful shutdown
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -123,12 +125,16 @@ func (sw *schedWorker) handleWorker() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// session looks good
|
// session looks good
|
||||||
if !enabled {
|
{
|
||||||
sched.workersLk.Lock()
|
sched.workersLk.Lock()
|
||||||
|
enabled := worker.enabled
|
||||||
worker.enabled = true
|
worker.enabled = true
|
||||||
sched.workersLk.Unlock()
|
sched.workersLk.Unlock()
|
||||||
|
|
||||||
// we'll send window requests on the next loop
|
if !enabled {
|
||||||
|
// go send window requests
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// wait for more tasks to be assigned by the main scheduler or for the worker
|
// wait for more tasks to be assigned by the main scheduler or for the worker
|
||||||
|
10
extern/sector-storage/worker_local.go
vendored
10
extern/sector-storage/worker_local.go
vendored
@ -8,6 +8,7 @@ import (
|
|||||||
"reflect"
|
"reflect"
|
||||||
"runtime"
|
"runtime"
|
||||||
"sync"
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/elastic/go-sysinfo"
|
"github.com/elastic/go-sysinfo"
|
||||||
@ -51,8 +52,9 @@ type LocalWorker struct {
|
|||||||
acceptTasks map[sealtasks.TaskType]struct{}
|
acceptTasks map[sealtasks.TaskType]struct{}
|
||||||
running sync.WaitGroup
|
running sync.WaitGroup
|
||||||
|
|
||||||
session uuid.UUID
|
session uuid.UUID
|
||||||
closing chan struct{}
|
testDisable int64
|
||||||
|
closing chan struct{}
|
||||||
}
|
}
|
||||||
|
|
||||||
func newLocalWorker(executor ExecutorFunc, wcfg WorkerConfig, store stores.Store, local *stores.Local, sindex stores.SectorIndex, ret storiface.WorkerReturn, cst *statestore.StateStore) *LocalWorker {
|
func newLocalWorker(executor ExecutorFunc, wcfg WorkerConfig, store stores.Store, local *stores.Local, sindex stores.SectorIndex, ret storiface.WorkerReturn, cst *statestore.StateStore) *LocalWorker {
|
||||||
@ -501,6 +503,10 @@ func (l *LocalWorker) Info(context.Context) (storiface.WorkerInfo, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (l *LocalWorker) Session(ctx context.Context) (uuid.UUID, error) {
|
func (l *LocalWorker) Session(ctx context.Context) (uuid.UUID, error) {
|
||||||
|
if atomic.LoadInt64(&l.testDisable) == 1 {
|
||||||
|
return uuid.UUID{}, xerrors.Errorf("disabled")
|
||||||
|
}
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case <-l.closing:
|
case <-l.closing:
|
||||||
return ClosedWorkerID, nil
|
return ClosedWorkerID, nil
|
||||||
|
Loading…
Reference in New Issue
Block a user