worker: Handle multiple manager restarts while waiting for tasks

This commit is contained in:
Łukasz Magiera 2020-10-30 18:00:48 +01:00
parent c3d00b0ac6
commit af1d45d969

View File

@ -451,14 +451,24 @@ var runCmd = &cli.Command{
return xerrors.Errorf("getting miner session: %w", err) return xerrors.Errorf("getting miner session: %w", err)
} }
waitQuietCh := func() chan struct{} {
out := make(chan struct{})
go func() {
workerApi.LocalWorker.WaitQuiet()
close(out)
}()
return out
}
go func() { go func() {
heartbeats := time.NewTicker(stores.HeartbeatInterval) heartbeats := time.NewTicker(stores.HeartbeatInterval)
defer heartbeats.Stop() defer heartbeats.Stop()
var connected, reconnect bool var redeclareStorage bool
var readyCh chan struct{}
for { for {
// If we're reconnecting, redeclare storage first // If we're reconnecting, redeclare storage first
if reconnect { if redeclareStorage {
log.Info("Redeclaring local storage") log.Info("Redeclaring local storage")
if err := localStore.Redeclare(ctx); err != nil { if err := localStore.Redeclare(ctx); err != nil {
@ -471,14 +481,13 @@ var runCmd = &cli.Command{
} }
continue continue
} }
connected = false
} }
log.Info("Making sure no local tasks are running")
// TODO: we could get rid of this, but that requires tracking resources for restarted tasks correctly // TODO: we could get rid of this, but that requires tracking resources for restarted tasks correctly
workerApi.LocalWorker.WaitQuiet() if readyCh == nil {
log.Info("Making sure no local tasks are running")
readyCh = waitQuietCh()
}
for { for {
curSession, err := nodeApi.Session(ctx) curSession, err := nodeApi.Session(ctx)
@ -489,8 +498,10 @@ var runCmd = &cli.Command{
minerSession = curSession minerSession = curSession
break break
} }
}
if !connected { select {
case <-readyCh:
if err := nodeApi.WorkerConnect(ctx, "http://"+address+"/rpc/v0"); err != nil { if err := nodeApi.WorkerConnect(ctx, "http://"+address+"/rpc/v0"); err != nil {
log.Errorf("Registering worker failed: %+v", err) log.Errorf("Registering worker failed: %+v", err)
cancel() cancel()
@ -498,20 +509,17 @@ var runCmd = &cli.Command{
} }
log.Info("Worker registered successfully, waiting for tasks") log.Info("Worker registered successfully, waiting for tasks")
connected = true
}
}
select { readyCh = nil
case <-heartbeats.C:
case <-ctx.Done(): case <-ctx.Done():
return // graceful shutdown return // graceful shutdown
case <-heartbeats.C:
} }
} }
log.Errorf("LOTUS-MINER CONNECTION LOST") log.Errorf("LOTUS-MINER CONNECTION LOST")
reconnect = true redeclareStorage = true
} }
}() }()