Merge pull request #2053 from filecoin-project/feat/worker-reconnect

seal-worker: Auto-restart if miner dies
This commit is contained in:
Łukasz Magiera 2020-06-17 19:48:29 +02:00 committed by GitHub
commit 2800fd919b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 68 additions and 5 deletions

View File

@ -38,6 +38,8 @@ type Common interface {
// trigger graceful shutdown // trigger graceful shutdown
Shutdown(context.Context) error Shutdown(context.Context) error
Closing(context.Context) (<-chan struct{}, error)
} }
// Version provides various build-time information // Version provides various build-time information

View File

@ -50,7 +50,8 @@ type CommonStruct struct {
LogList func(context.Context) ([]string, error) `perm:"write"` LogList func(context.Context) ([]string, error) `perm:"write"`
LogSetLevel func(context.Context, string, string) error `perm:"write"` LogSetLevel func(context.Context, string, string) error `perm:"write"`
Shutdown func(context.Context) error `perm:"admin"` Shutdown func(context.Context) error `perm:"admin"`
Closing func(context.Context) (<-chan struct{}, error) `perm:"read"`
} }
} }
@ -313,6 +314,10 @@ func (c *CommonStruct) Shutdown(ctx context.Context) error {
return c.Internal.Shutdown(ctx) return c.Internal.Shutdown(ctx)
} }
func (c *CommonStruct) Closing(ctx context.Context) (<-chan struct{}, error) {
return c.Internal.Closing(ctx)
}
// FullNodeStruct // FullNodeStruct
func (c *FullNodeStruct) ClientListImports(ctx context.Context) ([]api.Import, error) { func (c *FullNodeStruct) ClientListImports(ctx context.Context) ([]api.Import, error) {

View File

@ -3,11 +3,14 @@ package main
import ( import (
"context" "context"
"encoding/json" "encoding/json"
"fmt"
"io/ioutil" "io/ioutil"
"net" "net"
"net/http" "net/http"
"os" "os"
"path/filepath" "path/filepath"
"syscall"
"time"
"github.com/google/uuid" "github.com/google/uuid"
"github.com/gorilla/mux" "github.com/gorilla/mux"
@ -117,11 +120,19 @@ var runCmd = &cli.Command{
} }
// Connect to storage-miner // Connect to storage-miner
var nodeApi api.StorageMiner
nodeApi, closer, err := lcli.GetStorageMinerAPI(cctx) var closer func()
if err != nil { var err error
return xerrors.Errorf("getting miner api: %w", err) for {
nodeApi, closer, err = lcli.GetStorageMinerAPI(cctx)
if err == nil {
break
}
fmt.Printf("\r\x1b[0KConnecting to miner API... (%s)", err)
time.Sleep(time.Second)
continue
} }
defer closer() defer closer()
ctx := lcli.ReqContext(cctx) ctx := lcli.ReqContext(cctx)
ctx, cancel := context.WithCancel(ctx) ctx, cancel := context.WithCancel(ctx)
@ -136,6 +147,8 @@ var runCmd = &cli.Command{
} }
log.Infof("Remote version %s", v) log.Infof("Remote version %s", v)
watchMinerConn(ctx, cctx, nodeApi)
// Check params // Check params
act, err := nodeApi.ActorAddress(ctx) act, err := nodeApi.ActorAddress(ctx)
@ -317,3 +330,42 @@ var runCmd = &cli.Command{
return srv.Serve(nl) return srv.Serve(nl)
}, },
} }
func watchMinerConn(ctx context.Context, cctx *cli.Context, nodeApi api.StorageMiner) {
go func() {
closing, err := nodeApi.Closing(ctx)
if err != nil {
log.Errorf("failed to get remote closing channel: %+v", err)
}
select {
case <-closing:
case <-ctx.Done():
}
if ctx.Err() != nil {
return // graceful shutdown
}
log.Warnf("Connection with miner node lost, restarting")
exe, err := os.Executable()
if err != nil {
log.Errorf("getting executable for auto-restart: %+v", err)
}
log.Sync()
// TODO: there are probably cleaner/more graceful ways to restart,
// but this is good enough for now (FSM can recover from the mess this creates)
if err := syscall.Exec(exe, []string{exe, "run",
fmt.Sprintf("--address=%s", cctx.String("address")),
fmt.Sprintf("--no-local-storage=%t", cctx.Bool("no-local-storage")),
fmt.Sprintf("--precommit1=%t", cctx.Bool("precommit1")),
fmt.Sprintf("--precommit2=%t", cctx.Bool("precommit2")),
fmt.Sprintf("--commit=%t", cctx.Bool("commit")),
}, os.Environ()); err != nil {
fmt.Println(err)
}
}()
}

View File

@ -139,4 +139,8 @@ func (a *CommonAPI) Shutdown(ctx context.Context) error {
return nil return nil
} }
func (a *CommonAPI) Closing(ctx context.Context) (<-chan struct{}, error) {
return make(chan struct{}), nil // relies on jsonrpc closing
}
var _ api.Common = &CommonAPI{} var _ api.Common = &CommonAPI{}