Merge pull request #2053 from filecoin-project/feat/worker-reconnect
seal-worker: Auto-restart if miner dies
This commit is contained in:
commit
2800fd919b
@ -38,6 +38,8 @@ type Common interface {
|
|||||||
|
|
||||||
// trigger graceful shutdown
|
// trigger graceful shutdown
|
||||||
Shutdown(context.Context) error
|
Shutdown(context.Context) error
|
||||||
|
|
||||||
|
Closing(context.Context) (<-chan struct{}, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Version provides various build-time information
|
// Version provides various build-time information
|
||||||
|
@ -51,6 +51,7 @@ type CommonStruct struct {
|
|||||||
LogSetLevel func(context.Context, string, string) error `perm:"write"`
|
LogSetLevel func(context.Context, string, string) error `perm:"write"`
|
||||||
|
|
||||||
Shutdown func(context.Context) error `perm:"admin"`
|
Shutdown func(context.Context) error `perm:"admin"`
|
||||||
|
Closing func(context.Context) (<-chan struct{}, error) `perm:"read"`
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -313,6 +314,10 @@ func (c *CommonStruct) Shutdown(ctx context.Context) error {
|
|||||||
return c.Internal.Shutdown(ctx)
|
return c.Internal.Shutdown(ctx)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *CommonStruct) Closing(ctx context.Context) (<-chan struct{}, error) {
|
||||||
|
return c.Internal.Closing(ctx)
|
||||||
|
}
|
||||||
|
|
||||||
// FullNodeStruct
|
// FullNodeStruct
|
||||||
|
|
||||||
func (c *FullNodeStruct) ClientListImports(ctx context.Context) ([]api.Import, error) {
|
func (c *FullNodeStruct) ClientListImports(ctx context.Context) ([]api.Import, error) {
|
||||||
|
@ -3,11 +3,14 @@ package main
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"net"
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"syscall"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/google/uuid"
|
"github.com/google/uuid"
|
||||||
"github.com/gorilla/mux"
|
"github.com/gorilla/mux"
|
||||||
@ -117,11 +120,19 @@ var runCmd = &cli.Command{
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Connect to storage-miner
|
// Connect to storage-miner
|
||||||
|
var nodeApi api.StorageMiner
|
||||||
nodeApi, closer, err := lcli.GetStorageMinerAPI(cctx)
|
var closer func()
|
||||||
if err != nil {
|
var err error
|
||||||
return xerrors.Errorf("getting miner api: %w", err)
|
for {
|
||||||
|
nodeApi, closer, err = lcli.GetStorageMinerAPI(cctx)
|
||||||
|
if err == nil {
|
||||||
|
break
|
||||||
}
|
}
|
||||||
|
fmt.Printf("\r\x1b[0KConnecting to miner API... (%s)", err)
|
||||||
|
time.Sleep(time.Second)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
defer closer()
|
defer closer()
|
||||||
ctx := lcli.ReqContext(cctx)
|
ctx := lcli.ReqContext(cctx)
|
||||||
ctx, cancel := context.WithCancel(ctx)
|
ctx, cancel := context.WithCancel(ctx)
|
||||||
@ -136,6 +147,8 @@ var runCmd = &cli.Command{
|
|||||||
}
|
}
|
||||||
log.Infof("Remote version %s", v)
|
log.Infof("Remote version %s", v)
|
||||||
|
|
||||||
|
watchMinerConn(ctx, cctx, nodeApi)
|
||||||
|
|
||||||
// Check params
|
// Check params
|
||||||
|
|
||||||
act, err := nodeApi.ActorAddress(ctx)
|
act, err := nodeApi.ActorAddress(ctx)
|
||||||
@ -317,3 +330,42 @@ var runCmd = &cli.Command{
|
|||||||
return srv.Serve(nl)
|
return srv.Serve(nl)
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func watchMinerConn(ctx context.Context, cctx *cli.Context, nodeApi api.StorageMiner) {
|
||||||
|
go func() {
|
||||||
|
closing, err := nodeApi.Closing(ctx)
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("failed to get remote closing channel: %+v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-closing:
|
||||||
|
case <-ctx.Done():
|
||||||
|
}
|
||||||
|
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
return // graceful shutdown
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Warnf("Connection with miner node lost, restarting")
|
||||||
|
|
||||||
|
exe, err := os.Executable()
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("getting executable for auto-restart: %+v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Sync()
|
||||||
|
|
||||||
|
// TODO: there are probably cleaner/more graceful ways to restart,
|
||||||
|
// but this is good enough for now (FSM can recover from the mess this creates)
|
||||||
|
if err := syscall.Exec(exe, []string{exe, "run",
|
||||||
|
fmt.Sprintf("--address=%s", cctx.String("address")),
|
||||||
|
fmt.Sprintf("--no-local-storage=%t", cctx.Bool("no-local-storage")),
|
||||||
|
fmt.Sprintf("--precommit1=%t", cctx.Bool("precommit1")),
|
||||||
|
fmt.Sprintf("--precommit2=%t", cctx.Bool("precommit2")),
|
||||||
|
fmt.Sprintf("--commit=%t", cctx.Bool("commit")),
|
||||||
|
}, os.Environ()); err != nil {
|
||||||
|
fmt.Println(err)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
@ -139,4 +139,8 @@ func (a *CommonAPI) Shutdown(ctx context.Context) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *CommonAPI) Closing(ctx context.Context) (<-chan struct{}, error) {
|
||||||
|
return make(chan struct{}), nil // relies on jsonrpc closing
|
||||||
|
}
|
||||||
|
|
||||||
var _ api.Common = &CommonAPI{}
|
var _ api.Common = &CommonAPI{}
|
||||||
|
Loading…
Reference in New Issue
Block a user