diff --git a/api/api_common.go b/api/api_common.go index e5556d16f..6d47e35f7 100644 --- a/api/api_common.go +++ b/api/api_common.go @@ -38,6 +38,8 @@ type Common interface { // trigger graceful shutdown Shutdown(context.Context) error + + Closing(context.Context) (<-chan struct{}, error) } // Version provides various build-time information diff --git a/api/apistruct/struct.go b/api/apistruct/struct.go index 02d9e5155..dfa7a44ee 100644 --- a/api/apistruct/struct.go +++ b/api/apistruct/struct.go @@ -50,7 +50,8 @@ type CommonStruct struct { LogList func(context.Context) ([]string, error) `perm:"write"` LogSetLevel func(context.Context, string, string) error `perm:"write"` - Shutdown func(context.Context) error `perm:"admin"` + Shutdown func(context.Context) error `perm:"admin"` + Closing func(context.Context) (<-chan struct{}, error) `perm:"read"` } } @@ -313,6 +314,10 @@ func (c *CommonStruct) Shutdown(ctx context.Context) error { return c.Internal.Shutdown(ctx) } +func (c *CommonStruct) Closing(ctx context.Context) (<-chan struct{}, error) { + return c.Internal.Closing(ctx) +} + // FullNodeStruct func (c *FullNodeStruct) ClientListImports(ctx context.Context) ([]api.Import, error) { diff --git a/cmd/lotus-seal-worker/main.go b/cmd/lotus-seal-worker/main.go index 65418ead8..ff45687f8 100644 --- a/cmd/lotus-seal-worker/main.go +++ b/cmd/lotus-seal-worker/main.go @@ -3,11 +3,14 @@ package main import ( "context" "encoding/json" + "fmt" "io/ioutil" "net" "net/http" "os" "path/filepath" + "syscall" + "time" "github.com/google/uuid" "github.com/gorilla/mux" @@ -117,11 +120,19 @@ var runCmd = &cli.Command{ } // Connect to storage-miner - - nodeApi, closer, err := lcli.GetStorageMinerAPI(cctx) - if err != nil { - return xerrors.Errorf("getting miner api: %w", err) + var nodeApi api.StorageMiner + var closer func() + var err error + for { + nodeApi, closer, err = lcli.GetStorageMinerAPI(cctx) + if err == nil { + break + } + fmt.Printf("\r\x1b[0KConnecting to miner API... (%s)", err) + time.Sleep(time.Second) + continue } + defer closer() ctx := lcli.ReqContext(cctx) ctx, cancel := context.WithCancel(ctx) @@ -136,6 +147,8 @@ var runCmd = &cli.Command{ } log.Infof("Remote version %s", v) + watchMinerConn(ctx, cctx, nodeApi) + // Check params act, err := nodeApi.ActorAddress(ctx) @@ -317,3 +330,42 @@ var runCmd = &cli.Command{ return srv.Serve(nl) }, } + +func watchMinerConn(ctx context.Context, cctx *cli.Context, nodeApi api.StorageMiner) { + go func() { + closing, err := nodeApi.Closing(ctx) + if err != nil { + log.Errorf("failed to get remote closing channel: %+v", err) + } + + select { + case <-closing: + case <-ctx.Done(): + } + + if ctx.Err() != nil { + return // graceful shutdown + } + + log.Warnf("Connection with miner node lost, restarting") + + exe, err := os.Executable() + if err != nil { + log.Errorf("getting executable for auto-restart: %+v", err) + } + + log.Sync() + + // TODO: there are probably cleaner/more graceful ways to restart, + // but this is good enough for now (FSM can recover from the mess this creates) + if err := syscall.Exec(exe, []string{exe, "run", + fmt.Sprintf("--address=%s", cctx.String("address")), + fmt.Sprintf("--no-local-storage=%t", cctx.Bool("no-local-storage")), + fmt.Sprintf("--precommit1=%t", cctx.Bool("precommit1")), + fmt.Sprintf("--precommit2=%t", cctx.Bool("precommit2")), + fmt.Sprintf("--commit=%t", cctx.Bool("commit")), + }, os.Environ()); err != nil { + fmt.Println(err) + } + }() +} diff --git a/node/impl/common/common.go b/node/impl/common/common.go index aa85d9182..3a42872d9 100644 --- a/node/impl/common/common.go +++ b/node/impl/common/common.go @@ -139,4 +139,8 @@ func (a *CommonAPI) Shutdown(ctx context.Context) error { return nil } +func (a *CommonAPI) Closing(ctx context.Context) (<-chan struct{}, error) { + return make(chan struct{}), nil // relies on jsonrpc closing +} + var _ api.Common = &CommonAPI{}