lotus/cmd/lotus-seal-worker/main.go

427 lines
11 KiB
Go
Raw Normal View History

package main
2019-11-21 00:52:59 +00:00
import (
2020-03-13 01:37:38 +00:00
"context"
2020-03-16 18:46:02 +00:00
"encoding/json"
"fmt"
2020-03-16 18:46:02 +00:00
"io/ioutil"
2020-03-16 17:50:07 +00:00
"net"
2020-03-13 01:37:38 +00:00
"net/http"
2019-11-21 00:52:59 +00:00
"os"
2020-03-16 18:46:02 +00:00
"path/filepath"
"strings"
"syscall"
"time"
2020-03-16 18:46:02 +00:00
"github.com/google/uuid"
2020-03-13 01:37:38 +00:00
"github.com/gorilla/mux"
logging "github.com/ipfs/go-log/v2"
"github.com/urfave/cli/v2"
2020-06-05 22:59:01 +00:00
"golang.org/x/xerrors"
2019-11-21 00:52:59 +00:00
2020-05-20 17:43:22 +00:00
"github.com/filecoin-project/go-jsonrpc"
2020-05-20 18:23:51 +00:00
"github.com/filecoin-project/go-jsonrpc/auth"
2020-03-11 01:57:52 +00:00
paramfetch "github.com/filecoin-project/go-paramfetch"
"github.com/filecoin-project/sector-storage/ffiwrapper"
2020-03-23 11:40:02 +00:00
2020-03-11 01:57:52 +00:00
"github.com/filecoin-project/lotus/api"
2020-03-13 01:37:38 +00:00
"github.com/filecoin-project/lotus/api/apistruct"
2019-11-21 00:52:59 +00:00
"github.com/filecoin-project/lotus/build"
2020-03-11 01:57:52 +00:00
lcli "github.com/filecoin-project/lotus/cli"
2020-01-08 13:49:34 +00:00
"github.com/filecoin-project/lotus/lib/lotuslog"
"github.com/filecoin-project/lotus/node/repo"
sectorstorage "github.com/filecoin-project/sector-storage"
2020-03-27 23:00:21 +00:00
"github.com/filecoin-project/sector-storage/sealtasks"
"github.com/filecoin-project/sector-storage/stores"
2019-11-21 00:52:59 +00:00
)
var log = logging.Logger("main")
const FlagWorkerRepo = "worker-repo"
2020-07-17 13:18:40 +00:00
2020-07-10 12:18:09 +00:00
// TODO remove after deprecation period
const FlagWorkerRepoDeprecation = "workerrepo"
2020-03-13 01:37:38 +00:00
2019-11-21 00:52:59 +00:00
func main() {
2020-01-08 13:49:34 +00:00
lotuslog.SetupLogLevels()
2019-11-21 00:52:59 +00:00
log.Info("Starting lotus worker")
local := []*cli.Command{
runCmd,
}
app := &cli.App{
Name: "lotus-worker",
Usage: "Remote miner worker",
2020-06-01 18:43:51 +00:00
Version: build.UserVersion(),
2019-11-21 00:52:59 +00:00
Flags: []cli.Flag{
&cli.StringFlag{
2020-07-08 10:38:59 +00:00
Name: FlagWorkerRepo,
2020-07-10 12:18:09 +00:00
Aliases: []string{FlagWorkerRepoDeprecation},
EnvVars: []string{"LOTUS_WORKER_PATH", "WORKER_PATH"},
2019-11-21 00:52:59 +00:00
Value: "~/.lotusworker", // TODO: Consider XDG_DATA_HOME
2020-07-10 12:18:09 +00:00
Usage: fmt.Sprintf("Specify worker repo path. flag %s and env WORKER_PATH are DEPRECATION, will REMOVE SOON", FlagWorkerRepoDeprecation),
2019-11-21 00:52:59 +00:00
},
&cli.StringFlag{
2020-07-08 10:38:59 +00:00
Name: "miner-repo",
2020-07-10 12:18:09 +00:00
Aliases: []string{"storagerepo"},
EnvVars: []string{"LOTUS_MINER_PATH", "LOTUS_STORAGE_PATH"},
2020-07-08 10:38:59 +00:00
Value: "~/.lotusminer", // TODO: Consider XDG_DATA_HOME
2020-07-10 12:18:09 +00:00
Usage: fmt.Sprintf("Specify miner repo path. flag storagerepo and env LOTUS_STORAGE_PATH are DEPRECATION, will REMOVE SOON"),
2019-11-21 00:52:59 +00:00
},
2019-12-07 14:19:46 +00:00
&cli.BoolFlag{
Name: "enable-gpu-proving",
Usage: "enable use of GPU for mining operations",
Value: true,
},
2019-11-21 00:52:59 +00:00
},
Commands: local,
}
app.Setup()
2020-03-25 21:15:10 +00:00
app.Metadata["repoType"] = repo.Worker
2019-11-21 00:52:59 +00:00
if err := app.Run(os.Args); err != nil {
2019-11-21 18:38:43 +00:00
log.Warnf("%+v", err)
2019-11-21 00:52:59 +00:00
return
}
}
var runCmd = &cli.Command{
Name: "run",
Usage: "Start lotus worker",
2020-03-16 17:50:07 +00:00
Flags: []cli.Flag{
&cli.StringFlag{
Name: "address",
Usage: "locally reachable address",
Value: "0.0.0.0:3456",
2020-03-16 17:50:07 +00:00
},
2020-03-16 18:46:02 +00:00
&cli.BoolFlag{
Name: "no-local-storage",
Usage: "don't use storageminer repo for sector storage",
},
2020-03-25 21:15:10 +00:00
&cli.BoolFlag{
Name: "precommit1",
Usage: "enable precommit1 (32G sectors: 1 core, 128GiB Memory)",
Value: true,
},
&cli.BoolFlag{
Name: "precommit2",
Usage: "enable precommit2 (32G sectors: all cores, 96GiB Memory)",
Value: true,
},
&cli.BoolFlag{
Name: "commit",
Usage: "enable commit (32G sectors: all cores or GPUs, 128GiB Memory + 64GiB swap)",
Value: true,
},
&cli.IntFlag{
Name: "parallel-fetch-limit",
Usage: "maximum fetch operations to run in parallel",
Value: 5,
},
&cli.StringFlag{
Name: "timeout",
Usage: "used when address is unspecified. must be a valid duration recognized by golang's time.ParseDuration function",
Value: "30m",
},
2020-03-16 17:50:07 +00:00
},
2019-11-21 00:52:59 +00:00
Action: func(cctx *cli.Context) error {
2020-03-11 01:57:52 +00:00
if !cctx.Bool("enable-gpu-proving") {
if err := os.Setenv("BELLMAN_NO_GPU", "true"); err != nil {
return xerrors.Errorf("could not set no-gpu env: %+v", err)
}
2020-03-11 01:57:52 +00:00
}
2020-03-16 17:50:07 +00:00
// Connect to storage-miner
var nodeApi api.StorageMiner
var closer func()
var err error
for {
nodeApi, closer, err = lcli.GetStorageMinerAPI(cctx)
if err == nil {
break
}
fmt.Printf("\r\x1b[0KConnecting to miner API... (%s)", err)
time.Sleep(time.Second)
continue
2020-03-11 01:57:52 +00:00
}
2020-03-11 01:57:52 +00:00
defer closer()
ctx := lcli.ReqContext(cctx)
2020-03-18 23:23:28 +00:00
ctx, cancel := context.WithCancel(ctx)
defer cancel()
2020-03-11 01:57:52 +00:00
v, err := nodeApi.Version(ctx)
if err != nil {
return err
}
if v.APIVersion != build.APIVersion {
2020-07-08 10:38:59 +00:00
return xerrors.Errorf("lotus-miner API version doesn't match: local: ", api.Version{APIVersion: build.APIVersion})
2020-03-11 01:57:52 +00:00
}
2020-03-13 01:37:38 +00:00
log.Infof("Remote version %s", v)
2020-03-11 01:57:52 +00:00
watchMinerConn(ctx, cctx, nodeApi)
2020-03-16 17:50:07 +00:00
// Check params
2020-03-11 01:57:52 +00:00
act, err := nodeApi.ActorAddress(ctx)
if err != nil {
return err
}
ssize, err := nodeApi.ActorSectorSize(ctx, act)
if err != nil {
return err
}
2020-03-25 21:15:10 +00:00
if cctx.Bool("commit") {
if err := paramfetch.GetParams(ctx, build.ParametersJSON(), uint64(ssize)); err != nil {
2020-03-25 21:15:10 +00:00
return xerrors.Errorf("get params: %w", err)
}
}
var taskTypes []sealtasks.TaskType
2020-06-04 13:54:54 +00:00
taskTypes = append(taskTypes, sealtasks.TTFetch, sealtasks.TTCommit1, sealtasks.TTFinalize)
2020-03-25 21:15:10 +00:00
if cctx.Bool("precommit1") {
taskTypes = append(taskTypes, sealtasks.TTPreCommit1)
}
if cctx.Bool("precommit2") {
taskTypes = append(taskTypes, sealtasks.TTPreCommit2)
}
if cctx.Bool("commit") {
taskTypes = append(taskTypes, sealtasks.TTCommit2)
}
if len(taskTypes) == 0 {
return xerrors.Errorf("no task types specified")
2020-03-11 01:57:52 +00:00
}
2020-03-16 17:50:07 +00:00
// Open repo
2020-07-08 10:38:59 +00:00
repoPath := cctx.String(FlagWorkerRepo)
2020-03-13 01:37:38 +00:00
r, err := repo.NewFS(repoPath)
if err != nil {
return err
}
ok, err := r.Exists()
if err != nil {
return err
}
if !ok {
2020-03-16 18:46:02 +00:00
if err := r.Init(repo.Worker); err != nil {
return err
}
lr, err := r.Lock(repo.Worker)
if err != nil {
return err
}
var localPaths []stores.LocalPath
2020-03-16 18:46:02 +00:00
if !cctx.Bool("no-local-storage") {
2020-03-19 15:10:19 +00:00
b, err := json.MarshalIndent(&stores.LocalStorageMeta{
2020-03-16 18:46:02 +00:00
ID: stores.ID(uuid.New().String()),
Weight: 10,
CanSeal: true,
CanStore: false,
}, "", " ")
if err != nil {
return xerrors.Errorf("marshaling storage config: %w", err)
}
if err := ioutil.WriteFile(filepath.Join(lr.Path(), "sectorstore.json"), b, 0644); err != nil {
return xerrors.Errorf("persisting storage metadata (%s): %w", filepath.Join(lr.Path(), "sectorstore.json"), err)
}
localPaths = append(localPaths, stores.LocalPath{
2020-03-16 18:46:02 +00:00
Path: lr.Path(),
})
}
if err := lr.SetStorage(func(sc *stores.StorageConfig) {
2020-03-16 18:46:02 +00:00
sc.StoragePaths = append(sc.StoragePaths, localPaths...)
}); err != nil {
return xerrors.Errorf("set storage config: %w", err)
}
2020-03-18 23:23:28 +00:00
{
// init datastore for r.Exists
2020-07-10 14:13:35 +00:00
_, err := lr.Datastore("/metadata")
2020-03-18 23:23:28 +00:00
if err != nil {
return err
}
}
2020-03-16 18:46:02 +00:00
if err := lr.Close(); err != nil {
return xerrors.Errorf("close repo: %w", err)
}
2020-03-13 01:37:38 +00:00
}
lr, err := r.Lock(repo.Worker)
if err != nil {
return err
}
2020-03-19 15:10:19 +00:00
log.Info("Opening local storage; connecting to master")
const unspecifiedAddress = "0.0.0.0"
address := cctx.String("address")
addressSlice := strings.Split(address, ":")
if ip := net.ParseIP(addressSlice[0]); ip != nil {
if ip.String() == unspecifiedAddress {
timeout, err := time.ParseDuration(cctx.String("timeout"))
if err != nil {
return err
}
rip, err := extractRoutableIP(timeout)
if err != nil {
return err
}
address = rip + ":" + addressSlice[1]
}
}
2020-03-19 15:10:19 +00:00
localStore, err := stores.NewLocal(ctx, lr, nodeApi, []string{"http://" + address + "/remote"})
2020-03-13 01:37:38 +00:00
if err != nil {
return err
}
2020-03-16 17:50:07 +00:00
// Setup remote sector store
2020-04-10 21:29:05 +00:00
spt, err := ffiwrapper.SealProofTypeFromSectorSize(ssize)
2020-03-13 01:37:38 +00:00
if err != nil {
return xerrors.Errorf("getting proof type: %w", err)
}
sminfo, err := lcli.GetAPIInfo(cctx, repo.StorageMiner)
if err != nil {
return xerrors.Errorf("could not get api info: %w", err)
}
remote := stores.NewRemote(localStore, nodeApi, sminfo.AuthHeader(), cctx.Int("parallel-fetch-limit"))
2020-03-13 01:37:38 +00:00
2020-03-16 17:50:07 +00:00
// Create / expose the worker
2020-03-13 01:37:38 +00:00
workerApi := &worker{
2020-03-23 11:40:02 +00:00
LocalWorker: sectorstorage.NewLocalWorker(sectorstorage.WorkerConfig{
SealProof: spt,
2020-03-25 21:15:10 +00:00
TaskTypes: taskTypes,
}, remote, localStore, nodeApi),
2020-03-13 01:37:38 +00:00
}
mux := mux.NewRouter()
log.Info("Setting up control endpoint at " + address)
2020-03-18 04:40:25 +00:00
2020-03-13 01:37:38 +00:00
rpcServer := jsonrpc.NewServer()
rpcServer.Register("Filecoin", apistruct.PermissionedWorkerAPI(workerApi))
mux.Handle("/rpc/v0", rpcServer)
mux.PathPrefix("/remote").HandlerFunc((&stores.FetchHandler{Local: localStore}).ServeHTTP)
2020-03-13 01:37:38 +00:00
mux.PathPrefix("/").Handler(http.DefaultServeMux) // pprof
ah := &auth.Handler{
Verify: nodeApi.AuthVerify,
Next: mux.ServeHTTP,
}
2020-03-18 23:23:28 +00:00
srv := &http.Server{
Handler: ah,
BaseContext: func(listener net.Listener) context.Context {
return ctx
},
}
2020-03-13 01:37:38 +00:00
go func() {
2020-03-18 04:40:25 +00:00
<-ctx.Done()
2020-06-02 19:30:45 +00:00
log.Warn("Shutting down...")
2020-03-13 01:37:38 +00:00
if err := srv.Shutdown(context.TODO()); err != nil {
log.Errorf("shutting down RPC server failed: %s", err)
}
log.Warn("Graceful shutdown successful")
}()
2020-03-16 17:50:07 +00:00
nl, err := net.Listen("tcp", address)
2020-03-16 17:50:07 +00:00
if err != nil {
return err
}
2020-03-18 04:40:25 +00:00
log.Info("Waiting for tasks")
2020-03-18 23:23:28 +00:00
go func() {
if err := nodeApi.WorkerConnect(ctx, "ws://"+address+"/rpc/v0"); err != nil {
2020-03-18 23:23:28 +00:00
log.Errorf("Registering worker failed: %+v", err)
cancel()
return
}
}()
2020-03-16 17:50:07 +00:00
return srv.Serve(nl)
2019-11-21 00:52:59 +00:00
},
}
func watchMinerConn(ctx context.Context, cctx *cli.Context, nodeApi api.StorageMiner) {
go func() {
closing, err := nodeApi.Closing(ctx)
if err != nil {
log.Errorf("failed to get remote closing channel: %+v", err)
}
select {
case <-closing:
case <-ctx.Done():
}
if ctx.Err() != nil {
return // graceful shutdown
}
log.Warnf("Connection with miner node lost, restarting")
exe, err := os.Executable()
if err != nil {
log.Errorf("getting executable for auto-restart: %+v", err)
}
log.Sync()
// TODO: there are probably cleaner/more graceful ways to restart,
// but this is good enough for now (FSM can recover from the mess this creates)
if err := syscall.Exec(exe, []string{exe, "run",
fmt.Sprintf("--address=%s", cctx.String("address")),
fmt.Sprintf("--no-local-storage=%t", cctx.Bool("no-local-storage")),
fmt.Sprintf("--precommit1=%t", cctx.Bool("precommit1")),
fmt.Sprintf("--precommit2=%t", cctx.Bool("precommit2")),
fmt.Sprintf("--commit=%t", cctx.Bool("commit")),
}, os.Environ()); err != nil {
fmt.Println(err)
}
}()
}
func extractRoutableIP(timeout time.Duration) (string, error) {
minerMultiAddrKey := "MINER_API_INFO"
deprecatedMinerMultiAddrKey := "STORAGE_API_INFO"
env, ok := os.LookupEnv(minerMultiAddrKey)
if !ok {
// TODO remove after deprecation period
env, ok = os.LookupEnv(deprecatedMinerMultiAddrKey)
if ok {
log.Warnf("Using a deprecated env(%s) value, please use env(%s) instead.", deprecatedMinerMultiAddrKey, minerMultiAddrKey)
}
return "", xerrors.New("MINER_API_INFO environment variable required to extract IP")
}
minerAddr := strings.Split(env, "/")
conn, err := net.DialTimeout("tcp", minerAddr[2]+":"+minerAddr[4], timeout)
if err != nil {
return "", err
}
defer conn.Close()
localAddr := conn.LocalAddr().(*net.TCPAddr)
return strings.Split(localAddr.IP.String(), ":")[0], nil
}