lotus/extern/sector-storage/worker_local.go
2020-10-30 18:01:37 +01:00

557 lines
16 KiB
Go

package sectorstorage
import (
"context"
"encoding/json"
"io"
"os"
"reflect"
"runtime"
"sync"
"sync/atomic"
"time"
"github.com/elastic/go-sysinfo"
"github.com/google/uuid"
"github.com/hashicorp/go-multierror"
"github.com/ipfs/go-cid"
"golang.org/x/xerrors"
ffi "github.com/filecoin-project/filecoin-ffi"
"github.com/filecoin-project/go-state-types/abi"
"github.com/filecoin-project/go-statestore"
storage2 "github.com/filecoin-project/specs-storage/storage"
"github.com/filecoin-project/lotus/extern/sector-storage/ffiwrapper"
"github.com/filecoin-project/lotus/extern/sector-storage/sealtasks"
"github.com/filecoin-project/lotus/extern/sector-storage/stores"
"github.com/filecoin-project/lotus/extern/sector-storage/storiface"
)
var pathTypes = []storiface.SectorFileType{storiface.FTUnsealed, storiface.FTSealed, storiface.FTCache}
type WorkerConfig struct {
SealProof abi.RegisteredSealProof
TaskTypes []sealtasks.TaskType
NoSwap bool
}
// used do provide custom proofs impl (mostly used in testing)
type ExecutorFunc func() (ffiwrapper.Storage, error)
type LocalWorker struct {
scfg *ffiwrapper.Config
storage stores.Store
localStore *stores.Local
sindex stores.SectorIndex
ret storiface.WorkerReturn
executor ExecutorFunc
noSwap bool
ct *workerCallTracker
acceptTasks map[sealtasks.TaskType]struct{}
running sync.WaitGroup
session uuid.UUID
testDisable int64
closing chan struct{}
}
func newLocalWorker(executor ExecutorFunc, wcfg WorkerConfig, store stores.Store, local *stores.Local, sindex stores.SectorIndex, ret storiface.WorkerReturn, cst *statestore.StateStore) *LocalWorker {
acceptTasks := map[sealtasks.TaskType]struct{}{}
for _, taskType := range wcfg.TaskTypes {
acceptTasks[taskType] = struct{}{}
}
w := &LocalWorker{
scfg: &ffiwrapper.Config{
SealProofType: wcfg.SealProof,
},
storage: store,
localStore: local,
sindex: sindex,
ret: ret,
ct: &workerCallTracker{
st: cst,
},
acceptTasks: acceptTasks,
executor: executor,
noSwap: wcfg.NoSwap,
session: uuid.New(),
closing: make(chan struct{}),
}
if w.executor == nil {
w.executor = w.ffiExec
}
unfinished, err := w.ct.unfinished()
if err != nil {
log.Errorf("reading unfinished tasks: %+v", err)
return w
}
go func() {
for _, call := range unfinished {
err := xerrors.Errorf("worker restarted")
// TODO: Handle restarting PC1 once support is merged
if doReturn(context.TODO(), call.RetType, call.ID, ret, nil, err) {
if err := w.ct.onReturned(call.ID); err != nil {
log.Errorf("marking call as returned failed: %s: %+v", call.RetType, err)
}
}
}
}()
return w
}
func NewLocalWorker(wcfg WorkerConfig, store stores.Store, local *stores.Local, sindex stores.SectorIndex, ret storiface.WorkerReturn, cst *statestore.StateStore) *LocalWorker {
return newLocalWorker(nil, wcfg, store, local, sindex, ret, cst)
}
type localWorkerPathProvider struct {
w *LocalWorker
op storiface.AcquireMode
}
func (l *localWorkerPathProvider) AcquireSector(ctx context.Context, sector abi.SectorID, existing storiface.SectorFileType, allocate storiface.SectorFileType, sealing storiface.PathType) (storiface.SectorPaths, func(), error) {
ssize, err := l.w.scfg.SealProofType.SectorSize()
if err != nil {
return storiface.SectorPaths{}, nil, err
}
paths, storageIDs, err := l.w.storage.AcquireSector(ctx, sector, ssize, existing, allocate, sealing, l.op)
if err != nil {
return storiface.SectorPaths{}, nil, err
}
releaseStorage, err := l.w.localStore.Reserve(ctx, sector, ssize, allocate, storageIDs, storiface.FSOverheadSeal)
if err != nil {
return storiface.SectorPaths{}, nil, xerrors.Errorf("reserving storage space: %w", err)
}
log.Debugf("acquired sector %d (e:%d; a:%d): %v", sector, existing, allocate, paths)
return paths, func() {
releaseStorage()
for _, fileType := range pathTypes {
if fileType&allocate == 0 {
continue
}
sid := storiface.PathByType(storageIDs, fileType)
if err := l.w.sindex.StorageDeclareSector(ctx, stores.ID(sid), sector, fileType, l.op == storiface.AcquireMove); err != nil {
log.Errorf("declare sector error: %+v", err)
}
}
}, nil
}
func (l *LocalWorker) ffiExec() (ffiwrapper.Storage, error) {
return ffiwrapper.New(&localWorkerPathProvider{w: l}, l.scfg)
}
type ReturnType string
// in: func(WorkerReturn, context.Context, CallID, err string)
// in: func(WorkerReturn, context.Context, CallID, ret T, err string)
func rfunc(in interface{}) func(context.Context, storiface.CallID, storiface.WorkerReturn, interface{}, error) error {
rf := reflect.ValueOf(in)
ft := rf.Type()
withRet := ft.NumIn() == 5
return func(ctx context.Context, ci storiface.CallID, wr storiface.WorkerReturn, i interface{}, err error) error {
rctx := reflect.ValueOf(ctx)
rwr := reflect.ValueOf(wr)
rerr := reflect.ValueOf(errstr(err))
rci := reflect.ValueOf(ci)
var ro []reflect.Value
if withRet {
ret := reflect.ValueOf(i)
if i == nil {
ret = reflect.Zero(rf.Type().In(3))
}
ro = rf.Call([]reflect.Value{rwr, rctx, rci, ret, rerr})
} else {
ro = rf.Call([]reflect.Value{rwr, rctx, rci, rerr})
}
if !ro[0].IsNil() {
return ro[0].Interface().(error)
}
return nil
}
}
var returnFunc = map[ReturnType]func(context.Context, storiface.CallID, storiface.WorkerReturn, interface{}, error) error{
"AddPiece": rfunc(storiface.WorkerReturn.ReturnAddPiece),
"SealPreCommit1": rfunc(storiface.WorkerReturn.ReturnSealPreCommit1),
"SealPreCommit2": rfunc(storiface.WorkerReturn.ReturnSealPreCommit2),
"SealCommit1": rfunc(storiface.WorkerReturn.ReturnSealCommit1),
"SealCommit2": rfunc(storiface.WorkerReturn.ReturnSealCommit2),
"FinalizeSector": rfunc(storiface.WorkerReturn.ReturnFinalizeSector),
"ReleaseUnsealed": rfunc(storiface.WorkerReturn.ReturnReleaseUnsealed),
"MoveStorage": rfunc(storiface.WorkerReturn.ReturnMoveStorage),
"UnsealPiece": rfunc(storiface.WorkerReturn.ReturnUnsealPiece),
"ReadPiece": rfunc(storiface.WorkerReturn.ReturnReadPiece),
"Fetch": rfunc(storiface.WorkerReturn.ReturnFetch),
}
func (l *LocalWorker) asyncCall(ctx context.Context, sector abi.SectorID, rt ReturnType, work func(ctx context.Context, ci storiface.CallID) (interface{}, error)) (storiface.CallID, error) {
ci := storiface.CallID{
Sector: sector,
ID: uuid.New(),
}
if err := l.ct.onStart(ci, rt); err != nil {
log.Errorf("tracking call (start): %+v", err)
}
l.running.Add(1)
go func() {
defer l.running.Done()
ctx := &wctx{
vals: ctx,
closing: l.closing,
}
res, err := work(ctx, ci)
if err != nil {
rb, err := json.Marshal(res)
if err != nil {
log.Errorf("tracking call (marshaling results): %+v", err)
} else {
if err := l.ct.onDone(ci, rb); err != nil {
log.Errorf("tracking call (done): %+v", err)
}
}
}
if doReturn(ctx, rt, ci, l.ret, res, err) {
if err := l.ct.onReturned(ci); err != nil {
log.Errorf("tracking call (done): %+v", err)
}
}
}()
return ci, nil
}
// doReturn tries to send the result to manager, returns true if successful
func doReturn(ctx context.Context, rt ReturnType, ci storiface.CallID, ret storiface.WorkerReturn, res interface{}, rerr error) bool {
for {
err := returnFunc[rt](ctx, ci, ret, res, rerr)
if err == nil {
break
}
log.Errorf("return error, will retry in 5s: %s: %+v", rt, err)
select {
case <-time.After(5 * time.Second):
case <-ctx.Done():
log.Errorf("failed to return results: %s", ctx.Err())
// fine to just return, worker is most likely shutting down, and
// we didn't mark the result as returned yet, so we'll try to
// re-submit it on restart
return false
}
}
return true
}
func errstr(err error) string {
if err != nil {
return err.Error()
}
return ""
}
func (l *LocalWorker) NewSector(ctx context.Context, sector abi.SectorID) error {
sb, err := l.executor()
if err != nil {
return err
}
return sb.NewSector(ctx, sector)
}
func (l *LocalWorker) AddPiece(ctx context.Context, sector abi.SectorID, epcs []abi.UnpaddedPieceSize, sz abi.UnpaddedPieceSize, r io.Reader) (storiface.CallID, error) {
sb, err := l.executor()
if err != nil {
return storiface.UndefCall, err
}
return l.asyncCall(ctx, sector, "AddPiece", func(ctx context.Context, ci storiface.CallID) (interface{}, error) {
return sb.AddPiece(ctx, sector, epcs, sz, r)
})
}
func (l *LocalWorker) Fetch(ctx context.Context, sector abi.SectorID, fileType storiface.SectorFileType, ptype storiface.PathType, am storiface.AcquireMode) (storiface.CallID, error) {
return l.asyncCall(ctx, sector, "Fetch", func(ctx context.Context, ci storiface.CallID) (interface{}, error) {
_, done, err := (&localWorkerPathProvider{w: l, op: am}).AcquireSector(ctx, sector, fileType, storiface.FTNone, ptype)
if err == nil {
done()
}
return nil, err
})
}
func (l *LocalWorker) SealPreCommit1(ctx context.Context, sector abi.SectorID, ticket abi.SealRandomness, pieces []abi.PieceInfo) (storiface.CallID, error) {
return l.asyncCall(ctx, sector, "SealPreCommit1", func(ctx context.Context, ci storiface.CallID) (interface{}, error) {
{
// cleanup previous failed attempts if they exist
if err := l.storage.Remove(ctx, sector, storiface.FTSealed, true); err != nil {
return nil, xerrors.Errorf("cleaning up sealed data: %w", err)
}
if err := l.storage.Remove(ctx, sector, storiface.FTCache, true); err != nil {
return nil, xerrors.Errorf("cleaning up cache data: %w", err)
}
}
sb, err := l.executor()
if err != nil {
return nil, err
}
return sb.SealPreCommit1(ctx, sector, ticket, pieces)
})
}
func (l *LocalWorker) SealPreCommit2(ctx context.Context, sector abi.SectorID, phase1Out storage2.PreCommit1Out) (storiface.CallID, error) {
sb, err := l.executor()
if err != nil {
return storiface.UndefCall, err
}
return l.asyncCall(ctx, sector, "SealPreCommit2", func(ctx context.Context, ci storiface.CallID) (interface{}, error) {
return sb.SealPreCommit2(ctx, sector, phase1Out)
})
}
func (l *LocalWorker) SealCommit1(ctx context.Context, sector abi.SectorID, ticket abi.SealRandomness, seed abi.InteractiveSealRandomness, pieces []abi.PieceInfo, cids storage2.SectorCids) (storiface.CallID, error) {
sb, err := l.executor()
if err != nil {
return storiface.UndefCall, err
}
return l.asyncCall(ctx, sector, "SealCommit1", func(ctx context.Context, ci storiface.CallID) (interface{}, error) {
return sb.SealCommit1(ctx, sector, ticket, seed, pieces, cids)
})
}
func (l *LocalWorker) SealCommit2(ctx context.Context, sector abi.SectorID, phase1Out storage2.Commit1Out) (storiface.CallID, error) {
sb, err := l.executor()
if err != nil {
return storiface.UndefCall, err
}
return l.asyncCall(ctx, sector, "SealCommit2", func(ctx context.Context, ci storiface.CallID) (interface{}, error) {
return sb.SealCommit2(ctx, sector, phase1Out)
})
}
func (l *LocalWorker) FinalizeSector(ctx context.Context, sector abi.SectorID, keepUnsealed []storage2.Range) (storiface.CallID, error) {
sb, err := l.executor()
if err != nil {
return storiface.UndefCall, err
}
return l.asyncCall(ctx, sector, "FinalizeSector", func(ctx context.Context, ci storiface.CallID) (interface{}, error) {
if err := sb.FinalizeSector(ctx, sector, keepUnsealed); err != nil {
return nil, xerrors.Errorf("finalizing sector: %w", err)
}
if len(keepUnsealed) == 0 {
if err := l.storage.Remove(ctx, sector, storiface.FTUnsealed, true); err != nil {
return nil, xerrors.Errorf("removing unsealed data: %w", err)
}
}
return nil, err
})
}
func (l *LocalWorker) ReleaseUnsealed(ctx context.Context, sector abi.SectorID, safeToFree []storage2.Range) (storiface.CallID, error) {
return storiface.UndefCall, xerrors.Errorf("implement me")
}
func (l *LocalWorker) Remove(ctx context.Context, sector abi.SectorID) error {
var err error
if rerr := l.storage.Remove(ctx, sector, storiface.FTSealed, true); rerr != nil {
err = multierror.Append(err, xerrors.Errorf("removing sector (sealed): %w", rerr))
}
if rerr := l.storage.Remove(ctx, sector, storiface.FTCache, true); rerr != nil {
err = multierror.Append(err, xerrors.Errorf("removing sector (cache): %w", rerr))
}
if rerr := l.storage.Remove(ctx, sector, storiface.FTUnsealed, true); rerr != nil {
err = multierror.Append(err, xerrors.Errorf("removing sector (unsealed): %w", rerr))
}
return err
}
func (l *LocalWorker) MoveStorage(ctx context.Context, sector abi.SectorID, types storiface.SectorFileType) (storiface.CallID, error) {
return l.asyncCall(ctx, sector, "MoveStorage", func(ctx context.Context, ci storiface.CallID) (interface{}, error) {
ssize, err := l.scfg.SealProofType.SectorSize()
if err != nil {
return nil, err
}
return nil, l.storage.MoveStorage(ctx, sector, ssize, types)
})
}
func (l *LocalWorker) UnsealPiece(ctx context.Context, sector abi.SectorID, index storiface.UnpaddedByteIndex, size abi.UnpaddedPieceSize, randomness abi.SealRandomness, cid cid.Cid) (storiface.CallID, error) {
sb, err := l.executor()
if err != nil {
return storiface.UndefCall, err
}
return l.asyncCall(ctx, sector, "UnsealPiece", func(ctx context.Context, ci storiface.CallID) (interface{}, error) {
if err = sb.UnsealPiece(ctx, sector, index, size, randomness, cid); err != nil {
return nil, xerrors.Errorf("unsealing sector: %w", err)
}
if err = l.storage.RemoveCopies(ctx, sector, storiface.FTSealed); err != nil {
return nil, xerrors.Errorf("removing source data: %w", err)
}
if err = l.storage.RemoveCopies(ctx, sector, storiface.FTCache); err != nil {
return nil, xerrors.Errorf("removing source data: %w", err)
}
return nil, nil
})
}
func (l *LocalWorker) ReadPiece(ctx context.Context, writer io.Writer, sector abi.SectorID, index storiface.UnpaddedByteIndex, size abi.UnpaddedPieceSize) (storiface.CallID, error) {
sb, err := l.executor()
if err != nil {
return storiface.UndefCall, err
}
return l.asyncCall(ctx, sector, "ReadPiece", func(ctx context.Context, ci storiface.CallID) (interface{}, error) {
return sb.ReadPiece(ctx, writer, sector, index, size)
})
}
func (l *LocalWorker) TaskTypes(context.Context) (map[sealtasks.TaskType]struct{}, error) {
return l.acceptTasks, nil
}
func (l *LocalWorker) Paths(ctx context.Context) ([]stores.StoragePath, error) {
return l.localStore.Local(ctx)
}
func (l *LocalWorker) Info(context.Context) (storiface.WorkerInfo, error) {
hostname, err := os.Hostname() // TODO: allow overriding from config
if err != nil {
panic(err)
}
gpus, err := ffi.GetGPUDevices()
if err != nil {
log.Errorf("getting gpu devices failed: %+v", err)
}
h, err := sysinfo.Host()
if err != nil {
return storiface.WorkerInfo{}, xerrors.Errorf("getting host info: %w", err)
}
mem, err := h.Memory()
if err != nil {
return storiface.WorkerInfo{}, xerrors.Errorf("getting memory info: %w", err)
}
memSwap := mem.VirtualTotal
if l.noSwap {
memSwap = 0
}
return storiface.WorkerInfo{
Hostname: hostname,
Resources: storiface.WorkerResources{
MemPhysical: mem.Total,
MemSwap: memSwap,
MemReserved: mem.VirtualUsed + mem.Total - mem.Available, // TODO: sub this process
CPUs: uint64(runtime.NumCPU()),
GPUs: gpus,
},
}, nil
}
func (l *LocalWorker) Session(ctx context.Context) (uuid.UUID, error) {
if atomic.LoadInt64(&l.testDisable) == 1 {
return uuid.UUID{}, xerrors.Errorf("disabled")
}
select {
case <-l.closing:
return ClosedWorkerID, nil
default:
return l.session, nil
}
}
func (l *LocalWorker) Close() error {
close(l.closing)
return nil
}
// WaitQuiet blocks as long as there are tasks running
func (l *LocalWorker) WaitQuiet() {
l.running.Wait()
}
type wctx struct {
vals context.Context
closing chan struct{}
}
func (w *wctx) Deadline() (time.Time, bool) {
return time.Time{}, false
}
func (w *wctx) Done() <-chan struct{} {
return w.closing
}
func (w *wctx) Err() error {
select {
case <-w.closing:
return context.Canceled
default:
return nil
}
}
func (w *wctx) Value(key interface{}) interface{} {
return w.vals.Value(key)
}
var _ context.Context = &wctx{}
var _ Worker = &LocalWorker{}