lotus/storage/pipeline/states_failed.go
Andrew Jackson (Ajax) a520ee85d6 master-to-sturdy
2023-09-21 10:37:02 -05:00

600 lines
20 KiB
Go

package sealing
import (
"context"
"time"
"github.com/hashicorp/go-multierror"
"golang.org/x/xerrors"
"github.com/filecoin-project/go-address"
"github.com/filecoin-project/go-commp-utils/zerocomm"
"github.com/filecoin-project/go-state-types/abi"
"github.com/filecoin-project/go-state-types/builtin/v9/miner"
"github.com/filecoin-project/go-state-types/exitcode"
"github.com/filecoin-project/go-statemachine"
"github.com/filecoin-project/lotus/api"
"github.com/filecoin-project/lotus/chain/actors/builtin/market"
"github.com/filecoin-project/lotus/chain/types"
"github.com/filecoin-project/lotus/storage/sealer/storiface"
)
var MinRetryTime = 1 * time.Minute
func failedCooldown(ctx statemachine.Context, sector SectorInfo) error {
// TODO: Exponential backoff when we see consecutive failures
retryStart := time.Unix(int64(sector.Log[len(sector.Log)-1].Timestamp), 0).Add(MinRetryTime)
if len(sector.Log) > 0 && !time.Now().After(retryStart) {
log.Infof("%s(%d), waiting %s before retrying", sector.State, sector.SectorNumber, time.Until(retryStart))
select {
case <-time.After(time.Until(retryStart)):
case <-ctx.Context().Done():
return ctx.Context().Err()
}
}
return nil
}
func (m *Sealing) checkPreCommitted(ctx statemachine.Context, sector SectorInfo) (*miner.SectorPreCommitOnChainInfo, bool) {
ts, err := m.Api.ChainHead(ctx.Context())
if err != nil {
log.Errorf("handleSealPrecommit1Failed(%d): temp error: %+v", sector.SectorNumber, err)
return nil, false
}
info, err := m.Api.StateSectorPreCommitInfo(ctx.Context(), m.maddr, sector.SectorNumber, ts.Key())
if err != nil {
log.Errorf("handleSealPrecommit1Failed(%d): temp error: %+v", sector.SectorNumber, err)
return nil, false
}
return info, true
}
var MaxPreCommit1Retries = uint64(3)
func (m *Sealing) handleSealPrecommit1Failed(ctx statemachine.Context, sector SectorInfo) error {
if sector.PreCommit1Fails > MaxPreCommit1Retries {
return ctx.Send(SectorRemove{})
}
if err := failedCooldown(ctx, sector); err != nil {
return err
}
return ctx.Send(SectorRetrySealPreCommit1{})
}
func (m *Sealing) handleSealPrecommit2Failed(ctx statemachine.Context, sector SectorInfo) error {
if err := failedCooldown(ctx, sector); err != nil {
return err
}
if sector.PreCommit2Fails > 3 {
return ctx.Send(SectorRetrySealPreCommit1{})
}
return ctx.Send(SectorRetrySealPreCommit2{})
}
func (m *Sealing) handlePreCommitFailed(ctx statemachine.Context, sector SectorInfo) error {
ts, err := m.Api.ChainHead(ctx.Context())
if err != nil {
log.Errorf("handlePreCommitFailed: api error, not proceeding: %+v", err)
return nil
}
if sector.PreCommitMessage != nil {
mw, err := m.Api.StateSearchMsg(ctx.Context(), ts.Key(), *sector.PreCommitMessage, api.LookbackNoLimit, true)
if err != nil {
// API error
if err := failedCooldown(ctx, sector); err != nil {
return err
}
return ctx.Send(SectorRetryPreCommitWait{})
}
if mw == nil {
// API error in precommit
return ctx.Send(SectorRetryPreCommitWait{})
}
switch mw.Receipt.ExitCode {
case exitcode.Ok:
// API error in PreCommitWait
return ctx.Send(SectorRetryPreCommitWait{})
case exitcode.SysErrOutOfGas:
// API error in PreCommitWait AND gas estimator guessed a wrong number in PreCommit
return ctx.Send(SectorRetryPreCommit{})
default:
// something else went wrong
}
}
if err := checkPrecommit(ctx.Context(), m.Address(), sector, ts.Key(), ts.Height(), m.Api); err != nil {
switch err.(type) {
case *ErrApi:
log.Errorf("handlePreCommitFailed: api error, not proceeding: %+v", err)
return nil
case *ErrBadCommD: // TODO: Should this just back to packing? (not really needed since handlePreCommit1 will do that too)
return ctx.Send(SectorSealPreCommit1Failed{xerrors.Errorf("bad CommD error: %w", err)})
case *ErrExpiredTicket:
return ctx.Send(SectorSealPreCommit1Failed{xerrors.Errorf("ticket expired error: %w", err)})
case *ErrBadTicket:
return ctx.Send(SectorSealPreCommit1Failed{xerrors.Errorf("bad expired: %w", err)})
case *ErrInvalidDeals:
log.Warnf("invalid deals in sector %d: %v", sector.SectorNumber, err)
return ctx.Send(SectorInvalidDealIDs{Return: RetPreCommitFailed})
case *ErrExpiredDeals:
return ctx.Send(SectorDealsExpired{xerrors.Errorf("sector deals expired: %w", err)})
case *ErrNoPrecommit:
return ctx.Send(SectorRetryPreCommit{})
case *ErrPrecommitOnChain:
// noop
case *ErrSectorNumberAllocated:
log.Errorf("handlePreCommitFailed: sector number already allocated, not proceeding: %+v", err)
// TODO: check if the sector is committed (not sure how we'd end up here)
// TODO: check on-chain state, adjust local sector number counter to not give out allocated numbers
return nil
default:
return xerrors.Errorf("checkPrecommit sanity check error: %w", err)
}
}
if pci, is := m.checkPreCommitted(ctx, sector); is && pci != nil {
if sector.PreCommitMessage == nil {
log.Warnf("sector %d is precommitted on chain, but we don't have precommit message", sector.SectorNumber)
return ctx.Send(SectorPreCommitLanded{TipSet: ts.Key()})
}
if pci.Info.SealedCID != *sector.CommR {
log.Warnf("sector %d is precommitted on chain, with different CommR: %s != %s", sector.SectorNumber, pci.Info.SealedCID, sector.CommR)
return nil // TODO: remove when the actor allows re-precommit
}
// TODO: we could compare more things, but I don't think we really need to
// CommR tells us that CommD (and CommPs), and the ticket are all matching
if err := failedCooldown(ctx, sector); err != nil {
return err
}
return ctx.Send(SectorRetryWaitSeed{})
}
if sector.PreCommitMessage != nil {
log.Warn("retrying precommit even though the message failed to apply")
}
if err := failedCooldown(ctx, sector); err != nil {
return err
}
return ctx.Send(SectorRetryPreCommit{})
}
func (m *Sealing) handleComputeProofFailed(ctx statemachine.Context, sector SectorInfo) error {
// TODO: Check sector files
if err := failedCooldown(ctx, sector); err != nil {
return err
}
if sector.InvalidProofs > 1 {
return ctx.Send(SectorSealPreCommit1Failed{xerrors.Errorf("consecutive compute fails")})
}
return ctx.Send(SectorRetryComputeProof{})
}
func (m *Sealing) handleRemoteCommitFailed(ctx statemachine.Context, sector SectorInfo) error {
if err := failedCooldown(ctx, sector); err != nil {
return err
}
if sector.InvalidProofs > 1 {
log.Errorw("consecutive remote commit fails", "sector", sector.SectorNumber, "c1url", sector.RemoteCommit1Endpoint, "c2url", sector.RemoteCommit2Endpoint)
}
return ctx.Send(SectorRetryComputeProof{})
}
func (m *Sealing) handleSubmitReplicaUpdateFailed(ctx statemachine.Context, sector SectorInfo) error {
if err := failedCooldown(ctx, sector); err != nil {
return err
}
if sector.ReplicaUpdateMessage != nil {
mw, err := m.Api.StateSearchMsg(ctx.Context(), types.EmptyTSK, *sector.ReplicaUpdateMessage, api.LookbackNoLimit, true)
if err != nil {
// API error
return ctx.Send(SectorRetrySubmitReplicaUpdateWait{})
}
if mw == nil {
return ctx.Send(SectorRetrySubmitReplicaUpdateWait{})
}
switch mw.Receipt.ExitCode {
case exitcode.Ok:
return ctx.Send(SectorRetrySubmitReplicaUpdateWait{})
case exitcode.SysErrOutOfGas:
return ctx.Send(SectorRetrySubmitReplicaUpdate{})
default:
// something else went wrong
}
}
ts, err := m.Api.ChainHead(ctx.Context())
if err != nil {
log.Errorf("handleSubmitReplicaUpdateFailed: api error, not proceeding: %+v", err)
return nil
}
if err := checkReplicaUpdate(ctx.Context(), m.maddr, sector, ts.Key(), m.Api); err != nil {
switch err.(type) {
case *ErrApi:
log.Errorf("handleSubmitReplicaUpdateFailed: api error, not proceeding: %+v", err)
return nil
case *ErrBadRU:
log.Errorf("bad replica update: %+v", err)
return ctx.Send(SectorRetryReplicaUpdate{})
case *ErrBadPR:
log.Errorf("bad PR1: +%v", err)
return ctx.Send(SectorRetryProveReplicaUpdate{})
case *ErrInvalidDeals:
return ctx.Send(SectorInvalidDealIDs{})
case *ErrExpiredDeals:
return ctx.Send(SectorDealsExpired{xerrors.Errorf("expired dealIDs in sector: %w", err)})
default:
log.Errorf("sanity check error, not proceeding: +%v", err)
return xerrors.Errorf("checkReplica sanity check error: %w", err)
}
}
// Abort upgrade for sectors that went faulty since being marked for upgrade
active, err := m.sectorActive(ctx.Context(), ts.Key(), sector.SectorNumber)
if err != nil {
log.Errorf("sector active check: api error, not proceeding: %+v", err)
return nil
}
if !active {
err := xerrors.Errorf("sector marked for upgrade %d no longer active, aborting upgrade", sector.SectorNumber)
log.Errorf(err.Error())
return ctx.Send(SectorAbortUpgrade{err})
}
return ctx.Send(SectorRetrySubmitReplicaUpdate{})
}
func (m *Sealing) handleReleaseSectorKeyFailed(ctx statemachine.Context, sector SectorInfo) error {
// not much we can do, wait for a bit and try again
if err := failedCooldown(ctx, sector); err != nil {
return err
}
return ctx.Send(SectorUpdateActive{})
}
func (m *Sealing) handleCommitFailed(ctx statemachine.Context, sector SectorInfo) error {
ts, err := m.Api.ChainHead(ctx.Context())
if err != nil {
log.Errorf("handleCommitting: api error, not proceeding: %+v", err)
return nil
}
if sector.CommitMessage != nil {
mw, err := m.Api.StateSearchMsg(ctx.Context(), ts.Key(), *sector.CommitMessage, api.LookbackNoLimit, true)
if err != nil {
// API error
if err := failedCooldown(ctx, sector); err != nil {
return err
}
return ctx.Send(SectorRetryCommitWait{})
}
if mw == nil {
// API error in commit
return ctx.Send(SectorRetryCommitWait{})
}
switch mw.Receipt.ExitCode {
case exitcode.Ok:
si, err := m.Api.StateSectorGetInfo(ctx.Context(), m.maddr, sector.SectorNumber, mw.TipSet)
if err != nil {
// API error
if err := failedCooldown(ctx, sector); err != nil {
return err
}
return ctx.Send(SectorRetryCommitWait{})
}
if si != nil {
// API error in CommitWait?
return ctx.Send(SectorRetryCommitWait{})
}
// if si == nil, something else went wrong; Likely expired deals, we'll
// find out in checkCommit
case exitcode.SysErrOutOfGas:
// API error in CommitWait AND gas estimator guessed a wrong number in SubmitCommit
return ctx.Send(SectorRetrySubmitCommit{})
default:
// something else went wrong
}
}
if err := m.checkCommit(ctx.Context(), sector, sector.Proof, ts.Key()); err != nil {
switch err.(type) {
case *ErrApi:
log.Errorf("handleCommitFailed: api error, not proceeding: %+v", err)
return nil
case *ErrBadSeed:
log.Errorf("seed changed, will retry: %+v", err)
return ctx.Send(SectorRetryWaitSeed{})
case *ErrInvalidProof:
if err := failedCooldown(ctx, sector); err != nil {
return err
}
if sector.InvalidProofs > 0 {
return ctx.Send(SectorSealPreCommit1Failed{xerrors.Errorf("consecutive invalid proofs")})
}
return ctx.Send(SectorRetryInvalidProof{})
case *ErrPrecommitOnChain:
log.Errorf("no precommit on chain, will retry: %+v", err)
return ctx.Send(SectorRetryPreCommitWait{})
case *ErrNoPrecommit:
return ctx.Send(SectorRetryPreCommit{})
case *ErrInvalidDeals:
log.Warnf("invalid deals in sector %d: %v", sector.SectorNumber, err)
return ctx.Send(SectorInvalidDealIDs{Return: RetCommitFailed})
case *ErrExpiredDeals:
return ctx.Send(SectorDealsExpired{xerrors.Errorf("sector deals expired: %w", err)})
case *ErrCommitWaitFailed:
if err := failedCooldown(ctx, sector); err != nil {
return err
}
return ctx.Send(SectorRetryCommitWait{})
default:
return xerrors.Errorf("checkCommit sanity check error (%T): %w", err, err)
}
}
// TODO: Check sector files
if err := failedCooldown(ctx, sector); err != nil {
return err
}
return ctx.Send(SectorRetryComputeProof{})
}
func (m *Sealing) handleFinalizeFailed(ctx statemachine.Context, sector SectorInfo) error {
// TODO: Check sector files
if err := failedCooldown(ctx, sector); err != nil {
return err
}
return ctx.Send(SectorRetryFinalize{})
}
func (m *Sealing) handleRemoveFailed(ctx statemachine.Context, sector SectorInfo) error {
if err := failedCooldown(ctx, sector); err != nil {
return err
}
return ctx.Send(SectorRemove{})
}
func (m *Sealing) handleTerminateFailed(ctx statemachine.Context, sector SectorInfo) error {
// ignoring error as it's most likely an API error - `pci` will be nil, and we'll go back to
// the Terminating state after cooldown. If the API is still failing, well get back to here
// with the error in SectorInfo log.
pci, _ := m.Api.StateSectorPreCommitInfo(ctx.Context(), m.maddr, sector.SectorNumber, types.EmptyTSK)
if pci != nil {
return nil // pause the fsm, needs manual user action
}
if err := failedCooldown(ctx, sector); err != nil {
return err
}
return ctx.Send(SectorTerminate{})
}
func (m *Sealing) handleDealsExpired(ctx statemachine.Context, sector SectorInfo) error {
// First make vary sure the sector isn't committed
si, err := m.Api.StateSectorGetInfo(ctx.Context(), m.maddr, sector.SectorNumber, types.EmptyTSK)
if err != nil {
return xerrors.Errorf("getting sector info: %w", err)
}
if si != nil {
// TODO: this should never happen, but in case it does, try to go back to
// the proving state after running some checks
return xerrors.Errorf("sector is committed on-chain, but we're in DealsExpired")
}
// Not much to do here, we can't go back in time to commit this sector
return ctx.Send(SectorRemove{})
}
func (m *Sealing) handleDealsExpiredSnapDeals(ctx statemachine.Context, sector SectorInfo) error {
if !sector.CCUpdate {
// Should be impossible
return xerrors.Errorf("should never reach SnapDealsDealsExpired as a non-CCUpdate sector")
}
return ctx.Send(SectorAbortUpgrade{xerrors.Errorf("one of upgrade deals expired")})
}
func (m *Sealing) handleAbortUpgrade(ctx statemachine.Context, sector SectorInfo) error {
if !sector.CCUpdate {
return xerrors.Errorf("should never reach AbortUpgrade as a non-CCUpdate sector")
}
m.cleanupAssignedDeals(sector)
// Remove snap deals replica if any
// This removes update / update-cache from all storage
if err := m.sealer.ReleaseReplicaUpgrade(ctx.Context(), m.minerSector(sector.SectorType, sector.SectorNumber)); err != nil {
return xerrors.Errorf("removing CC update files from sector storage")
}
// This removes the unsealed file from all storage
// note: we're not keeping anything unsealed because we're reverting to CC
if err := m.sealer.ReleaseUnsealed(ctx.Context(), m.minerSector(sector.SectorType, sector.SectorNumber), []storiface.Range{}); err != nil {
log.Error(err)
}
// and makes sure sealed/cache files only exist in long-term-storage
if err := m.sealer.FinalizeSector(ctx.Context(), m.minerSector(sector.SectorType, sector.SectorNumber)); err != nil {
log.Error(err)
}
return ctx.Send(SectorRevertUpgradeToProving{})
}
// failWith is a mutator or global mutator
func (m *Sealing) handleRecoverDealIDsOrFailWith(ctx statemachine.Context, sector SectorInfo, failWith interface{}) error {
toFix, paddingPieces, err := recoveryPiecesToFix(ctx.Context(), m.Api, sector, m.maddr)
if err != nil {
return err
}
ts, err := m.Api.ChainHead(ctx.Context())
if err != nil {
return err
}
failed := map[int]error{}
updates := map[int]abi.DealID{}
for _, i := range toFix {
p := sector.Pieces[i]
if p.DealInfo.PublishCid == nil {
// TODO: check if we are in an early enough state try to remove this piece
log.Errorf("can't fix sector deals: piece %d (of %d) of sector %d has nil DealInfo.PublishCid (refers to deal %d)", i, len(sector.Pieces), sector.SectorNumber, p.DealInfo.DealID)
// Not much to do here (and this can only happen for old spacerace sectors)
return ctx.Send(failWith)
}
var dp *market.DealProposal
if p.DealInfo.DealProposal != nil {
mdp := *p.DealInfo.DealProposal
dp = &mdp
}
res, err := m.DealInfo.GetCurrentDealInfo(ctx.Context(), ts.Key(), dp, *p.DealInfo.PublishCid)
if err != nil {
failed[i] = xerrors.Errorf("getting current deal info for piece %d: %w", i, err)
continue
}
if res.MarketDeal == nil {
failed[i] = xerrors.Errorf("nil market deal (%d,%d,%d,%s)", i, sector.SectorNumber, p.DealInfo.DealID, p.Piece.PieceCID)
continue
}
if res.MarketDeal.Proposal.PieceCID != p.Piece.PieceCID {
failed[i] = xerrors.Errorf("recovered piece (%d) deal in sector %d (dealid %d) has different PieceCID %s != %s", i, sector.SectorNumber, p.DealInfo.DealID, p.Piece.PieceCID, res.MarketDeal.Proposal.PieceCID)
continue
}
updates[i] = res.DealID
}
if len(failed) > 0 {
var merr error
for _, e := range failed {
merr = multierror.Append(merr, e)
}
if len(failed)+paddingPieces == len(sector.Pieces) {
log.Errorf("removing sector %d: all deals expired or unrecoverable: %+v", sector.SectorNumber, merr)
return ctx.Send(failWith)
}
// todo: try to remove bad pieces (hard; see the todo above)
// for now removing sectors is probably better than having them stuck in RecoverDealIDs
// and expire anyways
log.Errorf("removing sector %d: deals expired or unrecoverable: %+v", sector.SectorNumber, merr)
return ctx.Send(failWith)
}
// Not much to do here, we can't go back in time to commit this sector
return ctx.Send(SectorUpdateDealIDs{Updates: updates})
}
func (m *Sealing) HandleRecoverDealIDs(ctx statemachine.Context, sector SectorInfo) error {
return m.handleRecoverDealIDsOrFailWith(ctx, sector, SectorRemove{})
}
func (m *Sealing) handleSnapDealsRecoverDealIDs(ctx statemachine.Context, sector SectorInfo) error {
return m.handleRecoverDealIDsOrFailWith(ctx, sector, SectorAbortUpgrade{xerrors.New("failed recovering deal ids")})
}
func recoveryPiecesToFix(ctx context.Context, api SealingAPI, sector SectorInfo, maddr address.Address) ([]int, int, error) {
ts, err := api.ChainHead(ctx)
if err != nil {
return nil, 0, xerrors.Errorf("getting chain head: %w", err)
}
var toFix []int
paddingPieces := 0
for i, p := range sector.Pieces {
// if no deal is associated with the piece, ensure that we added it as
// filler (i.e. ensure that it has a zero PieceCID)
if p.DealInfo == nil {
exp := zerocomm.ZeroPieceCommitment(p.Piece.Size.Unpadded())
if !p.Piece.PieceCID.Equals(exp) {
return nil, 0, xerrors.Errorf("sector %d piece %d had non-zero PieceCID %+v", sector.SectorNumber, i, p.Piece.PieceCID)
}
paddingPieces++
continue
}
deal, err := api.StateMarketStorageDeal(ctx, p.DealInfo.DealID, ts.Key())
if err != nil {
log.Warnf("getting deal %d for piece %d: %+v", p.DealInfo.DealID, i, err)
toFix = append(toFix, i)
continue
}
if deal.Proposal.Provider != maddr {
log.Warnf("piece %d (of %d) of sector %d refers deal %d with wrong provider: %s != %s", i, len(sector.Pieces), sector.SectorNumber, p.DealInfo.DealID, deal.Proposal.Provider, maddr)
toFix = append(toFix, i)
continue
}
if deal.Proposal.PieceCID != p.Piece.PieceCID {
log.Warnf("piece %d (of %d) of sector %d refers deal %d with wrong PieceCID: %s != %s", i, len(sector.Pieces), sector.SectorNumber, p.DealInfo.DealID, p.Piece.PieceCID, deal.Proposal.PieceCID)
toFix = append(toFix, i)
continue
}
if p.Piece.Size != deal.Proposal.PieceSize {
log.Warnf("piece %d (of %d) of sector %d refers deal %d with different size: %d != %d", i, len(sector.Pieces), sector.SectorNumber, p.DealInfo.DealID, p.Piece.Size, deal.Proposal.PieceSize)
toFix = append(toFix, i)
continue
}
if ts.Height() >= deal.Proposal.StartEpoch {
// TODO: check if we are in an early enough state (before precommit), try to remove the offending pieces
// (tricky as we have to 'defragment' the sector while doing that, and update piece references for retrieval)
return nil, 0, xerrors.Errorf("can't fix sector deals: piece %d (of %d) of sector %d refers expired deal %d - should start at %d, head %d", i, len(sector.Pieces), sector.SectorNumber, p.DealInfo.DealID, deal.Proposal.StartEpoch, ts.Height())
}
}
return toFix, paddingPieces, nil
}