lotus/chain/stmgr/forks.go

package stmgr

import (
	"bytes"
	"context"
	"encoding/binary"
	"errors"
	"os"
	"sort"
	"strings"
	"sync"
	"time"

	"github.com/ipfs/go-cid"
	"github.com/ipfs/go-datastore"
	"golang.org/x/xerrors"

	"github.com/filecoin-project/go-address"
	"github.com/filecoin-project/go-state-types/abi"
	"github.com/filecoin-project/go-state-types/big"
	"github.com/filecoin-project/go-state-types/network"
	"github.com/filecoin-project/specs-actors/v8/actors/migration/nv16"

	"github.com/filecoin-project/lotus/build"
	"github.com/filecoin-project/lotus/chain/actors/adt"
	"github.com/filecoin-project/lotus/chain/actors/builtin"
	init_ "github.com/filecoin-project/lotus/chain/actors/builtin/init"
	"github.com/filecoin-project/lotus/chain/state"
	"github.com/filecoin-project/lotus/chain/types"
	"github.com/filecoin-project/lotus/chain/vm"
)

// EnvDisablePreMigrations when set to '1' stops pre-migrations from running
const EnvDisablePreMigrations = "LOTUS_DISABLE_PRE_MIGRATIONS"

// MigrationCache can be used to cache information used by a migration. This is primarily useful to
// "pre-compute" some migration state ahead of time, and make it accessible in the migration itself.
type MigrationCache interface {
	Write(key string, value cid.Cid) error
	Read(key string) (bool, cid.Cid, error)
	Load(key string, loadFunc func() (cid.Cid, error)) (cid.Cid, error)
}

// MigrationFunc is a migration function run at every upgrade.
//
//   - The cache is a per-upgrade cache, pre-populated by pre-migrations.
//   - The oldState is the state produced by the upgrade epoch.
//   - The returned newState is the new state that will be used by the next epoch.
//   - The height is the upgrade epoch height (already executed).
//   - The tipset is the first non-null tipset after the upgrade height (the tipset in
//     which the upgrade is executed). Do not assume that ts.Height() is the upgrade height.
//
// NOTE: In StateCompute and CallWithGas, the passed tipset is actually the tipset _before_ the
// upgrade. The tipset should really only be used for referencing the "current chain".
type MigrationFunc func(
	ctx context.Context,
	sm *StateManager, cache MigrationCache,
	cb ExecMonitor,
	oldState cid.Cid,
	height abi.ChainEpoch, ts *types.TipSet,
) (newState cid.Cid, err error)

// PreMigrationFunc is a function run _before_ a network upgrade to pre-compute part of the network
// upgrade and speed it up.
type PreMigrationFunc func(
	ctx context.Context,
	sm *StateManager, cache MigrationCache,
	oldState cid.Cid,
	height abi.ChainEpoch, ts *types.TipSet,
) error

// PreMigration describes a pre-migration step to prepare for a network state upgrade. Pre-migrations
// are optimizations, are not guaranteed to run, and may be canceled and/or run multiple times.
type PreMigration struct {
	// PreMigration is the pre-migration function to run at the specified time. This function is
	// run asynchronously and must abort promptly when canceled.
	PreMigration PreMigrationFunc

	// StartWithin specifies that this pre-migration should be started at most StartWithin
	// epochs before the upgrade.
	StartWithin abi.ChainEpoch

	// DontStartWithin specifies that this pre-migration should not be started DontStartWithin
	// epochs before the final upgrade epoch.
	//
	// This should be set such that the pre-migration is likely to complete before StopWithin.
	DontStartWithin abi.ChainEpoch

	// StopWithin specifies that this pre-migration should be stopped StopWithin epochs of the
	// final upgrade epoch.
	StopWithin abi.ChainEpoch
}

type Upgrade struct {
	Height    abi.ChainEpoch
	Network   network.Version
	Expensive bool
	Migration MigrationFunc

	// PreMigrations specifies a set of pre-migration functions to run at the indicated epochs.
	// These functions should fill the given cache with information that can speed up the
	// eventual full migration at the upgrade epoch.
	PreMigrations []PreMigration
}

type UpgradeSchedule []Upgrade

func (us UpgradeSchedule) Validate() error {
	// Make sure each upgrade is valid.
	for _, u := range us {
		if u.Network <= 0 {
			return xerrors.Errorf("cannot upgrade to version <= 0: %d", u.Network)
		}

		for _, m := range u.PreMigrations {
			if m.StartWithin <= 0 {
				return xerrors.Errorf("pre-migration must specify a positive start-within epoch")
			}

			if m.DontStartWithin < 0 || m.StopWithin < 0 {
				return xerrors.Errorf("pre-migration must specify non-negative epochs")
			}

			if m.StartWithin <= m.StopWithin {
				return xerrors.Errorf("pre-migration start-within must come before stop-within")
			}

			// If we have a dont-start-within.
			if m.DontStartWithin != 0 {
				if m.DontStartWithin < m.StopWithin {
					return xerrors.Errorf("pre-migration dont-start-within must come before stop-within")
				}
				if m.StartWithin <= m.DontStartWithin {
					return xerrors.Errorf("pre-migration start-within must come after dont-start-within")
				}
			}
		}
		if !sort.SliceIsSorted(u.PreMigrations, func(i, j int) bool {
			return u.PreMigrations[i].StartWithin > u.PreMigrations[j].StartWithin //nolint:scopelint,gosec
		}) {
			return xerrors.Errorf("pre-migrations must be sorted by start epoch")
		}
	}

	// Make sure the upgrade order makes sense.
	for i := 1; i < len(us); i++ {
		prev := &us[i-1]
		curr := &us[i]
		if !(prev.Network <= curr.Network) {
			return xerrors.Errorf("cannot downgrade from version %d to version %d", prev.Network, curr.Network)
		}
		// Make sure the heights make sense.
		if prev.Height < 0 {
			// Previous upgrade was disabled.
			continue
		}
		if !(prev.Height < curr.Height) {
			return xerrors.Errorf("upgrade heights must be strictly increasing: upgrade %d was at height %d, followed by upgrade %d at height %d", i-1, prev.Height, i, curr.Height)
		}
	}
	return nil
}

func (us UpgradeSchedule) GetNtwkVersion(e abi.ChainEpoch) (network.Version, error) {
	// Traverse from newest to oldest returning upgrade active during epoch e
	for i := len(us) - 1; i >= 0; i-- {
		u := us[i]
		// u.Height is the last epoch before u.Network becomes the active version
		if u.Height < e {
			return u.Network, nil
		}
	}

	return build.GenesisNetworkVersion, nil
}

func (sm *StateManager) HandleStateForks(ctx context.Context, root cid.Cid, height abi.ChainEpoch, cb ExecMonitor, ts *types.TipSet) (cid.Cid, error) {
	retCid := root
	u := sm.stateMigrations[height]
	if u != nil && u.upgrade != nil {
		if height != build.UpgradeWatermelonFixHeight {
			migCid, ok, err := u.migrationResultCache.Get(ctx, root)
			if err == nil {
				if ok {
					log.Infow("CACHED migration", "height", height, "from", root, "to", migCid)
					return migCid, nil
				}
			} else if !errors.Is(err, datastore.ErrNotFound) {
				log.Errorw("failed to lookup previous migration result", "err", err)
			} else {
				log.Debug("no cached migration found, migrating from scratch")
			}
		}

		startTime := time.Now()
		log.Warnw("STARTING migration", "height", height, "from", root)
		// Yes, we clone the cache, even for the final upgrade epoch. Why? Reverts. We may
		// have to migrate multiple times.
		tmpCache := u.cache.Clone()
		var err error
		retCid, err = u.upgrade(ctx, sm, tmpCache, cb, root, height, ts)
		if err != nil {
			log.Errorw("FAILED migration", "height", height, "from", root, "error", err)
			return cid.Undef, err
		}
		// Yes, we update the cache, even for the final upgrade epoch. Why? Reverts. This
		// can save us a _lot_ of time because very few actors will have changed if we
		// do a small revert then need to re-run the migration.
		u.cache.Update(tmpCache)
		log.Warnw("COMPLETED migration",
			"height", height,
			"from", root,
			"to", retCid,
			"duration", time.Since(startTime),
		)

		// Only set if migration ran, we do not want a root => root mapping
		if err := u.migrationResultCache.Store(ctx, root, retCid); err != nil {
			log.Errorw("failed to store migration result", "err", err)
		}
	}

	return retCid, nil
}

// Returns true executing tipsets between the specified heights would trigger an expensive
// migration. NOTE: migrations occurring _at_ the target height are not included, as they're
// executed _after_ the target height.
func (sm *StateManager) hasExpensiveForkBetween(parent, height abi.ChainEpoch) bool {
	for h := parent; h < height; h++ {
		if _, ok := sm.expensiveUpgrades[h]; ok {
			return true
		}
	}
	return false
}

func (sm *StateManager) hasExpensiveFork(height abi.ChainEpoch) bool {
	_, ok := sm.expensiveUpgrades[height]
	return ok
}

func runPreMigration(ctx context.Context, sm *StateManager, fn PreMigrationFunc, cache *nv16.MemMigrationCache, ts *types.TipSet) {
	height := ts.Height()
	parent := ts.ParentState()

	if disabled := os.Getenv(EnvDisablePreMigrations); strings.TrimSpace(disabled) == "1" {
		log.Warnw("SKIPPING pre-migration", "height", height)
		return
	}

	startTime := time.Now()

	log.Warn("STARTING pre-migration")
	// Clone the cache so we don't actually _update_ it
	// till we're done. Otherwise, if we fail, the next
	// migration to use the cache may assume that
	// certain blocks exist, even if they don't.
	tmpCache := cache.Clone()
	err := fn(ctx, sm, tmpCache, parent, height, ts)
	if err != nil {
		log.Errorw("FAILED pre-migration", "error", err)
		return
	}
	// Finally, if everything worked, update the cache.
	cache.Update(tmpCache)
	log.Warnw("COMPLETED pre-migration", "duration", time.Since(startTime))
}

func (sm *StateManager) preMigrationWorker(ctx context.Context) {
	defer close(sm.shutdown)

	ctx, cancel := context.WithCancel(ctx)
	defer cancel()

	type op struct {
		after    abi.ChainEpoch
		notAfter abi.ChainEpoch
		run      func(ts *types.TipSet)
	}

	var wg sync.WaitGroup
	defer wg.Wait()

	// Turn each pre-migration into an operation in a schedule.
	var schedule []op
	for upgradeEpoch, migration := range sm.stateMigrations {
		cache := migration.cache
		for _, prem := range migration.preMigrations {
			preCtx, preCancel := context.WithCancel(ctx)
			migrationFunc := prem.PreMigration

			afterEpoch := upgradeEpoch - prem.StartWithin
			notAfterEpoch := upgradeEpoch - prem.DontStartWithin
			stopEpoch := upgradeEpoch - prem.StopWithin
			// We can't start after we stop.
			if notAfterEpoch > stopEpoch {
				notAfterEpoch = stopEpoch - 1
			}

			// Add an op to start a pre-migration.
			schedule = append(schedule, op{
				after:    afterEpoch,
				notAfter: notAfterEpoch,

				// TODO: are these values correct?
				run: func(ts *types.TipSet) {
					wg.Add(1)
					go func() {
						defer wg.Done()
						runPreMigration(preCtx, sm, migrationFunc, cache, ts)
					}()
				},
			})

			// Add an op to cancel the pre-migration if it's still running.
			schedule = append(schedule, op{
				after:    stopEpoch,
				notAfter: -1,
				run:      func(ts *types.TipSet) { preCancel() },
			})
		}
	}

	// Then sort by epoch.
	sort.Slice(schedule, func(i, j int) bool {
		return schedule[i].after < schedule[j].after
	})

	// Finally, when the head changes, see if there's anything we need to do.
	//
	// We're intentionally ignoring reorgs as they don't matter for our purposes.
	for change := range sm.cs.SubHeadChanges(ctx) {
		for _, head := range change {
			for len(schedule) > 0 {
				op := &schedule[0]
				if head.Val.Height() < op.after {
					break
				}

				// If we haven't passed the pre-migration height...
				if op.notAfter < 0 || head.Val.Height() < op.notAfter {
					op.run(head.Val)
				}
				schedule = schedule[1:]
			}
		}
	}
}

func DoTransfer(tree types.StateTree, from, to address.Address, amt abi.TokenAmount, cb func(trace types.ExecutionTrace)) error {
	fromAct, err := tree.GetActor(from)
	if err != nil {
		return xerrors.Errorf("failed to get 'from' actor for transfer: %w", err)
	}

	fromAct.Balance = types.BigSub(fromAct.Balance, amt)
	if fromAct.Balance.Sign() < 0 {
		return xerrors.Errorf("(sanity) deducted more funds from target account than it had (%s, %s)", from, types.FIL(amt))
	}

	if err := tree.SetActor(from, fromAct); err != nil {
		return xerrors.Errorf("failed to persist from actor: %w", err)
	}

	toAct, err := tree.GetActor(to)
	if err != nil {
		return xerrors.Errorf("failed to get 'to' actor for transfer: %w", err)
	}

	toAct.Balance = types.BigAdd(toAct.Balance, amt)

	if err := tree.SetActor(to, toAct); err != nil {
		return xerrors.Errorf("failed to persist to actor: %w", err)
	}

	if cb != nil {
		// record the transfer in execution traces

		cb(types.ExecutionTrace{
			Msg: types.MessageTrace{
				From:  from,
				To:    to,
				Value: amt,
			},
		})
	}

	return nil
}

func TerminateActor(ctx context.Context, tree *state.StateTree, addr address.Address, em ExecMonitor, epoch abi.ChainEpoch, ts *types.TipSet) error {
	a, err := tree.GetActor(addr)
	if xerrors.Is(err, types.ErrActorNotFound) {
		return types.ErrActorNotFound
	} else if err != nil {
		return xerrors.Errorf("failed to get actor to delete: %w", err)
	}

	var trace types.ExecutionTrace
	if err := DoTransfer(tree, addr, builtin.BurntFundsActorAddr, a.Balance, func(t types.ExecutionTrace) {
		trace = t
	}); err != nil {
		return xerrors.Errorf("transferring terminated actor's balance: %w", err)
	}

	if em != nil {
		// record the transfer in execution traces

		fakeMsg := MakeFakeMsg(builtin.SystemActorAddr, addr, big.Zero(), uint64(epoch))

		if err := em.MessageApplied(ctx, ts, fakeMsg.Cid(), fakeMsg, &vm.ApplyRet{
			MessageReceipt: *MakeFakeRct(),
			ActorErr:       nil,
			ExecutionTrace: trace,
			Duration:       0,
			GasCosts:       nil,
		}, false); err != nil {
			return xerrors.Errorf("recording transfers: %w", err)
		}
	}

	err = tree.DeleteActor(addr)
	if err != nil {
		return xerrors.Errorf("deleting actor from tree: %w", err)
	}

	ia, err := tree.GetActor(init_.Address)
	if err != nil {
		return xerrors.Errorf("loading init actor: %w", err)
	}

	ias, err := init_.Load(&state.AdtStore{IpldStore: tree.Store}, ia)
	if err != nil {
		return xerrors.Errorf("loading init actor state: %w", err)
	}

	if err := ias.Remove(addr); err != nil {
		return xerrors.Errorf("deleting entry from address map: %w", err)
	}

	nih, err := tree.Store.Put(ctx, ias)
	if err != nil {
		return xerrors.Errorf("writing new init actor state: %w", err)
	}

	ia.Head = nih

	return tree.SetActor(init_.Address, ia)
}

func SetNetworkName(ctx context.Context, store adt.Store, tree *state.StateTree, name string) error {
	ia, err := tree.GetActor(init_.Address)
	if err != nil {
		return xerrors.Errorf("getting init actor: %w", err)
	}

	initState, err := init_.Load(store, ia)
	if err != nil {
		return xerrors.Errorf("reading init state: %w", err)
	}

	if err := initState.SetNetworkName(name); err != nil {
		return xerrors.Errorf("setting network name: %w", err)
	}

	ia.Head, err = store.Put(ctx, initState)
	if err != nil {
		return xerrors.Errorf("writing new init state: %w", err)
	}

	if err := tree.SetActor(init_.Address, ia); err != nil {
		return xerrors.Errorf("setting init actor: %w", err)
	}

	return nil
}

func MakeKeyAddr(splitAddr address.Address, count uint64) (address.Address, error) {
	var b bytes.Buffer
	if err := splitAddr.MarshalCBOR(&b); err != nil {
		return address.Undef, xerrors.Errorf("marshalling split address: %w", err)
	}

	if err := binary.Write(&b, binary.BigEndian, count); err != nil {
		return address.Undef, xerrors.Errorf("writing count into a buffer: %w", err)
	}

	if err := binary.Write(&b, binary.BigEndian, []byte("Ignition upgrade")); err != nil {
		return address.Undef, xerrors.Errorf("writing fork name into a buffer: %w", err)
	}

	addr, err := address.NewActorAddress(b.Bytes())
	if err != nil {
		return address.Undef, xerrors.Errorf("create actor address: %w", err)
	}

	return addr, nil
}

func MakeFakeMsg(from address.Address, to address.Address, amt abi.TokenAmount, nonce uint64) *types.Message {
	return &types.Message{
		From:  from,
		To:    to,
		Value: amt,
		Nonce: nonce,
	}
}

func MakeFakeRct() *types.MessageReceipt {
	return &types.MessageReceipt{
		ExitCode: 0,
		Return:   nil,
		GasUsed:  0,
	}
}