Merge pull request #5434 from filecoin-project/feat/pre-migrate

Implement pre-migration framework
2021-01-27 19:24:26 -05:00 · 2021-01-27 19:24:26 -05:00 · 887d40bcbb
commit 887d40bcbb
parent eab50e061a 4890d83acb
6 changed files with 511 additions and 60 deletions
--- a/chain/gen/gen.go
+++ b/chain/gen/gen.go
@ -14,7 +14,6 @@ import (
 	"github.com/filecoin-project/go-state-types/big"
 	"github.com/filecoin-project/go-state-types/crypto"
 	"github.com/google/uuid"
-	block "github.com/ipfs/go-block-format"
 	"github.com/ipfs/go-blockservice"
 	"github.com/ipfs/go-cid"
 	offline "github.com/ipfs/go-ipfs-exchange-offline"
@ -85,19 +84,6 @@ type ChainGen struct {
 	lr repo.LockedRepo
 }

-type mybs struct {
-	blockstore.Blockstore
-}
-
-func (m mybs) Get(c cid.Cid) (block.Block, error) {
-	b, err := m.Blockstore.Get(c)
-	if err != nil {
-		return nil, err
-	}
-
-	return b, nil
-}
-
 var rootkeyMultisig = genesis.MultisigMeta{
 	Signers:         []address.Address{remAccTestKey},
 	Threshold:       1,
@ -152,8 +138,6 @@ func NewGeneratorWithSectors(numSectors int) (*ChainGen, error) {
 		}
 	}()

-	bs = mybs{bs}
-
 	ks, err := lr.KeyStore()
 	if err != nil {
 		return nil, xerrors.Errorf("getting repo keystore failed: %w", err)
@ -465,7 +449,12 @@ func (cg *ChainGen) NextTipSetFromMinersWithMessages(base *types.TipSet, miners
 		}
 	}

-	return store.NewFullTipSet(blks), nil
+	fts := store.NewFullTipSet(blks)
+	if err := cg.cs.PutTipSet(context.TODO(), fts.TipSet()); err != nil {
+		return nil, err
+	}
+
+	return fts, nil
 }

 func (cg *ChainGen) makeBlock(parents *types.TipSet, m address.Address, vrfticket *types.Ticket,
--- a/chain/stmgr/forks.go
+++ b/chain/stmgr/forks.go
@ -4,6 +4,10 @@ import (
 	"bytes"
 	"context"
 	"encoding/binary"
+	"runtime"
+	"sort"
+	"sync"
+	"time"

 	"github.com/filecoin-project/go-state-types/rt"

@ -36,20 +40,70 @@ import (
 	"golang.org/x/xerrors"
 )

-// UpgradeFunc is a migration function run at every upgrade.
+// MigrationCache can be used to cache information used by a migration. This is primarily useful to
+// "pre-compute" some migration state ahead of time, and make it accessible in the migration itself.
+type MigrationCache interface {
+	Write(key string, value cid.Cid) error
+	Read(key string) (bool, cid.Cid, error)
+	Load(key string, loadFunc func() (cid.Cid, error)) (cid.Cid, error)
+}
+
+// MigrationFunc is a migration function run at every upgrade.
 //
+// - The cache is a per-upgrade cache, pre-populated by pre-migrations.
 // - The oldState is the state produced by the upgrade epoch.
 // - The returned newState is the new state that will be used by the next epoch.
 // - The height is the upgrade epoch height (already executed).
 // - The tipset is the tipset for the last non-null block before the upgrade. Do
 //   not assume that ts.Height() is the upgrade height.
-type UpgradeFunc func(ctx context.Context, sm *StateManager, cb ExecCallback, oldState cid.Cid, height abi.ChainEpoch, ts *types.TipSet) (newState cid.Cid, err error)
+type MigrationFunc func(
+	ctx context.Context,
+	sm *StateManager, cache MigrationCache,
+	cb ExecCallback, oldState cid.Cid,
+	height abi.ChainEpoch, ts *types.TipSet,
+) (newState cid.Cid, err error)
+
+// PreMigrationFunc is a function run _before_ a network upgrade to pre-compute part of the network
+// upgrade and speed it up.
+type PreMigrationFunc func(
+	ctx context.Context,
+	sm *StateManager, cache MigrationCache,
+	oldState cid.Cid,
+	height abi.ChainEpoch, ts *types.TipSet,
+) error
+
+// PreMigration describes a pre-migration step to prepare for a network state upgrade. Pre-migrations
+// are optimizations, are not guaranteed to run, and may be canceled and/or run multiple times.
+type PreMigration struct {
+	// PreMigration is the pre-migration function to run at the specified time. This function is
+	// run asynchronously and must abort promptly when canceled.
+	PreMigration PreMigrationFunc
+
+	// StartWithin specifies that this pre-migration should be started at most StartWithin
+	// epochs before the upgrade.
+	StartWithin abi.ChainEpoch
+
+	// DontStartWithin specifies that this pre-migration should not be started DontStartWithin
+	// epochs before the final upgrade epoch.
+	//
+	// This should be set such that the pre-migration is likely to complete before StopWithin.
+	DontStartWithin abi.ChainEpoch
+
+	// StopWithin specifies that this pre-migration should be stopped StopWithin epochs of the
+	// final upgrade epoch.
+	StopWithin abi.ChainEpoch
+}

 type Upgrade struct {
 	Height    abi.ChainEpoch
 	Network   network.Version
 	Expensive bool
-	Migration UpgradeFunc
+	Migration MigrationFunc
+
+	// PreMigrations specifies a set of pre-migration functions to run at the indicated epochs.
+	// These functions should fill the given cache with information that can speed up the
+	// eventual full migration at the upgrade epoch.
+	PreMigrations []PreMigration
 }

 type UpgradeSchedule []Upgrade
@ -121,6 +175,17 @@ func DefaultUpgradeSchedule() UpgradeSchedule {
 		Height:    build.UpgradeActorsV3Height,
 		Network:   network.Version10,
 		Migration: UpgradeActorsV3,
+		PreMigrations: []PreMigration{{
+			PreMigration:    PreUpgradeActorsV3,
+			StartWithin:     120,
+			DontStartWithin: 60,
+			StopWithin:      35,
+		}, {
+			PreMigration:    PreUpgradeActorsV3,
+			StartWithin:     30,
+			DontStartWithin: 15,
+			StopWithin:      5,
+		}},
 		Expensive: true,
 	}}

@ -135,14 +200,43 @@ func DefaultUpgradeSchedule() UpgradeSchedule {
 }

 func (us UpgradeSchedule) Validate() error {
-	// Make sure we're not trying to upgrade to version 0.
+	// Make sure each upgrade is valid.
 	for _, u := range us {
 		if u.Network <= 0 {
 			return xerrors.Errorf("cannot upgrade to version <= 0: %d", u.Network)
 		}
+
+		for _, m := range u.PreMigrations {
+			if m.StartWithin <= 0 {
+				return xerrors.Errorf("pre-migration must specify a positive start-within epoch")
+			}
+
+			if m.DontStartWithin < 0 || m.StopWithin < 0 {
+				return xerrors.Errorf("pre-migration must specify non-negative epochs")
+			}
+
+			if m.StartWithin <= m.StopWithin {
+				return xerrors.Errorf("pre-migration start-within must come before stop-within")
+			}
+
+			// If we have a dont-start-within.
+			if m.DontStartWithin != 0 {
+				if m.DontStartWithin < m.StopWithin {
+					return xerrors.Errorf("pre-migration dont-start-within must come before stop-within")
+				}
+				if m.StartWithin <= m.DontStartWithin {
+					return xerrors.Errorf("pre-migration start-within must come after dont-start-within")
+				}
+			}
+		}
+		if !sort.SliceIsSorted(u.PreMigrations, func(i, j int) bool {
+			return u.PreMigrations[i].StartWithin > u.PreMigrations[j].StartWithin //nolint:scopelint,gosec
+		}) {
+			return xerrors.Errorf("pre-migrations must be sorted by start epoch")
+		}
 	}

-	// Make sure all the upgrades make sense.
+	// Make sure the upgrade order makes sense.
 	for i := 1; i < len(us); i++ {
 		prev := &us[i-1]
 		curr := &us[i]
@ -164,12 +258,26 @@ func (us UpgradeSchedule) Validate() error {
 func (sm *StateManager) handleStateForks(ctx context.Context, root cid.Cid, height abi.ChainEpoch, cb ExecCallback, ts *types.TipSet) (cid.Cid, error) {
 	retCid := root
 	var err error
-	f, ok := sm.stateMigrations[height]
-	if ok {
-		retCid, err = f(ctx, sm, cb, root, height, ts)
+	u := sm.stateMigrations[height]
+	if u != nil && u.upgrade != nil {
+		startTime := time.Now()
+		log.Warnw("STARTING migration", "height", height)
+		// Yes, we clone the cache, even for the final upgrade epoch. Why? Reverts. We may
+		// have to migrate multiple times.
+		tmpCache := u.cache.Clone()
+		retCid, err = u.upgrade(ctx, sm, tmpCache, cb, root, height, ts)
 		if err != nil {
+			log.Errorw("FAILED migration", "height", height, "error", err)
 			return cid.Undef, err
 		}
+		// Yes, we update the cache, even for the final upgrade epoch. Why? Reverts. This
+		// can save us a _lot_ of time because very few actors will have changed if we
+		// do a small revert then need to re-run the migration.
+		u.cache.Update(tmpCache)
+		log.Warnw("COMPLETED migration",
+			"height", height,
+			"duration", time.Since(startTime),
+		)
 	}

 	return retCid, nil
@ -180,6 +288,109 @@ func (sm *StateManager) hasExpensiveFork(ctx context.Context, height abi.ChainEp
 	return ok
 }

+func runPreMigration(ctx context.Context, sm *StateManager, fn PreMigrationFunc, cache *nv10.MemMigrationCache, ts *types.TipSet) {
+	height := ts.Height()
+	parent := ts.ParentState()
+
+	startTime := time.Now()
+
+	log.Warn("STARTING pre-migration")
+	// Clone the cache so we don't actually _update_ it
+	// till we're done. Otherwise, if we fail, the next
+	// migration to use the cache may assume that
+	// certain blocks exist, even if they don't.
+	tmpCache := cache.Clone()
+	err := fn(ctx, sm, tmpCache, parent, height, ts)
+	if err != nil {
+		log.Errorw("FAILED pre-migration", "error", err)
+		return
+	}
+	// Finally, if everything worked, update the cache.
+	cache.Update(tmpCache)
+	log.Warnw("COMPLETED pre-migration", "duration", time.Since(startTime))
+}
+
+func (sm *StateManager) preMigrationWorker(ctx context.Context) {
+	defer close(sm.shutdown)
+
+	ctx, cancel := context.WithCancel(ctx)
+	defer cancel()
+
+	type op struct {
+		after    abi.ChainEpoch
+		notAfter abi.ChainEpoch
+		run      func(ts *types.TipSet)
+	}
+
+	var wg sync.WaitGroup
+	defer wg.Wait()
+
+	// Turn each pre-migration into an operation in a schedule.
+	var schedule []op
+	for upgradeEpoch, migration := range sm.stateMigrations {
+		cache := migration.cache
+		for _, prem := range migration.preMigrations {
+			preCtx, preCancel := context.WithCancel(ctx)
+			migrationFunc := prem.PreMigration
+
+			afterEpoch := upgradeEpoch - prem.StartWithin
+			notAfterEpoch := upgradeEpoch - prem.DontStartWithin
+			stopEpoch := upgradeEpoch - prem.StopWithin
+			// We can't start after we stop.
+			if notAfterEpoch > stopEpoch {
+				notAfterEpoch = stopEpoch - 1
+			}
+
+			// Add an op to start a pre-migration.
+			schedule = append(schedule, op{
+				after:    afterEpoch,
+				notAfter: notAfterEpoch,
+
+				// TODO: are these values correct?
+				run: func(ts *types.TipSet) {
+					wg.Add(1)
+					go func() {
+						defer wg.Done()
+						runPreMigration(preCtx, sm, migrationFunc, cache, ts)
+					}()
+				},
+			})
+
+			// Add an op to cancel the pre-migration if it's still running.
+			schedule = append(schedule, op{
+				after:    stopEpoch,
+				notAfter: -1,
+				run:      func(ts *types.TipSet) { preCancel() },
+			})
+		}
+	}
+
+	// Then sort by epoch.
+	sort.Slice(schedule, func(i, j int) bool {
+		return schedule[i].after < schedule[j].after
+	})
+
+	// Finally, when the head changes, see if there's anything we need to do.
+	//
+	// We're intentionally ignoring reorgs as they don't matter for our purposes.
+	for change := range sm.cs.SubHeadChanges(ctx) {
+		for _, head := range change {
+			for len(schedule) > 0 {
+				op := &schedule[0]
+				if head.Val.Height() < op.after {
+					break
+				}
+
+				// If we haven't passed the pre-migration height...
+				if op.notAfter < 0 || head.Val.Height() < op.notAfter {
+					op.run(head.Val)
+				}
+				schedule = schedule[1:]
+			}
+		}
+	}
+}
+
 func doTransfer(tree types.StateTree, from, to address.Address, amt abi.TokenAmount, cb func(trace types.ExecutionTrace)) error {
 	fromAct, err := tree.GetActor(from)
 	if err != nil {
@ -233,7 +444,7 @@ func doTransfer(tree types.StateTree, from, to address.Address, amt abi.TokenAmo
 	return nil
 }

-func UpgradeFaucetBurnRecovery(ctx context.Context, sm *StateManager, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) {
+func UpgradeFaucetBurnRecovery(ctx context.Context, sm *StateManager, _ MigrationCache, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) {
 	// Some initial parameters
 	FundsForMiners := types.FromFil(1_000_000)
 	LookbackEpoch := abi.ChainEpoch(32000)
@ -519,7 +730,7 @@ func UpgradeFaucetBurnRecovery(ctx context.Context, sm *StateManager, cb ExecCal
 	return tree.Flush(ctx)
 }

-func UpgradeIgnition(ctx context.Context, sm *StateManager, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) {
+func UpgradeIgnition(ctx context.Context, sm *StateManager, _ MigrationCache, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) {
 	store := sm.cs.Store(ctx)

 	if build.UpgradeLiftoffHeight <= epoch {
@ -574,7 +785,7 @@ func UpgradeIgnition(ctx context.Context, sm *StateManager, cb ExecCallback, roo
 	return tree.Flush(ctx)
 }

-func UpgradeRefuel(ctx context.Context, sm *StateManager, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) {
+func UpgradeRefuel(ctx context.Context, sm *StateManager, _ MigrationCache, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) {

 	store := sm.cs.Store(ctx)
 	tree, err := sm.StateTree(root)
@ -600,7 +811,7 @@ func UpgradeRefuel(ctx context.Context, sm *StateManager, cb ExecCallback, root
 	return tree.Flush(ctx)
 }

-func UpgradeActorsV2(ctx context.Context, sm *StateManager, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) {
+func UpgradeActorsV2(ctx context.Context, sm *StateManager, _ MigrationCache, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) {
 	buf := bufbstore.NewTieredBstore(sm.cs.Blockstore(), bstore.NewTemporarySync())
 	store := store.ActorStore(ctx, buf)

@ -646,7 +857,7 @@ func UpgradeActorsV2(ctx context.Context, sm *StateManager, cb ExecCallback, roo
 	return newRoot, nil
 }

-func UpgradeLiftoff(ctx context.Context, sm *StateManager, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) {
+func UpgradeLiftoff(ctx context.Context, sm *StateManager, _ MigrationCache, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) {
 	tree, err := sm.StateTree(root)
 	if err != nil {
 		return cid.Undef, xerrors.Errorf("getting state tree: %w", err)
@ -660,7 +871,7 @@ func UpgradeLiftoff(ctx context.Context, sm *StateManager, cb ExecCallback, root
 	return tree.Flush(ctx)
 }

-func UpgradeCalico(ctx context.Context, sm *StateManager, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) {
+func UpgradeCalico(ctx context.Context, sm *StateManager, _ MigrationCache, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) {
 	store := sm.cs.Store(ctx)
 	var stateRoot types.StateRoot
 	if err := store.Get(ctx, root, &stateRoot); err != nil {
@ -702,12 +913,56 @@ func UpgradeCalico(ctx context.Context, sm *StateManager, cb ExecCallback, root
 	return newRoot, nil
 }

-func UpgradeActorsV3(ctx context.Context, sm *StateManager, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) {
+func UpgradeActorsV3(ctx context.Context, sm *StateManager, cache MigrationCache, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) {
+	// Use all the CPUs except 3.
+	workerCount := runtime.NumCPU() - 3
+	if workerCount <= 0 {
+		workerCount = 1
+	}
+
+	config := nv10.Config{MaxWorkers: uint(workerCount)}
+	newRoot, err := upgradeActorsV3Common(ctx, sm, cache, root, epoch, ts, config)
+	if err != nil {
+		return cid.Undef, xerrors.Errorf("migrating actors v3 state: %w", err)
+	}
+
+	// perform some basic sanity checks to make sure everything still works.
+	store := store.ActorStore(ctx, sm.ChainStore().Blockstore())
+	if newSm, err := state.LoadStateTree(store, newRoot); err != nil {
+		return cid.Undef, xerrors.Errorf("state tree sanity load failed: %w", err)
+	} else if newRoot2, err := newSm.Flush(ctx); err != nil {
+		return cid.Undef, xerrors.Errorf("state tree sanity flush failed: %w", err)
+	} else if newRoot2 != newRoot {
+		return cid.Undef, xerrors.Errorf("state-root mismatch: %s != %s", newRoot, newRoot2)
+	} else if _, err := newSm.GetActor(init_.Address); err != nil {
+		return cid.Undef, xerrors.Errorf("failed to load init actor after upgrade: %w", err)
+	}
+
+	return newRoot, nil
+}
+
+func PreUpgradeActorsV3(ctx context.Context, sm *StateManager, cache MigrationCache, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) error {
+	// Use half the CPUs for pre-migration, but leave at least 3.
+	workerCount := runtime.NumCPU()
+	if workerCount <= 4 {
+		workerCount = 1
+	} else {
+		workerCount /= 2
+	}
+	config := nv10.Config{MaxWorkers: uint(workerCount)}
+	_, err := upgradeActorsV3Common(ctx, sm, cache, root, epoch, ts, config)
+	return err
+}
+
+func upgradeActorsV3Common(
+	ctx context.Context, sm *StateManager, cache MigrationCache,
+	root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet,
+	config nv10.Config,
+) (cid.Cid, error) {
 	buf := bufbstore.NewTieredBstore(sm.cs.Blockstore(), bstore.NewTemporarySync())
 	store := store.ActorStore(ctx, buf)

 	// Load the state root.
-
 	var stateRoot types.StateRoot
 	if err := store.Get(ctx, root, &stateRoot); err != nil {
 		return cid.Undef, xerrors.Errorf("failed to decode state root: %w", err)
@ -721,18 +976,12 @@ func UpgradeActorsV3(ctx context.Context, sm *StateManager, cb ExecCallback, roo
 	}

 	// Perform the migration
-
-	// TODO: store this somewhere and pre-migrate
-	cache := nv10.NewMemMigrationCache()
-	// TODO: tune this.
-	config := nv10.Config{MaxWorkers: 1}
 	newHamtRoot, err := nv10.MigrateStateTree(ctx, store, stateRoot.Actors, epoch, config, migrationLogger{}, cache)
 	if err != nil {
 		return cid.Undef, xerrors.Errorf("upgrading to actors v2: %w", err)
 	}

 	// Persist the result.
-
 	newRoot, err := store.Put(ctx, &types.StateRoot{
 		Version: types.StateTreeVersion2,
 		Actors:  newHamtRoot,
@ -742,19 +991,6 @@ func UpgradeActorsV3(ctx context.Context, sm *StateManager, cb ExecCallback, roo
 		return cid.Undef, xerrors.Errorf("failed to persist new state root: %w", err)
 	}

-	// Check the result.
-
-	// perform some basic sanity checks to make sure everything still works.
-	if newSm, err := state.LoadStateTree(store, newRoot); err != nil {
-		return cid.Undef, xerrors.Errorf("state tree sanity load failed: %w", err)
-	} else if newRoot2, err := newSm.Flush(ctx); err != nil {
-		return cid.Undef, xerrors.Errorf("state tree sanity flush failed: %w", err)
-	} else if newRoot2 != newRoot {
-		return cid.Undef, xerrors.Errorf("state-root mismatch: %s != %s", newRoot, newRoot2)
-	} else if _, err := newSm.GetActor(init_.Address); err != nil {
-		return cid.Undef, xerrors.Errorf("failed to load init actor after upgrade: %w", err)
-	}
-
 	// Persist the new tree.

 	{
--- a/chain/stmgr/forks_test.go
+++ b/chain/stmgr/forks_test.go
@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 	"io"
+	"sync"
 	"testing"

 	"github.com/ipfs/go-cid"
@ -122,7 +123,7 @@ func TestForkHeightTriggers(t *testing.T) {
 		cg.ChainStore(), UpgradeSchedule{{
 			Network: 1,
 			Height:  testForkHeight,
-			Migration: func(ctx context.Context, sm *StateManager, cb ExecCallback,
+			Migration: func(ctx context.Context, sm *StateManager, cache MigrationCache, cb ExecCallback,
 				root cid.Cid, height abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) {
 				cst := ipldcbor.NewCborStore(sm.ChainStore().Blockstore())

@ -252,7 +253,7 @@ func TestForkRefuseCall(t *testing.T) {
 			Network:   1,
 			Expensive: true,
 			Height:    testForkHeight,
-			Migration: func(ctx context.Context, sm *StateManager, cb ExecCallback,
+			Migration: func(ctx context.Context, sm *StateManager, cache MigrationCache, cb ExecCallback,
 				root cid.Cid, height abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) {
 				return root, nil
 			}}})
@ -317,3 +318,166 @@ func TestForkRefuseCall(t *testing.T) {
 		}
 	}
 }
+
+func TestForkPreMigration(t *testing.T) {
+	logging.SetAllLoggers(logging.LevelInfo)
+
+	cg, err := gen.NewGenerator()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	fooCid, err := abi.CidBuilder.Sum([]byte("foo"))
+	require.NoError(t, err)
+
+	barCid, err := abi.CidBuilder.Sum([]byte("bar"))
+	require.NoError(t, err)
+
+	failCid, err := abi.CidBuilder.Sum([]byte("fail"))
+	require.NoError(t, err)
+
+	var wait20 sync.WaitGroup
+	wait20.Add(3)
+
+	wasCanceled := make(chan struct{})
+
+	checkCache := func(t *testing.T, cache MigrationCache) {
+		found, value, err := cache.Read("foo")
+		require.NoError(t, err)
+		require.True(t, found)
+		require.Equal(t, fooCid, value)
+
+		found, value, err = cache.Read("bar")
+		require.NoError(t, err)
+		require.True(t, found)
+		require.Equal(t, barCid, value)
+
+		found, _, err = cache.Read("fail")
+		require.NoError(t, err)
+		require.False(t, found)
+	}
+
+	counter := make(chan struct{}, 10)
+
+	sm, err := NewStateManagerWithUpgradeSchedule(
+		cg.ChainStore(), UpgradeSchedule{{
+			Network: 1,
+			Height:  testForkHeight,
+			Migration: func(ctx context.Context, sm *StateManager, cache MigrationCache, cb ExecCallback,
+				root cid.Cid, height abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) {
+
+				// Make sure the test that should be canceled, is canceled.
+				select {
+				case <-wasCanceled:
+				case <-ctx.Done():
+					return cid.Undef, ctx.Err()
+				}
+
+				// the cache should be setup correctly.
+				checkCache(t, cache)
+
+				counter <- struct{}{}
+
+				return root, nil
+			},
+			PreMigrations: []PreMigration{{
+				StartWithin: 20,
+				PreMigration: func(ctx context.Context, _ *StateManager, cache MigrationCache,
+					_ cid.Cid, _ abi.ChainEpoch, _ *types.TipSet) error {
+					wait20.Done()
+					wait20.Wait()
+
+					err := cache.Write("foo", fooCid)
+					require.NoError(t, err)
+
+					counter <- struct{}{}
+
+					return nil
+				},
+			}, {
+				StartWithin: 20,
+				PreMigration: func(ctx context.Context, _ *StateManager, cache MigrationCache,
+					_ cid.Cid, _ abi.ChainEpoch, _ *types.TipSet) error {
+					wait20.Done()
+					wait20.Wait()
+
+					err := cache.Write("bar", barCid)
+					require.NoError(t, err)
+
+					counter <- struct{}{}
+
+					return nil
+				},
+			}, {
+				StartWithin: 20,
+				PreMigration: func(ctx context.Context, _ *StateManager, cache MigrationCache,
+					_ cid.Cid, _ abi.ChainEpoch, _ *types.TipSet) error {
+					wait20.Done()
+					wait20.Wait()
+
+					err := cache.Write("fail", failCid)
+					require.NoError(t, err)
+
+					counter <- struct{}{}
+
+					// Fail this migration. The cached entry should not be persisted.
+					return fmt.Errorf("failed")
+				},
+			}, {
+				StartWithin: 15,
+				StopWithin:  5,
+				PreMigration: func(ctx context.Context, _ *StateManager, cache MigrationCache,
+					_ cid.Cid, _ abi.ChainEpoch, _ *types.TipSet) error {
+
+					<-ctx.Done()
+					close(wasCanceled)
+
+					counter <- struct{}{}
+
+					return nil
+				},
+			}, {
+				StartWithin: 10,
+				PreMigration: func(ctx context.Context, _ *StateManager, cache MigrationCache,
+					_ cid.Cid, _ abi.ChainEpoch, _ *types.TipSet) error {
+
+					checkCache(t, cache)
+
+					counter <- struct{}{}
+
+					return nil
+				},
+			}}},
+		})
+	if err != nil {
+		t.Fatal(err)
+	}
+	require.NoError(t, sm.Start(context.Background()))
+	defer func() {
+		require.NoError(t, sm.Stop(context.Background()))
+	}()
+
+	inv := vm.NewActorRegistry()
+	inv.Register(nil, testActor{})
+
+	sm.SetVMConstructor(func(ctx context.Context, vmopt *vm.VMOpts) (*vm.VM, error) {
+		nvm, err := vm.NewVM(ctx, vmopt)
+		if err != nil {
+			return nil, err
+		}
+		nvm.SetInvoker(inv)
+		return nvm, nil
+	})
+
+	cg.SetStateManager(sm)
+
+	for i := 0; i < 50; i++ {
+		_, err := cg.NextTipSet()
+		if err != nil {
+			t.Fatal(err)
+		}
+	}
+	// We have 5 pre-migration steps, and the migration. They should all have written something
+	// to this channel.
+	require.Equal(t, 6, len(counter))
+}
--- a/chain/stmgr/stmgr.go
+++ b/chain/stmgr/stmgr.go
@ -20,6 +20,7 @@ import (

 	// Used for genesis.
 	msig0 "github.com/filecoin-project/specs-actors/actors/builtin/multisig"
+	"github.com/filecoin-project/specs-actors/v3/actors/migration/nv10"

 	// we use the same adt for all receipts
 	blockadt "github.com/filecoin-project/specs-actors/actors/util/adt"
@ -62,15 +63,24 @@ type versionSpec struct {
 	atOrBelow      abi.ChainEpoch
 }

+type migration struct {
+	upgrade       MigrationFunc
+	preMigrations []PreMigration
+	cache         *nv10.MemMigrationCache
+}
+
 type StateManager struct {
 	cs *store.ChainStore

+	cancel   context.CancelFunc
+	shutdown chan struct{}
+
 	// Determines the network version at any given epoch.
 	networkVersions []versionSpec
 	latestVersion   network.Version

-	// Maps chain epochs to upgrade functions.
-	stateMigrations map[abi.ChainEpoch]UpgradeFunc
+	// Maps chain epochs to migrations.
+	stateMigrations map[abi.ChainEpoch]*migration
 	// A set of potentially expensive/time consuming upgrades. Explicit
 	// calls for, e.g., gas estimation fail against this epoch with
 	// ErrExpensiveFork.
@ -103,7 +113,7 @@ func NewStateManagerWithUpgradeSchedule(cs *store.ChainStore, us UpgradeSchedule
 		return nil, err
 	}

-	stateMigrations := make(map[abi.ChainEpoch]UpgradeFunc, len(us))
+	stateMigrations := make(map[abi.ChainEpoch]*migration, len(us))
 	expensiveUpgrades := make(map[abi.ChainEpoch]struct{}, len(us))
 	var networkVersions []versionSpec
 	lastVersion := network.Version0
@ -111,8 +121,13 @@ func NewStateManagerWithUpgradeSchedule(cs *store.ChainStore, us UpgradeSchedule
 		// If we have any upgrades, process them and create a version
 		// schedule.
 		for _, upgrade := range us {
-			if upgrade.Migration != nil {
-				stateMigrations[upgrade.Height] = upgrade.Migration
+			if upgrade.Migration != nil || upgrade.PreMigrations != nil {
+				migration := &migration{
+					upgrade:       upgrade.Migration,
+					preMigrations: upgrade.PreMigrations,
+					cache:         nv10.NewMemMigrationCache(),
+				}
+				stateMigrations[upgrade.Height] = migration
 			}
 			if upgrade.Expensive {
 				expensiveUpgrades[upgrade.Height] = struct{}{}
@ -148,6 +163,33 @@ func cidsToKey(cids []cid.Cid) string {
 	return out
 }

+// Start starts the state manager's optional background processes. At the moment, this schedules
+// pre-migration functions to run ahead of network upgrades.
+//
+// This method is not safe to invoke from multiple threads or concurrently with Stop.
+func (sm *StateManager) Start(context.Context) error {
+	var ctx context.Context
+	ctx, sm.cancel = context.WithCancel(context.Background())
+	sm.shutdown = make(chan struct{})
+	go sm.preMigrationWorker(ctx)
+	return nil
+}
+
+// Stop starts the state manager's background processes.
+//
+// This method is not safe to invoke concurrently with Start.
+func (sm *StateManager) Stop(ctx context.Context) error {
+	if sm.cancel != nil {
+		sm.cancel()
+		select {
+		case <-sm.shutdown:
+		case <-ctx.Done():
+			return ctx.Err()
+		}
+	}
+	return nil
+}
+
 func (sm *StateManager) TipSetState(ctx context.Context, ts *types.TipSet) (st cid.Cid, rec cid.Cid, err error) {
 	ctx, span := trace.StartSpan(ctx, "tipSetState")
 	defer span.End()
--- a/node/builder.go
+++ b/node/builder.go
@ -269,7 +269,7 @@ func Online() Option {
 			Override(new(vm.SyscallBuilder), vm.Syscalls),
 			Override(new(*store.ChainStore), modules.ChainStore),
 			Override(new(stmgr.UpgradeSchedule), stmgr.DefaultUpgradeSchedule()),
-			Override(new(*stmgr.StateManager), stmgr.NewStateManagerWithUpgradeSchedule),
+			Override(new(*stmgr.StateManager), modules.StateManager),
 			Override(new(*wallet.LocalWallet), wallet.NewWallet),
 			Override(new(wallet.Default), From(new(*wallet.LocalWallet))),
 			Override(new(api.WalletAPI), From(new(wallet.MultiWallet))),
--- a/node/modules/stmgr.go
+++ b/node/modules/stmgr.go
@ -0,0 +1,20 @@
+package modules
+
+import (
+	"go.uber.org/fx"
+
+	"github.com/filecoin-project/lotus/chain/stmgr"
+	"github.com/filecoin-project/lotus/chain/store"
+)
+
+func StateManager(lc fx.Lifecycle, cs *store.ChainStore, us stmgr.UpgradeSchedule) (*stmgr.StateManager, error) {
+	sm, err := stmgr.NewStateManagerWithUpgradeSchedule(cs, us)
+	if err != nil {
+		return nil, err
+	}
+	lc.Append(fx.Hook{
+		OnStart: sm.Start,
+		OnStop:  sm.Stop,
+	})
+	return sm, nil
+}