ipld-eth-state-snapshot/pkg/snapshot/service.go

// Copyright © 2020 Vulcanize, Inc
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.

package snapshot

import (
	"context"
	"fmt"
	"math/big"
	"sync"

	"github.com/cerc-io/eth-iterator-utils/tracker"
	statediff "github.com/cerc-io/plugeth-statediff"
	"github.com/cerc-io/plugeth-statediff/adapt"
	"github.com/cerc-io/plugeth-statediff/indexer"
	"github.com/cerc-io/plugeth-statediff/types"
	"github.com/ethereum/go-ethereum/common"
	"github.com/ethereum/go-ethereum/core/rawdb"
	"github.com/ethereum/go-ethereum/core/state"
	"github.com/ethereum/go-ethereum/crypto"
	"github.com/ethereum/go-ethereum/ethdb"
	"github.com/ethereum/go-ethereum/rlp"
	log "github.com/sirupsen/logrus"
)

var (
	emptyNode, _      = rlp.EncodeToBytes(&[]byte{})
	emptyCodeHash     = crypto.Keccak256([]byte{})
	emptyContractRoot = crypto.Keccak256Hash(emptyNode)

	defaultBatchSize = uint(100)
)

// Service holds ethDB and stateDB to read data from lvldb and Publisher
// to publish trie in postgres DB.
type Service struct {
	ethDB        ethdb.Database
	stateDB      state.Database
	indexer      indexer.Indexer
	maxBatchSize uint
	recoveryFile string
}

func NewLevelDB(con *EthConfig) (ethdb.Database, error) {
	kvdb, err := rawdb.NewLevelDBDatabase(con.LevelDBPath, 1024, 256, "ipld-eth-state-snapshot", true)
	if err != nil {
		return nil, fmt.Errorf("failed to connect LevelDB: %s", err)
	}
	edb, err := rawdb.NewDatabaseWithFreezer(kvdb, con.AncientDBPath, "ipld-eth-state-snapshot", true)
	if err != nil {
		return nil, fmt.Errorf("failed to connect LevelDB freezer: %s", err)
	}
	return edb, nil
}

// NewSnapshotService creates Service.
func NewSnapshotService(edb ethdb.Database, indexer indexer.Indexer, recoveryFile string) (*Service, error) {
	return &Service{
		ethDB:        edb,
		stateDB:      state.NewDatabase(edb),
		indexer:      indexer,
		maxBatchSize: defaultBatchSize,
		recoveryFile: recoveryFile,
	}, nil
}

type SnapshotParams struct {
	WatchedAddresses []common.Address
	Height           uint64
	Workers          uint
}

func (s *Service) CreateSnapshot(params SnapshotParams) error {
	// extract header from lvldb and publish to PG-IPFS
	// hold onto the headerID so that we can link the state nodes to this header
	log.Infof("Creating snapshot at height %d", params.Height)
	hash := rawdb.ReadCanonicalHash(s.ethDB, params.Height)
	header := rawdb.ReadHeader(s.ethDB, hash, params.Height)
	if header == nil {
		return fmt.Errorf("unable to read canonical header at height %d", params.Height)
	}
	log.Infof("head hash: %s head height: %d", hash.Hex(), params.Height)

	// Context for snapshot work
	ctx, cancelCtx := context.WithCancel(context.Background())
	defer cancelCtx()

	var err error
	tx := s.indexer.BeginTx(header.Number, ctx)
	defer tx.RollbackOnFailure(err)

	var headerid string
	headerid, err = s.indexer.PushHeader(tx, header, big.NewInt(0), big.NewInt(0))
	if err != nil {
		return err
	}

	tracker := tracker.New(s.recoveryFile, params.Workers)
	tracker.CaptureSignal(cancelCtx)

	defer func() {
		err := tracker.HaltAndDump()
		if err != nil {
			log.Errorf("failed to write recovery file: %v", err)
		}
	}()

	var nodeMtx, ipldMtx sync.Mutex
	nodeSink := func(node types.StateLeafNode) error {
		nodeMtx.Lock()
		defer nodeMtx.Unlock()
		return s.indexer.PushStateNode(tx, node, headerid)
	}
	ipldSink := func(c types.IPLD) error {
		ipldMtx.Lock()
		defer ipldMtx.Unlock()
		return s.indexer.PushIPLD(tx, c)
	}

	// Build a diff compared against the zero hash to get a full snapshot
	sdargs := statediff.Args{
		NewStateRoot: header.Root,
		BlockHash:    header.Hash(),
		BlockNumber:  header.Number,
	}
	sdparams := statediff.Params{
		WatchedAddresses: params.WatchedAddresses,
	}
	sdparams.ComputeWatchedAddressesLeafPaths()
	builder := statediff.NewBuilder(adapt.GethStateView(s.stateDB))
	builder.SetSubtrieWorkers(params.Workers)
	if err = builder.WriteStateDiffTracked(sdargs, sdparams, nodeSink, ipldSink, &tracker); err != nil {
		return err
	}

	if err = tx.Submit(); err != nil {
		return fmt.Errorf("batch transaction submission failed: %w", err)
	}
	return err
}

// CreateLatestSnapshot snapshot at head (ignores height param)
func (s *Service) CreateLatestSnapshot(workers uint, watchedAddresses []common.Address) error {
	log.Info("Creating snapshot at head")
	hash := rawdb.ReadHeadHeaderHash(s.ethDB)
	height := rawdb.ReadHeaderNumber(s.ethDB, hash)
	if height == nil {
		return fmt.Errorf("unable to read header height for header hash %s", hash.String())
	}
	return s.CreateSnapshot(SnapshotParams{Height: *height, Workers: workers, WatchedAddresses: watchedAddresses})
}
state snapshot extractor 2020-07-01 18:44:59 +00:00			`// Copyright © 2020 Vulcanize, Inc`
			`//`
			`// This program is free software: you can redistribute it and/or modify`
			`// it under the terms of the GNU Affero General Public License as published by`
			`// the Free Software Foundation, either version 3 of the License, or`
			`// (at your option) any later version.`
			`//`
			`// This program is distributed in the hope that it will be useful,`
			`// but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`// GNU Affero General Public License for more details.`
			`//`
			`// You should have received a copy of the GNU Affero General Public License`
			`// along with this program. If not, see <http://www.gnu.org/licenses/>.`

			`package snapshot`

			`import (`
Account selective snapshot (#46) * snapshotter ignores nodes not along a path along those derived from a list of account addresses if one is provided * config and env updates * cmd update * Encode watched address path bytes to hex for comparison * actually ignore the subtries that are not along the paths of interest * Fixes for account selective snapshot * Use non-concurrent iterator when having a single worker * Only index root node when starting path of an iterator is nil * Upgrade deps * Avoid tracking iterators and skip recovery test * Fix recovery mechanism, use sync Map instead of buffered channels * Add test for account selective snapshot * Continue traversal with concurrent iterators with starting path nil * Use errgroup to simplify error handling with concurrent iterators * Check if all the nodes are indexed in the recovery test * Use concurrency safe sync Map in account selective snapshot test * Only track concurrent iterators and refactor code * Fix node and recovered path comparison * Revert back to using buffered channels for tracking iterators * Add a metric to monitor number of active iterators * Update docs * Update seeked path after node is processed * Return error on context cancellation from subtrie iteration * Add tests for account selective snapshot recovery * Explicity enforce concurrent iterator bounds to avoid duplicate nodes * Update full snapshot test to check nodes being indexed * Refactor code to simplify snapshot logic * Remove unnecessary function argument * Use ctx cancellation for handling signals * Add descriptive comments Co-authored-by: prathamesh0 <prathamesh.musale0@gmail.com> 2022-08-03 11:35:04 +00:00			`"context"`
state snapshot extractor 2020-07-01 18:44:59 +00:00			`"fmt"`
Index block number using string in file mode 2022-05-13 08:30:40 +00:00			`"math/big"`
refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`"sync"`
state snapshot extractor 2020-07-01 18:44:59 +00:00
refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`"github.com/cerc-io/eth-iterator-utils/tracker"`
			`statediff "github.com/cerc-io/plugeth-statediff"`
			`"github.com/cerc-io/plugeth-statediff/adapt"`
			`"github.com/cerc-io/plugeth-statediff/indexer"`
			`"github.com/cerc-io/plugeth-statediff/types"`
state snapshot extractor 2020-07-01 18:44:59 +00:00			`"github.com/ethereum/go-ethereum/common"`
			`"github.com/ethereum/go-ethereum/core/rawdb"`
			`"github.com/ethereum/go-ethereum/core/state"`
			`"github.com/ethereum/go-ethereum/crypto"`
			`"github.com/ethereum/go-ethereum/ethdb"`
			`"github.com/ethereum/go-ethereum/rlp"`
logging tweaks 2022-03-09 13:37:33 +00:00			`log "github.com/sirupsen/logrus"`
state snapshot extractor 2020-07-01 18:44:59 +00:00			`)`

			`var (`
Change ON CONFLICT clauses to do nothing and upgrade geth 2022-05-12 14:47:11 +00:00			`emptyNode, _ = rlp.EncodeToBytes(&[]byte{})`
store code bytes in ipfs blockstore table 2020-07-31 05:31:34 +00:00			`emptyCodeHash = crypto.Keccak256([]byte{})`
state snapshot extractor 2020-07-01 18:44:59 +00:00			`emptyContractRoot = crypto.Keccak256Hash(emptyNode)`
Update geth and implement transaction batching. 2021-12-13 15:01:32 +00:00
			`defaultBatchSize = uint(100)`
state snapshot extractor 2020-07-01 18:44:59 +00:00			`)`

Fix linting errors. 2021-12-14 06:50:19 +00:00			`// Service holds ethDB and stateDB to read data from lvldb and Publisher`
			`// to publish trie in postgres DB.`
state snapshot extractor 2020-07-01 18:44:59 +00:00			`type Service struct {`
refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`ethDB ethdb.Database`
			`stateDB state.Database`
			`indexer indexer.Indexer`
			`maxBatchSize uint`
			`recoveryFile string`
state snapshot extractor 2020-07-01 18:44:59 +00:00			`}`

split ethdb out of service 2022-01-11 05:37:27 +00:00			`func NewLevelDB(con *EthConfig) (ethdb.Database, error) {`
refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`kvdb, err := rawdb.NewLevelDBDatabase(con.LevelDBPath, 1024, 256, "ipld-eth-state-snapshot", true)`
			`if err != nil {`
			`return nil, fmt.Errorf("failed to connect LevelDB: %s", err)`
			`}`
refactor to work with v4 vdb geth v1.11.5 2023-03-31 15:39:27 +00:00			`edb, err := rawdb.NewDatabaseWithFreezer(kvdb, con.AncientDBPath, "ipld-eth-state-snapshot", true)`
add some logs and guards, update module name, update readme 2022-03-30 23:57:30 +00:00			`if err != nil {`
refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`return nil, fmt.Errorf("failed to connect LevelDB freezer: %s", err)`
add some logs and guards, update module name, update readme 2022-03-30 23:57:30 +00:00			`}`
			`return edb, nil`
split ethdb out of service 2022-01-11 05:37:27 +00:00			`}`
Update geth and implement transaction batching. 2021-12-13 15:01:32 +00:00
split ethdb out of service 2022-01-11 05:37:27 +00:00			`// NewSnapshotService creates Service.`
refactor tests 2023-08-27 09:49:46 +00:00			`func NewSnapshotService(edb ethdb.Database, indexer indexer.Indexer, recoveryFile string) (*Service, error) {`
state snapshot extractor 2020-07-01 18:44:59 +00:00			`return &Service{`
refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`ethDB: edb,`
			`stateDB: state.NewDatabase(edb),`
			`indexer: indexer,`
			`maxBatchSize: defaultBatchSize,`
			`recoveryFile: recoveryFile,`
state snapshot extractor 2020-07-01 18:44:59 +00:00			`}, nil`
			`}`

[wip] async traversal 2020-08-20 10:23:36 +00:00			`type SnapshotParams struct {`
refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`WatchedAddresses []common.Address`
Account selective snapshot (#46) * snapshotter ignores nodes not along a path along those derived from a list of account addresses if one is provided * config and env updates * cmd update * Encode watched address path bytes to hex for comparison * actually ignore the subtries that are not along the paths of interest * Fixes for account selective snapshot * Use non-concurrent iterator when having a single worker * Only index root node when starting path of an iterator is nil * Upgrade deps * Avoid tracking iterators and skip recovery test * Fix recovery mechanism, use sync Map instead of buffered channels * Add test for account selective snapshot * Continue traversal with concurrent iterators with starting path nil * Use errgroup to simplify error handling with concurrent iterators * Check if all the nodes are indexed in the recovery test * Use concurrency safe sync Map in account selective snapshot test * Only track concurrent iterators and refactor code * Fix node and recovered path comparison * Revert back to using buffered channels for tracking iterators * Add a metric to monitor number of active iterators * Update docs * Update seeked path after node is processed * Return error on context cancellation from subtrie iteration * Add tests for account selective snapshot recovery * Explicity enforce concurrent iterator bounds to avoid duplicate nodes * Update full snapshot test to check nodes being indexed * Refactor code to simplify snapshot logic * Remove unnecessary function argument * Use ctx cancellation for handling signals * Add descriptive comments Co-authored-by: prathamesh0 <prathamesh.musale0@gmail.com> 2022-08-03 11:35:04 +00:00			`Height uint64`
			`Workers uint`
[wip] async traversal 2020-08-20 10:23:36 +00:00			`}`

[wip] use new state iterator 2020-08-23 04:38:31 +00:00			`func (s *Service) CreateSnapshot(params SnapshotParams) error {`
option for defering to the latest height 2020-07-16 15:31:37 +00:00			`// extract header from lvldb and publish to PG-IPFS`
			`// hold onto the headerID so that we can link the state nodes to this header`
logging tweaks 2022-03-09 13:37:33 +00:00			`log.Infof("Creating snapshot at height %d", params.Height)`
[wip] use new state iterator 2020-08-23 04:38:31 +00:00			`hash := rawdb.ReadCanonicalHash(s.ethDB, params.Height)`
			`header := rawdb.ReadHeader(s.ethDB, hash, params.Height)`
option for defering to the latest height 2020-07-16 15:31:37 +00:00			`if header == nil {`
[wip] use new state iterator 2020-08-23 04:38:31 +00:00			`return fmt.Errorf("unable to read canonical header at height %d", params.Height)`
option for defering to the latest height 2020-07-16 15:31:37 +00:00			`}`
logging tweaks 2022-03-09 13:37:33 +00:00			`log.Infof("head hash: %s head height: %d", hash.Hex(), params.Height)`
Update geth and implement transaction batching. 2021-12-13 15:01:32 +00:00
refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`// Context for snapshot work`
Account selective snapshot (#46) * snapshotter ignores nodes not along a path along those derived from a list of account addresses if one is provided * config and env updates * cmd update * Encode watched address path bytes to hex for comparison * actually ignore the subtries that are not along the paths of interest * Fixes for account selective snapshot * Use non-concurrent iterator when having a single worker * Only index root node when starting path of an iterator is nil * Upgrade deps * Avoid tracking iterators and skip recovery test * Fix recovery mechanism, use sync Map instead of buffered channels * Add test for account selective snapshot * Continue traversal with concurrent iterators with starting path nil * Use errgroup to simplify error handling with concurrent iterators * Check if all the nodes are indexed in the recovery test * Use concurrency safe sync Map in account selective snapshot test * Only track concurrent iterators and refactor code * Fix node and recovered path comparison * Revert back to using buffered channels for tracking iterators * Add a metric to monitor number of active iterators * Update docs * Update seeked path after node is processed * Return error on context cancellation from subtrie iteration * Add tests for account selective snapshot recovery * Explicity enforce concurrent iterator bounds to avoid duplicate nodes * Update full snapshot test to check nodes being indexed * Refactor code to simplify snapshot logic * Remove unnecessary function argument * Use ctx cancellation for handling signals * Add descriptive comments Co-authored-by: prathamesh0 <prathamesh.musale0@gmail.com> 2022-08-03 11:35:04 +00:00			`ctx, cancelCtx := context.WithCancel(context.Background())`
refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`defer cancelCtx()`

			`var err error`
			`tx := s.indexer.BeginTx(header.Number, ctx)`
			`defer tx.RollbackOnFailure(err)`
create csv file publisher; update geth and schema 2022-02-09 15:19:10 +00:00
refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`var headerid string`
			`headerid, err = s.indexer.PushHeader(tx, header, big.NewInt(0), big.NewInt(0))`
implement job recovery; fix traversal & iterator 2022-02-18 11:12:53 +00:00			`if err != nil {`
			`return err`
			`}`
clean up logging 2022-06-08 12:08:17 +00:00
refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`tracker := tracker.New(s.recoveryFile, params.Workers)`
			`tracker.CaptureSignal(cancelCtx)`
implement job recovery; fix traversal & iterator 2022-02-18 11:12:53 +00:00
			`defer func() {`
refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`err := tracker.HaltAndDump()`
implement job recovery; fix traversal & iterator 2022-02-18 11:12:53 +00:00			`if err != nil {`
Log progress info 2022-05-26 10:20:42 +00:00			`log.Errorf("failed to write recovery file: %v", err)`
implement job recovery; fix traversal & iterator 2022-02-18 11:12:53 +00:00			`}`
			`}()`

refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`var nodeMtx, ipldMtx sync.Mutex`
			`nodeSink := func(node types.StateLeafNode) error {`
			`nodeMtx.Lock()`
			`defer nodeMtx.Unlock()`
			`return s.indexer.PushStateNode(tx, node, headerid)`
[wip] async traversal 2020-08-20 10:23:36 +00:00			`}`
refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`ipldSink := func(c types.IPLD) error {`
			`ipldMtx.Lock()`
			`defer ipldMtx.Unlock()`
			`return s.indexer.PushIPLD(tx, c)`
add guard 2020-07-16 15:02:16 +00:00			`}`
[wip] async traversal 2020-08-20 10:23:36 +00:00
refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`// Build a diff compared against the zero hash to get a full snapshot`
			`sdargs := statediff.Args{`
			`NewStateRoot: header.Root,`
			`BlockHash: header.Hash(),`
			`BlockNumber: header.Number,`
state snapshot extractor 2020-07-01 18:44:59 +00:00			`}`
refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`sdparams := statediff.Params{`
			`WatchedAddresses: params.WatchedAddresses,`
fixes 2022-01-11 00:06:29 +00:00			`}`
refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`sdparams.ComputeWatchedAddressesLeafPaths()`
			`builder := statediff.NewBuilder(adapt.GethStateView(s.stateDB))`
refactor tests 2023-08-27 09:49:46 +00:00			`builder.SetSubtrieWorkers(params.Workers)`
refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`if err = builder.WriteStateDiffTracked(sdargs, sdparams, nodeSink, ipldSink, &tracker); err != nil {`
fix overshadowing of snap.Tx 2023-05-16 14:22:15 +00:00			`return err`
Account selective snapshot (#46) * snapshotter ignores nodes not along a path along those derived from a list of account addresses if one is provided * config and env updates * cmd update * Encode watched address path bytes to hex for comparison * actually ignore the subtries that are not along the paths of interest * Fixes for account selective snapshot * Use non-concurrent iterator when having a single worker * Only index root node when starting path of an iterator is nil * Upgrade deps * Avoid tracking iterators and skip recovery test * Fix recovery mechanism, use sync Map instead of buffered channels * Add test for account selective snapshot * Continue traversal with concurrent iterators with starting path nil * Use errgroup to simplify error handling with concurrent iterators * Check if all the nodes are indexed in the recovery test * Use concurrency safe sync Map in account selective snapshot test * Only track concurrent iterators and refactor code * Fix node and recovered path comparison * Revert back to using buffered channels for tracking iterators * Add a metric to monitor number of active iterators * Update docs * Update seeked path after node is processed * Return error on context cancellation from subtrie iteration * Add tests for account selective snapshot recovery * Explicity enforce concurrent iterator bounds to avoid duplicate nodes * Update full snapshot test to check nodes being indexed * Refactor code to simplify snapshot logic * Remove unnecessary function argument * Use ctx cancellation for handling signals * Add descriptive comments Co-authored-by: prathamesh0 <prathamesh.musale0@gmail.com> 2022-08-03 11:35:04 +00:00			`}`

refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`if err = tx.Submit(); err != nil {`
			`return fmt.Errorf("batch transaction submission failed: %w", err)`
Account selective snapshot (#46) * snapshotter ignores nodes not along a path along those derived from a list of account addresses if one is provided * config and env updates * cmd update * Encode watched address path bytes to hex for comparison * actually ignore the subtries that are not along the paths of interest * Fixes for account selective snapshot * Use non-concurrent iterator when having a single worker * Only index root node when starting path of an iterator is nil * Upgrade deps * Avoid tracking iterators and skip recovery test * Fix recovery mechanism, use sync Map instead of buffered channels * Add test for account selective snapshot * Continue traversal with concurrent iterators with starting path nil * Use errgroup to simplify error handling with concurrent iterators * Check if all the nodes are indexed in the recovery test * Use concurrency safe sync Map in account selective snapshot test * Only track concurrent iterators and refactor code * Fix node and recovered path comparison * Revert back to using buffered channels for tracking iterators * Add a metric to monitor number of active iterators * Update docs * Update seeked path after node is processed * Return error on context cancellation from subtrie iteration * Add tests for account selective snapshot recovery * Explicity enforce concurrent iterator bounds to avoid duplicate nodes * Update full snapshot test to check nodes being indexed * Refactor code to simplify snapshot logic * Remove unnecessary function argument * Use ctx cancellation for handling signals * Add descriptive comments Co-authored-by: prathamesh0 <prathamesh.musale0@gmail.com> 2022-08-03 11:35:04 +00:00			`}`
refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`return err`
update service 2023-04-12 18:07:42 +00:00			`}`

refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`// CreateLatestSnapshot snapshot at head (ignores height param)`
			`func (s *Service) CreateLatestSnapshot(workers uint, watchedAddresses []common.Address) error {`
			`log.Info("Creating snapshot at head")`
			`hash := rawdb.ReadHeadHeaderHash(s.ethDB)`
			`height := rawdb.ReadHeaderNumber(s.ethDB, hash)`
			`if height == nil {`
			`return fmt.Errorf("unable to read header height for header hash %s", hash.String())`
update service 2023-04-12 18:07:42 +00:00			`}`
refactor to use statediff plugin 2023-08-04 12:36:56 +00:00			`return s.CreateSnapshot(SnapshotParams{Height: *height, Workers: workers, WatchedAddresses: watchedAddresses})`
state snapshot extractor 2020-07-01 18:44:59 +00:00			`}`