diff --git a/README.md b/README.md index 3e115f0..6592c27 100644 --- a/README.md +++ b/README.md @@ -4,64 +4,178 @@ [![Go Report Card](https://goreportcard.com/badge/github.com/vulcanize/ipld-eth-state-snapshot)](https://goreportcard.com/report/github.com/vulcanize/ipld-eth-state-snapshot) -## Usage +## Setup -For state snapshot from LevelDB -```bash -./ipld-eth-state-snapshot stateSnapshot --config={path to toml config file} -``` +* Build the binary: -For in-place snapshot in database -```bash -./ipld-eth-state-snapshot inPlaceStateSnapshot --config={path to toml config file} -``` + ```bash + make build + ``` -### Config +## Configuration Config format: ```toml [snapshot] - mode = "file" # indicates output mode ("postgres" or "file") - workers = 4 # degree of concurrency, the state trie is subdivided into sectiosn that are traversed and processed concurrently - blockHeight = -1 # blockheight to perform the snapshot at (-1 indicates to use the latest blockheight found in leveldb) - recoveryFile = "recovery_file" # specifies a file to output recovery information on error or premature closure + mode = "file" # indicates output mode + workers = 4 # degree of concurrency, the state trie is subdivided into sections that are traversed and processed concurrently + blockHeight = -1 # blockheight to perform the snapshot at (-1 indicates to use the latest blockheight found in leveldb) + recoveryFile = "recovery_file" # specifies a file to output recovery information on error or premature closure [leveldb] - path = "/Users/user/Library/Ethereum/geth/chaindata" # path to geth leveldb - ancient = "/Users/user/Library/Ethereum/geth/chaindata/ancient" # path to geth ancient database + # path to geth leveldb + path = "/Users/user/Library/Ethereum/geth/chaindata" # ANCIENT_DB_PATH + # path to geth ancient database + ancient = "/Users/user/Library/Ethereum/geth/chaindata/ancient" # LVL_DB_PATH [database] - name = "vulcanize_public" # postgres database name - hostname = "localhost" # postgres host - port = 5432 # postgres port - user = "postgres" # postgres user - password = "" # postgres password + # when operating in 'postgres' output mode + # db credentials + name = "vulcanize_public" # DATABASE_NAME + hostname = "localhost" # DATABASE_HOSTNAME + port = 5432 # DATABASE_PORT + user = "postgres" # DATABASE_USER + password = "" # DATABASE_PASSWORD [file] - outputDir = "output_dir/" # when operating in 'file' output mode, this is the directory the files are written to + # when operating in 'file' output mode + # directory the CSV files are written to + outputDir = "output_dir/" # FILE_OUTPUT_DIR [log] - level = "info" # log level (trace, debug, info, warn, error, fatal, panic) (default: info) - file = "log_file" # file path for logging + level = "info" # log level (trace, debug, info, warn, error, fatal, panic) (default: info) + file = "log_file" # file path for logging, leave unset to log to stdout [prom] - metrics = true # enable prometheus metrics (default: false) - http = true # enable prometheus http service (default: false) - httpAddr = "0.0.0.0" # prometheus http host (default: 127.0.0.1) - httpPort = 9101 # prometheus http port (default: 8086) - dbStats = true # enable prometheus db stats (default: false) + # prometheus metrics + metrics = true # enable prometheus metrics (default: false) + http = true # enable prometheus http service (default: false) + httpAddr = "0.0.0.0" # prometheus http host (default: 127.0.0.1) + httpPort = 9101 # prometheus http port (default: 8086) + dbStats = true # enable prometheus db stats (default: false) -# node info [ethereum] - clientName = "Geth" # $ETH_CLIENT_NAME - nodeID = "arch1" # $ETH_NODE_ID - networkID = "1" # $ETH_NETWORK_ID - chainID = "1" # $ETH_CHAIN_ID - genesisBlock = "0xd4e56740f876aef8c010b86a40d5f56745a118d0906a34e69aec8c0db1cb8fa3" # $ETH_GENESIS_BLOCK + # node info + clientName = "Geth" # ETH_CLIENT_NAME + nodeID = "arch1" # ETH_NODE_ID + networkID = "1" # ETH_NETWORK_ID + chainID = "1" # ETH_CHAIN_ID + genesisBlock = "0xd4e56740f876aef8c010b86a40d5f56745a118d0906a34e69aec8c0db1cb8fa3" # ETH_GENESIS_BLOCK ``` +## Usage + +* For state snapshot from LevelDB: + + ```bash + ./ipld-eth-state-snapshot stateSnapshot --config={path to toml config file} + ``` + +* For in-place snapshot in the database: + + ```bash + ./ipld-eth-state-snapshot inPlaceStateSnapshot --config={path to toml config file} + ``` + +## Monitoring + +* Enable metrics using config parameters `prom.metrics` and `prom.http`. +* `ipld-eth-state-snapshot` exposes following prometheus metrics at `/metrics` endpoint: + * `state_node_count`: Number of state nodes processed. + * `storage_node_count`: Number of storage nodes processed. + * `code_node_count`: Number of code nodes processed. + * DB stats if operating in `postgres` mode. + ## Tests -* Install [mockgen](https://github.com/golang/mock#installation) -* `make test` +* Run unit tests: + + ```bash + # setup db + docker-compose up -d + + # run tests after db migrations are run + make dbtest + + # tear down db + docker-compose down -v --remove-orphans + ``` + +## Import output data in file mode into a database + +* When `ipld-eth-state-snapshot stateSnapshot` is run in file mode (`database.type`), the output is in form of CSV files. + +* Assuming the output files are located in host's `./output_dir` directory. + +* Data post-processing: + + * Create a directory to store post-processed output: + + ```bash + mkdir -p output_dir/processed_output + ``` + + * Combine output from multiple workers and copy to post-processed output directory: + + ```bash + # public.blocks + cat {output_dir,output_dir/*}/public.blocks.csv > output_dir/processed_output/combined-public.blocks.csv + + # eth.state_cids + cat output_dir/*/eth.state_cids.csv > output_dir/processed_output/combined-eth.state_cids.csv + + # eth.storage_cids + cat output_dir/*/eth.storage_cids.csv > output_dir/processed_output/combined-eth.storage_cids.csv + + # public.nodes + cp output_dir/public.nodes.csv output_dir/processed_output/public.nodes.csv + + # eth.header_cids + cp output_dir/eth.header_cids.csv output_dir/processed_output/eth.header_cids.csv + ``` + + * De-duplicate data: + + ```bash + # public.blocks + sort -u output_dir/processed_output/combined-public.blocks.csv -o output_dir/processed_output/deduped-combined-public.blocks.csv + + # eth.header_cids + sort -u output_dir/processed_output/eth.header_cids.csv -o output_dir/processed_output/deduped-eth.header_cids.csv + + # eth.state_cids + sort -u output_dir/processed_output/combined-eth.state_cids.csv -o output_dir/processed_output/deduped-combined-eth.state_cids.csv + + # eth.storage_cids + sort -u output_dir/processed_output/combined-eth.storage_cids.csv -o output_dir/processed_output/deduped-combined-eth.storage_cids.csv + ``` + +* Copy over the post-processed output files to the DB server (say in `/output_dir`). + +* Start `psql` to run the import commands: + + ```bash + psql -U -h -p + ``` + +* Run the following to import data: + + ```bash + # public.nodes + COPY public.nodes FROM '/output_dir/processed_output/public.nodes.csv' CSV; + + # public.blocks + COPY public.blocks FROM '/output_dir/processed_output/deduped-combined-public.blocks.csv' CSV; + + # eth.header_cids + COPY eth.header_cids FROM '/output_dir/processed_output/deduped-eth.header_cids.csv' CSV; + + # eth.state_cids + COPY eth.state_cids FROM '/output_dir/processed_output/deduped-combined-eth.state_cids.csv' CSV FORCE NOT NULL state_leaf_key; + + # eth.storage_cids + COPY eth.storage_cids FROM '/output_dir/processed_output/deduped-combined-eth.storage_cids.csv' CSV FORCE NOT NULL storage_leaf_key; + ``` + +* NOTE: `COPY` command on CSVs inserts empty strings as `NULL` in the DB. Passing `FORCE_NOT_NULL ` forces it to insert empty strings instead. This is required to maintain compatibility of the imported snapshot data with the data generated by statediffing. Reference: https://www.postgresql.org/docs/14/sql-copy.html