V1.10.1 statediff 0.0.17 #60

Merged
telackey merged 3 commits from v1.10.1-statediff-0.0.17 into v1.10.1-statediff 2021-04-09 02:32:04 +00:00
18 changed files with 1912 additions and 58 deletions

View File

@ -32,7 +32,6 @@ import (
sdtypes "github.com/ethereum/go-ethereum/statediff/types" sdtypes "github.com/ethereum/go-ethereum/statediff/types"
) )
// TODO: add test that filters on address
var ( var (
contractLeafKey []byte contractLeafKey []byte
emptyDiffs = make([]sdtypes.StateNode, 0) emptyDiffs = make([]sdtypes.StateNode, 0)

View File

@ -0,0 +1,8 @@
-- +goose Up
CREATE TABLE IF NOT EXISTS public.blocks (
key TEXT UNIQUE NOT NULL,
data BYTEA NOT NULL
);
-- +goose Down
DROP TABLE public.blocks;

View File

@ -0,0 +1,13 @@
-- +goose Up
CREATE TABLE nodes (
id SERIAL PRIMARY KEY,
client_name VARCHAR,
genesis_block VARCHAR(66),
network_id VARCHAR,
node_id VARCHAR(128),
chain_id INTEGER DEFAULT 1,
CONSTRAINT node_uc UNIQUE (genesis_block, network_id, node_id, chain_id)
);
-- +goose Down
DROP TABLE nodes;

View File

@ -0,0 +1,5 @@
-- +goose Up
CREATE SCHEMA eth;
-- +goose Down
DROP SCHEMA eth;

View File

@ -0,0 +1,23 @@
-- +goose Up
CREATE TABLE eth.header_cids (
id SERIAL PRIMARY KEY,
block_number BIGINT NOT NULL,
block_hash VARCHAR(66) NOT NULL,
parent_hash VARCHAR(66) NOT NULL,
cid TEXT NOT NULL,
mh_key TEXT NOT NULL REFERENCES public.blocks (key) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
td NUMERIC NOT NULL,
node_id INTEGER NOT NULL REFERENCES nodes (id) ON DELETE CASCADE,
reward NUMERIC NOT NULL,
state_root VARCHAR(66) NOT NULL,
tx_root VARCHAR(66) NOT NULL,
receipt_root VARCHAR(66) NOT NULL,
uncle_root VARCHAR(66) NOT NULL,
bloom BYTEA NOT NULL,
timestamp NUMERIC NOT NULL,
times_validated INTEGER NOT NULL DEFAULT 1,
UNIQUE (block_number, block_hash)
);
-- +goose Down
DROP TABLE eth.header_cids;

View File

@ -0,0 +1,14 @@
-- +goose Up
CREATE TABLE eth.uncle_cids (
id SERIAL PRIMARY KEY,
header_id INTEGER NOT NULL REFERENCES eth.header_cids (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
block_hash VARCHAR(66) NOT NULL,
parent_hash VARCHAR(66) NOT NULL,
cid TEXT NOT NULL,
mh_key TEXT NOT NULL REFERENCES public.blocks (key) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
reward NUMERIC NOT NULL,
UNIQUE (header_id, block_hash)
);
-- +goose Down
DROP TABLE eth.uncle_cids;

View File

@ -0,0 +1,16 @@
-- +goose Up
CREATE TABLE eth.transaction_cids (
id SERIAL PRIMARY KEY,
header_id INTEGER NOT NULL REFERENCES eth.header_cids (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
tx_hash VARCHAR(66) NOT NULL,
index INTEGER NOT NULL,
cid TEXT NOT NULL,
mh_key TEXT NOT NULL REFERENCES public.blocks (key) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
dst VARCHAR(66) NOT NULL,
src VARCHAR(66) NOT NULL,
tx_data BYTEA,
UNIQUE (header_id, tx_hash)
);
-- +goose Down
DROP TABLE eth.transaction_cids;

View File

@ -0,0 +1,20 @@
-- +goose Up
CREATE TABLE eth.receipt_cids (
id SERIAL PRIMARY KEY,
tx_id INTEGER NOT NULL REFERENCES eth.transaction_cids (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
cid TEXT NOT NULL,
mh_key TEXT NOT NULL REFERENCES public.blocks (key) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
contract VARCHAR(66),
contract_hash VARCHAR(66),
topic0s VARCHAR(66)[],
topic1s VARCHAR(66)[],
topic2s VARCHAR(66)[],
topic3s VARCHAR(66)[],
log_contracts VARCHAR(66)[],
post_state VARCHAR(66),
post_status INTEGER,
UNIQUE (tx_id)
);
-- +goose Down
DROP TABLE eth.receipt_cids;

View File

@ -0,0 +1,15 @@
-- +goose Up
CREATE TABLE eth.state_cids (
id BIGSERIAL PRIMARY KEY,
header_id INTEGER NOT NULL REFERENCES eth.header_cids (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
state_leaf_key VARCHAR(66),
cid TEXT NOT NULL,
mh_key TEXT NOT NULL REFERENCES public.blocks (key) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
state_path BYTEA,
node_type INTEGER NOT NULL,
diff BOOLEAN NOT NULL DEFAULT FALSE,
UNIQUE (header_id, state_path)
);
-- +goose Down
DROP TABLE eth.state_cids;

View File

@ -0,0 +1,15 @@
-- +goose Up
CREATE TABLE eth.storage_cids (
id BIGSERIAL PRIMARY KEY,
state_id BIGINT NOT NULL REFERENCES eth.state_cids (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
storage_leaf_key VARCHAR(66),
cid TEXT NOT NULL,
mh_key TEXT NOT NULL REFERENCES public.blocks (key) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
storage_path BYTEA,
node_type INTEGER NOT NULL,
diff BOOLEAN NOT NULL DEFAULT FALSE,
UNIQUE (state_id, storage_path)
);
-- +goose Down
DROP TABLE eth.storage_cids;

View File

@ -0,0 +1,13 @@
-- +goose Up
CREATE TABLE eth.state_accounts (
id SERIAL PRIMARY KEY,
state_id BIGINT NOT NULL REFERENCES eth.state_cids (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
balance NUMERIC NOT NULL,
nonce INTEGER NOT NULL,
code_hash BYTEA NOT NULL,
storage_root VARCHAR(66) NOT NULL,
UNIQUE (state_id)
);
-- +goose Down
DROP TABLE eth.state_accounts;

View File

@ -0,0 +1,6 @@
-- +goose Up
COMMENT ON TABLE public.nodes IS E'@name NodeInfo';
COMMENT ON TABLE eth.transaction_cids IS E'@name EthTransactionCids';
COMMENT ON TABLE eth.header_cids IS E'@name EthHeaderCids';
COMMENT ON COLUMN public.nodes.node_id IS E'@name ChainNodeID';
COMMENT ON COLUMN eth.header_cids.node_id IS E'@name EthNodeID';

View File

@ -0,0 +1,69 @@
-- +goose Up
-- +goose StatementBegin
CREATE FUNCTION eth.graphql_subscription() returns TRIGGER as $$
declare
table_name text = TG_ARGV[0];
attribute text = TG_ARGV[1];
id text;
begin
execute 'select $1.' || quote_ident(attribute)
using new
into id;
perform pg_notify('postgraphile:' || table_name,
json_build_object(
'__node__', json_build_array(
table_name,
id
)
)::text
);
return new;
end;
$$ language plpgsql;
-- +goose StatementEnd
CREATE TRIGGER header_cids_ai
after INSERT ON eth.header_cids
for each row
execute procedure eth.graphql_subscription('header_cids', 'id');
CREATE TRIGGER receipt_cids_ai
after INSERT ON eth.receipt_cids
for each row
execute procedure eth.graphql_subscription('receipt_cids', 'id');
CREATE TRIGGER state_accounts_ai
after INSERT ON eth.state_accounts
for each row
execute procedure eth.graphql_subscription('state_accounts', 'id');
CREATE TRIGGER state_cids_ai
after INSERT ON eth.state_cids
for each row
execute procedure eth.graphql_subscription('state_cids', 'id');
CREATE TRIGGER storage_cids_ai
after INSERT ON eth.storage_cids
for each row
execute procedure eth.graphql_subscription('storage_cids', 'id');
CREATE TRIGGER transaction_cids_ai
after INSERT ON eth.transaction_cids
for each row
execute procedure eth.graphql_subscription('transaction_cids', 'id');
CREATE TRIGGER uncle_cids_ai
after INSERT ON eth.uncle_cids
for each row
execute procedure eth.graphql_subscription('uncle_cids', 'id');
-- +goose Down
DROP TRIGGER uncle_cids_ai ON eth.uncle_cids;
DROP TRIGGER transaction_cids_ai ON eth.transaction_cids;
DROP TRIGGER storage_cids_ai ON eth.storage_cids;
DROP TRIGGER state_cids_ai ON eth.state_cids;
DROP TRIGGER state_accounts_ai ON eth.state_accounts;
DROP TRIGGER receipt_cids_ai ON eth.receipt_cids;
DROP TRIGGER header_cids_ai ON eth.header_cids;
DROP FUNCTION eth.graphql_subscription();

View File

@ -0,0 +1,121 @@
-- +goose Up
-- header indexes
CREATE INDEX block_number_index ON eth.header_cids USING brin (block_number);
CREATE INDEX block_hash_index ON eth.header_cids USING btree (block_hash);
CREATE INDEX header_cid_index ON eth.header_cids USING btree (cid);
CREATE INDEX header_mh_index ON eth.header_cids USING btree (mh_key);
CREATE INDEX state_root_index ON eth.header_cids USING btree (state_root);
CREATE INDEX timestamp_index ON eth.header_cids USING brin (timestamp);
-- transaction indexes
CREATE INDEX tx_header_id_index ON eth.transaction_cids USING btree (header_id);
CREATE INDEX tx_hash_index ON eth.transaction_cids USING btree (tx_hash);
CREATE INDEX tx_cid_index ON eth.transaction_cids USING btree (cid);
CREATE INDEX tx_mh_index ON eth.transaction_cids USING btree (mh_key);
CREATE INDEX tx_dst_index ON eth.transaction_cids USING btree (dst);
CREATE INDEX tx_src_index ON eth.transaction_cids USING btree (src);
-- receipt indexes
CREATE INDEX rct_tx_id_index ON eth.receipt_cids USING btree (tx_id);
CREATE INDEX rct_cid_index ON eth.receipt_cids USING btree (cid);
CREATE INDEX rct_mh_index ON eth.receipt_cids USING btree (mh_key);
CREATE INDEX rct_contract_index ON eth.receipt_cids USING btree (contract);
CREATE INDEX rct_contract_hash_index ON eth.receipt_cids USING btree (contract_hash);
CREATE INDEX rct_topic0_index ON eth.receipt_cids USING gin (topic0s);
CREATE INDEX rct_topic1_index ON eth.receipt_cids USING gin (topic1s);
CREATE INDEX rct_topic2_index ON eth.receipt_cids USING gin (topic2s);
CREATE INDEX rct_topic3_index ON eth.receipt_cids USING gin (topic3s);
CREATE INDEX rct_log_contract_index ON eth.receipt_cids USING gin (log_contracts);
-- state node indexes
CREATE INDEX state_header_id_index ON eth.state_cids USING btree (header_id);
CREATE INDEX state_leaf_key_index ON eth.state_cids USING btree (state_leaf_key);
CREATE INDEX state_cid_index ON eth.state_cids USING btree (cid);
CREATE INDEX state_mh_index ON eth.state_cids USING btree (mh_key);
CREATE INDEX state_path_index ON eth.state_cids USING btree (state_path);
-- storage node indexes
CREATE INDEX storage_state_id_index ON eth.storage_cids USING btree (state_id);
CREATE INDEX storage_leaf_key_index ON eth.storage_cids USING btree (storage_leaf_key);
CREATE INDEX storage_cid_index ON eth.storage_cids USING btree (cid);
CREATE INDEX storage_mh_index ON eth.storage_cids USING btree (mh_key);
CREATE INDEX storage_path_index ON eth.storage_cids USING btree (storage_path);
-- state accounts indexes
CREATE INDEX account_state_id_index ON eth.state_accounts USING btree (state_id);
CREATE INDEX storage_root_index ON eth.state_accounts USING btree (storage_root);
-- +goose Down
-- state account indexes
DROP INDEX eth.storage_root_index;
DROP INDEX eth.account_state_id_index;
-- storage node indexes
DROP INDEX eth.storage_path_index;
DROP INDEX eth.storage_mh_index;
DROP INDEX eth.storage_cid_index;
DROP INDEX eth.storage_leaf_key_index;
DROP INDEX eth.storage_state_id_index;
-- state node indexes
DROP INDEX eth.state_path_index;
DROP INDEX eth.state_mh_index;
DROP INDEX eth.state_cid_index;
DROP INDEX eth.state_leaf_key_index;
DROP INDEX eth.state_header_id_index;
-- receipt indexes
DROP INDEX eth.rct_log_contract_index;
DROP INDEX eth.rct_topic3_index;
DROP INDEX eth.rct_topic2_index;
DROP INDEX eth.rct_topic1_index;
DROP INDEX eth.rct_topic0_index;
DROP INDEX eth.rct_contract_hash_index;
DROP INDEX eth.rct_contract_index;
DROP INDEX eth.rct_mh_index;
DROP INDEX eth.rct_cid_index;
DROP INDEX eth.rct_tx_id_index;
-- transaction indexes
DROP INDEX eth.tx_src_index;
DROP INDEX eth.tx_dst_index;
DROP INDEX eth.tx_mh_index;
DROP INDEX eth.tx_cid_index;
DROP INDEX eth.tx_hash_index;
DROP INDEX eth.tx_header_id_index;
-- header indexes
DROP INDEX eth.timestamp_index;
DROP INDEX eth.state_root_index;
DROP INDEX eth.header_mh_index;
DROP INDEX eth.header_cid_index;
DROP INDEX eth.block_hash_index;
DROP INDEX eth.block_number_index;

View File

@ -0,0 +1,158 @@
-- +goose Up
-- +goose StatementBegin
-- returns if a storage node at the provided path was removed in the range > the provided height and <= the provided block hash
CREATE OR REPLACE FUNCTION was_storage_removed(path BYTEA, height BIGINT, hash VARCHAR(66)) RETURNS BOOLEAN
AS $$
SELECT exists(SELECT 1
FROM eth.storage_cids
INNER JOIN eth.state_cids ON (storage_cids.state_id = state_cids.id)
INNER JOIN eth.header_cids ON (state_cids.header_id = header_cids.id)
WHERE storage_path = path
AND block_number > height
AND block_number <= (SELECT block_number
FROM eth.header_cids
WHERE block_hash = hash)
AND storage_cids.node_type = 3
LIMIT 1);
$$ LANGUAGE SQL;
-- +goose StatementEnd
-- +goose StatementBegin
-- returns if a state node at the provided path was removed in the range > the provided height and <= the provided block hash
CREATE OR REPLACE FUNCTION was_state_removed(path BYTEA, height BIGINT, hash VARCHAR(66)) RETURNS BOOLEAN
AS $$
SELECT exists(SELECT 1
FROM eth.state_cids
INNER JOIN eth.header_cids ON (state_cids.header_id = header_cids.id)
WHERE state_path = path
AND block_number > height
AND block_number <= (SELECT block_number
FROM eth.header_cids
WHERE block_hash = hash)
AND state_cids.node_type = 3
LIMIT 1);
$$ LANGUAGE SQL;
-- +goose StatementEnd
-- +goose StatementBegin
CREATE TYPE child_result AS (
has_child BOOLEAN,
children eth.header_cids[]
);
CREATE OR REPLACE FUNCTION has_child(hash VARCHAR(66), height BIGINT) RETURNS child_result AS
$BODY$
DECLARE
child_height INT;
temp_child eth.header_cids;
new_child_result child_result;
BEGIN
child_height = height + 1;
-- short circuit if there are no children
SELECT exists(SELECT 1
FROM eth.header_cids
WHERE parent_hash = hash
AND block_number = child_height
LIMIT 1)
INTO new_child_result.has_child;
-- collect all the children for this header
IF new_child_result.has_child THEN
FOR temp_child IN
SELECT * FROM eth.header_cids WHERE parent_hash = hash AND block_number = child_height
LOOP
new_child_result.children = array_append(new_child_result.children, temp_child);
END LOOP;
END IF;
RETURN new_child_result;
END
$BODY$
LANGUAGE 'plpgsql';
-- +goose StatementEnd
-- +goose StatementBegin
CREATE OR REPLACE FUNCTION canonical_header_from_array(headers eth.header_cids[]) RETURNS eth.header_cids AS
$BODY$
DECLARE
canonical_header eth.header_cids;
canonical_child eth.header_cids;
header eth.header_cids;
current_child_result child_result;
child_headers eth.header_cids[];
current_header_with_child eth.header_cids;
has_children_count INT DEFAULT 0;
BEGIN
-- for each header in the provided set
FOREACH header IN ARRAY headers
LOOP
-- check if it has any children
current_child_result = has_child(header.block_hash, header.block_number);
IF current_child_result.has_child THEN
-- if it does, take note
has_children_count = has_children_count + 1;
current_header_with_child = header;
-- and add the children to the growing set of child headers
child_headers = array_cat(child_headers, current_child_result.children);
END IF;
END LOOP;
-- if none of the headers had children, none is more canonical than the other
IF has_children_count = 0 THEN
-- return the first one selected
SELECT * INTO canonical_header FROM unnest(headers) LIMIT 1;
-- if only one header had children, it can be considered the heaviest/canonical header of the set
ELSIF has_children_count = 1 THEN
-- return the only header with a child
canonical_header = current_header_with_child;
-- if there are multiple headers with children
ELSE
-- find the canonical header from the child set
canonical_child = canonical_header_from_array(child_headers);
-- the header that is parent to this header, is the canonical header at this level
SELECT * INTO canonical_header FROM unnest(headers)
WHERE block_hash = canonical_child.parent_hash;
END IF;
RETURN canonical_header;
END
$BODY$
LANGUAGE 'plpgsql';
-- +goose StatementEnd
-- +goose StatementBegin
CREATE OR REPLACE FUNCTION canonical_header_id(height BIGINT) RETURNS INTEGER AS
$BODY$
DECLARE
canonical_header eth.header_cids;
headers eth.header_cids[];
header_count INT;
temp_header eth.header_cids;
BEGIN
-- collect all headers at this height
FOR temp_header IN
SELECT * FROM eth.header_cids WHERE block_number = height
LOOP
headers = array_append(headers, temp_header);
END LOOP;
-- count the number of headers collected
header_count = array_length(headers, 1);
-- if we have less than 1 header, return NULL
IF header_count IS NULL OR header_count < 1 THEN
RETURN NULL;
-- if we have one header, return its id
ELSIF header_count = 1 THEN
RETURN headers[1].id;
-- if we have multiple headers we need to determine which one is canonical
ELSE
canonical_header = canonical_header_from_array(headers);
RETURN canonical_header.id;
END IF;
END;
$BODY$
LANGUAGE 'plpgsql';
-- +goose StatementEnd
-- +goose Down
DROP FUNCTION was_storage_removed;
DROP FUNCTION was_state_removed;
DROP FUNCTION canonical_header_id;
DROP FUNCTION canonical_header_from_array;
DROP FUNCTION has_child;
DROP TYPE child_result;

1201
statediff/db/schema.sql Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,57 +0,0 @@
// Copyright 2019 The go-ethereum Authors
// This file is part of the go-ethereum library.
//
// The go-ethereum library is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// The go-ethereum library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
/*
Package statediff provides an auxiliary service that processes state diff objects from incoming chain events,
relaying the objects to any rpc subscriptions.
This work is adapted from work by Charles Crain at https://github.com/jpmorganchase/quorum/blob/9b7fd9af8082795eeeb6863d9746f12b82dd5078/statediff/statediff.go
The service is spun up using the below CLI flags
--statediff: boolean flag, turns on the service
--statediff.streamblock: boolean flag, configures the service to associate and stream out the rest of the block data with the state diffs.
--statediff.intermediatenodes: boolean flag, tells service to include intermediate (branch and extension) nodes; default (false) processes leaf nodes only.
--statediff.watchedaddresses: string slice flag, used to limit the state diffing process to the given addresses. Usage: --statediff.watchedaddresses=addr1 --statediff.watchedaddresses=addr2 --statediff.watchedaddresses=addr3
If you wish to use the websocket endpoint to subscribe to the statediff service, be sure to open up the Websocket RPC server with the `--ws` flag. The IPC-RPC server is turned on by default.
The statediffing services works only with `--syncmode="full", but -importantly- does not require garbage collection to be turned off (does not require an archival node).
e.g.
$ ./geth --statediff --statediff.streamblock --ws --syncmode "full"
This starts up the geth node in full sync mode, starts up the statediffing service, and opens up the websocket endpoint to subscribe to the service.
Because the "streamblock" flag has been turned on, the service will strean out block data (headers, transactions, and receipts) along with the diffed state and storage leafs.
Rpc subscriptions to the service can be created using the rpc.Client.Subscribe() method,
with the "statediff" namespace, a statediff.Payload channel, and the name of the statediff api's rpc method- "stream".
e.g.
cli, _ := rpc.Dial("ipcPathOrWsURL")
stateDiffPayloadChan := make(chan statediff.Payload, 20000)
rpcSub, err := cli.Subscribe(context.Background(), "statediff", stateDiffPayloadChan, "stream"})
for {
select {
case stateDiffPayload := <- stateDiffPayloadChan:
processPayload(stateDiffPayload)
case err := <- rpcSub.Err():
log.Error(err)
}
}
*/
package statediff

215
statediff/doc.md Normal file
View File

@ -0,0 +1,215 @@
# Statediff
This package provides an auxiliary service that asynchronously processes state diff objects from chain events,
either relaying the state objects to RPC subscribers or writing them directly to Postgres as IPLD objects.
It also exposes RPC endpoints for fetching or writing to Postgres the state diff at a specific block height
or for a specific block hash, this operates on historical block and state data and so depends on a complete state archive.
Data is emitted in this differential format in order to make it feasible to IPLD-ize and index the *entire* Ethereum state
(including intermediate state and storage trie nodes). If this state diff process is ran continuously from genesis,
the entire state at any block can be materialized from the cumulative differentials up to that point.
## Statediff object
A state diff `StateObject` is the collection of all the state and storage trie nodes that have been updated in a given block.
For convenience, we also associate these nodes with the block number and hash, and optionally the set of code hashes and code for any
contracts deployed in this block.
A complete state diff `StateObject` will include all state and storage intermediate nodes, which is necessary for generating proofs and for
traversing the tries.
```go
// StateObject is a collection of state (and linked storage nodes) as well as the associated block number, block hash,
// and a set of code hashes and their code
type StateObject struct {
BlockNumber *big.Int `json:"blockNumber" gencodec:"required"`
BlockHash common.Hash `json:"blockHash" gencodec:"required"`
Nodes []StateNode `json:"nodes" gencodec:"required"`
CodeAndCodeHashes []CodeAndCodeHash `json:"codeMapping"`
}
// StateNode holds the data for a single state diff node
type StateNode struct {
NodeType NodeType `json:"nodeType" gencodec:"required"`
Path []byte `json:"path" gencodec:"required"`
NodeValue []byte `json:"value" gencodec:"required"`
StorageNodes []StorageNode `json:"storage"`
LeafKey []byte `json:"leafKey"`
}
// StorageNode holds the data for a single storage diff node
type StorageNode struct {
NodeType NodeType `json:"nodeType" gencodec:"required"`
Path []byte `json:"path" gencodec:"required"`
NodeValue []byte `json:"value" gencodec:"required"`
LeafKey []byte `json:"leafKey"`
}
// CodeAndCodeHash struct for holding codehash => code mappings
// we can't use an actual map because they are not rlp serializable
type CodeAndCodeHash struct {
Hash common.Hash `json:"codeHash"`
Code []byte `json:"code"`
}
```
These objects are packed into a `Payload` structure which can additionally associate the `StateObject`
with the block (header, uncles, and transactions), receipts, and total difficulty.
This `Payload` encapsulates all of the differential data at a given block, and allows us to index the entire Ethereum data structure
as hash-linked IPLD objects.
```go
// Payload packages the data to send to state diff subscriptions
type Payload struct {
BlockRlp []byte `json:"blockRlp"`
TotalDifficulty *big.Int `json:"totalDifficulty"`
ReceiptsRlp []byte `json:"receiptsRlp"`
StateObjectRlp []byte `json:"stateObjectRlp" gencodec:"required"`
encoded []byte
err error
}
```
## Usage
This state diffing service runs as an auxiliary service concurrent to the regular syncing process of the geth node.
### CLI configuration
This service introduces a CLI flag namespace `statediff`
`--statediff` flag is used to turn on the service
`--statediff.writing` is used to tell the service to write state diff objects it produces from synced ChainEvents directly to a configured Postgres database
`--statediff.db` is the connection string for the Postgres database to write to
`--statediff.dbnodeid` is the node id to use in the Postgres database
`--statediff.dbclientname` is the client name to use in the Postgres database
The service can only operate in full sync mode (`--syncmode=full`), but only the historical RPC endpoints require an archive node (`--gcmode=archive`)
e.g.
`
./build/bin/geth --syncmode=full --gcmode=archive --statediff --statediff.writing --statediff.db=postgres://localhost:5432/vulcanize_testing?sslmode=disable --statediff.dbnodeid={nodeId} --statediff.dbclientname={dbClientName}
`
### RPC endpoints
The state diffing service exposes both a WS subscription endpoint, and a number of HTTP unary endpoints.
Each of these endpoints requires a set of parameters provided by the caller
```go
// Params is used to carry in parameters from subscribing/requesting clients configuration
type Params struct {
IntermediateStateNodes bool
IntermediateStorageNodes bool
IncludeBlock bool
IncludeReceipts bool
IncludeTD bool
IncludeCode bool
WatchedAddresses []common.Address
WatchedStorageSlots []common.Hash
}
```
Using these params we can tell the service whether to include state and/or storage intermediate nodes; whether
to include the associated block (header, uncles, and transactions); whether to include the associated receipts;
whether to include the total difficulty for this block; whether to include the set of code hashes and code for
contracts deployed in this block; whether to limit the diffing process to a list of specific addresses; and/or
whether to limit the diffing process to a list of specific storage slot keys.
#### Subscription endpoint
A websocket supporting RPC endpoint is exposed for subscribing to state diff `StateObjects` that come off the head of the chain while the geth node syncs.
```go
// Stream is a subscription endpoint that fires off state diff payloads as they are created
Stream(ctx context.Context, params Params) (*rpc.Subscription, error)
```
To expose this endpoint the node needs to have the websocket server turned on (`--ws`),
and the `statediff` namespace exposed (`--ws.api=statediff`).
Go code subscriptions to this endpoint can be created using the `rpc.Client.Subscribe()` method,
with the "statediff" namespace, a `statediff.Payload` channel, and the name of the statediff api's rpc method: "stream".
e.g.
```go
cli, err := rpc.Dial("ipcPathOrWsURL")
if err != nil {
// handle error
}
stateDiffPayloadChan := make(chan statediff.Payload, 20000)
methodName := "stream"
params := statediff.Params{
IncludeBlock: true,
IncludeTD: true,
IncludeReceipts: true,
IntermediateStorageNodes: true,
IntermediateStateNodes: true,
}
rpcSub, err := cli.Subscribe(context.Background(), statediff.APIName, stateDiffPayloadChan, methodName, params)
if err != nil {
// handle error
}
for {
select {
case stateDiffPayload := <- stateDiffPayloadChan:
// process the payload
case err := <- rpcSub.Err():
// handle rpc subscription error
}
}
```
#### Unary endpoints
The service also exposes unary RPC endpoints for retrieving the state diff `StateObject` for a specific block height/hash.
```go
// StateDiffAt returns a state diff payload at the specific blockheight
StateDiffAt(ctx context.Context, blockNumber uint64, params Params) (*Payload, error)
// StateDiffFor returns a state diff payload for the specific blockhash
StateDiffFor(ctx context.Context, blockHash common.Hash, params Params) (*Payload, error)
```
To expose this endpoint the node needs to have the HTTP server turned on (`--http`),
and the `statediff` namespace exposed (`--http.api=statediff`).
### Direct indexing into Postgres
If `--statediff.writing` is set, the service will convert the state diff `StateObject` data into IPLD objects, persist them directly to Postgres,
and generate secondary indexes around the IPLD data.
The schema and migrations for this Postgres database are provided in `statediff/db/`.
#### Postgres setup
We use [pressly/goose](https://github.com/pressly/goose) as our Postgres migration manager.
You can also load the Postgres schema directly into a database using
`psql database_name < schema.sql`
This will only work on a version 12.4 Postgres database.
#### Schema overview
Our Postgres schemas are built around a single IPFS backing Postgres IPLD blockstore table (`public.blocks`) that conforms with [go-ds-sql](https://github.com/ipfs/go-ds-sql/blob/master/postgres/postgres.go).
All IPLD objects are stored in this table, where `key` is the blockstore-prefixed multihash key for the IPLD object and `data` contains
the bytes for the IPLD block (in the case of all Ethereum IPLDs, this is the RLP byte encoding of the Ethereum object).
The IPLD objects in this table can be traversed using an IPLD DAG interface, but since this table only maps multihash to raw IPLD object
it is not particularly useful for searching through the data by looking up Ethereum objects by their constituent fields
(e.g. by block number, tx source/recipient, state/storage trie node path). To improve the accessibility of these objects
we create an Ethereum [advanced data layout](https://github.com/ipld/specs#schemas-and-advanced-data-layouts) (ADL) by generating secondary
indexes on top of the raw IPLDs in other Postgres tables.
These secondary index tables fall under the `eth` schema and follow an `{objectType}_cids` naming convention.
These tables provide a view into individual fields of the underlying Ethereum IPLD objects, allowing lookups on these fields, and reference the raw IPLD objects stored in `public.blocks`
by foreign keys to their multihash keys.
Additionally, these tables maintain the hash-linked nature of Ethereum objects to one another. E.g. a storage trie node entry in the `storage_cids`
table contains a `state_id` foreign key which references the `id` for the `state_cids` entry that contains the state leaf node for the contract that storage node belongs to,
and in turn that `state_cids` entry contains a `header_id` foreign key which references the `id` of the `header_cids` entry that contains the header for the block these state and storage nodes were updated (diffed).
### Optimization
On mainnet this process is extremely IO intensive and requires significant resources to allow it to keep up with the head of the chain.
The state diff processing time for a specific block is dependent on the number and complexity of the state changes that occur in a block and
the number of updated state nodes that are available in the in-memory cache vs must be retrieved from disc.
If memory permits, one means of improving the efficiency of this process is to increase the in-memory trie cache allocation.
This can be done by increasing the overall `--cache` allocation and/or by increasing the % of the cache allocated to trie
usage with `--cache.trie`.