2021-04-09 02:32:04 +00:00
18 changed files with 1912 additions and 58 deletions
--- a/statediff/builder_test.go
+++ b/statediff/builder_test.go
@ -32,7 +32,6 @@ import (
 	sdtypes "github.com/ethereum/go-ethereum/statediff/types"
 )

-// TODO: add test that filters on address
 var (
 	contractLeafKey                                        []byte
 	emptyDiffs                                             = make([]sdtypes.StateNode, 0)
--- a/statediff/db/migrations/00001_create_ipfs_blocks_table.sql
+++ b/statediff/db/migrations/00001_create_ipfs_blocks_table.sql
@ -0,0 +1,8 @@
+-- +goose Up
+CREATE TABLE IF NOT EXISTS public.blocks (
+  key TEXT UNIQUE NOT NULL,
+  data BYTEA NOT NULL
+);
+
+-- +goose Down
+DROP TABLE public.blocks;
--- a/statediff/db/migrations/00002_create_nodes_table.sql
+++ b/statediff/db/migrations/00002_create_nodes_table.sql
@ -0,0 +1,13 @@
+-- +goose Up
+CREATE TABLE nodes (
+  id            SERIAL PRIMARY KEY,
+  client_name   VARCHAR,
+  genesis_block VARCHAR(66),
+  network_id    VARCHAR,
+  node_id       VARCHAR(128),
+  chain_id      INTEGER DEFAULT 1,
+  CONSTRAINT node_uc UNIQUE (genesis_block, network_id, node_id, chain_id)
+);
+
+-- +goose Down
+DROP TABLE nodes;
--- a/statediff/db/migrations/00003_create_eth_schema.sql
+++ b/statediff/db/migrations/00003_create_eth_schema.sql
@ -0,0 +1,5 @@
+-- +goose Up
+CREATE SCHEMA eth;
+
+-- +goose Down
+DROP SCHEMA eth;
--- a/statediff/db/migrations/00004_create_eth_header_cids_table.sql
+++ b/statediff/db/migrations/00004_create_eth_header_cids_table.sql
@ -0,0 +1,23 @@
+-- +goose Up
+CREATE TABLE eth.header_cids (
+  id                    SERIAL PRIMARY KEY,
+  block_number          BIGINT NOT NULL,
+  block_hash            VARCHAR(66) NOT NULL,
+  parent_hash           VARCHAR(66) NOT NULL,
+  cid                   TEXT NOT NULL,
+  mh_key                TEXT NOT NULL REFERENCES public.blocks (key) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+  td                    NUMERIC NOT NULL,
+  node_id               INTEGER NOT NULL REFERENCES nodes (id) ON DELETE CASCADE,
+  reward                NUMERIC NOT NULL,
+  state_root            VARCHAR(66) NOT NULL,
+  tx_root               VARCHAR(66) NOT NULL,
+  receipt_root          VARCHAR(66) NOT NULL,
+  uncle_root            VARCHAR(66) NOT NULL,
+  bloom                 BYTEA NOT NULL,
+  timestamp             NUMERIC NOT NULL,
+  times_validated       INTEGER NOT NULL DEFAULT 1,
+  UNIQUE (block_number, block_hash)
+);
+
+-- +goose Down
+DROP TABLE eth.header_cids;
--- a/statediff/db/migrations/00005_create_eth_uncle_cids_table.sql
+++ b/statediff/db/migrations/00005_create_eth_uncle_cids_table.sql
@ -0,0 +1,14 @@
+-- +goose Up
+CREATE TABLE eth.uncle_cids (
+  id                    SERIAL PRIMARY KEY,
+  header_id             INTEGER NOT NULL REFERENCES eth.header_cids (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+  block_hash            VARCHAR(66) NOT NULL,
+  parent_hash           VARCHAR(66) NOT NULL,
+  cid                   TEXT NOT NULL,
+  mh_key                TEXT NOT NULL REFERENCES public.blocks (key) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+  reward                NUMERIC NOT NULL,
+  UNIQUE (header_id, block_hash)
+);
+
+-- +goose Down
+DROP TABLE eth.uncle_cids;
--- a/statediff/db/migrations/00006_create_eth_transaction_cids_table.sql
+++ b/statediff/db/migrations/00006_create_eth_transaction_cids_table.sql
@ -0,0 +1,16 @@
+-- +goose Up
+CREATE TABLE eth.transaction_cids (
+  id                    SERIAL PRIMARY KEY,
+  header_id             INTEGER NOT NULL REFERENCES eth.header_cids (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+  tx_hash               VARCHAR(66) NOT NULL,
+  index                 INTEGER NOT NULL,
+  cid                   TEXT NOT NULL,
+  mh_key                TEXT NOT NULL REFERENCES public.blocks (key) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+  dst                   VARCHAR(66) NOT NULL,
+  src                   VARCHAR(66) NOT NULL,
+  tx_data               BYTEA,
+  UNIQUE (header_id, tx_hash)
+);
+
+-- +goose Down
+DROP TABLE eth.transaction_cids;
--- a/statediff/db/migrations/00007_create_eth_receipt_cids_table.sql
+++ b/statediff/db/migrations/00007_create_eth_receipt_cids_table.sql
@ -0,0 +1,20 @@
+-- +goose Up
+CREATE TABLE eth.receipt_cids (
+  id                    SERIAL PRIMARY KEY,
+  tx_id                 INTEGER NOT NULL REFERENCES eth.transaction_cids (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+  cid                   TEXT NOT NULL,
+  mh_key                TEXT NOT NULL REFERENCES public.blocks (key) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+  contract              VARCHAR(66),
+  contract_hash         VARCHAR(66),
+  topic0s               VARCHAR(66)[],
+  topic1s               VARCHAR(66)[],
+  topic2s               VARCHAR(66)[],
+  topic3s               VARCHAR(66)[],
+  log_contracts         VARCHAR(66)[],
+  post_state            VARCHAR(66),
+  post_status           INTEGER,
+  UNIQUE (tx_id)
+);
+
+-- +goose Down
+DROP TABLE eth.receipt_cids;
--- a/statediff/db/migrations/00008_create_eth_state_cids_table.sql
+++ b/statediff/db/migrations/00008_create_eth_state_cids_table.sql
@ -0,0 +1,15 @@
+-- +goose Up
+CREATE TABLE eth.state_cids (
+  id                    BIGSERIAL PRIMARY KEY,
+  header_id             INTEGER NOT NULL REFERENCES eth.header_cids (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+  state_leaf_key        VARCHAR(66),
+  cid                   TEXT NOT NULL,
+  mh_key                TEXT NOT NULL REFERENCES public.blocks (key) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+  state_path            BYTEA,
+  node_type             INTEGER NOT NULL,
+  diff                  BOOLEAN NOT NULL DEFAULT FALSE,
+  UNIQUE (header_id, state_path)
+);
+
+-- +goose Down
+DROP TABLE eth.state_cids;
--- a/statediff/db/migrations/00009_create_eth_storage_cids_table.sql
+++ b/statediff/db/migrations/00009_create_eth_storage_cids_table.sql
@ -0,0 +1,15 @@
+-- +goose Up
+CREATE TABLE eth.storage_cids (
+  id                    BIGSERIAL PRIMARY KEY,
+  state_id              BIGINT NOT NULL REFERENCES eth.state_cids (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+  storage_leaf_key      VARCHAR(66),
+  cid                   TEXT NOT NULL,
+  mh_key                TEXT NOT NULL REFERENCES public.blocks (key) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+  storage_path          BYTEA,
+  node_type             INTEGER NOT NULL,
+  diff                  BOOLEAN NOT NULL DEFAULT FALSE,
+  UNIQUE (state_id, storage_path)
+);
+
+-- +goose Down
+DROP TABLE eth.storage_cids;
--- a/statediff/db/migrations/00010_create_eth_state_accouts_table.sql
+++ b/statediff/db/migrations/00010_create_eth_state_accouts_table.sql
@ -0,0 +1,13 @@
+-- +goose Up
+CREATE TABLE eth.state_accounts (
+  id                    SERIAL PRIMARY KEY,
+  state_id              BIGINT NOT NULL REFERENCES eth.state_cids (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+  balance               NUMERIC NOT NULL,
+  nonce                 INTEGER NOT NULL,
+  code_hash             BYTEA NOT NULL,
+  storage_root          VARCHAR(66) NOT NULL,
+  UNIQUE (state_id)
+);
+
+-- +goose Down
+DROP TABLE eth.state_accounts;
--- a/statediff/db/migrations/00011_create_postgraphile_comments.sql
+++ b/statediff/db/migrations/00011_create_postgraphile_comments.sql
@ -0,0 +1,6 @@
+-- +goose Up
+COMMENT ON TABLE public.nodes IS E'@name NodeInfo';
+COMMENT ON TABLE eth.transaction_cids IS E'@name EthTransactionCids';
+COMMENT ON TABLE eth.header_cids IS E'@name EthHeaderCids';
+COMMENT ON COLUMN public.nodes.node_id IS E'@name ChainNodeID';
+COMMENT ON COLUMN eth.header_cids.node_id IS E'@name EthNodeID';
--- a/statediff/db/migrations/00012_potgraphile_triggers.sql
+++ b/statediff/db/migrations/00012_potgraphile_triggers.sql
@ -0,0 +1,69 @@
+-- +goose Up
+-- +goose StatementBegin
+CREATE FUNCTION eth.graphql_subscription() returns TRIGGER as $$
+declare
+    table_name text = TG_ARGV[0];
+    attribute text = TG_ARGV[1];
+    id text;
+begin
+    execute 'select $1.' || quote_ident(attribute)
+        using new
+        into id;
+    perform pg_notify('postgraphile:' || table_name,
+                      json_build_object(
+                              '__node__', json_build_array(
+                              table_name,
+                              id
+                          )
+                          )::text
+        );
+    return new;
+end;
+$$ language plpgsql;
+-- +goose StatementEnd
+
+CREATE TRIGGER header_cids_ai
+    after INSERT ON eth.header_cids
+    for each row
+    execute procedure eth.graphql_subscription('header_cids', 'id');
+
+CREATE TRIGGER receipt_cids_ai
+    after INSERT ON eth.receipt_cids
+    for each row
+    execute procedure eth.graphql_subscription('receipt_cids', 'id');
+
+CREATE TRIGGER state_accounts_ai
+    after INSERT ON eth.state_accounts
+    for each row
+    execute procedure eth.graphql_subscription('state_accounts', 'id');
+
+CREATE TRIGGER state_cids_ai
+    after INSERT ON eth.state_cids
+    for each row
+    execute procedure eth.graphql_subscription('state_cids', 'id');
+
+CREATE TRIGGER storage_cids_ai
+    after INSERT ON eth.storage_cids
+    for each row
+    execute procedure eth.graphql_subscription('storage_cids', 'id');
+
+CREATE TRIGGER transaction_cids_ai
+    after INSERT ON eth.transaction_cids
+    for each row
+    execute procedure eth.graphql_subscription('transaction_cids', 'id');
+
+CREATE TRIGGER uncle_cids_ai
+    after INSERT ON eth.uncle_cids
+    for each row
+    execute procedure eth.graphql_subscription('uncle_cids', 'id');
+
+-- +goose Down
+DROP TRIGGER uncle_cids_ai ON eth.uncle_cids;
+DROP TRIGGER transaction_cids_ai ON eth.transaction_cids;
+DROP TRIGGER storage_cids_ai ON eth.storage_cids;
+DROP TRIGGER state_cids_ai ON eth.state_cids;
+DROP TRIGGER state_accounts_ai ON eth.state_accounts;
+DROP TRIGGER receipt_cids_ai ON eth.receipt_cids;
+DROP TRIGGER header_cids_ai ON eth.header_cids;
+
+DROP FUNCTION eth.graphql_subscription();
--- a/statediff/db/migrations/00013_create_cid_indexes.sql
+++ b/statediff/db/migrations/00013_create_cid_indexes.sql
@ -0,0 +1,121 @@
+-- +goose Up
+-- header indexes
+CREATE INDEX block_number_index ON eth.header_cids USING brin (block_number);
+
+CREATE INDEX block_hash_index ON eth.header_cids USING btree (block_hash);
+
+CREATE INDEX header_cid_index ON eth.header_cids USING btree (cid);
+
+CREATE INDEX header_mh_index ON eth.header_cids USING btree (mh_key);
+
+CREATE INDEX state_root_index ON eth.header_cids USING btree (state_root);
+
+CREATE INDEX timestamp_index ON eth.header_cids USING brin (timestamp);
+
+-- transaction indexes
+CREATE INDEX tx_header_id_index ON eth.transaction_cids USING btree (header_id);
+
+CREATE INDEX tx_hash_index ON eth.transaction_cids USING btree (tx_hash);
+
+CREATE INDEX tx_cid_index ON eth.transaction_cids USING btree (cid);
+
+CREATE INDEX tx_mh_index ON eth.transaction_cids USING btree (mh_key);
+
+CREATE INDEX tx_dst_index ON eth.transaction_cids USING btree (dst);
+
+CREATE INDEX tx_src_index ON eth.transaction_cids USING btree (src);
+
+-- receipt indexes
+CREATE INDEX rct_tx_id_index ON eth.receipt_cids USING btree (tx_id);
+
+CREATE INDEX rct_cid_index ON eth.receipt_cids USING btree (cid);
+
+CREATE INDEX rct_mh_index ON eth.receipt_cids USING btree (mh_key);
+
+CREATE INDEX rct_contract_index ON eth.receipt_cids USING btree (contract);
+
+CREATE INDEX rct_contract_hash_index ON eth.receipt_cids USING btree (contract_hash);
+
+CREATE INDEX rct_topic0_index ON eth.receipt_cids USING gin (topic0s);
+
+CREATE INDEX rct_topic1_index ON eth.receipt_cids USING gin (topic1s);
+
+CREATE INDEX rct_topic2_index ON eth.receipt_cids USING gin (topic2s);
+
+CREATE INDEX rct_topic3_index ON eth.receipt_cids USING gin (topic3s);
+
+CREATE INDEX rct_log_contract_index ON eth.receipt_cids USING gin (log_contracts);
+
+-- state node indexes
+CREATE INDEX state_header_id_index ON eth.state_cids USING btree (header_id);
+
+CREATE INDEX state_leaf_key_index ON eth.state_cids USING btree (state_leaf_key);
+
+CREATE INDEX state_cid_index ON eth.state_cids USING btree (cid);
+
+CREATE INDEX state_mh_index ON eth.state_cids USING btree (mh_key);
+
+CREATE INDEX state_path_index ON eth.state_cids USING btree (state_path);
+
+-- storage node indexes
+CREATE INDEX storage_state_id_index ON eth.storage_cids USING btree (state_id);
+
+CREATE INDEX storage_leaf_key_index ON eth.storage_cids USING btree (storage_leaf_key);
+
+CREATE INDEX storage_cid_index ON eth.storage_cids USING btree (cid);
+
+CREATE INDEX storage_mh_index ON eth.storage_cids USING btree (mh_key);
+
+CREATE INDEX storage_path_index ON eth.storage_cids USING btree (storage_path);
+
+-- state accounts indexes
+CREATE INDEX account_state_id_index ON eth.state_accounts USING btree (state_id);
+
+CREATE INDEX storage_root_index ON eth.state_accounts USING btree (storage_root);
+
+-- +goose Down
+-- state account indexes
+DROP INDEX eth.storage_root_index;
+DROP INDEX eth.account_state_id_index;
+
+-- storage node indexes
+DROP INDEX eth.storage_path_index;
+DROP INDEX eth.storage_mh_index;
+DROP INDEX eth.storage_cid_index;
+DROP INDEX eth.storage_leaf_key_index;
+DROP INDEX eth.storage_state_id_index;
+
+-- state node indexes
+DROP INDEX eth.state_path_index;
+DROP INDEX eth.state_mh_index;
+DROP INDEX eth.state_cid_index;
+DROP INDEX eth.state_leaf_key_index;
+DROP INDEX eth.state_header_id_index;
+
+-- receipt indexes
+DROP INDEX eth.rct_log_contract_index;
+DROP INDEX eth.rct_topic3_index;
+DROP INDEX eth.rct_topic2_index;
+DROP INDEX eth.rct_topic1_index;
+DROP INDEX eth.rct_topic0_index;
+DROP INDEX eth.rct_contract_hash_index;
+DROP INDEX eth.rct_contract_index;
+DROP INDEX eth.rct_mh_index;
+DROP INDEX eth.rct_cid_index;
+DROP INDEX eth.rct_tx_id_index;
+
+-- transaction indexes
+DROP INDEX eth.tx_src_index;
+DROP INDEX eth.tx_dst_index;
+DROP INDEX eth.tx_mh_index;
+DROP INDEX eth.tx_cid_index;
+DROP INDEX eth.tx_hash_index;
+DROP INDEX eth.tx_header_id_index;
+
+-- header indexes
+DROP INDEX eth.timestamp_index;
+DROP INDEX eth.state_root_index;
+DROP INDEX eth.header_mh_index;
+DROP INDEX eth.header_cid_index;
+DROP INDEX eth.block_hash_index;
+DROP INDEX eth.block_number_index;
--- a/statediff/db/migrations/00014_create_stored_functions.sql
+++ b/statediff/db/migrations/00014_create_stored_functions.sql
@ -0,0 +1,158 @@
+-- +goose Up
+-- +goose StatementBegin
+-- returns if a storage node at the provided path was removed in the range > the provided height and <= the provided block hash
+CREATE OR REPLACE FUNCTION was_storage_removed(path BYTEA, height BIGINT, hash VARCHAR(66)) RETURNS BOOLEAN
+AS $$
+SELECT exists(SELECT 1
+              FROM eth.storage_cids
+                INNER JOIN eth.state_cids ON (storage_cids.state_id = state_cids.id)
+                INNER JOIN eth.header_cids ON (state_cids.header_id = header_cids.id)
+              WHERE storage_path = path
+                AND block_number > height
+                AND block_number <= (SELECT block_number
+                                     FROM eth.header_cids
+                                     WHERE block_hash = hash)
+                AND storage_cids.node_type = 3
+              LIMIT 1);
+$$ LANGUAGE SQL;
+-- +goose StatementEnd
+
+-- +goose StatementBegin
+-- returns if a state node at the provided path was removed in the range > the provided height and <= the provided block hash
+CREATE OR REPLACE FUNCTION was_state_removed(path BYTEA, height BIGINT, hash VARCHAR(66)) RETURNS BOOLEAN
+AS $$
+SELECT exists(SELECT 1
+              FROM eth.state_cids
+                INNER JOIN eth.header_cids ON (state_cids.header_id = header_cids.id)
+              WHERE state_path = path
+                AND block_number > height
+                AND block_number <= (SELECT block_number
+                                     FROM eth.header_cids
+                                     WHERE block_hash = hash)
+                AND state_cids.node_type = 3
+              LIMIT 1);
+$$ LANGUAGE SQL;
+-- +goose StatementEnd
+
+-- +goose StatementBegin
+CREATE TYPE child_result AS (
+    has_child BOOLEAN,
+    children eth.header_cids[]
+);
+
+CREATE OR REPLACE FUNCTION has_child(hash VARCHAR(66), height BIGINT) RETURNS child_result AS
+$BODY$
+DECLARE
+  child_height INT;
+  temp_child eth.header_cids;
+  new_child_result child_result;
+BEGIN
+  child_height = height + 1;
+  -- short circuit if there are no children
+  SELECT exists(SELECT 1
+              FROM eth.header_cids
+              WHERE parent_hash = hash
+                AND block_number = child_height
+              LIMIT 1)
+  INTO new_child_result.has_child;
+  -- collect all the children for this header
+  IF new_child_result.has_child THEN
+    FOR temp_child IN
+    SELECT * FROM eth.header_cids WHERE parent_hash = hash AND block_number = child_height
+    LOOP
+      new_child_result.children = array_append(new_child_result.children, temp_child);
+    END LOOP;
+  END IF;
+RETURN new_child_result;
+END
+$BODY$
+LANGUAGE 'plpgsql';
+-- +goose StatementEnd
+
+-- +goose StatementBegin
+CREATE OR REPLACE FUNCTION canonical_header_from_array(headers eth.header_cids[]) RETURNS eth.header_cids AS
+$BODY$
+DECLARE
+  canonical_header eth.header_cids;
+  canonical_child eth.header_cids;
+  header eth.header_cids;
+  current_child_result child_result;
+  child_headers eth.header_cids[];
+  current_header_with_child eth.header_cids;
+  has_children_count INT DEFAULT 0;
+BEGIN
+  -- for each header in the provided set
+  FOREACH header IN ARRAY headers
+  LOOP
+    -- check if it has any children
+    current_child_result = has_child(header.block_hash, header.block_number);
+    IF current_child_result.has_child THEN
+      -- if it does, take note
+      has_children_count = has_children_count + 1;
+      current_header_with_child = header;
+      -- and add the children to the growing set of child headers
+      child_headers = array_cat(child_headers, current_child_result.children);
+    END IF;
+  END LOOP;
+  -- if none of the headers had children, none is more canonical than the other
+  IF has_children_count = 0 THEN
+    -- return the first one selected
+    SELECT * INTO canonical_header FROM unnest(headers) LIMIT 1;
+  -- if only one header had children, it can be considered the heaviest/canonical header of the set
+  ELSIF has_children_count = 1 THEN
+    -- return the only header with a child
+    canonical_header = current_header_with_child;
+  -- if there are multiple headers with children
+  ELSE
+    -- find the canonical header from the child set
+    canonical_child = canonical_header_from_array(child_headers);
+    -- the header that is parent to this header, is the canonical header at this level
+    SELECT * INTO canonical_header FROM unnest(headers)
+    WHERE block_hash = canonical_child.parent_hash;
+  END IF;
+  RETURN canonical_header;
+END
+$BODY$
+LANGUAGE 'plpgsql';
+-- +goose StatementEnd
+
+-- +goose StatementBegin
+CREATE OR REPLACE FUNCTION canonical_header_id(height BIGINT) RETURNS INTEGER AS
+$BODY$
+DECLARE
+  canonical_header eth.header_cids;
+  headers eth.header_cids[];
+  header_count INT;
+  temp_header eth.header_cids;
+BEGIN
+  -- collect all headers at this height
+  FOR temp_header IN
+  SELECT * FROM eth.header_cids WHERE block_number = height
+  LOOP
+    headers = array_append(headers, temp_header);
+  END LOOP;
+  -- count the number of headers collected
+  header_count = array_length(headers, 1);
+  -- if we have less than 1 header, return NULL
+  IF header_count IS NULL OR header_count < 1 THEN
+    RETURN NULL;
+  -- if we have one header, return its id
+  ELSIF header_count = 1 THEN
+    RETURN headers[1].id;
+  -- if we have multiple headers we need to determine which one is canonical
+  ELSE
+    canonical_header = canonical_header_from_array(headers);
+    RETURN canonical_header.id;
+  END IF;
+END;
+$BODY$
+LANGUAGE 'plpgsql';
+-- +goose StatementEnd
+
+-- +goose Down
+DROP FUNCTION was_storage_removed;
+DROP FUNCTION was_state_removed;
+DROP FUNCTION canonical_header_id;
+DROP FUNCTION canonical_header_from_array;
+DROP FUNCTION has_child;
+DROP TYPE child_result;
--- a/statediff/db/schema.sql
+++ b/statediff/db/schema.sql
--- a/statediff/doc.go
+++ b/statediff/doc.go
@ -1,57 +0,0 @@
-// Copyright 2019 The go-ethereum Authors
-// This file is part of the go-ethereum library.
-//
-// The go-ethereum library is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// The go-ethereum library is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Lesser General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public License
-// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
-
-/*
-Package statediff provides an auxiliary service that processes state diff objects from incoming chain events,
-relaying the objects to any rpc subscriptions.
-
-This work is adapted from work by Charles Crain at https://github.com/jpmorganchase/quorum/blob/9b7fd9af8082795eeeb6863d9746f12b82dd5078/statediff/statediff.go
-
-The service is spun up using the below CLI flags
--statediff: boolean flag, turns on the service
--statediff.streamblock: boolean flag, configures the service to associate and stream out the rest of the block data with the state diffs.
--statediff.intermediatenodes: boolean flag, tells service to include intermediate (branch and extension) nodes; default (false) processes leaf nodes only.
--statediff.watchedaddresses: string slice flag, used to limit the state diffing process to the given addresses. Usage: --statediff.watchedaddresses=addr1 --statediff.watchedaddresses=addr2 --statediff.watchedaddresses=addr3
-
-If you wish to use the websocket endpoint to subscribe to the statediff service, be sure to open up the Websocket RPC server with the `--ws` flag. The IPC-RPC server is turned on by default.
-
-The statediffing services works only with `--syncmode="full", but -importantly- does not require garbage collection to be turned off (does not require an archival node).
-
-e.g.
-
-$ ./geth --statediff --statediff.streamblock --ws --syncmode "full"
-
-This starts up the geth node in full sync mode, starts up the statediffing service, and opens up the websocket endpoint to subscribe to the service.
-Because the "streamblock" flag has been turned on, the service will strean out block data (headers, transactions, and receipts) along with the diffed state and storage leafs.
-
-Rpc subscriptions to the service can be created using the rpc.Client.Subscribe() method,
-with the "statediff" namespace, a statediff.Payload channel, and the name of the statediff api's rpc method- "stream".
-
-e.g.
-
-cli, _ := rpc.Dial("ipcPathOrWsURL")
-stateDiffPayloadChan := make(chan statediff.Payload, 20000)
-rpcSub, err := cli.Subscribe(context.Background(), "statediff", stateDiffPayloadChan, "stream"})
-for {
-	select {
-	case stateDiffPayload := <- stateDiffPayloadChan:
-		processPayload(stateDiffPayload)
-	case err := <- rpcSub.Err():
-		log.Error(err)
-	}
-}
-*/
-package statediff
--- a/statediff/doc.md
+++ b/statediff/doc.md
@ -0,0 +1,215 @@
+# Statediff
+
+This package provides an auxiliary service that asynchronously processes state diff objects from chain events,
+either relaying the state objects to RPC subscribers or writing them directly to Postgres as IPLD objects.
+
+It also exposes RPC endpoints for fetching or writing to Postgres the state diff at a specific block height
+or for a specific block hash, this operates on historical block and state data and so depends on a complete state archive.
+
+Data is emitted in this differential format in order to make it feasible to IPLD-ize and index the *entire* Ethereum state
+(including intermediate state and storage trie nodes). If this state diff process is ran continuously from genesis,
+the entire state at any block can be materialized from the cumulative differentials up to that point.
+
+## Statediff object
+A state diff `StateObject` is the collection of all the state and storage trie nodes that have been updated in a given block.
+For convenience, we also associate these nodes with the block number and hash, and optionally the set of code hashes and code for any
+contracts deployed in this block.
+
+A complete state diff `StateObject` will include all state and storage intermediate nodes, which is necessary for generating proofs and for
+traversing the tries.
+
+```go
+// StateObject is a collection of state (and linked storage nodes) as well as the associated block number, block hash,
+// and a set of code hashes and their code
+type StateObject struct {
+	BlockNumber       *big.Int                `json:"blockNumber"     gencodec:"required"`
+	BlockHash         common.Hash             `json:"blockHash"       gencodec:"required"`
+	Nodes             []StateNode             `json:"nodes"           gencodec:"required"`
+	CodeAndCodeHashes []CodeAndCodeHash       `json:"codeMapping"`
+}
+
+// StateNode holds the data for a single state diff node
+type StateNode struct {
+	NodeType     NodeType      `json:"nodeType"        gencodec:"required"`
+	Path         []byte        `json:"path"            gencodec:"required"`
+	NodeValue    []byte        `json:"value"           gencodec:"required"`
+	StorageNodes []StorageNode `json:"storage"`
+	LeafKey      []byte        `json:"leafKey"`
+}
+
+// StorageNode holds the data for a single storage diff node
+type StorageNode struct {
+	NodeType  NodeType `json:"nodeType"        gencodec:"required"`
+	Path      []byte   `json:"path"            gencodec:"required"`
+	NodeValue []byte   `json:"value"           gencodec:"required"`
+	LeafKey   []byte   `json:"leafKey"`
+}
+
+// CodeAndCodeHash struct for holding codehash => code mappings
+// we can't use an actual map because they are not rlp serializable
+type CodeAndCodeHash struct {
+	Hash common.Hash `json:"codeHash"`
+	Code []byte      `json:"code"`
+}
+```
+These objects are packed into a `Payload` structure which can additionally associate the `StateObject`
+with the block (header, uncles, and transactions), receipts, and total difficulty.
+This `Payload` encapsulates all of the differential data at a given block, and allows us to index the entire Ethereum data structure
+as hash-linked IPLD objects.
+
+```go
+// Payload packages the data to send to state diff subscriptions
+type Payload struct {
+	BlockRlp        []byte   `json:"blockRlp"`
+	TotalDifficulty *big.Int `json:"totalDifficulty"`
+	ReceiptsRlp     []byte   `json:"receiptsRlp"`
+	StateObjectRlp  []byte   `json:"stateObjectRlp"    gencodec:"required"`
+
+	encoded []byte
+	err     error
+}
+```
+
+## Usage
+This state diffing service runs as an auxiliary service concurrent to the regular syncing process of the geth node.
+
+
+### CLI configuration
+This service introduces a CLI flag namespace `statediff`
+
+`--statediff` flag is used to turn on the service
+`--statediff.writing` is used to tell the service to write state diff objects it produces from synced ChainEvents directly to a configured Postgres database
+`--statediff.db` is the connection string for the Postgres database to write to
+`--statediff.dbnodeid` is the node id to use in the Postgres database
+`--statediff.dbclientname` is the client name to use in the Postgres database
+
+The service can only operate in full sync mode (`--syncmode=full`), but only the historical RPC endpoints require an archive node (`--gcmode=archive`)
+
+e.g.
+`
+./build/bin/geth --syncmode=full --gcmode=archive --statediff --statediff.writing --statediff.db=postgres://localhost:5432/vulcanize_testing?sslmode=disable --statediff.dbnodeid={nodeId} --statediff.dbclientname={dbClientName}
+`
+
+### RPC endpoints
+The state diffing service exposes both a WS subscription endpoint, and a number of HTTP unary endpoints.
+
+Each of these endpoints requires a set of parameters provided by the caller
+
+```go
+// Params is used to carry in parameters from subscribing/requesting clients configuration
+type Params struct {
+	IntermediateStateNodes   bool
+	IntermediateStorageNodes bool
+	IncludeBlock             bool
+	IncludeReceipts          bool
+	IncludeTD                bool
+	IncludeCode              bool
+	WatchedAddresses         []common.Address
+	WatchedStorageSlots      []common.Hash
+}
+```
+
+Using these params we can tell the service whether to include state and/or storage intermediate nodes; whether
+to include the associated block (header, uncles, and transactions); whether to include the associated receipts;
+whether to include the total difficulty for this block; whether to include the set of code hashes and code for
+contracts deployed in this block; whether to limit the diffing process to a list of specific addresses; and/or
+whether to limit the diffing process to a list of specific storage slot keys.
+
+#### Subscription endpoint
+A websocket supporting RPC endpoint is exposed for subscribing to state diff `StateObjects` that come off the head of the chain while the geth node syncs.
+
+```go
+// Stream is a subscription endpoint that fires off state diff payloads as they are created
+Stream(ctx context.Context, params Params) (*rpc.Subscription, error)
+```
+
+To expose this endpoint the node needs to have the websocket server turned on (`--ws`),
+and the `statediff` namespace exposed (`--ws.api=statediff`).
+
+Go code subscriptions to this endpoint can be created using the `rpc.Client.Subscribe()` method,
+with the "statediff" namespace, a `statediff.Payload` channel, and the name of the statediff api's rpc method: "stream".
+
+e.g.
+
+```go
+
+cli, err := rpc.Dial("ipcPathOrWsURL")
+if err != nil {
+	// handle error
+}
+stateDiffPayloadChan := make(chan statediff.Payload, 20000)
+methodName := "stream"
+params := statediff.Params{
+    IncludeBlock:             true,
+    IncludeTD:                true,
+    IncludeReceipts:          true,
+    IntermediateStorageNodes: true,
+    IntermediateStateNodes:   true,
+}
+rpcSub, err := cli.Subscribe(context.Background(), statediff.APIName, stateDiffPayloadChan, methodName, params)
+if err != nil {
+	// handle error
+}
+for {
+	select {
+	case stateDiffPayload := <- stateDiffPayloadChan:
+            // process the payload
+        case err := <- rpcSub.Err():
+    	    // handle rpc subscription error
+        }
+}
+```
+
+#### Unary endpoints
+The service also exposes unary RPC endpoints for retrieving the state diff `StateObject` for a specific block height/hash.
+```go
+// StateDiffAt returns a state diff payload at the specific blockheight
+StateDiffAt(ctx context.Context, blockNumber uint64, params Params) (*Payload, error)
+
+// StateDiffFor returns a state diff payload for the specific blockhash
+StateDiffFor(ctx context.Context, blockHash common.Hash, params Params) (*Payload, error)
+```
+
+To expose this endpoint the node needs to have the HTTP server turned on (`--http`),
+and the `statediff` namespace exposed (`--http.api=statediff`).
+
+### Direct indexing into Postgres
+If `--statediff.writing` is set, the service will convert the state diff `StateObject` data into IPLD objects, persist them directly to Postgres,
+and generate secondary indexes around the IPLD data.
+
+The schema and migrations for this Postgres database are provided in `statediff/db/`.
+
+#### Postgres setup
+We use [pressly/goose](https://github.com/pressly/goose) as our Postgres migration manager.
+You can also load the Postgres schema directly into a database using
+
+`psql database_name < schema.sql`
+
+This will only work on a version 12.4 Postgres database.
+
+#### Schema overview
+Our Postgres schemas are built around a single IPFS backing Postgres IPLD blockstore table (`public.blocks`) that conforms with [go-ds-sql](https://github.com/ipfs/go-ds-sql/blob/master/postgres/postgres.go).
+All IPLD objects are stored in this table, where `key` is the blockstore-prefixed multihash key for the IPLD object and `data` contains
+the bytes for the IPLD block (in the case of all Ethereum IPLDs, this is the RLP byte encoding of the Ethereum object).
+
+The IPLD objects in this table can be traversed using an IPLD DAG interface, but since this table only maps multihash to raw IPLD object
+it is not particularly useful for searching through the data by looking up Ethereum objects by their constituent fields
+(e.g. by block number, tx source/recipient, state/storage trie node path). To improve the accessibility of these objects
+we create an Ethereum [advanced data layout](https://github.com/ipld/specs#schemas-and-advanced-data-layouts) (ADL) by generating secondary
+indexes on top of the raw IPLDs in other Postgres tables.
+
+These secondary index tables fall under the `eth` schema and follow an `{objectType}_cids` naming convention.
+These tables provide a view into individual fields of the underlying Ethereum IPLD objects, allowing lookups on these fields, and reference the raw IPLD objects stored in `public.blocks`
+by foreign keys to their multihash keys.
+Additionally, these tables maintain the hash-linked nature of Ethereum objects to one another. E.g. a storage trie node entry in the `storage_cids`
+table contains a `state_id` foreign key which references the `id` for the `state_cids` entry that contains the state leaf node for the contract that storage node belongs to,
+and in turn that `state_cids` entry contains a `header_id` foreign key which references the `id` of the `header_cids` entry that contains the header for the block these state and storage nodes were updated (diffed).
+
+### Optimization
+On mainnet this process is extremely IO intensive and requires significant resources to allow it to keep up with the head of the chain.
+The state diff processing time for a specific block is dependent on the number and complexity of the state changes that occur in a block and
+the number of updated state nodes that are available in the in-memory cache vs must be retrieved from disc.
+
+If memory permits, one means of improving the efficiency of this process is to increase the in-memory trie cache allocation.
+This can be done by increasing the overall `--cache` allocation and/or by increasing the % of the cache allocated to trie
+usage with `--cache.trie`.