From be544a34245b658c8d4a350e77f1916656c395f8 Mon Sep 17 00:00:00 2001
From: prathamesh0 <42446521+prathamesh0@users.noreply.github.com>
Date: Wed, 17 Aug 2022 15:14:14 +0530
Subject: [PATCH] Add helper scripts for data dump correction (#57)

* Add a script to find bad data in CSV file dumps

* Add a script to delete bad rows from CSV file dumps

* Add instructions to run the scripts

* Reorganize instructions
---
 README.md                  | 16 +++++++++
 scripts/README.md          | 73 ++++++++++++++++++++++++++++++++++++++
 scripts/filter-bad-rows.sh | 29 +++++++++++++++
 scripts/find-bad-rows.sh   | 43 ++++++++++++++++++++++
 4 files changed, 161 insertions(+)
 create mode 100644 scripts/README.md
 create mode 100755 scripts/filter-bad-rows.sh
 create mode 100755 scripts/find-bad-rows.sh
diff --git a/README.md b/README.md
index b354d55..9f5fb72 100644
--- a/README.md
+++ b/README.md
@@ -191,3 +191,19 @@ Config format:
     ```
 
 * NOTE: `COPY` command on CSVs inserts empty strings as `NULL` in the DB. Passing `FORCE_NOT_NULL <COLUMN_NAME>` forces it to insert empty strings instead. This is required to maintain compatibility of the imported snapshot data with the data generated by statediffing. Reference: https://www.postgresql.org/docs/14/sql-copy.html
+
+### Troubleshooting
+
+* Run the following command to find any rows (in data dumps in `file` mode) having unexpected number of columns:
+
+    ```bash
+    ./scripts/find-bad-rows.sh -i <input-file> -c <expected-columns> -o [output-file] -d true
+    ```
+
+* Run the following command to select rows (from data dumps in `file` mode) other than the ones having unexpected number of columns:
+
+    ```bash
+    ./scripts/filter-bad-rows.sh -i <input-file> -c <expected-columns> -o <output-file>
+    ```
+
+* See [scripts](./scripts) for more details.
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 0000000..a0f0f43
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,73 @@
+## Data Validation
+
+* For a given table in the `ipld-eth-db` schema, we know the number of columns to be expected in each row in the data dump:
+
+  | Table             | Expected columns |
+  | ----------------- |:----------------:|
+  | public.nodes      | 5                |
+  | public.blocks     | 3                |
+  | eth.header_cids   | 16               |
+  | eth.state_cids    | 8                |
+  | eth.storage_cids  | 9                |
+
+### Find Bad Data
+
+* Run the following command to find any rows having unexpected number of columns:
+
+  ```bash
+  ./scripts/find-bad-rows.sh -i <input-file> -c <expected-columns> -o [output-file] -d [include-data]
+  ```
+
+  * `input-file` `-i`: Input data file path
+  * `expected-columns` `-c`: Expected number of columns in each row of the input file
+  * `output-file` `-o`: Output destination file path (default: `STDOUT`)
+  * `include-data` `-d`: Whether to include the data row in the output (`true | false`) (default: `false`)
+  * The output is of format: row number, number of columns, the data row
+
+    Eg:
+
+    ```bash
+    ./scripts/find-bad-rows.sh -i eth.state_cids.csv -c 8 -o res.txt -d true
+    ```
+    
+    Output:
+
+    ```
+    1 9 1500000,xxxxxxxx,0x83952d392f9b0059eea94b10d1a095eefb1943ea91595a16c6698757127d4e1c,,baglacgzasvqcntdahkxhufdnkm7a22s2eetj6mx6nzkarwxtkvy4x3bubdgq,\x0f,0,f,/blocks/,DMQJKYBGZRQDVLT2CRWVGPQNNJNCCJU7GL7G4VAI3LZVK4OL5Q2ARTI
+    ```
+
+    Eg:
+
+    ```bash
+    ./scripts/find-bad-rows.sh -i public.nodes.csv -c 5 -o res.txt -d true
+    ./scripts/find-bad-rows.sh -i public.blocks.csv -c 3 -o res.txt -d true
+    ./scripts/find-bad-rows.sh -i eth.header_cids.csv -c 16 -o res.txt -d true
+    ./scripts/find-bad-rows.sh -i eth.state_cids.csv -c 8 -o res.txt -d true
+    ./scripts/find-bad-rows.sh -i eth.storage_cids.csv -c 9 -o res.txt -d true
+    ```
+
+## Data Cleanup
+
+* In case of column count mismatch, data from `file` mode dumps can't be imported readily into `ipld-eth-db`.
+
+### Filter Bad Data
+
+* Run the following command to filter out rows having unexpected number of columns:
+
+  ```bash
+  ./scripts/filter-bad-rows.sh -i <input-file> -c <expected-columns> -o <output-file>
+  ```
+
+  * `input-file` `-i`: Input data file path
+  * `expected-columns` `-c`: Expected number of columns in each row of the input file
+  * `output-file` `-o`: Output destination file path
+
+    Eg:
+
+    ```bash
+    ./scripts/filter-bad-rows.sh -i public.nodes.csv -c 5 -o cleaned-public.nodes.csv
+    ./scripts/filter-bad-rows.sh -i public.blocks.csv -c 3 -o cleaned-public.blocks.csv
+    ./scripts/filter-bad-rows.sh -i eth.header_cids.csv -c 16 -o cleaned-eth.header_cids.csv
+    ./scripts/filter-bad-rows.sh -i eth.state_cids.csv -c 8 -o cleaned-eth.state_cids.csv
+    ./scripts/filter-bad-rows.sh -i eth.storage_cids.csv -c 9 -o cleaned-eth.storage_cids.csv
+    ```
diff --git a/scripts/filter-bad-rows.sh b/scripts/filter-bad-rows.sh
new file mode 100755
index 0000000..5904a77
--- /dev/null
+++ b/scripts/filter-bad-rows.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# flags
+# -i <input-file>:        Input data file path
+# -c <expected-columns>:  Expected number of columns in each row of the input file
+# -o [output-file]:       Output destination file path
+
+# eg: ./scripts/filter-bad-rows.sh -i eth.state_cids.csv -c 8 -o cleaned-eth.state_cids.csv
+
+while getopts i:c:o: OPTION
+do
+  case "${OPTION}" in
+    i) inputFile=${OPTARG};;
+    c) expectedColumns=${OPTARG};;
+    o) outputFile=${OPTARG};;
+  esac
+done
+
+timestamp=$(date +%s)
+
+# select only rows having expected number of columns
+if [ -z "${outputFile}" ]; then
+  echo "Invalid destination file arg (-o) ${outputFile}"
+else
+  awk -F"," "NF==${expectedColumns}" ${inputFile} > ${outputFile}
+fi
+
+difference=$(($(date +%s)-timestamp))
+echo Time taken: $(date -d@${difference} -u +%H:%M:%S)
diff --git a/scripts/find-bad-rows.sh b/scripts/find-bad-rows.sh
new file mode 100755
index 0000000..a3a6b62
--- /dev/null
+++ b/scripts/find-bad-rows.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# flags
+# -i <input-file>:        Input data file path
+# -c <expected-columns>:  Expected number of columns in each row of the input file
+# -o [output-file]:       Output destination file path (default: STDOUT)
+# -d [include-data]:      Whether to include the data row in output (true | false) (default: false)
+
+# eg: ./scripts/find-bad-rows.sh -i eth.state_cids.csv -c 8 -o res.txt -d true
+# output: 1 9 1500000,xxxxxxxx,0x83952d392f9b0059eea94b10d1a095eefb1943ea91595a16c6698757127d4e1c,,
+#         baglacgzasvqcntdahkxhufdnkm7a22s2eetj6mx6nzkarwxtkvy4x3bubdgq,\x0f,0,f,/blocks/,
+#         DMQJKYBGZRQDVLT2CRWVGPQNNJNCCJU7GL7G4VAI3LZVK4OL5Q2ARTI
+
+while getopts i:c:o:d: OPTION
+do
+  case "${OPTION}" in
+    i) inputFile=${OPTARG};;
+    c) expectedColumns=${OPTARG};;
+    o) outputFile=${OPTARG};;
+    d) data=${OPTARG};;
+  esac
+done
+
+timestamp=$(date +%s)
+
+# if data requested, dump row number, number of columns and the row
+if [ "${data}" = true ] ; then
+  if [ -z "${outputFile}" ]; then
+    awk -F"," "NF!=${expectedColumns} {print NR, NF, \$0}" < ${inputFile}
+  else
+    awk -F"," "NF!=${expectedColumns} {print NR, NF, \$0}" < ${inputFile} > ${outputFile}
+  fi
+# else, dump only row number, number of columns
+else
+  if [ -z "${outputFile}" ]; then
+    awk -F"," "NF!=${expectedColumns} {print NR, NF}" < ${inputFile}
+  else
+    awk -F"," "NF!=${expectedColumns} {print NR, NF}" < ${inputFile} > ${outputFile}
+  fi
+fi
+
+difference=$(($(date +%s)-timestamp))
+echo Time taken: $(date -d@${difference} -u +%H:%M:%S)