From be544a34245b658c8d4a350e77f1916656c395f8 Mon Sep 17 00:00:00 2001 From: prathamesh0 <42446521+prathamesh0@users.noreply.github.com> Date: Wed, 17 Aug 2022 15:14:14 +0530 Subject: [PATCH] Add helper scripts for data dump correction (#57) * Add a script to find bad data in CSV file dumps * Add a script to delete bad rows from CSV file dumps * Add instructions to run the scripts * Reorganize instructions --- README.md | 16 +++++++++ scripts/README.md | 73 ++++++++++++++++++++++++++++++++++++++ scripts/filter-bad-rows.sh | 29 +++++++++++++++ scripts/find-bad-rows.sh | 43 ++++++++++++++++++++++ 4 files changed, 161 insertions(+) create mode 100644 scripts/README.md create mode 100755 scripts/filter-bad-rows.sh create mode 100755 scripts/find-bad-rows.sh diff --git a/README.md b/README.md index b354d55..9f5fb72 100644 --- a/README.md +++ b/README.md @@ -191,3 +191,19 @@ Config format: ``` * NOTE: `COPY` command on CSVs inserts empty strings as `NULL` in the DB. Passing `FORCE_NOT_NULL ` forces it to insert empty strings instead. This is required to maintain compatibility of the imported snapshot data with the data generated by statediffing. Reference: https://www.postgresql.org/docs/14/sql-copy.html + +### Troubleshooting + +* Run the following command to find any rows (in data dumps in `file` mode) having unexpected number of columns: + + ```bash + ./scripts/find-bad-rows.sh -i -c -o [output-file] -d true + ``` + +* Run the following command to select rows (from data dumps in `file` mode) other than the ones having unexpected number of columns: + + ```bash + ./scripts/filter-bad-rows.sh -i -c -o + ``` + +* See [scripts](./scripts) for more details. diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..a0f0f43 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,73 @@ +## Data Validation + +* For a given table in the `ipld-eth-db` schema, we know the number of columns to be expected in each row in the data dump: + + | Table | Expected columns | + | ----------------- |:----------------:| + | public.nodes | 5 | + | public.blocks | 3 | + | eth.header_cids | 16 | + | eth.state_cids | 8 | + | eth.storage_cids | 9 | + +### Find Bad Data + +* Run the following command to find any rows having unexpected number of columns: + + ```bash + ./scripts/find-bad-rows.sh -i -c -o [output-file] -d [include-data] + ``` + + * `input-file` `-i`: Input data file path + * `expected-columns` `-c`: Expected number of columns in each row of the input file + * `output-file` `-o`: Output destination file path (default: `STDOUT`) + * `include-data` `-d`: Whether to include the data row in the output (`true | false`) (default: `false`) + * The output is of format: row number, number of columns, the data row + + Eg: + + ```bash + ./scripts/find-bad-rows.sh -i eth.state_cids.csv -c 8 -o res.txt -d true + ``` + + Output: + + ``` + 1 9 1500000,xxxxxxxx,0x83952d392f9b0059eea94b10d1a095eefb1943ea91595a16c6698757127d4e1c,,baglacgzasvqcntdahkxhufdnkm7a22s2eetj6mx6nzkarwxtkvy4x3bubdgq,\x0f,0,f,/blocks/,DMQJKYBGZRQDVLT2CRWVGPQNNJNCCJU7GL7G4VAI3LZVK4OL5Q2ARTI + ``` + + Eg: + + ```bash + ./scripts/find-bad-rows.sh -i public.nodes.csv -c 5 -o res.txt -d true + ./scripts/find-bad-rows.sh -i public.blocks.csv -c 3 -o res.txt -d true + ./scripts/find-bad-rows.sh -i eth.header_cids.csv -c 16 -o res.txt -d true + ./scripts/find-bad-rows.sh -i eth.state_cids.csv -c 8 -o res.txt -d true + ./scripts/find-bad-rows.sh -i eth.storage_cids.csv -c 9 -o res.txt -d true + ``` + +## Data Cleanup + +* In case of column count mismatch, data from `file` mode dumps can't be imported readily into `ipld-eth-db`. + +### Filter Bad Data + +* Run the following command to filter out rows having unexpected number of columns: + + ```bash + ./scripts/filter-bad-rows.sh -i -c -o + ``` + + * `input-file` `-i`: Input data file path + * `expected-columns` `-c`: Expected number of columns in each row of the input file + * `output-file` `-o`: Output destination file path + + Eg: + + ```bash + ./scripts/filter-bad-rows.sh -i public.nodes.csv -c 5 -o cleaned-public.nodes.csv + ./scripts/filter-bad-rows.sh -i public.blocks.csv -c 3 -o cleaned-public.blocks.csv + ./scripts/filter-bad-rows.sh -i eth.header_cids.csv -c 16 -o cleaned-eth.header_cids.csv + ./scripts/filter-bad-rows.sh -i eth.state_cids.csv -c 8 -o cleaned-eth.state_cids.csv + ./scripts/filter-bad-rows.sh -i eth.storage_cids.csv -c 9 -o cleaned-eth.storage_cids.csv + ``` diff --git a/scripts/filter-bad-rows.sh b/scripts/filter-bad-rows.sh new file mode 100755 index 0000000..5904a77 --- /dev/null +++ b/scripts/filter-bad-rows.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# flags +# -i : Input data file path +# -c : Expected number of columns in each row of the input file +# -o [output-file]: Output destination file path + +# eg: ./scripts/filter-bad-rows.sh -i eth.state_cids.csv -c 8 -o cleaned-eth.state_cids.csv + +while getopts i:c:o: OPTION +do + case "${OPTION}" in + i) inputFile=${OPTARG};; + c) expectedColumns=${OPTARG};; + o) outputFile=${OPTARG};; + esac +done + +timestamp=$(date +%s) + +# select only rows having expected number of columns +if [ -z "${outputFile}" ]; then + echo "Invalid destination file arg (-o) ${outputFile}" +else + awk -F"," "NF==${expectedColumns}" ${inputFile} > ${outputFile} +fi + +difference=$(($(date +%s)-timestamp)) +echo Time taken: $(date -d@${difference} -u +%H:%M:%S) diff --git a/scripts/find-bad-rows.sh b/scripts/find-bad-rows.sh new file mode 100755 index 0000000..a3a6b62 --- /dev/null +++ b/scripts/find-bad-rows.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +# flags +# -i : Input data file path +# -c : Expected number of columns in each row of the input file +# -o [output-file]: Output destination file path (default: STDOUT) +# -d [include-data]: Whether to include the data row in output (true | false) (default: false) + +# eg: ./scripts/find-bad-rows.sh -i eth.state_cids.csv -c 8 -o res.txt -d true +# output: 1 9 1500000,xxxxxxxx,0x83952d392f9b0059eea94b10d1a095eefb1943ea91595a16c6698757127d4e1c,, +# baglacgzasvqcntdahkxhufdnkm7a22s2eetj6mx6nzkarwxtkvy4x3bubdgq,\x0f,0,f,/blocks/, +# DMQJKYBGZRQDVLT2CRWVGPQNNJNCCJU7GL7G4VAI3LZVK4OL5Q2ARTI + +while getopts i:c:o:d: OPTION +do + case "${OPTION}" in + i) inputFile=${OPTARG};; + c) expectedColumns=${OPTARG};; + o) outputFile=${OPTARG};; + d) data=${OPTARG};; + esac +done + +timestamp=$(date +%s) + +# if data requested, dump row number, number of columns and the row +if [ "${data}" = true ] ; then + if [ -z "${outputFile}" ]; then + awk -F"," "NF!=${expectedColumns} {print NR, NF, \$0}" < ${inputFile} + else + awk -F"," "NF!=${expectedColumns} {print NR, NF, \$0}" < ${inputFile} > ${outputFile} + fi +# else, dump only row number, number of columns +else + if [ -z "${outputFile}" ]; then + awk -F"," "NF!=${expectedColumns} {print NR, NF}" < ${inputFile} + else + awk -F"," "NF!=${expectedColumns} {print NR, NF}" < ${inputFile} > ${outputFile} + fi +fi + +difference=$(($(date +%s)-timestamp)) +echo Time taken: $(date -d@${difference} -u +%H:%M:%S)