From 3f79c2b81172f224513868980c72765d981dce8b Mon Sep 17 00:00:00 2001 From: prathamesh0 <42446521+prathamesh0@users.noreply.github.com> Date: Wed, 19 Apr 2023 12:22:13 +0530 Subject: [PATCH] [WIP] Handle restarts in fixturenet-eth stack (#324) * Use mounted volumes for data in geth nodes * Use mounted volumes for data in lighthouse nodes * Avoid resetting genesis time in a lighthouse node on restart * Mount parent datadir for lighthouse nodes * Trap signals on shutdown and clean up in lighthouse nodes * Allow stalled sync in lighthouse beacon nodes * Gracefully shutdown geth nodes * Add clean up instructions * Gracefully shutdown lighthouse boot node Former-commit-id: 3130af161539c701e46b4ef8a8af5b503ce0433b --- .../compose/docker-compose-fixturenet-eth.yml | 17 ++++++++++ .../cerc-fixturenet-eth-geth/Dockerfile | 2 +- .../genesis/accounts/import_keys.sh | 2 +- .../cerc-fixturenet-eth-geth/run-el.sh | 30 ++++++++++++++++-- .../genesis/cl/beacon_node.sh | 5 +-- .../genesis/cl/bootnode.sh | 18 ++++++++--- .../genesis/cl/reset_genesis_time.sh | 2 +- .../genesis/cl/validator_client.sh | 2 +- .../cerc-fixturenet-eth-lighthouse/run-cl.sh | 31 ++++++++++++++----- app/data/stacks/fixturenet-eth/README.md | 28 +++++++++++++++-- 10 files changed, 113 insertions(+), 24 deletions(-) diff --git a/app/data/compose/docker-compose-fixturenet-eth.yml b/app/data/compose/docker-compose-fixturenet-eth.yml index 508543e..2024cac 100644 --- a/app/data/compose/docker-compose-fixturenet-eth.yml +++ b/app/data/compose/docker-compose-fixturenet-eth.yml @@ -8,6 +8,8 @@ services: environment: RUN_BOOTNODE: "true" image: cerc/fixturenet-eth-geth:local + volumes: + - fixturenet_eth_bootnode_geth_data:/root/ethdata ports: - "9898" - "30303" @@ -26,6 +28,7 @@ services: image: cerc/fixturenet-eth-geth:local volumes: - fixturenet_geth_accounts:/opt/testnet/build/el + - fixturenet_eth_geth_1_data:/root/ethdata healthcheck: test: ["CMD", "nc", "-v", "localhost", "8545"] interval: 30s @@ -52,12 +55,16 @@ services: image: cerc/fixturenet-eth-geth:local depends_on: - fixturenet-eth-bootnode-geth + volumes: + - fixturenet_eth_geth_2_data:/root/ethdata fixturenet-eth-bootnode-lighthouse: hostname: fixturenet-eth-bootnode-lighthouse environment: RUN_BOOTNODE: "true" image: cerc/fixturenet-eth-lighthouse:local + volumes: + - fixturenet_eth_bootnode_lighthouse_data:/opt/testnet/build/cl fixturenet-eth-lighthouse-1: hostname: fixturenet-eth-lighthouse-1 @@ -74,6 +81,8 @@ services: ETH1_ENDPOINT: "http://fixturenet-eth-geth-1:8545" EXECUTION_ENDPOINT: "http://fixturenet-eth-geth-1:8551" image: cerc/fixturenet-eth-lighthouse:local + volumes: + - fixturenet_eth_lighthouse_1_data:/opt/testnet/build/cl depends_on: fixturenet-eth-bootnode-lighthouse: condition: service_started @@ -98,6 +107,8 @@ services: EXECUTION_ENDPOINT: "http://fixturenet-eth-geth-2:8551" LIGHTHOUSE_GENESIS_STATE_URL: "http://fixturenet-eth-lighthouse-1:8001/eth/v2/debug/beacon/states/0" image: cerc/fixturenet-eth-lighthouse:local + volumes: + - fixturenet_eth_lighthouse_2_data:/opt/testnet/build/cl depends_on: fixturenet-eth-bootnode-lighthouse: condition: service_started @@ -106,3 +117,9 @@ services: volumes: fixturenet_geth_accounts: + fixturenet_eth_bootnode_geth_data: + fixturenet_eth_geth_1_data: + fixturenet_eth_geth_2_data: + fixturenet_eth_bootnode_lighthouse_data: + fixturenet_eth_lighthouse_1_data: + fixturenet_eth_lighthouse_2_data: diff --git a/app/data/container-build/cerc-fixturenet-eth-geth/Dockerfile b/app/data/container-build/cerc-fixturenet-eth-geth/Dockerfile index 51dbbcd..63c3c0a 100644 --- a/app/data/container-build/cerc-fixturenet-eth-geth/Dockerfile +++ b/app/data/container-build/cerc-fixturenet-eth-geth/Dockerfile @@ -22,6 +22,6 @@ COPY run-el.sh /opt/testnet/run.sh RUN cd /opt/testnet && make genesis-el COPY --from=geth /usr/local/bin/geth /usr/local/bin/ -RUN geth init /opt/testnet/build/el/geth.json && rm -f ~/.ethereum/geth/nodekey +RUN geth --datadir ~/ethdata init /opt/testnet/build/el/geth.json && rm -f ~/ethdata/geth/nodekey ENTRYPOINT ["/opt/testnet/run.sh"] diff --git a/app/data/container-build/cerc-fixturenet-eth-geth/genesis/accounts/import_keys.sh b/app/data/container-build/cerc-fixturenet-eth-geth/genesis/accounts/import_keys.sh index 719b9f5..e8dce0e 100755 --- a/app/data/container-build/cerc-fixturenet-eth-geth/genesis/accounts/import_keys.sh +++ b/app/data/container-build/cerc-fixturenet-eth-geth/genesis/accounts/import_keys.sh @@ -12,6 +12,6 @@ for line in `cat ../build/el/accounts.csv`; do echo "" echo "$ADDRESS" - geth account import --password .pw.$$ .key.$$ + geth account import --datadir=~/ethdata --password .pw.$$ .key.$$ rm -f .pw.$$ .key.$$ done diff --git a/app/data/container-build/cerc-fixturenet-eth-geth/run-el.sh b/app/data/container-build/cerc-fixturenet-eth-geth/run-el.sh index 12e5946..526c76d 100755 --- a/app/data/container-build/cerc-fixturenet-eth-geth/run-el.sh +++ b/app/data/container-build/cerc-fixturenet-eth-geth/run-el.sh @@ -18,17 +18,35 @@ if [ "true" == "$CERC_REMOTE_DEBUG" ] && [ -x "/usr/local/bin/dlv" ]; then START_CMD="/usr/local/bin/dlv --listen=:40000 --headless=true --api-version=2 --accept-multiclient exec /usr/local/bin/geth --continue --" fi +# See https://linuxconfig.org/how-to-propagate-a-signal-to-child-processes-from-a-bash-script +cleanup() { + echo "Signal received, cleaning up..." + + # Kill the child process first (CERC_REMOTE_DEBUG=true uses dlv which starts geth as a child process) + pkill -P ${geth_pid} + sleep 2 + kill $(jobs -p) + + wait + echo "Done" +} +trap 'cleanup' SIGINT SIGTERM + if [ "true" == "$RUN_BOOTNODE" ]; then $START_CMD \ + --datadir=~/ethdata \ --nodekeyhex="${BOOTNODE_KEY}" \ --nodiscover \ --ipcdisable \ --networkid=${NETWORK_ID} \ - --netrestrict="${NETRESTRICT}" + --netrestrict="${NETRESTRICT}" \ + & + + geth_pid=$! else cd /opt/testnet/accounts ./import_keys.sh - + echo -n "$JWT" > /opt/testnet/build/el/jwtsecret if [ "$CERC_RUN_STATEDIFF" == "detect" ] && [ -n "$CERC_STATEDIFF_DB_HOST" ]; then @@ -74,6 +92,7 @@ else fi $START_CMD \ + --datadir=~/ethdata \ --bootnodes="${ENODE}" \ --allow-insecure-unlock \ --http \ @@ -101,5 +120,10 @@ else --metrics.addr="0.0.0.0" \ --verbosity=${CERC_GETH_VERBOSITY:-3} \ --vmodule="${CERC_GETH_VMODULE:-statediff/*=5}" \ - --miner.etherbase="${ETHERBASE}" ${STATEDIFF_OPTS} + --miner.etherbase="${ETHERBASE}" ${STATEDIFF_OPTS} \ + & + + geth_pid=$! fi + +wait $geth_pid diff --git a/app/data/container-build/cerc-fixturenet-eth-lighthouse/genesis/cl/beacon_node.sh b/app/data/container-build/cerc-fixturenet-eth-lighthouse/genesis/cl/beacon_node.sh index 6fa1d8d..1f90615 100755 --- a/app/data/container-build/cerc-fixturenet-eth-lighthouse/genesis/cl/beacon_node.sh +++ b/app/data/container-build/cerc-fixturenet-eth-lighthouse/genesis/cl/beacon_node.sh @@ -19,9 +19,9 @@ http_port=8001 authrpc_port=8551 exec lighthouse \ - --debug-level $DEBUG_LEVEL \ bn \ $SUBSCRIBE_ALL_SUBNETS \ + --debug-level $DEBUG_LEVEL \ --boot-nodes "$ENR" \ --datadir $data_dir \ --testnet-dir $TESTNET_DIR \ @@ -38,4 +38,5 @@ exec lighthouse \ --execution-jwt $JWTSECRET \ --terminal-total-difficulty-override $ETH1_TTD \ --suggested-fee-recipient $SUGGESTED_FEE_RECIPIENT \ - --target-peers $((BN_COUNT - 1)) + --target-peers $((BN_COUNT - 1)) \ + --http-allow-sync-stalled \ diff --git a/app/data/container-build/cerc-fixturenet-eth-lighthouse/genesis/cl/bootnode.sh b/app/data/container-build/cerc-fixturenet-eth-lighthouse/genesis/cl/bootnode.sh index f6f5cc7..a395f41 100755 --- a/app/data/container-build/cerc-fixturenet-eth-lighthouse/genesis/cl/bootnode.sh +++ b/app/data/container-build/cerc-fixturenet-eth-lighthouse/genesis/cl/bootnode.sh @@ -21,14 +21,22 @@ if [ ! -f "$DATADIR/bootnode/enr.dat" ]; then --udp-port $BOOTNODE_PORT \ --tcp-port $BOOTNODE_PORT \ --genesis-fork-version $GENESIS_FORK_VERSION \ - --output-dir $DATADIR/bootnode + --output-dir $DATADIR/bootnode-temp - bootnode_enr=`cat $DATADIR/bootnode/enr.dat` - echo "- $bootnode_enr" > $TESTNET_DIR/boot_enr.yaml - - echo "Generated bootnode enr and written to $TESTNET_DIR/boot_enr.yaml" + # Output ENR to a temp dir and mv as "lcli generate-bootnode-enr" will not overwrite an empty dir (mounted volume) + mkdir -p $DATADIR/bootnode + mv $DATADIR/bootnode-temp/* $DATADIR/bootnode + rm -r $DATADIR/bootnode-temp + + echo "Generated bootnode enr" +else + echo "Found existing bootnode enr" fi +bootnode_enr=`cat $DATADIR/bootnode/enr.dat` +echo "- $bootnode_enr" > $TESTNET_DIR/boot_enr.yaml +echo "Written bootnode enr to $TESTNET_DIR/boot_enr.yaml" + exec lighthouse boot_node \ --testnet-dir $TESTNET_DIR \ --port $BOOTNODE_PORT \ diff --git a/app/data/container-build/cerc-fixturenet-eth-lighthouse/genesis/cl/reset_genesis_time.sh b/app/data/container-build/cerc-fixturenet-eth-lighthouse/genesis/cl/reset_genesis_time.sh index a8cc2e2..f21a8ee 100755 --- a/app/data/container-build/cerc-fixturenet-eth-lighthouse/genesis/cl/reset_genesis_time.sh +++ b/app/data/container-build/cerc-fixturenet-eth-lighthouse/genesis/cl/reset_genesis_time.sh @@ -16,4 +16,4 @@ lcli \ $TESTNET_DIR/genesis.ssz \ $NOW -echo "Reset genesis time to now ($NOW)" +echo "Reset genesis time to ($NOW)" diff --git a/app/data/container-build/cerc-fixturenet-eth-lighthouse/genesis/cl/validator_client.sh b/app/data/container-build/cerc-fixturenet-eth-lighthouse/genesis/cl/validator_client.sh index c7f0dba..30168f8 100755 --- a/app/data/container-build/cerc-fixturenet-eth-lighthouse/genesis/cl/validator_client.sh +++ b/app/data/container-build/cerc-fixturenet-eth-lighthouse/genesis/cl/validator_client.sh @@ -21,9 +21,9 @@ while getopts "pd:" flag; do done exec lighthouse \ - --debug-level $DEBUG_LEVEL \ vc \ $BUILDER_PROPOSALS \ + --debug-level $DEBUG_LEVEL \ --validators-dir $DATADIR/node_$NODE_NUMBER/validators \ --secrets-dir $DATADIR/node_$NODE_NUMBER/secrets \ --testnet-dir $TESTNET_DIR \ diff --git a/app/data/container-build/cerc-fixturenet-eth-lighthouse/run-cl.sh b/app/data/container-build/cerc-fixturenet-eth-lighthouse/run-cl.sh index ee5fc55..ec0e9ae 100755 --- a/app/data/container-build/cerc-fixturenet-eth-lighthouse/run-cl.sh +++ b/app/data/container-build/cerc-fixturenet-eth-lighthouse/run-cl.sh @@ -1,12 +1,25 @@ #!/bin/bash -if [ "true" == "$RUN_BOOTNODE" ]; then +# See https://linuxconfig.org/how-to-propagate-a-signal-to-child-processes-from-a-bash-script +cleanup() { + echo "Signal received, cleaning up..." + kill $(jobs -p) + + wait + echo "Done" +} +trap 'cleanup' SIGINT SIGTERM + +if [ "true" == "$RUN_BOOTNODE" ]; then cd /opt/testnet/build/cl python3 -m http.server 3000 & cd /opt/testnet/cl - ./bootnode.sh 2>&1 | tee /var/log/lighthouse_bootnode.log + ./bootnode.sh 2>&1 | tee /var/log/lighthouse_bootnode.log & + bootnode_pid=$! + + wait $bootnode_pid else while [ 1 -eq 1 ]; do echo "Waiting on geth ..." @@ -25,7 +38,12 @@ else cd /opt/testnet/cl if [ -z "$LIGHTHOUSE_GENESIS_STATE_URL" ]; then - ./reset_genesis_time.sh + # Check if beacon node data exists to avoid resetting genesis time on a restart + if [ -d /opt/testnet/build/cl/node_"$NODE_NUMBER"/beacon ]; then + echo "Skipping genesis time reset" + else + ./reset_genesis_time.sh + fi else while [ 1 -eq 1 ]; do echo "Waiting on Genesis time ..." @@ -54,10 +72,9 @@ else echo -n "$JWT" > $JWTSECRET ./beacon_node.sh 2>&1 | tee /var/log/lighthouse_bn.log & - lpid=$! + beacon_pid=$! ./validator_client.sh 2>&1 | tee /var/log/lighthouse_vc.log & - vpid=$! + validator_pid=$! - wait $lpid $vpid + wait $beacon_pid $validator_pid fi - diff --git a/app/data/stacks/fixturenet-eth/README.md b/app/data/stacks/fixturenet-eth/README.md index 3aa41de..86404af 100644 --- a/app/data/stacks/fixturenet-eth/README.md +++ b/app/data/stacks/fixturenet-eth/README.md @@ -3,15 +3,18 @@ Instructions for deploying a local a geth + lighthouse blockchain "fixturenet" for development and testing purposes using laconic-stack-orchestrator (the installation of which is covered [here](https://github.com/cerc-io/stack-orchestrator#user-mode)): ## Clone required repositories + ``` $ laconic-so --stack fixturenet-eth setup-repositories ``` ## Build the fixturenet-eth containers + ``` $ laconic-so --stack fixturenet-eth build-containers ``` -This should create several container images in the local image registry: + +This should create several container images in the local image registry: * cerc/go-ethereum * cerc/lighthouse @@ -19,6 +22,7 @@ This should create several container images in the local image registry: * cerc/fixturenet-eth-lighthouse ## Deploy the stack + ``` $ laconic-so --stack fixturenet-eth deploy up ``` @@ -57,7 +61,7 @@ Several other containers can used with the basic `fixturenet-eth`: * `eth-probe` (captures eth1 tx gossip) * `keycloak` (nginx proxy with keycloak auth for API authentication) * `tx-spammer` (generates and sends automated transactions to the fixturenet) - + It is not necessary to use them all at once, but a complete example follows: ``` @@ -99,4 +103,22 @@ keycloak-db-1 0.0.0.0:55850->5432/tcp keycloak-nginx-1 0.0.0.0:55859->80/tcp migrations-1 tx-spammer-1 -``` \ No newline at end of file +``` + +## Clean up + +Stop all services running in the background: + +```bash +$ laconic-so --stack fixturenet-eth deploy down +``` + +Clear volumes created by this stack: + +```bash +# List all relevant volumes +$ docker volume ls -q --filter "name=.*fixturenet_eth_bootnode_geth_data|.*fixturenet_eth_bootnode_lighthouse_data|.*fixturenet_eth_geth_1_data|.*fixturenet_eth_geth_2_data|.*fixturenet_eth_lighthouse_1_data|.*fixturenet_eth_lighthouse_2_data|.*fixturenet_geth_accounts" + +# Remove all the listed volumes +$ docker volume rm $(docker volume ls -q --filter "name=.*fixturenet_eth_bootnode_geth_data|.*fixturenet_eth_bootnode_lighthouse_data|.*fixturenet_eth_geth_1_data|.*fixturenet_eth_geth_2_data|.*fixturenet_eth_lighthouse_1_data|.*fixturenet_eth_lighthouse_2_data|.*fixturenet_geth_accounts") +```