From 8d6e50b3aed54c939015ac36b7380bf592ed90cc Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Mon, 2 Feb 2026 19:27:59 -0500 Subject: [PATCH] Use whitelist approach for etcd cleanup Instead of trying to delete specific stale resources (blacklist), keep only the valuable data (caddy TLS certs) and delete everything else. This is more robust as we don't need to maintain a list of all possible stale resources. Co-Authored-By: Claude Opus 4.5 --- stack_orchestrator/deploy/k8s/helpers.py | 110 +++++++++++------------ 1 file changed, 53 insertions(+), 57 deletions(-) diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index cd34b138..fa5b4141 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -115,81 +115,58 @@ def _get_etcd_host_path_from_kind_config(config_file: str) -> Optional[str]: return None -def _clear_stale_cni_from_etcd(etcd_path: str) -> bool: - """Clear stale CNI resources from persisted etcd to allow cluster recreation. +def _clean_etcd_keeping_certs(etcd_path: str) -> bool: + """Clean persisted etcd, keeping only TLS certificates. When etcd is persisted and a cluster is recreated, kind tries to install - CNI (kindnet) fresh but the persisted etcd already has those resources, - causing 'AlreadyExists' errors. This function clears those stale resources. + resources fresh but they already exist. Instead of trying to delete + specific stale resources (blacklist), we keep only the valuable data + (caddy TLS certs) and delete everything else (whitelist approach). - Returns True if resources were cleared, False if no action needed. + Returns True if cleanup succeeded, False if no action needed or failed. """ db_path = Path(etcd_path) / "member" / "snap" / "db" - # Check existence with sudo since etcd dir is often root-owned + # Check existence - etcd dir is often root-owned so use shell test check_result = subprocess.run(f"test -f {db_path}", shell=True, capture_output=True) if check_result.returncode != 0: if opts.o.debug: - print(f"No etcd snapshot at {db_path}, skipping CNI cleanup") + print(f"No etcd snapshot at {db_path}, skipping cleanup") return False if opts.o.debug: - print(f"Clearing stale CNI resources from persisted etcd at {etcd_path}") + print(f"Cleaning persisted etcd at {etcd_path}, keeping only TLS certs") - # Stale resources that conflict with fresh kind cluster creation - stale_prefixes = [ - "/registry/clusterrolebindings/kindnet", - "/registry/clusterroles/kindnet", - "/registry/controllerrevisions/kube-system/kindnet", - "/registry/daemonsets/kube-system/kindnet", - "/registry/pods/kube-system/kindnet", - "/registry/serviceaccounts/kube-system/kindnet", - # Also clear coredns as it can conflict - "/registry/clusterrolebindings/system:coredns", - "/registry/clusterroles/system:coredns", - "/registry/configmaps/kube-system/coredns", - "/registry/deployments/kube-system/coredns", - "/registry/serviceaccounts/kube-system/coredns", - "/registry/services/specs/kube-system/kube-dns", - ] - - # Build etcdctl delete commands - delete_cmds = " && ".join( - [f"etcdctl del --prefix '{prefix}'" for prefix in stale_prefixes] - ) - - # Use docker to run etcdutl and etcdctl etcd_image = "gcr.io/etcd-development/etcd:v3.5.9" temp_dir = "/tmp/laconic-etcd-cleanup" - # All operations done inside docker containers to handle root-owned etcd files + # Whitelist: prefixes to KEEP - everything else gets deleted + keep_prefixes = "/registry/secrets/caddy-system" + + # All operations in docker to handle root-owned etcd files cleanup_script = f""" set -e - - # Use alpine for file operations (has shell, rm, cp, etc.) ALPINE_IMAGE="alpine:3.19" - # Create temp dir using docker (handles permissions) + # Create temp dir docker run --rm -v /tmp:/tmp $ALPINE_IMAGE \ sh -c "rm -rf {temp_dir} && mkdir -p {temp_dir}" - # Copy db to temp location using docker + # Copy db to temp location docker run --rm \ -v {etcd_path}:/etcd:ro \ -v {temp_dir}:/tmp-work \ $ALPINE_IMAGE cp /etcd/member/snap/db /tmp-work/etcd-snapshot.db - # Restore snapshot to temp dir - docker run --rm \ - -v {temp_dir}:/work \ - {etcd_image} \ + # Restore snapshot + docker run --rm -v {temp_dir}:/work {etcd_image} \ etcdutl snapshot restore /work/etcd-snapshot.db \ - --data-dir=/work/etcd-data \ - --skip-hash-check 2>/dev/null + --data-dir=/work/etcd-data --skip-hash-check 2>/dev/null - # Start temp etcd, delete stale resources, stop + # Start temp etcd docker rm -f laconic-etcd-cleanup 2>/dev/null || true docker run -d --name laconic-etcd-cleanup \ -v {temp_dir}/etcd-data:/etcd-data \ + -v {temp_dir}:/backup \ {etcd_image} etcd \ --data-dir=/etcd-data \ --listen-client-urls=http://0.0.0.0:2379 \ @@ -197,30 +174,49 @@ def _clear_stale_cni_from_etcd(etcd_path: str) -> bool: sleep 3 - # Delete stale resources - docker exec laconic-etcd-cleanup /bin/sh -c "{delete_cmds}" 2>/dev/null || true + # Export caddy secrets to backup file (the only thing we keep) + docker exec laconic-etcd-cleanup \ + etcdctl get --prefix "{keep_prefixes}" -w json > {temp_dir}/kept.json \ + 2>/dev/null || echo '{{}}' > {temp_dir}/kept.json - # Create new snapshot from cleaned etcd + # Delete ALL registry keys + docker exec laconic-etcd-cleanup etcdctl del --prefix /registry + + # Restore kept keys using etcdctl txn + docker exec laconic-etcd-cleanup sh -c ' + cat /backup/kept.json 2>/dev/null | \ + (python3 -c " +import sys, json, base64 +try: + data = json.load(sys.stdin) + for kv in data.get(\"kvs\", []): + k = base64.b64decode(kv[\"key\"]).decode() + v = base64.b64decode(kv[\"value\"]).decode(\"latin-1\") + print(k) + print(v) +except: pass +" 2>/dev/null || true) | while IFS= read -r key && IFS= read -r value; do + printf \"%s\" \"$value\" | etcdctl put \"$key\" + done + ' 2>/dev/null || true + + # Save cleaned snapshot docker exec laconic-etcd-cleanup \ etcdctl snapshot save /etcd-data/cleaned-snapshot.db - # Stop temp etcd docker stop laconic-etcd-cleanup docker rm laconic-etcd-cleanup - # Clear original etcd member dir using docker + # Replace original etcd docker run --rm -v {etcd_path}:/etcd $ALPINE_IMAGE rm -rf /etcd/member - - # Restore cleaned snapshot to original location docker run --rm \ -v {temp_dir}/etcd-data/cleaned-snapshot.db:/data/db:ro \ -v {etcd_path}:/restore \ {etcd_image} \ - etcdutl snapshot restore /data/db \ - --data-dir=/restore \ - --skip-hash-check 2>/dev/null + etcdutl snapshot restore /data/db --data-dir=/restore --skip-hash-check \ + 2>/dev/null - # Cleanup temp dir + # Cleanup docker run --rm -v /tmp:/tmp $ALPINE_IMAGE rm -rf {temp_dir} """ @@ -231,15 +227,15 @@ def _clear_stale_cni_from_etcd(etcd_path: str) -> bool: return False if opts.o.debug: - print("Cleared stale CNI resources from persisted etcd") + print("Cleaned etcd, kept only TLS certificates") return True def create_cluster(name: str, config_file: str): - # Clear stale CNI resources from persisted etcd if present + # Clean persisted etcd, keeping only TLS certificates etcd_path = _get_etcd_host_path_from_kind_config(config_file) if etcd_path: - _clear_stale_cni_from_etcd(etcd_path) + _clean_etcd_keeping_certs(etcd_path) result = _run_command(f"kind create cluster --name {name} --config {config_file}") if result.returncode != 0: