From 8d6e50b3aed54c939015ac36b7380bf592ed90cc Mon Sep 17 00:00:00 2001
From: "A. F. Dudley" <a.frederick.dudley@gmail.com>
Date: Mon, 2 Feb 2026 19:27:59 -0500
Subject: [PATCH] Use whitelist approach for etcd cleanup

Instead of trying to delete specific stale resources (blacklist),
keep only the valuable data (caddy TLS certs) and delete everything
else. This is more robust as we don't need to maintain a list of
all possible stale resources.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 stack_orchestrator/deploy/k8s/helpers.py | 110 +++++++++++------------
 1 file changed, 53 insertions(+), 57 deletions(-)

diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py
index cd34b138..fa5b4141 100644
--- a/stack_orchestrator/deploy/k8s/helpers.py
+++ b/stack_orchestrator/deploy/k8s/helpers.py
@@ -115,81 +115,58 @@ def _get_etcd_host_path_from_kind_config(config_file: str) -> Optional[str]:
     return None
 
 
-def _clear_stale_cni_from_etcd(etcd_path: str) -> bool:
-    """Clear stale CNI resources from persisted etcd to allow cluster recreation.
+def _clean_etcd_keeping_certs(etcd_path: str) -> bool:
+    """Clean persisted etcd, keeping only TLS certificates.
 
     When etcd is persisted and a cluster is recreated, kind tries to install
-    CNI (kindnet) fresh but the persisted etcd already has those resources,
-    causing 'AlreadyExists' errors. This function clears those stale resources.
+    resources fresh but they already exist. Instead of trying to delete
+    specific stale resources (blacklist), we keep only the valuable data
+    (caddy TLS certs) and delete everything else (whitelist approach).
 
-    Returns True if resources were cleared, False if no action needed.
+    Returns True if cleanup succeeded, False if no action needed or failed.
     """
     db_path = Path(etcd_path) / "member" / "snap" / "db"
-    # Check existence with sudo since etcd dir is often root-owned
+    # Check existence - etcd dir is often root-owned so use shell test
     check_result = subprocess.run(f"test -f {db_path}", shell=True, capture_output=True)
     if check_result.returncode != 0:
         if opts.o.debug:
-            print(f"No etcd snapshot at {db_path}, skipping CNI cleanup")
+            print(f"No etcd snapshot at {db_path}, skipping cleanup")
         return False
 
     if opts.o.debug:
-        print(f"Clearing stale CNI resources from persisted etcd at {etcd_path}")
+        print(f"Cleaning persisted etcd at {etcd_path}, keeping only TLS certs")
 
-    # Stale resources that conflict with fresh kind cluster creation
-    stale_prefixes = [
-        "/registry/clusterrolebindings/kindnet",
-        "/registry/clusterroles/kindnet",
-        "/registry/controllerrevisions/kube-system/kindnet",
-        "/registry/daemonsets/kube-system/kindnet",
-        "/registry/pods/kube-system/kindnet",
-        "/registry/serviceaccounts/kube-system/kindnet",
-        # Also clear coredns as it can conflict
-        "/registry/clusterrolebindings/system:coredns",
-        "/registry/clusterroles/system:coredns",
-        "/registry/configmaps/kube-system/coredns",
-        "/registry/deployments/kube-system/coredns",
-        "/registry/serviceaccounts/kube-system/coredns",
-        "/registry/services/specs/kube-system/kube-dns",
-    ]
-
-    # Build etcdctl delete commands
-    delete_cmds = " && ".join(
-        [f"etcdctl del --prefix '{prefix}'" for prefix in stale_prefixes]
-    )
-
-    # Use docker to run etcdutl and etcdctl
     etcd_image = "gcr.io/etcd-development/etcd:v3.5.9"
     temp_dir = "/tmp/laconic-etcd-cleanup"
 
-    # All operations done inside docker containers to handle root-owned etcd files
+    # Whitelist: prefixes to KEEP - everything else gets deleted
+    keep_prefixes = "/registry/secrets/caddy-system"
+
+    # All operations in docker to handle root-owned etcd files
     cleanup_script = f"""
         set -e
-
-        # Use alpine for file operations (has shell, rm, cp, etc.)
         ALPINE_IMAGE="alpine:3.19"
 
-        # Create temp dir using docker (handles permissions)
+        # Create temp dir
         docker run --rm -v /tmp:/tmp $ALPINE_IMAGE \
             sh -c "rm -rf {temp_dir} && mkdir -p {temp_dir}"
 
-        # Copy db to temp location using docker
+        # Copy db to temp location
         docker run --rm \
             -v {etcd_path}:/etcd:ro \
             -v {temp_dir}:/tmp-work \
             $ALPINE_IMAGE cp /etcd/member/snap/db /tmp-work/etcd-snapshot.db
 
-        # Restore snapshot to temp dir
-        docker run --rm \
-            -v {temp_dir}:/work \
-            {etcd_image} \
+        # Restore snapshot
+        docker run --rm -v {temp_dir}:/work {etcd_image} \
             etcdutl snapshot restore /work/etcd-snapshot.db \
-                --data-dir=/work/etcd-data \
-                --skip-hash-check 2>/dev/null
+                --data-dir=/work/etcd-data --skip-hash-check 2>/dev/null
 
-        # Start temp etcd, delete stale resources, stop
+        # Start temp etcd
         docker rm -f laconic-etcd-cleanup 2>/dev/null || true
         docker run -d --name laconic-etcd-cleanup \
             -v {temp_dir}/etcd-data:/etcd-data \
+            -v {temp_dir}:/backup \
             {etcd_image} etcd \
                 --data-dir=/etcd-data \
                 --listen-client-urls=http://0.0.0.0:2379 \
@@ -197,30 +174,49 @@ def _clear_stale_cni_from_etcd(etcd_path: str) -> bool:
 
         sleep 3
 
-        # Delete stale resources
-        docker exec laconic-etcd-cleanup /bin/sh -c "{delete_cmds}" 2>/dev/null || true
+        # Export caddy secrets to backup file (the only thing we keep)
+        docker exec laconic-etcd-cleanup \
+            etcdctl get --prefix "{keep_prefixes}" -w json > {temp_dir}/kept.json \
+            2>/dev/null || echo '{{}}' > {temp_dir}/kept.json
 
-        # Create new snapshot from cleaned etcd
+        # Delete ALL registry keys
+        docker exec laconic-etcd-cleanup etcdctl del --prefix /registry
+
+        # Restore kept keys using etcdctl txn
+        docker exec laconic-etcd-cleanup sh -c '
+            cat /backup/kept.json 2>/dev/null | \
+            (python3 -c "
+import sys, json, base64
+try:
+    data = json.load(sys.stdin)
+    for kv in data.get(\"kvs\", []):
+        k = base64.b64decode(kv[\"key\"]).decode()
+        v = base64.b64decode(kv[\"value\"]).decode(\"latin-1\")
+        print(k)
+        print(v)
+except: pass
+" 2>/dev/null || true) | while IFS= read -r key && IFS= read -r value; do
+                printf \"%s\" \"$value\" | etcdctl put \"$key\"
+            done
+        ' 2>/dev/null || true
+
+        # Save cleaned snapshot
         docker exec laconic-etcd-cleanup \
             etcdctl snapshot save /etcd-data/cleaned-snapshot.db
 
-        # Stop temp etcd
         docker stop laconic-etcd-cleanup
         docker rm laconic-etcd-cleanup
 
-        # Clear original etcd member dir using docker
+        # Replace original etcd
         docker run --rm -v {etcd_path}:/etcd $ALPINE_IMAGE rm -rf /etcd/member
-
-        # Restore cleaned snapshot to original location
         docker run --rm \
             -v {temp_dir}/etcd-data/cleaned-snapshot.db:/data/db:ro \
             -v {etcd_path}:/restore \
             {etcd_image} \
-            etcdutl snapshot restore /data/db \
-                --data-dir=/restore \
-                --skip-hash-check 2>/dev/null
+            etcdutl snapshot restore /data/db --data-dir=/restore --skip-hash-check \
+            2>/dev/null
 
-        # Cleanup temp dir
+        # Cleanup
         docker run --rm -v /tmp:/tmp $ALPINE_IMAGE rm -rf {temp_dir}
     """
 
@@ -231,15 +227,15 @@ def _clear_stale_cni_from_etcd(etcd_path: str) -> bool:
         return False
 
     if opts.o.debug:
-        print("Cleared stale CNI resources from persisted etcd")
+        print("Cleaned etcd, kept only TLS certificates")
     return True
 
 
 def create_cluster(name: str, config_file: str):
-    # Clear stale CNI resources from persisted etcd if present
+    # Clean persisted etcd, keeping only TLS certificates
     etcd_path = _get_etcd_host_path_from_kind_config(config_file)
     if etcd_path:
-        _clear_stale_cni_from_etcd(etcd_path)
+        _clean_etcd_keeping_certs(etcd_path)
 
     result = _run_command(f"kind create cluster --name {name} --config {config_file}")
     if result.returncode != 0: