Clear stale CNI resources from persisted etcd before cluster creation
All checks were successful
Lint Checks / Run linter (push) Successful in 14s
All checks were successful
Lint Checks / Run linter (push) Successful in 14s
When etcd is persisted (for certificate backup) and a cluster is recreated, kind tries to install CNI (kindnet) fresh but the persisted etcd already has those resources, causing 'AlreadyExists' errors and cluster creation failure. This fix: - Detects etcd mount path from kind config - Before cluster creation, clears stale CNI resources (kindnet, coredns) - Preserves certificate and other important data Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
5214bc8c0c
commit
ba9f51116d
@ -96,7 +96,135 @@ def _run_command(command: str):
|
||||
return result
|
||||
|
||||
|
||||
def _get_etcd_host_path_from_kind_config(config_file: str) -> Optional[str]:
|
||||
"""Extract etcd host path from kind config extraMounts."""
|
||||
import yaml
|
||||
|
||||
try:
|
||||
with open(config_file, "r") as f:
|
||||
config = yaml.safe_load(f)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
nodes = config.get("nodes", [])
|
||||
for node in nodes:
|
||||
extra_mounts = node.get("extraMounts", [])
|
||||
for mount in extra_mounts:
|
||||
if mount.get("containerPath") == "/var/lib/etcd":
|
||||
return mount.get("hostPath")
|
||||
return None
|
||||
|
||||
|
||||
def _clear_stale_cni_from_etcd(etcd_path: str) -> bool:
|
||||
"""Clear stale CNI resources from persisted etcd to allow cluster recreation.
|
||||
|
||||
When etcd is persisted and a cluster is recreated, kind tries to install
|
||||
CNI (kindnet) fresh but the persisted etcd already has those resources,
|
||||
causing 'AlreadyExists' errors. This function clears those stale resources.
|
||||
|
||||
Returns True if resources were cleared, False if no action needed.
|
||||
"""
|
||||
db_path = Path(etcd_path) / "member" / "snap" / "db"
|
||||
if not db_path.exists():
|
||||
if opts.o.debug:
|
||||
print(f"No etcd snapshot at {db_path}, skipping CNI cleanup")
|
||||
return False
|
||||
|
||||
if opts.o.debug:
|
||||
print(f"Clearing stale CNI resources from persisted etcd at {etcd_path}")
|
||||
|
||||
# Stale resources that conflict with fresh kind cluster creation
|
||||
stale_prefixes = [
|
||||
"/registry/clusterrolebindings/kindnet",
|
||||
"/registry/clusterroles/kindnet",
|
||||
"/registry/controllerrevisions/kube-system/kindnet",
|
||||
"/registry/daemonsets/kube-system/kindnet",
|
||||
"/registry/pods/kube-system/kindnet",
|
||||
"/registry/serviceaccounts/kube-system/kindnet",
|
||||
# Also clear coredns as it can conflict
|
||||
"/registry/clusterrolebindings/system:coredns",
|
||||
"/registry/clusterroles/system:coredns",
|
||||
"/registry/configmaps/kube-system/coredns",
|
||||
"/registry/deployments/kube-system/coredns",
|
||||
"/registry/serviceaccounts/kube-system/coredns",
|
||||
"/registry/services/specs/kube-system/kube-dns",
|
||||
]
|
||||
|
||||
# Build etcdctl delete commands
|
||||
delete_cmds = " && ".join(
|
||||
[f"etcdctl del --prefix '{prefix}'" for prefix in stale_prefixes]
|
||||
)
|
||||
|
||||
# Use docker to run etcdutl and etcdctl
|
||||
etcd_image = "gcr.io/etcd-development/etcd:v3.5.9"
|
||||
temp_dir = "/tmp/laconic-etcd-cleanup"
|
||||
|
||||
cleanup_script = f"""
|
||||
set -e
|
||||
rm -rf {temp_dir}
|
||||
mkdir -p {temp_dir}
|
||||
|
||||
# Restore snapshot to temp dir
|
||||
docker run --rm \
|
||||
-v {db_path}:/data/db:ro \
|
||||
-v {temp_dir}:/restore \
|
||||
{etcd_image} \
|
||||
etcdutl snapshot restore /data/db \
|
||||
--data-dir=/restore/etcd-data \
|
||||
--skip-hash-check 2>/dev/null
|
||||
|
||||
# Start temp etcd, delete stale resources, stop
|
||||
docker rm -f laconic-etcd-cleanup 2>/dev/null || true
|
||||
docker run -d --name laconic-etcd-cleanup \
|
||||
-v {temp_dir}/etcd-data:/etcd-data \
|
||||
{etcd_image} etcd \
|
||||
--data-dir=/etcd-data \
|
||||
--listen-client-urls=http://0.0.0.0:2379 \
|
||||
--advertise-client-urls=http://localhost:2379
|
||||
|
||||
sleep 3
|
||||
|
||||
# Delete stale resources
|
||||
docker exec laconic-etcd-cleanup /bin/sh -c "{delete_cmds}" 2>/dev/null || true
|
||||
|
||||
# Create new snapshot from cleaned etcd
|
||||
docker exec laconic-etcd-cleanup \
|
||||
etcdctl snapshot save /etcd-data/cleaned-snapshot.db
|
||||
|
||||
# Stop temp etcd
|
||||
docker stop laconic-etcd-cleanup
|
||||
docker rm laconic-etcd-cleanup
|
||||
|
||||
# Replace original etcd data with cleaned version
|
||||
rm -rf {etcd_path}/member
|
||||
docker run --rm \
|
||||
-v {temp_dir}/etcd-data/cleaned-snapshot.db:/data/db:ro \
|
||||
-v {etcd_path}:/restore \
|
||||
{etcd_image} \
|
||||
etcdutl snapshot restore /data/db \
|
||||
--data-dir=/restore \
|
||||
--skip-hash-check 2>/dev/null
|
||||
|
||||
rm -rf {temp_dir}
|
||||
"""
|
||||
|
||||
result = subprocess.run(cleanup_script, shell=True, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
if opts.o.debug:
|
||||
print(f"Warning: etcd cleanup failed: {result.stderr}")
|
||||
return False
|
||||
|
||||
if opts.o.debug:
|
||||
print("Cleared stale CNI resources from persisted etcd")
|
||||
return True
|
||||
|
||||
|
||||
def create_cluster(name: str, config_file: str):
|
||||
# Clear stale CNI resources from persisted etcd if present
|
||||
etcd_path = _get_etcd_host_path_from_kind_config(config_file)
|
||||
if etcd_path:
|
||||
_clear_stale_cni_from_etcd(etcd_path)
|
||||
|
||||
result = _run_command(f"kind create cluster --name {name} --config {config_file}")
|
||||
if result.returncode != 0:
|
||||
raise DeployerException(f"kind create cluster failed: {result}")
|
||||
|
||||
Loading…
Reference in New Issue
Block a user