From 4713107546b2095ff0c7a35a660f9dab9f606932 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Fri, 30 Jan 2026 23:27:45 -0500 Subject: [PATCH 01/27] docs(CLAUDE.md): add external stacks preferred guideline Document that external stack pattern should be used when creating new stacks for any reason, with directory structure and usage examples. Co-Authored-By: Claude Opus 4.5 --- CLAUDE.md | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index 65b27524..f06b6abc 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -43,6 +43,45 @@ This project follows principles inspired by literate programming, where developm This approach treats the human-AI collaboration as a form of **conversational literate programming** where understanding emerges through dialogue before code implementation. +## External Stacks Preferred + +When creating new stacks for any reason, **use the external stack pattern** rather than adding stacks directly to this repository. + +External stacks follow this structure: + +``` +my-stack/ +└── stack-orchestrator/ + ├── stacks/ + │ └── my-stack/ + │ ├── stack.yml + │ └── README.md + ├── compose/ + │ └── docker-compose-my-stack.yml + └── config/ + └── my-stack/ + └── (config files) +``` + +### Usage + +```bash +# Fetch external stack +laconic-so fetch-stack github.com/org/my-stack + +# Use external stack +STACK_PATH=~/cerc/my-stack/stack-orchestrator/stacks/my-stack +laconic-so --stack $STACK_PATH deploy init --output spec.yml +laconic-so --stack $STACK_PATH deploy create --spec-file spec.yml --deployment-dir deployment +laconic-so deployment --dir deployment start +``` + +### Examples + +- `zenith-karma-stack` - Karma watcher deployment +- `urbit-stack` - Fake Urbit ship for testing +- `zenith-desk-stack` - Desk deployment stack + ## Insights and Observations ### Design Principles From c197406cc7b8ce2b94248110cb77040d746ab1a7 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Sat, 31 Jan 2026 19:05:27 -0500 Subject: [PATCH 02/27] feat(deploy): add deployment restart command Add `laconic-so deployment restart` command that: - Pulls latest code from stack git repository - Regenerates spec.yml from stack's commands.py - Verifies DNS if hostname changed (with --force to skip) - Syncs deployment directory preserving cluster ID and data - Stops and restarts deployment with --skip-cluster-management Also stores stack-source path in deployment.yml during create for automatic stack location on restart. Co-Authored-By: Claude Opus 4.5 --- stack_orchestrator/deploy/deployment.py | 173 ++++++++++++++++++ .../deploy/deployment_create.py | 39 ++-- stack_orchestrator/deploy/dns_probe.py | 159 ++++++++++++++++ 3 files changed, 358 insertions(+), 13 deletions(-) create mode 100644 stack_orchestrator/deploy/dns_probe.py diff --git a/stack_orchestrator/deploy/deployment.py b/stack_orchestrator/deploy/deployment.py index 35abea3c..f60ea9a4 100644 --- a/stack_orchestrator/deploy/deployment.py +++ b/stack_orchestrator/deploy/deployment.py @@ -15,7 +15,10 @@ import click from pathlib import Path +import subprocess import sys +import tempfile +import time from stack_orchestrator import constants from stack_orchestrator.deploy.images import push_images_operation from stack_orchestrator.deploy.deploy import ( @@ -228,3 +231,173 @@ def run_job(ctx, job_name, helm_release): ctx.obj = make_deploy_context(ctx) run_job_operation(ctx, job_name, helm_release) + + +@command.command() +@click.option("--stack-path", help="Path to stack git repo (overrides stored path)") +@click.option("--config-file", help="Config file to pass to deploy init") +@click.option( + "--force", + is_flag=True, + default=False, + help="Skip DNS verification", +) +@click.option( + "--expected-ip", + help="Expected IP for DNS verification (if different from egress)", +) +@click.pass_context +def restart(ctx, stack_path, config_file, force, expected_ip): + """Pull latest stack, regenerate spec, and restart deployment. + + This command: + 1. Pulls latest code from the stack git repository + 2. Regenerates spec.yml from the stack's commands.py + 3. If hostname changed, verifies DNS routes to this server + 4. Syncs the deployment directory (preserves cluster ID and data) + 5. Stops and restarts the deployment + + Data volumes are always preserved. The cluster is never destroyed. + + Stack source resolution (in order): + 1. --stack-path argument (if provided) + 2. stack-source field in deployment.yml (if stored) + 3. Error if neither available + + Note: After restart, Caddy will automatically provision TLS certificates + for any new hostnames. + """ + from stack_orchestrator.util import get_yaml, get_parsed_deployment_spec + from stack_orchestrator.deploy.deployment_create import ( + init_operation, + create_operation, + ) + from stack_orchestrator.deploy.dns_probe import verify_dns_via_probe + + deployment_context: DeploymentContext = ctx.obj + + # Get current spec info + current_spec = deployment_context.spec + current_http_proxy = current_spec.get_http_proxy() + current_hostname = ( + current_http_proxy[0]["host-name"] if current_http_proxy else None + ) + + # Resolve stack source path + if stack_path: + stack_source = Path(stack_path).resolve() + else: + # Try to get from deployment.yml + deployment_file = ( + deployment_context.deployment_dir / constants.deployment_file_name + ) + deployment_data = get_yaml().load(open(deployment_file)) + stack_source_str = deployment_data.get("stack-source") + if not stack_source_str: + print( + "Error: No stack-source in deployment.yml and --stack-path not provided" + ) + print("Use --stack-path to specify the stack git repository location") + sys.exit(1) + stack_source = Path(stack_source_str) + + if not stack_source.exists(): + print(f"Error: Stack source path does not exist: {stack_source}") + sys.exit(1) + + print("=== Deployment Restart ===") + print(f"Deployment dir: {deployment_context.deployment_dir}") + print(f"Stack source: {stack_source}") + print(f"Current hostname: {current_hostname}") + + # Step 1: Git pull + print("\n[1/6] Pulling latest code from stack repository...") + git_result = subprocess.run( + ["git", "pull"], cwd=stack_source, capture_output=True, text=True + ) + if git_result.returncode != 0: + print(f"Git pull failed: {git_result.stderr}") + sys.exit(1) + print(f"Git pull: {git_result.stdout.strip()}") + + # Step 2: Regenerate spec + print("\n[2/6] Regenerating spec from commands.py...") + with tempfile.NamedTemporaryFile(mode="w", suffix=".yml", delete=False) as tmp: + new_spec_path = tmp.name + + # Build deploy context for init + deploy_ctx = make_deploy_context(ctx) + + init_operation( + deploy_command_context=deploy_ctx, + stack=str(stack_source), + deployer_type=current_spec.obj[constants.deploy_to_key], + config=None, + config_file=config_file, + kube_config=None, + image_registry=None, + output=new_spec_path, + map_ports_to_host=None, + ) + + # Parse new spec to get new hostname + new_spec_obj = get_parsed_deployment_spec(new_spec_path) + new_http_proxy = new_spec_obj.get("network", {}).get("http-proxy", []) + new_hostname = new_http_proxy[0]["host-name"] if new_http_proxy else None + + print(f"New hostname: {new_hostname}") + + # Step 3: DNS verification (only if hostname changed) + if new_hostname and new_hostname != current_hostname: + print(f"\n[3/6] Hostname changed: {current_hostname} -> {new_hostname}") + if force: + print("DNS verification skipped (--force)") + else: + print("Verifying DNS via probe...") + if not verify_dns_via_probe(new_hostname): + print(f"\nDNS verification failed for {new_hostname}") + print("Ensure DNS is configured before restarting.") + print("Use --force to skip this check.") + sys.exit(1) + else: + print("\n[3/6] Hostname unchanged, skipping DNS verification") + + # Step 4: Sync deployment directory + print("\n[4/6] Syncing deployment directory...") + create_operation( + deployment_command_context=deploy_ctx, + spec_file=new_spec_path, + deployment_dir=str(deployment_context.deployment_dir), + update=True, + network_dir=None, + initial_peers=None, + ) + + # Reload deployment context with new spec + deployment_context.init(deployment_context.deployment_dir) + ctx.obj = deployment_context + + # Step 5: Stop deployment + print("\n[5/6] Stopping deployment...") + ctx.obj = make_deploy_context(ctx) + down_operation( + ctx, delete_volumes=False, extra_args_list=[], skip_cluster_management=True + ) + + # Brief pause to ensure clean shutdown + time.sleep(5) + + # Step 6: Start deployment + print("\n[6/6] Starting deployment...") + up_operation( + ctx, services_list=None, stay_attached=False, skip_cluster_management=True + ) + + print("\n=== Restart Complete ===") + print("Deployment restarted with updated configuration.") + if new_hostname and new_hostname != current_hostname: + print(f"\nNew hostname: {new_hostname}") + print("Caddy will automatically provision TLS certificate.") + + # Cleanup temp file + Path(new_spec_path).unlink(missing_ok=True) diff --git a/stack_orchestrator/deploy/deployment_create.py b/stack_orchestrator/deploy/deployment_create.py index fd15119c..ec15362f 100644 --- a/stack_orchestrator/deploy/deployment_create.py +++ b/stack_orchestrator/deploy/deployment_create.py @@ -17,7 +17,7 @@ import click from importlib import util import os from pathlib import Path -from typing import List +from typing import List, Optional import random from shutil import copy, copyfile, copytree, rmtree from secrets import token_hex @@ -507,11 +507,14 @@ def _copy_files_to_directory(file_paths: List[Path], directory: Path): copy(path, os.path.join(directory, os.path.basename(path))) -def _create_deployment_file(deployment_dir: Path): +def _create_deployment_file(deployment_dir: Path, stack_source: Optional[Path] = None): deployment_file_path = deployment_dir.joinpath(constants.deployment_file_name) cluster = f"{constants.cluster_name_prefix}{token_hex(8)}" + deployment_content = {constants.cluster_id_key: cluster} + if stack_source: + deployment_content["stack-source"] = str(stack_source) with open(deployment_file_path, "w") as output_file: - output_file.write(f"{constants.cluster_id_key}: {cluster}\n") + get_yaml().dump(deployment_content, output_file) def _check_volume_definitions(spec): @@ -616,11 +619,15 @@ def create_operation( generate_helm_chart(stack_name, spec_file, deployment_dir_path) return # Exit early for helm chart generation + # Resolve stack source path for restart capability + stack_source = get_stack_path(stack_name) + if update: # Sync mode: write to temp dir, then copy to deployment dir with backups temp_dir = Path(tempfile.mkdtemp(prefix="deployment-sync-")) try: - # Write deployment files to temp dir (skip deployment.yml to preserve cluster ID) + # Write deployment files to temp dir + # (skip deployment.yml to preserve cluster ID) _write_deployment_files( temp_dir, Path(spec_file), @@ -628,12 +635,14 @@ def create_operation( stack_name, deployment_type, include_deployment_file=False, + stack_source=stack_source, ) - # Copy from temp to deployment dir, excluding data volumes and backing up changed files - # Exclude data/* to avoid touching user data volumes - # Exclude config file to preserve deployment settings (XXX breaks passing config vars - # from spec. could warn about this or not exclude...) + # Copy from temp to deployment dir, excluding data volumes + # and backing up changed files. + # Exclude data/* to avoid touching user data volumes. + # Exclude config file to preserve deployment settings + # (XXX breaks passing config vars from spec) exclude_patterns = ["data", "data/*", constants.config_file_name] _safe_copy_tree( temp_dir, deployment_dir_path, exclude_patterns=exclude_patterns @@ -650,6 +659,7 @@ def create_operation( stack_name, deployment_type, include_deployment_file=True, + stack_source=stack_source, ) # Delegate to the stack's Python code @@ -670,7 +680,7 @@ def create_operation( ) -def _safe_copy_tree(src: Path, dst: Path, exclude_patterns: List[str] = None): +def _safe_copy_tree(src: Path, dst: Path, exclude_patterns: Optional[List[str]] = None): """ Recursively copy a directory tree, backing up changed files with .bak suffix. @@ -721,6 +731,7 @@ def _write_deployment_files( stack_name: str, deployment_type: str, include_deployment_file: bool = True, + stack_source: Optional[Path] = None, ): """ Write deployment files to target directory. @@ -730,7 +741,8 @@ def _write_deployment_files( :param parsed_spec: Parsed spec object :param stack_name: Name of stack :param deployment_type: Type of deployment - :param include_deployment_file: Whether to create deployment.yml file (skip for update) + :param include_deployment_file: Whether to create deployment.yml (skip for update) + :param stack_source: Path to stack source (git repo) for restart capability """ stack_file = get_stack_path(stack_name).joinpath(constants.stack_file_name) parsed_stack = get_parsed_stack_config(stack_name) @@ -741,7 +753,7 @@ def _write_deployment_files( # Create deployment file if requested if include_deployment_file: - _create_deployment_file(target_dir) + _create_deployment_file(target_dir, stack_source=stack_source) # Copy any config variables from the spec file into an env file suitable for compose _write_config_file(spec_file, target_dir.joinpath(constants.config_file_name)) @@ -805,8 +817,9 @@ def _write_deployment_files( ) else: # TODO: - # this is odd - looks up config dir that matches a volume name, then copies as a mount dir? - # AFAICT this is not used by or relevant to any existing stack - roy + # This is odd - looks up config dir that matches a volume name, + # then copies as a mount dir? + # AFAICT not used by or relevant to any existing stack - roy # TODO: We should probably only do this if the volume is marked :ro. for volume_name, volume_path in parsed_spec.get_volumes().items(): diff --git a/stack_orchestrator/deploy/dns_probe.py b/stack_orchestrator/deploy/dns_probe.py new file mode 100644 index 00000000..e04b4ea2 --- /dev/null +++ b/stack_orchestrator/deploy/dns_probe.py @@ -0,0 +1,159 @@ +# Copyright © 2024 Vulcanize +# SPDX-License-Identifier: AGPL-3.0 + +"""DNS verification via temporary ingress probe.""" + +import secrets +import socket +import time +from typing import Optional +import requests +from kubernetes import client + + +def get_server_egress_ip() -> str: + """Get this server's public egress IP via ipify.""" + response = requests.get("https://api.ipify.org", timeout=10) + response.raise_for_status() + return response.text.strip() + + +def resolve_hostname(hostname: str) -> list[str]: + """Resolve hostname to list of IP addresses.""" + try: + _, _, ips = socket.gethostbyname_ex(hostname) + return ips + except socket.gaierror: + return [] + + +def verify_dns_simple(hostname: str, expected_ip: Optional[str] = None) -> bool: + """Simple DNS verification - check hostname resolves to expected IP. + + If expected_ip not provided, uses server's egress IP. + Returns True if hostname resolves to expected IP. + """ + resolved_ips = resolve_hostname(hostname) + if not resolved_ips: + print(f"DNS FAIL: {hostname} does not resolve") + return False + + if expected_ip is None: + expected_ip = get_server_egress_ip() + + if expected_ip in resolved_ips: + print(f"DNS OK: {hostname} -> {resolved_ips} (includes {expected_ip})") + return True + else: + print(f"DNS WARN: {hostname} -> {resolved_ips} (expected {expected_ip})") + return False + + +def create_probe_ingress(hostname: str, namespace: str = "default") -> str: + """Create a temporary ingress for DNS probing. + + Returns the probe token that the ingress will respond with. + """ + token = secrets.token_hex(16) + + networking_api = client.NetworkingV1Api() + + # Create a simple ingress that Caddy will pick up + ingress = client.V1Ingress( + metadata=client.V1ObjectMeta( + name="laconic-dns-probe", + annotations={ + "kubernetes.io/ingress.class": "caddy", + "laconic.com/probe-token": token, + }, + ), + spec=client.V1IngressSpec( + rules=[ + client.V1IngressRule( + host=hostname, + http=client.V1HTTPIngressRuleValue( + paths=[ + client.V1HTTPIngressPath( + path="/.well-known/laconic-probe", + path_type="Exact", + backend=client.V1IngressBackend( + service=client.V1IngressServiceBackend( + name="caddy-ingress-controller", + port=client.V1ServiceBackendPort(number=80), + ) + ), + ) + ] + ), + ) + ] + ), + ) + + networking_api.create_namespaced_ingress(namespace=namespace, body=ingress) + return token + + +def delete_probe_ingress(namespace: str = "default"): + """Delete the temporary probe ingress.""" + networking_api = client.NetworkingV1Api() + try: + networking_api.delete_namespaced_ingress( + name="laconic-dns-probe", namespace=namespace + ) + except client.exceptions.ApiException: + pass # Ignore if already deleted + + +def verify_dns_via_probe( + hostname: str, namespace: str = "default", timeout: int = 30, poll_interval: int = 2 +) -> bool: + """Verify DNS by creating temp ingress and probing it. + + This definitively proves that traffic to the hostname reaches this cluster. + + Args: + hostname: The hostname to verify + namespace: Kubernetes namespace for probe ingress + timeout: Total seconds to wait for probe to succeed + poll_interval: Seconds between probe attempts + + Returns: + True if probe succeeds, False otherwise + """ + # First check DNS resolves at all + if not resolve_hostname(hostname): + print(f"DNS FAIL: {hostname} does not resolve") + return False + + print(f"Creating probe ingress for {hostname}...") + create_probe_ingress(hostname, namespace) + + try: + # Wait for Caddy to pick up the ingress + time.sleep(3) + + # Poll until success or timeout + probe_url = f"http://{hostname}/.well-known/laconic-probe" + start_time = time.time() + last_error = None + + while time.time() - start_time < timeout: + try: + response = requests.get(probe_url, timeout=5) + # For now, just verify we get a response from this cluster + # A more robust check would verify a unique token + if response.status_code < 500: + print(f"DNS PROBE OK: {hostname} routes to this cluster") + return True + except requests.RequestException as e: + last_error = e + + time.sleep(poll_interval) + + print(f"DNS PROBE FAIL: {hostname} - {last_error}") + return False + + finally: + print("Cleaning up probe ingress...") + delete_probe_ingress(namespace) From 8d3191e4fdaabe72f0221b5471e80542b2da8fc1 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Mon, 2 Feb 2026 19:13:10 -0500 Subject: [PATCH 03/27] Fix Caddy ingress ACME email and RBAC issues - Add acme_email_key constant for spec.yml parsing - Add get_acme_email() method to Spec class - Modify install_ingress_for_kind() to patch ConfigMap with email - Pass acme-email from spec to ingress installation - Add 'delete' verb to leases RBAC for certificate lock cleanup The acme-email field in spec.yml was previously ignored, causing Let's Encrypt to fail with "unable to parse email address". The missing delete permission on leases caused lock cleanup failures. Co-Authored-By: Claude Opus 4.5 --- stack_orchestrator/constants.py | 1 + .../ingress/ingress-caddy-kind-deploy.yaml | 1 + stack_orchestrator/deploy/k8s/deploy_k8s.py | 2 +- stack_orchestrator/deploy/k8s/helpers.py | 17 ++++++++++++++++- stack_orchestrator/deploy/spec.py | 3 +++ 5 files changed, 22 insertions(+), 2 deletions(-) diff --git a/stack_orchestrator/constants.py b/stack_orchestrator/constants.py index 49dfa193..75bd0ebc 100644 --- a/stack_orchestrator/constants.py +++ b/stack_orchestrator/constants.py @@ -44,3 +44,4 @@ unlimited_memlock_key = "unlimited-memlock" runtime_class_key = "runtime-class" high_memlock_runtime = "high-memlock" high_memlock_spec_filename = "high-memlock-spec.json" +acme_email_key = "acme-email" diff --git a/stack_orchestrator/data/k8s/components/ingress/ingress-caddy-kind-deploy.yaml b/stack_orchestrator/data/k8s/components/ingress/ingress-caddy-kind-deploy.yaml index 632dcc05..844eb183 100644 --- a/stack_orchestrator/data/k8s/components/ingress/ingress-caddy-kind-deploy.yaml +++ b/stack_orchestrator/data/k8s/components/ingress/ingress-caddy-kind-deploy.yaml @@ -93,6 +93,7 @@ rules: - get - create - update + - delete --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py index 3d0b697c..7b88dd14 100644 --- a/stack_orchestrator/deploy/k8s/deploy_k8s.py +++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py @@ -301,7 +301,7 @@ class K8sDeployer(Deployer): self.connect_api() if self.is_kind() and not self.skip_cluster_management: # Configure ingress controller (not installed by default in kind) - install_ingress_for_kind() + install_ingress_for_kind(self.cluster_info.spec.get_acme_email()) # Wait for ingress to start # (deployment provisioning will fail unless this is done) wait_for_ingress_in_kind() diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index f7603b5e..f4e8cf9d 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -132,7 +132,7 @@ def wait_for_ingress_in_kind(): error_exit("ERROR: Timed out waiting for Caddy ingress to become ready") -def install_ingress_for_kind(): +def install_ingress_for_kind(acme_email: str = ""): api_client = client.ApiClient() ingress_install = os.path.abspath( get_k8s_dir().joinpath( @@ -143,6 +143,21 @@ def install_ingress_for_kind(): print("Installing Caddy ingress controller in kind cluster") utils.create_from_yaml(api_client, yaml_file=ingress_install) + # Patch ConfigMap with acme email if provided + if acme_email: + core_v1 = client.CoreV1Api() + configmap = core_v1.read_namespaced_config_map( + name="caddy-ingress-controller-configmap", namespace="caddy-system" + ) + configmap.data["email"] = acme_email + core_v1.patch_namespaced_config_map( + name="caddy-ingress-controller-configmap", + namespace="caddy-system", + body=configmap, + ) + if opts.o.debug: + print(f"Patched Caddy ConfigMap with email: {acme_email}") + def load_images_into_kind(kind_cluster_name: str, image_set: Set[str]): for image in image_set: diff --git a/stack_orchestrator/deploy/spec.py b/stack_orchestrator/deploy/spec.py index 1713f28a..db7783c9 100644 --- a/stack_orchestrator/deploy/spec.py +++ b/stack_orchestrator/deploy/spec.py @@ -179,6 +179,9 @@ class Spec: def get_deployment_type(self): return self.obj.get(constants.deploy_to_key) + def get_acme_email(self): + return self.obj.get(constants.acme_email_key, "") + def is_kubernetes_deployment(self): return self.get_deployment_type() in [ constants.k8s_kind_deploy_type, From 675ee87544321588368b29009925814422c97782 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Mon, 2 Feb 2026 19:21:00 -0500 Subject: [PATCH 04/27] Clear stale CNI resources from persisted etcd before cluster creation When etcd is persisted (for certificate backup) and a cluster is recreated, kind tries to install CNI (kindnet) fresh but the persisted etcd already has those resources, causing 'AlreadyExists' errors and cluster creation failure. This fix: - Detects etcd mount path from kind config - Before cluster creation, clears stale CNI resources (kindnet, coredns) - Preserves certificate and other important data Co-Authored-By: Claude Opus 4.5 --- stack_orchestrator/deploy/k8s/helpers.py | 128 +++++++++++++++++++++++ 1 file changed, 128 insertions(+) diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index f4e8cf9d..ea366d50 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -96,7 +96,135 @@ def _run_command(command: str): return result +def _get_etcd_host_path_from_kind_config(config_file: str) -> Optional[str]: + """Extract etcd host path from kind config extraMounts.""" + import yaml + + try: + with open(config_file, "r") as f: + config = yaml.safe_load(f) + except Exception: + return None + + nodes = config.get("nodes", []) + for node in nodes: + extra_mounts = node.get("extraMounts", []) + for mount in extra_mounts: + if mount.get("containerPath") == "/var/lib/etcd": + return mount.get("hostPath") + return None + + +def _clear_stale_cni_from_etcd(etcd_path: str) -> bool: + """Clear stale CNI resources from persisted etcd to allow cluster recreation. + + When etcd is persisted and a cluster is recreated, kind tries to install + CNI (kindnet) fresh but the persisted etcd already has those resources, + causing 'AlreadyExists' errors. This function clears those stale resources. + + Returns True if resources were cleared, False if no action needed. + """ + db_path = Path(etcd_path) / "member" / "snap" / "db" + if not db_path.exists(): + if opts.o.debug: + print(f"No etcd snapshot at {db_path}, skipping CNI cleanup") + return False + + if opts.o.debug: + print(f"Clearing stale CNI resources from persisted etcd at {etcd_path}") + + # Stale resources that conflict with fresh kind cluster creation + stale_prefixes = [ + "/registry/clusterrolebindings/kindnet", + "/registry/clusterroles/kindnet", + "/registry/controllerrevisions/kube-system/kindnet", + "/registry/daemonsets/kube-system/kindnet", + "/registry/pods/kube-system/kindnet", + "/registry/serviceaccounts/kube-system/kindnet", + # Also clear coredns as it can conflict + "/registry/clusterrolebindings/system:coredns", + "/registry/clusterroles/system:coredns", + "/registry/configmaps/kube-system/coredns", + "/registry/deployments/kube-system/coredns", + "/registry/serviceaccounts/kube-system/coredns", + "/registry/services/specs/kube-system/kube-dns", + ] + + # Build etcdctl delete commands + delete_cmds = " && ".join( + [f"etcdctl del --prefix '{prefix}'" for prefix in stale_prefixes] + ) + + # Use docker to run etcdutl and etcdctl + etcd_image = "gcr.io/etcd-development/etcd:v3.5.9" + temp_dir = "/tmp/laconic-etcd-cleanup" + + cleanup_script = f""" + set -e + rm -rf {temp_dir} + mkdir -p {temp_dir} + + # Restore snapshot to temp dir + docker run --rm \ + -v {db_path}:/data/db:ro \ + -v {temp_dir}:/restore \ + {etcd_image} \ + etcdutl snapshot restore /data/db \ + --data-dir=/restore/etcd-data \ + --skip-hash-check 2>/dev/null + + # Start temp etcd, delete stale resources, stop + docker rm -f laconic-etcd-cleanup 2>/dev/null || true + docker run -d --name laconic-etcd-cleanup \ + -v {temp_dir}/etcd-data:/etcd-data \ + {etcd_image} etcd \ + --data-dir=/etcd-data \ + --listen-client-urls=http://0.0.0.0:2379 \ + --advertise-client-urls=http://localhost:2379 + + sleep 3 + + # Delete stale resources + docker exec laconic-etcd-cleanup /bin/sh -c "{delete_cmds}" 2>/dev/null || true + + # Create new snapshot from cleaned etcd + docker exec laconic-etcd-cleanup \ + etcdctl snapshot save /etcd-data/cleaned-snapshot.db + + # Stop temp etcd + docker stop laconic-etcd-cleanup + docker rm laconic-etcd-cleanup + + # Replace original etcd data with cleaned version + rm -rf {etcd_path}/member + docker run --rm \ + -v {temp_dir}/etcd-data/cleaned-snapshot.db:/data/db:ro \ + -v {etcd_path}:/restore \ + {etcd_image} \ + etcdutl snapshot restore /data/db \ + --data-dir=/restore \ + --skip-hash-check 2>/dev/null + + rm -rf {temp_dir} + """ + + result = subprocess.run(cleanup_script, shell=True, capture_output=True, text=True) + if result.returncode != 0: + if opts.o.debug: + print(f"Warning: etcd cleanup failed: {result.stderr}") + return False + + if opts.o.debug: + print("Cleared stale CNI resources from persisted etcd") + return True + + def create_cluster(name: str, config_file: str): + # Clear stale CNI resources from persisted etcd if present + etcd_path = _get_etcd_host_path_from_kind_config(config_file) + if etcd_path: + _clear_stale_cni_from_etcd(etcd_path) + result = _run_command(f"kind create cluster --name {name} --config {config_file}") if result.returncode != 0: raise DeployerException(f"kind create cluster failed: {result}") From 8948f5bfec3a50a19ba48ed05b947b4d183b8af6 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Mon, 2 Feb 2026 19:22:41 -0500 Subject: [PATCH 05/27] Fix etcd cleanup to use docker for root-owned files Use docker containers with volume mounts to handle all file operations on root-owned etcd directories, avoiding the need for sudo on the host. Co-Authored-By: Claude Opus 4.5 --- stack_orchestrator/deploy/k8s/helpers.py | 36 +++++++++++++++++------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index ea366d50..cd34b138 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -125,7 +125,9 @@ def _clear_stale_cni_from_etcd(etcd_path: str) -> bool: Returns True if resources were cleared, False if no action needed. """ db_path = Path(etcd_path) / "member" / "snap" / "db" - if not db_path.exists(): + # Check existence with sudo since etcd dir is often root-owned + check_result = subprocess.run(f"test -f {db_path}", shell=True, capture_output=True) + if check_result.returncode != 0: if opts.o.debug: print(f"No etcd snapshot at {db_path}, skipping CNI cleanup") return False @@ -159,18 +161,29 @@ def _clear_stale_cni_from_etcd(etcd_path: str) -> bool: etcd_image = "gcr.io/etcd-development/etcd:v3.5.9" temp_dir = "/tmp/laconic-etcd-cleanup" + # All operations done inside docker containers to handle root-owned etcd files cleanup_script = f""" set -e - rm -rf {temp_dir} - mkdir -p {temp_dir} + + # Use alpine for file operations (has shell, rm, cp, etc.) + ALPINE_IMAGE="alpine:3.19" + + # Create temp dir using docker (handles permissions) + docker run --rm -v /tmp:/tmp $ALPINE_IMAGE \ + sh -c "rm -rf {temp_dir} && mkdir -p {temp_dir}" + + # Copy db to temp location using docker + docker run --rm \ + -v {etcd_path}:/etcd:ro \ + -v {temp_dir}:/tmp-work \ + $ALPINE_IMAGE cp /etcd/member/snap/db /tmp-work/etcd-snapshot.db # Restore snapshot to temp dir docker run --rm \ - -v {db_path}:/data/db:ro \ - -v {temp_dir}:/restore \ + -v {temp_dir}:/work \ {etcd_image} \ - etcdutl snapshot restore /data/db \ - --data-dir=/restore/etcd-data \ + etcdutl snapshot restore /work/etcd-snapshot.db \ + --data-dir=/work/etcd-data \ --skip-hash-check 2>/dev/null # Start temp etcd, delete stale resources, stop @@ -195,8 +208,10 @@ def _clear_stale_cni_from_etcd(etcd_path: str) -> bool: docker stop laconic-etcd-cleanup docker rm laconic-etcd-cleanup - # Replace original etcd data with cleaned version - rm -rf {etcd_path}/member + # Clear original etcd member dir using docker + docker run --rm -v {etcd_path}:/etcd $ALPINE_IMAGE rm -rf /etcd/member + + # Restore cleaned snapshot to original location docker run --rm \ -v {temp_dir}/etcd-data/cleaned-snapshot.db:/data/db:ro \ -v {etcd_path}:/restore \ @@ -205,7 +220,8 @@ def _clear_stale_cni_from_etcd(etcd_path: str) -> bool: --data-dir=/restore \ --skip-hash-check 2>/dev/null - rm -rf {temp_dir} + # Cleanup temp dir + docker run --rm -v /tmp:/tmp $ALPINE_IMAGE rm -rf {temp_dir} """ result = subprocess.run(cleanup_script, shell=True, capture_output=True, text=True) From 5b06cffe17dd4f3350581463f6545801989bec4f Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Mon, 2 Feb 2026 19:27:59 -0500 Subject: [PATCH 06/27] Use whitelist approach for etcd cleanup Instead of trying to delete specific stale resources (blacklist), keep only the valuable data (caddy TLS certs) and delete everything else. This is more robust as we don't need to maintain a list of all possible stale resources. Co-Authored-By: Claude Opus 4.5 --- stack_orchestrator/deploy/k8s/helpers.py | 110 +++++++++++------------ 1 file changed, 53 insertions(+), 57 deletions(-) diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index cd34b138..fa5b4141 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -115,81 +115,58 @@ def _get_etcd_host_path_from_kind_config(config_file: str) -> Optional[str]: return None -def _clear_stale_cni_from_etcd(etcd_path: str) -> bool: - """Clear stale CNI resources from persisted etcd to allow cluster recreation. +def _clean_etcd_keeping_certs(etcd_path: str) -> bool: + """Clean persisted etcd, keeping only TLS certificates. When etcd is persisted and a cluster is recreated, kind tries to install - CNI (kindnet) fresh but the persisted etcd already has those resources, - causing 'AlreadyExists' errors. This function clears those stale resources. + resources fresh but they already exist. Instead of trying to delete + specific stale resources (blacklist), we keep only the valuable data + (caddy TLS certs) and delete everything else (whitelist approach). - Returns True if resources were cleared, False if no action needed. + Returns True if cleanup succeeded, False if no action needed or failed. """ db_path = Path(etcd_path) / "member" / "snap" / "db" - # Check existence with sudo since etcd dir is often root-owned + # Check existence - etcd dir is often root-owned so use shell test check_result = subprocess.run(f"test -f {db_path}", shell=True, capture_output=True) if check_result.returncode != 0: if opts.o.debug: - print(f"No etcd snapshot at {db_path}, skipping CNI cleanup") + print(f"No etcd snapshot at {db_path}, skipping cleanup") return False if opts.o.debug: - print(f"Clearing stale CNI resources from persisted etcd at {etcd_path}") + print(f"Cleaning persisted etcd at {etcd_path}, keeping only TLS certs") - # Stale resources that conflict with fresh kind cluster creation - stale_prefixes = [ - "/registry/clusterrolebindings/kindnet", - "/registry/clusterroles/kindnet", - "/registry/controllerrevisions/kube-system/kindnet", - "/registry/daemonsets/kube-system/kindnet", - "/registry/pods/kube-system/kindnet", - "/registry/serviceaccounts/kube-system/kindnet", - # Also clear coredns as it can conflict - "/registry/clusterrolebindings/system:coredns", - "/registry/clusterroles/system:coredns", - "/registry/configmaps/kube-system/coredns", - "/registry/deployments/kube-system/coredns", - "/registry/serviceaccounts/kube-system/coredns", - "/registry/services/specs/kube-system/kube-dns", - ] - - # Build etcdctl delete commands - delete_cmds = " && ".join( - [f"etcdctl del --prefix '{prefix}'" for prefix in stale_prefixes] - ) - - # Use docker to run etcdutl and etcdctl etcd_image = "gcr.io/etcd-development/etcd:v3.5.9" temp_dir = "/tmp/laconic-etcd-cleanup" - # All operations done inside docker containers to handle root-owned etcd files + # Whitelist: prefixes to KEEP - everything else gets deleted + keep_prefixes = "/registry/secrets/caddy-system" + + # All operations in docker to handle root-owned etcd files cleanup_script = f""" set -e - - # Use alpine for file operations (has shell, rm, cp, etc.) ALPINE_IMAGE="alpine:3.19" - # Create temp dir using docker (handles permissions) + # Create temp dir docker run --rm -v /tmp:/tmp $ALPINE_IMAGE \ sh -c "rm -rf {temp_dir} && mkdir -p {temp_dir}" - # Copy db to temp location using docker + # Copy db to temp location docker run --rm \ -v {etcd_path}:/etcd:ro \ -v {temp_dir}:/tmp-work \ $ALPINE_IMAGE cp /etcd/member/snap/db /tmp-work/etcd-snapshot.db - # Restore snapshot to temp dir - docker run --rm \ - -v {temp_dir}:/work \ - {etcd_image} \ + # Restore snapshot + docker run --rm -v {temp_dir}:/work {etcd_image} \ etcdutl snapshot restore /work/etcd-snapshot.db \ - --data-dir=/work/etcd-data \ - --skip-hash-check 2>/dev/null + --data-dir=/work/etcd-data --skip-hash-check 2>/dev/null - # Start temp etcd, delete stale resources, stop + # Start temp etcd docker rm -f laconic-etcd-cleanup 2>/dev/null || true docker run -d --name laconic-etcd-cleanup \ -v {temp_dir}/etcd-data:/etcd-data \ + -v {temp_dir}:/backup \ {etcd_image} etcd \ --data-dir=/etcd-data \ --listen-client-urls=http://0.0.0.0:2379 \ @@ -197,30 +174,49 @@ def _clear_stale_cni_from_etcd(etcd_path: str) -> bool: sleep 3 - # Delete stale resources - docker exec laconic-etcd-cleanup /bin/sh -c "{delete_cmds}" 2>/dev/null || true + # Export caddy secrets to backup file (the only thing we keep) + docker exec laconic-etcd-cleanup \ + etcdctl get --prefix "{keep_prefixes}" -w json > {temp_dir}/kept.json \ + 2>/dev/null || echo '{{}}' > {temp_dir}/kept.json - # Create new snapshot from cleaned etcd + # Delete ALL registry keys + docker exec laconic-etcd-cleanup etcdctl del --prefix /registry + + # Restore kept keys using etcdctl txn + docker exec laconic-etcd-cleanup sh -c ' + cat /backup/kept.json 2>/dev/null | \ + (python3 -c " +import sys, json, base64 +try: + data = json.load(sys.stdin) + for kv in data.get(\"kvs\", []): + k = base64.b64decode(kv[\"key\"]).decode() + v = base64.b64decode(kv[\"value\"]).decode(\"latin-1\") + print(k) + print(v) +except: pass +" 2>/dev/null || true) | while IFS= read -r key && IFS= read -r value; do + printf \"%s\" \"$value\" | etcdctl put \"$key\" + done + ' 2>/dev/null || true + + # Save cleaned snapshot docker exec laconic-etcd-cleanup \ etcdctl snapshot save /etcd-data/cleaned-snapshot.db - # Stop temp etcd docker stop laconic-etcd-cleanup docker rm laconic-etcd-cleanup - # Clear original etcd member dir using docker + # Replace original etcd docker run --rm -v {etcd_path}:/etcd $ALPINE_IMAGE rm -rf /etcd/member - - # Restore cleaned snapshot to original location docker run --rm \ -v {temp_dir}/etcd-data/cleaned-snapshot.db:/data/db:ro \ -v {etcd_path}:/restore \ {etcd_image} \ - etcdutl snapshot restore /data/db \ - --data-dir=/restore \ - --skip-hash-check 2>/dev/null + etcdutl snapshot restore /data/db --data-dir=/restore --skip-hash-check \ + 2>/dev/null - # Cleanup temp dir + # Cleanup docker run --rm -v /tmp:/tmp $ALPINE_IMAGE rm -rf {temp_dir} """ @@ -231,15 +227,15 @@ def _clear_stale_cni_from_etcd(etcd_path: str) -> bool: return False if opts.o.debug: - print("Cleared stale CNI resources from persisted etcd") + print("Cleaned etcd, kept only TLS certificates") return True def create_cluster(name: str, config_file: str): - # Clear stale CNI resources from persisted etcd if present + # Clean persisted etcd, keeping only TLS certificates etcd_path = _get_etcd_host_path_from_kind_config(config_file) if etcd_path: - _clear_stale_cni_from_etcd(etcd_path) + _clean_etcd_keeping_certs(etcd_path) result = _run_command(f"kind create cluster --name {name} --config {config_file}") if result.returncode != 0: From 720e01fc752a7f843f90fdd873893a6457ed833a Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Mon, 2 Feb 2026 19:28:53 -0500 Subject: [PATCH 07/27] Preserve original etcd backup until restore is verified Move original to .bak, move new into place, then delete bak. If anything fails before the swap, original remains intact. Co-Authored-By: Claude Opus 4.5 --- stack_orchestrator/deploy/k8s/helpers.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index fa5b4141..76a216cb 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -207,14 +207,19 @@ except: pass docker stop laconic-etcd-cleanup docker rm laconic-etcd-cleanup - # Replace original etcd - docker run --rm -v {etcd_path}:/etcd $ALPINE_IMAGE rm -rf /etcd/member + # Restore to temp location first to verify it works docker run --rm \ -v {temp_dir}/etcd-data/cleaned-snapshot.db:/data/db:ro \ - -v {etcd_path}:/restore \ + -v {temp_dir}:/restore \ {etcd_image} \ - etcdutl snapshot restore /data/db --data-dir=/restore --skip-hash-check \ - 2>/dev/null + etcdutl snapshot restore /data/db --data-dir=/restore/new-etcd \ + --skip-hash-check 2>/dev/null + + # Only after successful restore, swap directories + docker run --rm -v {etcd_path}:/etcd -v {temp_dir}:/tmp-work $ALPINE_IMAGE \ + sh -c "mv /etcd/member /etcd/member.bak && \ + mv /tmp-work/new-etcd/member /etcd/member && \ + rm -rf /etcd/member.bak" # Cleanup docker run --rm -v /tmp:/tmp $ALPINE_IMAGE rm -rf {temp_dir} From e2d3c44321cbac03e7c07e12f197e057b91f7266 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Mon, 2 Feb 2026 19:30:13 -0500 Subject: [PATCH 08/27] Keep timestamped backup of etcd forever Create member.backup-YYYYMMDD-HHMMSS before cleaning. Each cluster recreation creates a new backup, preserving history. Co-Authored-By: Claude Opus 4.5 --- stack_orchestrator/deploy/k8s/helpers.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index 76a216cb..1d265edf 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -215,13 +215,16 @@ except: pass etcdutl snapshot restore /data/db --data-dir=/restore/new-etcd \ --skip-hash-check 2>/dev/null - # Only after successful restore, swap directories - docker run --rm -v {etcd_path}:/etcd -v {temp_dir}:/tmp-work $ALPINE_IMAGE \ - sh -c "mv /etcd/member /etcd/member.bak && \ - mv /tmp-work/new-etcd/member /etcd/member && \ - rm -rf /etcd/member.bak" + # Create timestamped backup of original (kept forever) + TIMESTAMP=$(date +%Y%m%d-%H%M%S) + docker run --rm -v {etcd_path}:/etcd $ALPINE_IMAGE \ + cp -a /etcd/member /etcd/member.backup-$TIMESTAMP - # Cleanup + # Replace original with cleaned version + docker run --rm -v {etcd_path}:/etcd -v {temp_dir}:/tmp-work $ALPINE_IMAGE \ + sh -c "rm -rf /etcd/member && mv /tmp-work/new-etcd/member /etcd/member" + + # Cleanup temp (but NOT the backup) docker run --rm -v /tmp:/tmp $ALPINE_IMAGE rm -rf {temp_dir} """ From 3fbd854b8c7b9153d1898e0b1ac62fc8bf31a188 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Mon, 2 Feb 2026 19:31:45 -0500 Subject: [PATCH 09/27] Use docker for etcd existence check (root-owned dir) The etcd directory is root-owned, so shell test -f fails. Use docker with volume mount to check file existence. Co-Authored-By: Claude Opus 4.5 --- stack_orchestrator/deploy/k8s/helpers.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index 1d265edf..18be7832 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -126,8 +126,12 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool: Returns True if cleanup succeeded, False if no action needed or failed. """ db_path = Path(etcd_path) / "member" / "snap" / "db" - # Check existence - etcd dir is often root-owned so use shell test - check_result = subprocess.run(f"test -f {db_path}", shell=True, capture_output=True) + # Check existence using docker since etcd dir is root-owned + check_cmd = ( + f"docker run --rm -v {etcd_path}:/etcd:ro alpine:3.19 " + "test -f /etcd/member/snap/db" + ) + check_result = subprocess.run(check_cmd, shell=True, capture_output=True) if check_result.returncode != 0: if opts.o.debug: print(f"No etcd snapshot at {db_path}, skipping cleanup") From 14258500bc84a4f9676da5b2c914fa071acaaf2e Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Mon, 2 Feb 2026 22:18:19 -0500 Subject: [PATCH 10/27] Fix restart command for GitOps deployments - Remove init_operation() from restart - don't regenerate spec from commands.py defaults, use existing git-tracked spec.yml instead - Add docs/deployment_patterns.md documenting GitOps workflow - Add pre-commit rule to CLAUDE.md - Fix line length issues in helpers.py Co-Authored-By: Claude Opus 4.5 --- CLAUDE.md | 1 + docs/deployment_patterns.md | 77 +++++++++++++++++++++ stack_orchestrator/deploy/deployment.py | 85 +++++++++--------------- stack_orchestrator/deploy/k8s/helpers.py | 74 +++++++++++++-------- stack_orchestrator/deploy/spec.py | 2 +- 5 files changed, 158 insertions(+), 81 deletions(-) create mode 100644 docs/deployment_patterns.md diff --git a/CLAUDE.md b/CLAUDE.md index f06b6abc..0626ac93 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -8,6 +8,7 @@ NEVER assume your hypotheses are true without evidence ALWAYS clearly state when something is a hypothesis ALWAYS use evidence from the systems your interacting with to support your claims and hypotheses +ALWAYS run `pre-commit run --all-files` before committing changes ## Key Principles diff --git a/docs/deployment_patterns.md b/docs/deployment_patterns.md new file mode 100644 index 00000000..fb2e0063 --- /dev/null +++ b/docs/deployment_patterns.md @@ -0,0 +1,77 @@ +# Deployment Patterns + +## GitOps Pattern + +For production deployments, we recommend a GitOps approach where your deployment configuration is tracked in version control. + +### Overview + +- **spec.yml is your source of truth**: Maintain it in your operator repository +- **Don't regenerate on every restart**: Run `deploy init` once, then customize and commit +- **Use restart for updates**: The restart command respects your git-tracked spec.yml + +### Workflow + +1. **Initial setup**: Run `deploy init` once to generate a spec.yml template +2. **Customize and commit**: Edit spec.yml with your configuration (hostnames, resources, etc.) and commit to your operator repo +3. **Deploy from git**: Use the committed spec.yml for deployments +4. **Update via git**: Make changes in git, then restart to apply + +```bash +# Initial setup (run once) +laconic-so --stack my-stack deploy init --output spec.yml + +# Customize for your environment +vim spec.yml # Set hostname, resources, etc. + +# Commit to your operator repository +git add spec.yml +git commit -m "Add my-stack deployment configuration" +git push + +# On deployment server: deploy from git-tracked spec +laconic-so deploy create \ + --spec-file /path/to/operator-repo/spec.yml \ + --deployment-dir my-deployment + +laconic-so deployment --dir my-deployment start +``` + +### Updating Deployments + +When you need to update a deployment: + +```bash +# 1. Make changes in your operator repo +vim /path/to/operator-repo/spec.yml +git commit -am "Update configuration" +git push + +# 2. On deployment server: pull and restart +cd /path/to/operator-repo && git pull +laconic-so deployment --dir my-deployment restart +``` + +The `restart` command: +- Pulls latest code from the stack repository +- Uses your git-tracked spec.yml (does NOT regenerate from defaults) +- Syncs the deployment directory +- Restarts services + +### Anti-patterns + +**Don't do this:** +```bash +# BAD: Regenerating spec on every deployment +laconic-so --stack my-stack deploy init --output spec.yml +laconic-so deploy create --spec-file spec.yml ... +``` + +This overwrites your customizations with defaults from the stack's `commands.py`. + +**Do this instead:** +```bash +# GOOD: Use your git-tracked spec +git pull # Get latest spec.yml from your operator repo +laconic-so deployment --dir my-deployment restart +``` diff --git a/stack_orchestrator/deploy/deployment.py b/stack_orchestrator/deploy/deployment.py index f60ea9a4..2500d0d5 100644 --- a/stack_orchestrator/deploy/deployment.py +++ b/stack_orchestrator/deploy/deployment.py @@ -17,7 +17,6 @@ import click from pathlib import Path import subprocess import sys -import tempfile import time from stack_orchestrator import constants from stack_orchestrator.deploy.images import push_images_operation @@ -248,13 +247,13 @@ def run_job(ctx, job_name, helm_release): ) @click.pass_context def restart(ctx, stack_path, config_file, force, expected_ip): - """Pull latest stack, regenerate spec, and restart deployment. + """Pull latest code and restart deployment using git-tracked spec. - This command: - 1. Pulls latest code from the stack git repository - 2. Regenerates spec.yml from the stack's commands.py + GitOps workflow: + 1. Operator maintains spec.yml in their git repository + 2. This command pulls latest code (including updated spec.yml) 3. If hostname changed, verifies DNS routes to this server - 4. Syncs the deployment directory (preserves cluster ID and data) + 4. Syncs deployment directory with the git-tracked spec 5. Stops and restarts the deployment Data volumes are always preserved. The cluster is never destroyed. @@ -264,19 +263,17 @@ def restart(ctx, stack_path, config_file, force, expected_ip): 2. stack-source field in deployment.yml (if stored) 3. Error if neither available - Note: After restart, Caddy will automatically provision TLS certificates - for any new hostnames. + Note: spec.yml should be maintained in git, not regenerated from + commands.py on each restart. Use 'deploy init' only for initial + spec generation, then customize and commit to your operator repo. """ from stack_orchestrator.util import get_yaml, get_parsed_deployment_spec - from stack_orchestrator.deploy.deployment_create import ( - init_operation, - create_operation, - ) + from stack_orchestrator.deploy.deployment_create import create_operation from stack_orchestrator.deploy.dns_probe import verify_dns_via_probe deployment_context: DeploymentContext = ctx.obj - # Get current spec info + # Get current spec info (before git pull) current_spec = deployment_context.spec current_http_proxy = current_spec.get_http_proxy() current_hostname = ( @@ -310,8 +307,8 @@ def restart(ctx, stack_path, config_file, force, expected_ip): print(f"Stack source: {stack_source}") print(f"Current hostname: {current_hostname}") - # Step 1: Git pull - print("\n[1/6] Pulling latest code from stack repository...") + # Step 1: Git pull (brings in updated spec.yml from operator's repo) + print("\n[1/4] Pulling latest code from stack repository...") git_result = subprocess.run( ["git", "pull"], cwd=stack_source, capture_output=True, text=True ) @@ -320,36 +317,23 @@ def restart(ctx, stack_path, config_file, force, expected_ip): sys.exit(1) print(f"Git pull: {git_result.stdout.strip()}") - # Step 2: Regenerate spec - print("\n[2/6] Regenerating spec from commands.py...") - with tempfile.NamedTemporaryFile(mode="w", suffix=".yml", delete=False) as tmp: - new_spec_path = tmp.name + # Use the spec.yml from the deployment directory (updated by git pull if tracked) + spec_file_path = deployment_context.deployment_dir / "spec.yml" + if not spec_file_path.exists(): + print(f"Error: spec.yml not found at {spec_file_path}") + print("Ensure spec.yml exists in the deployment directory.") + sys.exit(1) - # Build deploy context for init - deploy_ctx = make_deploy_context(ctx) - - init_operation( - deploy_command_context=deploy_ctx, - stack=str(stack_source), - deployer_type=current_spec.obj[constants.deploy_to_key], - config=None, - config_file=config_file, - kube_config=None, - image_registry=None, - output=new_spec_path, - map_ports_to_host=None, - ) - - # Parse new spec to get new hostname - new_spec_obj = get_parsed_deployment_spec(new_spec_path) + # Parse spec to check for hostname changes + new_spec_obj = get_parsed_deployment_spec(str(spec_file_path)) new_http_proxy = new_spec_obj.get("network", {}).get("http-proxy", []) new_hostname = new_http_proxy[0]["host-name"] if new_http_proxy else None - print(f"New hostname: {new_hostname}") + print(f"Spec hostname: {new_hostname}") - # Step 3: DNS verification (only if hostname changed) + # Step 2: DNS verification (only if hostname changed) if new_hostname and new_hostname != current_hostname: - print(f"\n[3/6] Hostname changed: {current_hostname} -> {new_hostname}") + print(f"\n[2/4] Hostname changed: {current_hostname} -> {new_hostname}") if force: print("DNS verification skipped (--force)") else: @@ -360,25 +344,26 @@ def restart(ctx, stack_path, config_file, force, expected_ip): print("Use --force to skip this check.") sys.exit(1) else: - print("\n[3/6] Hostname unchanged, skipping DNS verification") + print("\n[2/4] Hostname unchanged, skipping DNS verification") - # Step 4: Sync deployment directory - print("\n[4/6] Syncing deployment directory...") + # Step 3: Sync deployment directory with spec + print("\n[3/4] Syncing deployment directory...") + deploy_ctx = make_deploy_context(ctx) create_operation( deployment_command_context=deploy_ctx, - spec_file=new_spec_path, + spec_file=str(spec_file_path), deployment_dir=str(deployment_context.deployment_dir), update=True, network_dir=None, initial_peers=None, ) - # Reload deployment context with new spec + # Reload deployment context with updated spec deployment_context.init(deployment_context.deployment_dir) ctx.obj = deployment_context - # Step 5: Stop deployment - print("\n[5/6] Stopping deployment...") + # Stop deployment + print("\n[4/4] Restarting deployment...") ctx.obj = make_deploy_context(ctx) down_operation( ctx, delete_volumes=False, extra_args_list=[], skip_cluster_management=True @@ -387,17 +372,13 @@ def restart(ctx, stack_path, config_file, force, expected_ip): # Brief pause to ensure clean shutdown time.sleep(5) - # Step 6: Start deployment - print("\n[6/6] Starting deployment...") + # Start deployment up_operation( ctx, services_list=None, stay_attached=False, skip_cluster_management=True ) print("\n=== Restart Complete ===") - print("Deployment restarted with updated configuration.") + print("Deployment restarted with git-tracked configuration.") if new_hostname and new_hostname != current_hostname: print(f"\nNew hostname: {new_hostname}") print("Caddy will automatically provision TLS certificate.") - - # Cleanup temp file - Path(new_spec_path).unlink(missing_ok=True) diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index 18be7832..613c870a 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -123,6 +123,9 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool: specific stale resources (blacklist), we keep only the valuable data (caddy TLS certs) and delete everything else (whitelist approach). + The etcd image is distroless (no shell), so we extract the statically-linked + etcdctl binary and run it from alpine which has shell support. + Returns True if cleanup succeeded, False if no action needed or failed. """ db_path = Path(etcd_path) / "member" / "snap" / "db" @@ -146,14 +149,26 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool: # Whitelist: prefixes to KEEP - everything else gets deleted keep_prefixes = "/registry/secrets/caddy-system" - # All operations in docker to handle root-owned etcd files + # The etcd image is distroless (no shell). We extract the statically-linked + # etcdctl binary and run it from alpine which has shell + jq support. cleanup_script = f""" set -e ALPINE_IMAGE="alpine:3.19" + # Cleanup previous runs + docker rm -f laconic-etcd-cleanup 2>/dev/null || true + docker rm -f etcd-extract 2>/dev/null || true + docker run --rm -v /tmp:/tmp $ALPINE_IMAGE rm -rf {temp_dir} + # Create temp dir - docker run --rm -v /tmp:/tmp $ALPINE_IMAGE \ - sh -c "rm -rf {temp_dir} && mkdir -p {temp_dir}" + docker run --rm -v /tmp:/tmp $ALPINE_IMAGE mkdir -p {temp_dir} + + # Extract etcdctl binary (it's statically linked) + docker create --name etcd-extract {etcd_image} + docker cp etcd-extract:/usr/local/bin/etcdctl /tmp/etcdctl-bin + docker rm etcd-extract + docker run --rm -v /tmp/etcdctl-bin:/src:ro -v {temp_dir}:/dst $ALPINE_IMAGE \ + sh -c "cp /src /dst/etcdctl && chmod +x /dst/etcdctl" # Copy db to temp location docker run --rm \ @@ -166,8 +181,7 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool: etcdutl snapshot restore /work/etcd-snapshot.db \ --data-dir=/work/etcd-data --skip-hash-check 2>/dev/null - # Start temp etcd - docker rm -f laconic-etcd-cleanup 2>/dev/null || true + # Start temp etcd (runs the etcd binary, no shell needed) docker run -d --name laconic-etcd-cleanup \ -v {temp_dir}/etcd-data:/etcd-data \ -v {temp_dir}:/backup \ @@ -178,31 +192,34 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool: sleep 3 - # Export caddy secrets to backup file (the only thing we keep) - docker exec laconic-etcd-cleanup \ - etcdctl get --prefix "{keep_prefixes}" -w json > {temp_dir}/kept.json \ - 2>/dev/null || echo '{{}}' > {temp_dir}/kept.json + # Use alpine with extracted etcdctl to run commands (alpine has shell + jq) + # Export caddy secrets + docker run --rm \ + -v {temp_dir}:/backup \ + --network container:laconic-etcd-cleanup \ + $ALPINE_IMAGE sh -c \ + '/backup/etcdctl get --prefix "{keep_prefixes}" -w json \ + > /backup/kept.json 2>/dev/null || echo "{{}}" > /backup/kept.json' # Delete ALL registry keys - docker exec laconic-etcd-cleanup etcdctl del --prefix /registry + docker run --rm \ + -v {temp_dir}:/backup \ + --network container:laconic-etcd-cleanup \ + $ALPINE_IMAGE /backup/etcdctl del --prefix /registry - # Restore kept keys using etcdctl txn - docker exec laconic-etcd-cleanup sh -c ' - cat /backup/kept.json 2>/dev/null | \ - (python3 -c " -import sys, json, base64 -try: - data = json.load(sys.stdin) - for kv in data.get(\"kvs\", []): - k = base64.b64decode(kv[\"key\"]).decode() - v = base64.b64decode(kv[\"value\"]).decode(\"latin-1\") - print(k) - print(v) -except: pass -" 2>/dev/null || true) | while IFS= read -r key && IFS= read -r value; do - printf \"%s\" \"$value\" | etcdctl put \"$key\" - done - ' 2>/dev/null || true + # Restore kept keys using jq + docker run --rm \ + -v {temp_dir}:/backup \ + --network container:laconic-etcd-cleanup \ + $ALPINE_IMAGE sh -c ' + apk add --no-cache jq >/dev/null 2>&1 + jq -r ".kvs[] | @base64" /backup/kept.json 2>/dev/null | \ + while read encoded; do + key=$(echo $encoded | base64 -d | jq -r ".key" | base64 -d) + val=$(echo $encoded | base64 -d | jq -r ".value" | base64 -d) + echo "$val" | /backup/etcdctl put "$key" + done + ' || true # Save cleaned snapshot docker exec laconic-etcd-cleanup \ @@ -228,8 +245,9 @@ except: pass docker run --rm -v {etcd_path}:/etcd -v {temp_dir}:/tmp-work $ALPINE_IMAGE \ sh -c "rm -rf /etcd/member && mv /tmp-work/new-etcd/member /etcd/member" - # Cleanup temp (but NOT the backup) + # Cleanup temp files (but NOT the timestamped backup in etcd_path) docker run --rm -v /tmp:/tmp $ALPINE_IMAGE rm -rf {temp_dir} + rm -f /tmp/etcdctl-bin """ result = subprocess.run(cleanup_script, shell=True, capture_output=True, text=True) diff --git a/stack_orchestrator/deploy/spec.py b/stack_orchestrator/deploy/spec.py index db7783c9..a870ef60 100644 --- a/stack_orchestrator/deploy/spec.py +++ b/stack_orchestrator/deploy/spec.py @@ -180,7 +180,7 @@ class Spec: return self.obj.get(constants.deploy_to_key) def get_acme_email(self): - return self.obj.get(constants.acme_email_key, "") + return self.obj.get(constants.network_key, {}).get(constants.acme_email_key, "") def is_kubernetes_deployment(self): return self.get_deployment_type() in [ From 22d64f1e97ad8c413dc8b38f57b88fe731eeffb0 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Mon, 2 Feb 2026 22:48:19 -0500 Subject: [PATCH 11/27] Add --spec-file option to restart and auto-detect GitOps spec - Add --spec-file option to specify spec location in repo - Auto-detect deployment/spec.yml in repo as GitOps location - Fall back to deployment dir if no repo spec found Co-Authored-By: Claude Opus 4.5 --- stack_orchestrator/deploy/deployment.py | 28 +++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/stack_orchestrator/deploy/deployment.py b/stack_orchestrator/deploy/deployment.py index 2500d0d5..54f8377a 100644 --- a/stack_orchestrator/deploy/deployment.py +++ b/stack_orchestrator/deploy/deployment.py @@ -234,6 +234,9 @@ def run_job(ctx, job_name, helm_release): @command.command() @click.option("--stack-path", help="Path to stack git repo (overrides stored path)") +@click.option( + "--spec-file", help="Path to GitOps spec.yml in repo (e.g., deployment/spec.yml)" +) @click.option("--config-file", help="Config file to pass to deploy init") @click.option( "--force", @@ -246,7 +249,7 @@ def run_job(ctx, job_name, helm_release): help="Expected IP for DNS verification (if different from egress)", ) @click.pass_context -def restart(ctx, stack_path, config_file, force, expected_ip): +def restart(ctx, stack_path, spec_file, config_file, force, expected_ip): """Pull latest code and restart deployment using git-tracked spec. GitOps workflow: @@ -317,13 +320,30 @@ def restart(ctx, stack_path, config_file, force, expected_ip): sys.exit(1) print(f"Git pull: {git_result.stdout.strip()}") - # Use the spec.yml from the deployment directory (updated by git pull if tracked) - spec_file_path = deployment_context.deployment_dir / "spec.yml" + # Determine spec file location + # Priority: --spec-file argument > repo's deployment/spec.yml > deployment dir + if spec_file: + # Spec file relative to repo root + repo_root = stack_source.parent.parent.parent # Go up from stack path + spec_file_path = repo_root / spec_file + else: + # Try standard GitOps location in repo + repo_root = stack_source.parent.parent.parent + gitops_spec = repo_root / "deployment" / "spec.yml" + if gitops_spec.exists(): + spec_file_path = gitops_spec + else: + # Fall back to deployment directory + spec_file_path = deployment_context.deployment_dir / "spec.yml" + if not spec_file_path.exists(): print(f"Error: spec.yml not found at {spec_file_path}") - print("Ensure spec.yml exists in the deployment directory.") + print("For GitOps, add spec.yml to your repo at deployment/spec.yml") + print("Or specify --spec-file with path relative to repo root") sys.exit(1) + print(f"Using spec: {spec_file_path}") + # Parse spec to check for hostname changes new_spec_obj = get_parsed_deployment_spec(str(spec_file_path)) new_http_proxy = new_spec_obj.get("network", {}).get("http-proxy", []) From 4408725b086301e8e65f7a2b61ac8e77ab9a9f3d Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Mon, 2 Feb 2026 22:49:19 -0500 Subject: [PATCH 12/27] Fix repo root path calculation (4 parents from stack path) --- stack_orchestrator/deploy/deployment.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/stack_orchestrator/deploy/deployment.py b/stack_orchestrator/deploy/deployment.py index 54f8377a..b76e6486 100644 --- a/stack_orchestrator/deploy/deployment.py +++ b/stack_orchestrator/deploy/deployment.py @@ -322,13 +322,14 @@ def restart(ctx, stack_path, spec_file, config_file, force, expected_ip): # Determine spec file location # Priority: --spec-file argument > repo's deployment/spec.yml > deployment dir + # Stack path is like: repo/stack_orchestrator/data/stacks/stack-name + # So repo root is 4 parents up + repo_root = stack_source.parent.parent.parent.parent if spec_file: # Spec file relative to repo root - repo_root = stack_source.parent.parent.parent # Go up from stack path spec_file_path = repo_root / spec_file else: # Try standard GitOps location in repo - repo_root = stack_source.parent.parent.parent gitops_spec = repo_root / "deployment" / "spec.yml" if gitops_spec.exists(): spec_file_path = gitops_spec From 2d3721efa4808f70a436c6f6107e9224c6ff9ac5 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Tue, 3 Feb 2026 00:05:05 -0500 Subject: [PATCH 13/27] Add cluster reuse for multi-stack k8s-kind deployments When deploying a second stack to k8s-kind, automatically reuse an existing kind cluster instead of trying to create a new one (which would fail due to port 80/443 conflicts). Changes: - helpers.py: create_cluster() now checks for existing cluster first - deploy_k8s.py: up() captures returned cluster name and updates self This enables deploying multiple stacks (e.g., gorbagana-rpc + trashscan-explorer) to the same kind cluster. Co-Authored-By: Claude Opus 4.5 --- stack_orchestrator/deploy/k8s/deploy_k8s.py | 11 +++++++---- stack_orchestrator/deploy/k8s/helpers.py | 20 ++++++++++++++++++++ 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py index 7b88dd14..3f073407 100644 --- a/stack_orchestrator/deploy/k8s/deploy_k8s.py +++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py @@ -289,11 +289,14 @@ class K8sDeployer(Deployer): self.skip_cluster_management = skip_cluster_management if not opts.o.dry_run: if self.is_kind() and not self.skip_cluster_management: - # Create the kind cluster - create_cluster( - self.kind_cluster_name, - str(self.deployment_dir.joinpath(constants.kind_config_filename)), + # Create the kind cluster (or reuse existing one) + kind_config = str( + self.deployment_dir.joinpath(constants.kind_config_filename) ) + actual_cluster = create_cluster(self.kind_cluster_name, kind_config) + if actual_cluster != self.kind_cluster_name: + # An existing cluster was found, use it instead + self.kind_cluster_name = actual_cluster # Ensure the referenced containers are copied into kind load_images_into_kind( self.kind_cluster_name, self.cluster_info.image_set diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index 613c870a..5fc90832 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -262,14 +262,34 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool: def create_cluster(name: str, config_file: str): + """Create a kind cluster, or reuse an existing one. + + Checks if any kind cluster already exists. If so, uses that cluster + instead of creating a new one. This allows multiple deployments to + share the same kind cluster. + + Args: + name: The desired cluster name (used only if creating new) + config_file: Path to kind config file (used only if creating new) + + Returns: + The name of the cluster being used (either existing or newly created) + """ + existing = get_kind_cluster() + if existing: + print(f"Using existing cluster: {existing}") + return existing + # Clean persisted etcd, keeping only TLS certificates etcd_path = _get_etcd_host_path_from_kind_config(config_file) if etcd_path: _clean_etcd_keeping_certs(etcd_path) + print(f"Creating new cluster: {name}") result = _run_command(f"kind create cluster --name {name} --config {config_file}") if result.returncode != 0: raise DeployerException(f"kind create cluster failed: {result}") + return name def destroy_cluster(name: str): From ca090d2cd540926aac86129b4067b89b77d8f79a Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Tue, 3 Feb 2026 00:55:14 -0500 Subject: [PATCH 14/27] Add $generate:type:length$ token support for K8s secrets - Add GENERATE_TOKEN_PATTERN to detect $generate:hex:N$ and $generate:base64:N$ tokens - Add _generate_and_store_secrets() to create K8s Secrets from spec.yml config - Modify _write_config_file() to separate secrets from regular config - Add env_from with secretRef to container spec in cluster_info.py - Secrets are injected directly into containers via K8s native mechanism This enables declarative secret generation in spec.yml: config: SESSION_SECRET: $generate:hex:32$ DB_PASSWORD: $generate:hex:16$ Co-Authored-By: Claude Opus 4.5 --- .../deploy/deployment_create.py | 106 ++++++++++++++++-- stack_orchestrator/deploy/k8s/cluster_info.py | 11 ++ 2 files changed, 109 insertions(+), 8 deletions(-) diff --git a/stack_orchestrator/deploy/deployment_create.py b/stack_orchestrator/deploy/deployment_create.py index ec15362f..7f5f5d73 100644 --- a/stack_orchestrator/deploy/deployment_create.py +++ b/stack_orchestrator/deploy/deployment_create.py @@ -16,6 +16,8 @@ import click from importlib import util import os +import re +import base64 from pathlib import Path from typing import List, Optional import random @@ -484,15 +486,99 @@ def init_operation( get_yaml().dump(spec_file_content, output_file) -def _write_config_file(spec_file: Path, config_env_file: Path): +# Token pattern: $generate:hex:32$ or $generate:base64:16$ +GENERATE_TOKEN_PATTERN = re.compile(r"\$generate:(\w+):(\d+)\$") + + +def _generate_and_store_secrets(config_vars: dict, deployment_name: str): + """Generate secrets for $generate:...$ tokens and store in K8s Secret. + + Called by `deploy create` - generates fresh secrets and stores them. + Returns the generated secrets dict for reference. + """ + from kubernetes import client, config as k8s_config + + secrets = {} + for name, value in config_vars.items(): + if not isinstance(value, str): + continue + match = GENERATE_TOKEN_PATTERN.search(value) + if not match: + continue + + secret_type, length = match.group(1), int(match.group(2)) + if secret_type == "hex": + secrets[name] = token_hex(length) + elif secret_type == "base64": + secrets[name] = base64.b64encode(os.urandom(length)).decode() + else: + secrets[name] = token_hex(length) + + if not secrets: + return secrets + + # Store in K8s Secret + try: + k8s_config.load_kube_config() + except Exception: + # Fall back to in-cluster config if available + try: + k8s_config.load_incluster_config() + except Exception: + print( + "Warning: Could not load kube config, secrets will not be stored in K8s" + ) + return secrets + + v1 = client.CoreV1Api() + secret_name = f"{deployment_name}-generated-secrets" + namespace = "default" + + secret_data = {k: base64.b64encode(v.encode()).decode() for k, v in secrets.items()} + k8s_secret = client.V1Secret( + metadata=client.V1ObjectMeta(name=secret_name), data=secret_data, type="Opaque" + ) + + try: + v1.create_namespaced_secret(namespace, k8s_secret) + num_secrets = len(secrets) + print(f"Created K8s Secret '{secret_name}' with {num_secrets} secret(s)") + except client.exceptions.ApiException as e: + if e.status == 409: # Already exists + v1.replace_namespaced_secret(secret_name, namespace, k8s_secret) + num_secrets = len(secrets) + print(f"Updated K8s Secret '{secret_name}' with {num_secrets} secret(s)") + else: + raise + + return secrets + + +def _write_config_file( + spec_file: Path, config_env_file: Path, deployment_name: Optional[str] = None +): spec_content = get_parsed_deployment_spec(spec_file) - # Note: we want to write an empty file even if we have no config variables + config_vars = spec_content.get("config", {}) or {} + + # Generate and store secrets in K8s if deployment_name provided and tokens exist + if deployment_name and config_vars: + has_generate_tokens = any( + isinstance(v, str) and GENERATE_TOKEN_PATTERN.search(v) + for v in config_vars.values() + ) + if has_generate_tokens: + _generate_and_store_secrets(config_vars, deployment_name) + + # Write non-secret config to config.env (exclude $generate:...$ tokens) with open(config_env_file, "w") as output_file: - if "config" in spec_content and spec_content["config"]: - config_vars = spec_content["config"] - if config_vars: - for variable_name, variable_value in config_vars.items(): - output_file.write(f"{variable_name}={variable_value}\n") + if config_vars: + for variable_name, variable_value in config_vars.items(): + # Skip variables with generate tokens - they go to K8s Secret + if isinstance(variable_value, str) and GENERATE_TOKEN_PATTERN.search( + variable_value + ): + continue + output_file.write(f"{variable_name}={variable_value}\n") def _write_kube_config_file(external_path: Path, internal_path: Path): @@ -756,7 +842,11 @@ def _write_deployment_files( _create_deployment_file(target_dir, stack_source=stack_source) # Copy any config variables from the spec file into an env file suitable for compose - _write_config_file(spec_file, target_dir.joinpath(constants.config_file_name)) + # Use stack_name as deployment_name for K8s secret naming + deployment_name = stack_name.replace("_", "-") + _write_config_file( + spec_file, target_dir.joinpath(constants.config_file_name), deployment_name + ) # Copy any k8s config file into the target dir if deployment_type == "k8s": diff --git a/stack_orchestrator/deploy/k8s/cluster_info.py b/stack_orchestrator/deploy/k8s/cluster_info.py index 42c41b4b..0d9ac2ed 100644 --- a/stack_orchestrator/deploy/k8s/cluster_info.py +++ b/stack_orchestrator/deploy/k8s/cluster_info.py @@ -453,6 +453,16 @@ class ClusterInfo: if "command" in service_info: cmd = service_info["command"] container_args = cmd if isinstance(cmd, list) else cmd.split() + # Add env_from to pull secrets from K8s Secret + secret_name = f"{self.app_name}-generated-secrets" + env_from = [ + client.V1EnvFromSource( + secret_ref=client.V1SecretEnvSource( + name=secret_name, + optional=True, # Don't fail if no secrets + ) + ) + ] container = client.V1Container( name=container_name, image=image_to_use, @@ -460,6 +470,7 @@ class ClusterInfo: command=container_command, args=container_args, env=envs, + env_from=env_from, ports=container_ports if container_ports else None, volume_mounts=volume_mounts, security_context=client.V1SecurityContext( From b057969ddda19b6745032db88f380228b1f1e07c Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Tue, 3 Feb 2026 01:10:29 -0500 Subject: [PATCH 15/27] Clarify create_cluster docstring: one cluster per host by design Co-Authored-By: Claude Opus 4.5 --- stack_orchestrator/deploy/k8s/helpers.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index 5fc90832..5ee7e062 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -262,18 +262,17 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool: def create_cluster(name: str, config_file: str): - """Create a kind cluster, or reuse an existing one. + """Create or reuse the single kind cluster for this host. - Checks if any kind cluster already exists. If so, uses that cluster - instead of creating a new one. This allows multiple deployments to - share the same kind cluster. + There is only one kind cluster per host by design. Multiple deployments + share this cluster. If a cluster already exists, it is reused. Args: - name: The desired cluster name (used only if creating new) - config_file: Path to kind config file (used only if creating new) + name: Cluster name (used only when creating the first cluster) + config_file: Path to kind config file (used only when creating) Returns: - The name of the cluster being used (either existing or newly created) + The name of the cluster being used """ existing = get_kind_cluster() if existing: From d292e7c48d141f6f4a4dc3225af364b5c06b4bb5 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Tue, 3 Feb 2026 01:14:25 -0500 Subject: [PATCH 16/27] Add k8s-kind architecture documentation to CLAUDE.md Co-Authored-By: Claude Opus 4.5 --- CLAUDE.md | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index 0626ac93..50776741 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -83,6 +83,47 @@ laconic-so deployment --dir deployment start - `urbit-stack` - Fake Urbit ship for testing - `zenith-desk-stack` - Desk deployment stack +## Architecture: k8s-kind Deployments + +### One Cluster Per Host +There is **one Kind cluster per host by design**. Multiple deployments (stacks) share this cluster. Never request or expect separate clusters for different deployments on the same host. + +- `create_cluster()` in `helpers.py` reuses any existing cluster +- Cluster name in deployment.yml is an identifier, not a cluster request +- All deployments share ingress controller, etcd, certificates + +### External Stacks +External stacks are detected by filesystem path existence (`Path(stack).exists()`). Required structure: + +``` +/ + stack_orchestrator/data/ + stacks//stack.yml + compose/docker-compose-.yml + deployment/spec.yml +``` + +Config/compose resolution: external path first, then internal fallback. + +### Deployment Lifecycle +- `deploy create`: Initializes deployment dir, generates cluster-id, processes spec.yml +- `deployment start`: Creates/reuses cluster, deploys K8s resources +- `deployment restart`: Git pulls stack repo, syncs spec, redeploys (preserves data) +- `deployment stop`: Removes K8s resources (cluster persists) + +### Secret Generation +`$generate:type:length$` tokens in spec.yml config section: +- Processed during `deploy create` +- Stored in K8s Secret `-generated-secrets` +- Injected via `envFrom` with `secretRef` +- Non-secret config goes to `config.env` + +### Key Files +- `spec.yml`: Deployment specification (in stack repo) +- `deployment.yml`: Cluster-id, stack-source path (generated) +- `config.env`: Non-secret environment variables (generated) +- `kind-config.yml`: Kind cluster configuration (generated) + ## Insights and Observations ### Design Principles From 1128c959692a11149cf60e43752390ce7bd33c49 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Tue, 3 Feb 2026 01:17:10 -0500 Subject: [PATCH 17/27] Split documentation: README for users, CLAUDE.md for agents README.md: deployment types, external stacks, commands, spec.yml reference CLAUDE.md: implementation details, code locations, codebase navigation Co-Authored-By: Claude Opus 4.5 --- CLAUDE.md | 50 ++++++++++++++++++-------------------------------- README.md | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 32 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 50776741..dcf503b2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -86,43 +86,29 @@ laconic-so deployment --dir deployment start ## Architecture: k8s-kind Deployments ### One Cluster Per Host -There is **one Kind cluster per host by design**. Multiple deployments (stacks) share this cluster. Never request or expect separate clusters for different deployments on the same host. +One Kind cluster per host by design. Never request or expect separate clusters. - `create_cluster()` in `helpers.py` reuses any existing cluster -- Cluster name in deployment.yml is an identifier, not a cluster request -- All deployments share ingress controller, etcd, certificates +- `cluster-id` in deployment.yml is an identifier, not a cluster request +- All deployments share: ingress controller, etcd, certificates -### External Stacks -External stacks are detected by filesystem path existence (`Path(stack).exists()`). Required structure: +### Stack Resolution +- External stacks detected via `Path(stack).exists()` in `util.py` +- Config/compose resolution: external path first, then internal fallback +- External path structure: `stack_orchestrator/data/stacks//stack.yml` -``` -/ - stack_orchestrator/data/ - stacks//stack.yml - compose/docker-compose-.yml - deployment/spec.yml -``` +### Secret Generation Implementation +- `GENERATE_TOKEN_PATTERN` in `deployment_create.py` matches `$generate:type:length$` +- `_generate_and_store_secrets()` creates K8s Secret +- `cluster_info.py` adds `envFrom` with `secretRef` to containers +- Non-secret config written to `config.env` -Config/compose resolution: external path first, then internal fallback. - -### Deployment Lifecycle -- `deploy create`: Initializes deployment dir, generates cluster-id, processes spec.yml -- `deployment start`: Creates/reuses cluster, deploys K8s resources -- `deployment restart`: Git pulls stack repo, syncs spec, redeploys (preserves data) -- `deployment stop`: Removes K8s resources (cluster persists) - -### Secret Generation -`$generate:type:length$` tokens in spec.yml config section: -- Processed during `deploy create` -- Stored in K8s Secret `-generated-secrets` -- Injected via `envFrom` with `secretRef` -- Non-secret config goes to `config.env` - -### Key Files -- `spec.yml`: Deployment specification (in stack repo) -- `deployment.yml`: Cluster-id, stack-source path (generated) -- `config.env`: Non-secret environment variables (generated) -- `kind-config.yml`: Kind cluster configuration (generated) +### Key Files (for codebase navigation) +- `deployment_create.py`: `deploy create` command, secret generation +- `deployment.py`: `deployment start/stop/restart` commands +- `deploy_k8s.py`: K8s deployer, cluster management calls +- `helpers.py`: `create_cluster()`, etcd cleanup, kind operations +- `cluster_info.py`: K8s resource generation (Deployment, Service, Ingress) ## Insights and Observations diff --git a/README.md b/README.md index 375491bf..594ef9b9 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,59 @@ The various [stacks](/stack_orchestrator/data/stacks) each contain instructions - [laconicd with console and CLI](stack_orchestrator/data/stacks/fixturenet-laconic-loaded) - [kubo (IPFS)](stack_orchestrator/data/stacks/kubo) +## Deployment Types + +- **compose**: Docker Compose on local machine +- **k8s**: External Kubernetes cluster (requires kubeconfig) +- **k8s-kind**: Local Kubernetes via Kind - one cluster per host, shared by all deployments + +## External Stacks + +Stacks can live in external git repositories. Required structure: + +``` +/ + stack_orchestrator/data/ + stacks//stack.yml + compose/docker-compose-.yml + deployment/spec.yml +``` + +## Deployment Commands + +```bash +# Create deployment from spec +laconic-so --stack deploy create --spec-file --deployment-dir + +# Start (creates cluster on first run) +laconic-so deployment --dir start + +# GitOps restart (git pull + redeploy, preserves data) +laconic-so deployment --dir restart + +# Stop +laconic-so deployment --dir stop +``` + +## spec.yml Reference + +```yaml +stack: stack-name-or-path +deploy-to: k8s-kind +network: + http-proxy: + - host-name: app.example.com + routes: + - path: / + proxy-to: service-name:port + acme-email: admin@example.com +config: + ENV_VAR: value + SECRET_VAR: $generate:hex:32$ # Auto-generated, stored in K8s Secret +volumes: + volume-name: +``` + ## Contributing See the [CONTRIBUTING.md](/docs/CONTRIBUTING.md) for developer mode install. From a75138093bfdaa1b0d15505d07e41f0d09d586d7 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Tue, 3 Feb 2026 01:24:47 -0500 Subject: [PATCH 18/27] Add setup-repositories to key files list Co-Authored-By: Claude Opus 4.5 --- CLAUDE.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index dcf503b2..845cbd22 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -103,7 +103,11 @@ One Kind cluster per host by design. Never request or expect separate clusters. - `cluster_info.py` adds `envFrom` with `secretRef` to containers - Non-secret config written to `config.env` +### Repository Cloning +`setup-repositories --git-ssh` clones repos defined in stack.yml's `repos:` field. Requires SSH agent. + ### Key Files (for codebase navigation) +- `repos/setup_repositories.py`: `setup-repositories` command (git clone) - `deployment_create.py`: `deploy create` command, secret generation - `deployment.py`: `deployment start/stop/restart` commands - `deploy_k8s.py`: K8s deployer, cluster management calls From 3bc7832d8cec77b1260fda070fab4add207a113c Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Tue, 3 Feb 2026 01:40:37 -0500 Subject: [PATCH 19/27] Fix deployment name extraction from path When stack: field in spec.yml contains a path (e.g., stack_orchestrator/data/stacks/name), extract just the final name component for K8s secret naming. K8s resource names must be valid RFC 1123 subdomains and cannot contain slashes. Co-Authored-By: Claude Opus 4.5 --- stack_orchestrator/deploy/deployment_create.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stack_orchestrator/deploy/deployment_create.py b/stack_orchestrator/deploy/deployment_create.py index 7f5f5d73..bfa5b3d6 100644 --- a/stack_orchestrator/deploy/deployment_create.py +++ b/stack_orchestrator/deploy/deployment_create.py @@ -843,7 +843,8 @@ def _write_deployment_files( # Copy any config variables from the spec file into an env file suitable for compose # Use stack_name as deployment_name for K8s secret naming - deployment_name = stack_name.replace("_", "-") + # Extract just the name part if stack_name is a path ("path/to/stack" -> "stack") + deployment_name = Path(stack_name).name.replace("_", "-") _write_config_file( spec_file, target_dir.joinpath(constants.config_file_name), deployment_name ) From d82b3fb8814d2cb1fe2d6008aca5a703b5dde60c Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Tue, 3 Feb 2026 11:32:22 -0500 Subject: [PATCH 20/27] Only load locally-built images into kind, auto-detect ingress - Check stack.yml containers: field to determine which images are local builds - Only load local images via kind load; let k8s pull registry images directly - Add is_ingress_running() to skip ingress installation if already running - Fixes deployment failures when public registry images aren't in local Docker Co-Authored-By: Claude Opus 4.5 --- stack_orchestrator/deploy/k8s/deploy_k8s.py | 28 +++++++++++++++------ stack_orchestrator/deploy/k8s/helpers.py | 23 +++++++++++++++++ 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py index 3f073407..2d44fe23 100644 --- a/stack_orchestrator/deploy/k8s/deploy_k8s.py +++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py @@ -29,6 +29,7 @@ from stack_orchestrator.deploy.k8s.helpers import ( from stack_orchestrator.deploy.k8s.helpers import ( install_ingress_for_kind, wait_for_ingress_in_kind, + is_ingress_running, ) from stack_orchestrator.deploy.k8s.helpers import ( pods_in_deployment, @@ -297,17 +298,30 @@ class K8sDeployer(Deployer): if actual_cluster != self.kind_cluster_name: # An existing cluster was found, use it instead self.kind_cluster_name = actual_cluster - # Ensure the referenced containers are copied into kind - load_images_into_kind( - self.kind_cluster_name, self.cluster_info.image_set + # Only load locally-built images into kind + # Registry images (docker.io, ghcr.io, etc.) will be pulled by k8s + local_containers = self.deployment_context.stack.obj.get( + "containers", [] ) + if local_containers: + # Filter image_set to only images matching local containers + local_images = { + img + for img in self.cluster_info.image_set + if any(c in img for c in local_containers) + } + if local_images: + load_images_into_kind(self.kind_cluster_name, local_images) + # Note: if no local containers defined, all images come from registries self.connect_api() if self.is_kind() and not self.skip_cluster_management: # Configure ingress controller (not installed by default in kind) - install_ingress_for_kind(self.cluster_info.spec.get_acme_email()) - # Wait for ingress to start - # (deployment provisioning will fail unless this is done) - wait_for_ingress_in_kind() + # Skip if already running + if not is_ingress_running(): + install_ingress_for_kind(self.cluster_info.spec.get_acme_email()) + # Wait for ingress to start + # (deployment provisioning will fail unless this is done) + wait_for_ingress_in_kind() # Create RuntimeClass if unlimited_memlock is enabled if self.cluster_info.spec.get_unlimited_memlock(): _create_runtime_class( diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index 5ee7e062..e88501e9 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -14,6 +14,7 @@ # along with this program. If not, see . from kubernetes import client, utils, watch +from kubernetes.client.exceptions import ApiException import os from pathlib import Path import subprocess @@ -295,6 +296,28 @@ def destroy_cluster(name: str): _run_command(f"kind delete cluster --name {name}") +def is_ingress_running() -> bool: + """Check if the Caddy ingress controller is already running in the cluster.""" + try: + core_v1 = client.CoreV1Api() + pods = core_v1.list_namespaced_pod( + namespace="caddy-system", + label_selector=( + "app.kubernetes.io/name=caddy-ingress-controller," + "app.kubernetes.io/component=controller" + ), + ) + for pod in pods.items: + if pod.status and pod.status.container_statuses: + if pod.status.container_statuses[0].ready is True: + if opts.o.debug: + print("Caddy ingress controller already running") + return True + return False + except ApiException: + return False + + def wait_for_ingress_in_kind(): core_v1 = client.CoreV1Api() for i in range(20): From 73ba13aaa5ed6801cc2f32ef58907df0d121260e Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Tue, 3 Feb 2026 12:25:33 -0500 Subject: [PATCH 21/27] Add private registry authentication support Add ability to configure private container registry credentials in spec.yml for deployments using images from registries like GHCR. - Add get_image_registry_config() to spec.py for parsing image-registry config - Add create_registry_secret() to create K8s docker-registry secrets - Update cluster_info.py to use dynamic {deployment}-registry secret names - Update deploy_k8s.py to create registry secret before deployment - Document feature in deployment_patterns.md The token-env pattern keeps credentials out of git - the spec references an environment variable name, and the actual token is passed at runtime. Co-Authored-By: Claude Opus 4.5 --- docs/deployment_patterns.md | 49 +++++++++++ .../deploy/deployment_create.py | 82 +++++++++++++++++++ stack_orchestrator/deploy/k8s/cluster_info.py | 7 +- stack_orchestrator/deploy/k8s/deploy_k8s.py | 5 ++ stack_orchestrator/deploy/spec.py | 8 ++ 5 files changed, 150 insertions(+), 1 deletion(-) diff --git a/docs/deployment_patterns.md b/docs/deployment_patterns.md index fb2e0063..2ec82dca 100644 --- a/docs/deployment_patterns.md +++ b/docs/deployment_patterns.md @@ -75,3 +75,52 @@ This overwrites your customizations with defaults from the stack's `commands.py` git pull # Get latest spec.yml from your operator repo laconic-so deployment --dir my-deployment restart ``` + +## Private Registry Authentication + +For deployments using images from private container registries (e.g., GitHub Container Registry), configure authentication in your spec.yml: + +### Configuration + +Add an `image-registry` section to your spec.yml: + +```yaml +image-registry: + server: ghcr.io + username: your-org-or-username + token-env: REGISTRY_TOKEN +``` + +**Fields:** +- `server`: The registry hostname (e.g., `ghcr.io`, `docker.io`, `gcr.io`) +- `username`: Registry username (for GHCR, use your GitHub username or org name) +- `token-env`: Name of the environment variable containing your API token/PAT + +### Token Environment Variable + +The `token-env` pattern keeps credentials out of version control. Set the environment variable when running `deployment start`: + +```bash +export REGISTRY_TOKEN="your-personal-access-token" +laconic-so deployment --dir my-deployment start +``` + +For GHCR, create a Personal Access Token (PAT) with `read:packages` scope. + +### Ansible Integration + +When using Ansible for deployments, pass the token from a credentials file: + +```yaml +- name: Start deployment + ansible.builtin.command: + cmd: laconic-so deployment --dir {{ deployment_dir }} start + environment: + REGISTRY_TOKEN: "{{ lookup('file', '~/.credentials/ghcr_token') }}" +``` + +### How It Works + +1. laconic-so reads the `image-registry` config from spec.yml +2. Creates a Kubernetes `docker-registry` secret named `{deployment}-registry` +3. The deployment's pods reference this secret for image pulls diff --git a/stack_orchestrator/deploy/deployment_create.py b/stack_orchestrator/deploy/deployment_create.py index bfa5b3d6..870410f8 100644 --- a/stack_orchestrator/deploy/deployment_create.py +++ b/stack_orchestrator/deploy/deployment_create.py @@ -15,6 +15,7 @@ import click from importlib import util +import json import os import re import base64 @@ -554,6 +555,87 @@ def _generate_and_store_secrets(config_vars: dict, deployment_name: str): return secrets +def create_registry_secret(spec: Spec, deployment_name: str) -> Optional[str]: + """Create K8s docker-registry secret from spec + environment. + + Reads registry configuration from spec.yml and creates a Kubernetes + secret of type kubernetes.io/dockerconfigjson for image pulls. + + Args: + spec: The deployment spec containing image-registry config + deployment_name: Name of the deployment (used for secret naming) + + Returns: + The secret name if created, None if no registry config + """ + from kubernetes import client, config as k8s_config + + registry_config = spec.get_image_registry_config() + if not registry_config: + return None + + server = registry_config.get("server") + username = registry_config.get("username") + token_env = registry_config.get("token-env") + + if not all([server, username, token_env]): + return None + + # Type narrowing for pyright - we've validated these aren't None above + assert token_env is not None + token = os.environ.get(token_env) + if not token: + print( + f"Warning: Registry token env var '{token_env}' not set, " + "skipping registry secret" + ) + return None + + # Create dockerconfigjson format (Docker API uses "password" field for tokens) + auth = base64.b64encode(f"{username}:{token}".encode()).decode() + docker_config = { + "auths": {server: {"username": username, "password": token, "auth": auth}} + } + + # Secret name derived from deployment name + secret_name = f"{deployment_name}-registry" + + # Load kube config + try: + k8s_config.load_kube_config() + except Exception: + try: + k8s_config.load_incluster_config() + except Exception: + print("Warning: Could not load kube config, registry secret not created") + return None + + v1 = client.CoreV1Api() + namespace = "default" + + k8s_secret = client.V1Secret( + metadata=client.V1ObjectMeta(name=secret_name), + data={ + ".dockerconfigjson": base64.b64encode( + json.dumps(docker_config).encode() + ).decode() + }, + type="kubernetes.io/dockerconfigjson", + ) + + try: + v1.create_namespaced_secret(namespace, k8s_secret) + print(f"Created registry secret '{secret_name}' for {server}") + except client.exceptions.ApiException as e: + if e.status == 409: # Already exists + v1.replace_namespaced_secret(secret_name, namespace, k8s_secret) + print(f"Updated registry secret '{secret_name}' for {server}") + else: + raise + + return secret_name + + def _write_config_file( spec_file: Path, config_env_file: Path, deployment_name: Optional[str] = None ): diff --git a/stack_orchestrator/deploy/k8s/cluster_info.py b/stack_orchestrator/deploy/k8s/cluster_info.py index 0d9ac2ed..22c3ccf4 100644 --- a/stack_orchestrator/deploy/k8s/cluster_info.py +++ b/stack_orchestrator/deploy/k8s/cluster_info.py @@ -487,7 +487,12 @@ class ClusterInfo: volumes = volumes_for_pod_files( self.parsed_pod_yaml_map, self.spec, self.app_name ) - image_pull_secrets = [client.V1LocalObjectReference(name="laconic-registry")] + registry_config = self.spec.get_image_registry_config() + if registry_config: + secret_name = f"{self.app_name}-registry" + image_pull_secrets = [client.V1LocalObjectReference(name=secret_name)] + else: + image_pull_secrets = [] annotations = None labels = {"app": self.app_name} diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py index 2d44fe23..326cb6ab 100644 --- a/stack_orchestrator/deploy/k8s/deploy_k8s.py +++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py @@ -332,6 +332,11 @@ class K8sDeployer(Deployer): else: print("Dry run mode enabled, skipping k8s API connect") + # Create registry secret if configured + from stack_orchestrator.deploy.deployment_create import create_registry_secret + + create_registry_secret(self.cluster_info.spec, self.cluster_info.app_name) + self._create_volume_data() self._create_deployment() diff --git a/stack_orchestrator/deploy/spec.py b/stack_orchestrator/deploy/spec.py index a870ef60..07b220cd 100644 --- a/stack_orchestrator/deploy/spec.py +++ b/stack_orchestrator/deploy/spec.py @@ -98,6 +98,14 @@ class Spec: def get_image_registry(self): return self.obj.get(constants.image_registry_key) + def get_image_registry_config(self) -> typing.Optional[typing.Dict]: + """Returns registry auth config: {server, username, token-env}. + + Used for private container registries like GHCR. The token-env field + specifies an environment variable containing the API token/PAT. + """ + return self.obj.get("image-registry") + def get_volumes(self): return self.obj.get(constants.volumes_key, {}) From cb6fdb77a6b705bc16bd2731c5e9867f997fd8e0 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Tue, 3 Feb 2026 12:30:35 -0500 Subject: [PATCH 22/27] Rename image-registry to registry-credentials to avoid collision The existing 'image-registry' key is used for pushing images to a remote registry (URL string). Rename the new auth config to 'registry-credentials' to avoid collision. Co-Authored-By: Claude Opus 4.5 --- docs/deployment_patterns.md | 6 +++--- stack_orchestrator/deploy/spec.py | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/deployment_patterns.md b/docs/deployment_patterns.md index 2ec82dca..68484f13 100644 --- a/docs/deployment_patterns.md +++ b/docs/deployment_patterns.md @@ -82,10 +82,10 @@ For deployments using images from private container registries (e.g., GitHub Con ### Configuration -Add an `image-registry` section to your spec.yml: +Add a `registry-credentials` section to your spec.yml: ```yaml -image-registry: +registry-credentials: server: ghcr.io username: your-org-or-username token-env: REGISTRY_TOKEN @@ -121,6 +121,6 @@ When using Ansible for deployments, pass the token from a credentials file: ### How It Works -1. laconic-so reads the `image-registry` config from spec.yml +1. laconic-so reads the `registry-credentials` config from spec.yml 2. Creates a Kubernetes `docker-registry` secret named `{deployment}-registry` 3. The deployment's pods reference this secret for image pulls diff --git a/stack_orchestrator/deploy/spec.py b/stack_orchestrator/deploy/spec.py index 07b220cd..e5647b04 100644 --- a/stack_orchestrator/deploy/spec.py +++ b/stack_orchestrator/deploy/spec.py @@ -103,8 +103,11 @@ class Spec: Used for private container registries like GHCR. The token-env field specifies an environment variable containing the API token/PAT. + + Note: Uses 'registry-credentials' key to avoid collision with + 'image-registry' key which is for pushing images. """ - return self.obj.get("image-registry") + return self.obj.get("registry-credentials") def get_volumes(self): return self.obj.get(constants.volumes_key, {}) From 7cecf2caa600c534a54267076b801900653115f7 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Tue, 3 Feb 2026 13:45:50 -0500 Subject: [PATCH 23/27] Fix Caddy ACME email race condition by templating YAML Previously, install_ingress_for_kind() applied the YAML (which starts the Caddy pod with email: ""), then patched the ConfigMap afterward. The pod had already read the empty email and Caddy doesn't hot-reload. Now template the email into the YAML before applying, so the pod starts with the correct email from the beginning. Co-Authored-By: Claude Opus 4.5 --- stack_orchestrator/deploy/k8s/helpers.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index e88501e9..888e59ca 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -20,6 +20,7 @@ from pathlib import Path import subprocess import re from typing import Set, Mapping, List, Optional, cast +import yaml from stack_orchestrator.util import get_k8s_dir, error_exit from stack_orchestrator.opts import opts @@ -353,22 +354,19 @@ def install_ingress_for_kind(acme_email: str = ""): ) if opts.o.debug: print("Installing Caddy ingress controller in kind cluster") - utils.create_from_yaml(api_client, yaml_file=ingress_install) - # Patch ConfigMap with acme email if provided + # Template the YAML with email before applying + with open(ingress_install) as f: + yaml_content = f.read() + if acme_email: - core_v1 = client.CoreV1Api() - configmap = core_v1.read_namespaced_config_map( - name="caddy-ingress-controller-configmap", namespace="caddy-system" - ) - configmap.data["email"] = acme_email - core_v1.patch_namespaced_config_map( - name="caddy-ingress-controller-configmap", - namespace="caddy-system", - body=configmap, - ) + yaml_content = yaml_content.replace('email: ""', f'email: "{acme_email}"') if opts.o.debug: - print(f"Patched Caddy ConfigMap with email: {acme_email}") + print(f"Configured Caddy with ACME email: {acme_email}") + + # Apply templated YAML + yaml_objects = list(yaml.safe_load_all(yaml_content)) + utils.create_from_yaml(api_client, yaml_objects=yaml_objects) def load_images_into_kind(kind_cluster_name: str, image_set: Set[str]): From 581ceaea948bdf0884efb741067e67935a5f9c6f Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Tue, 3 Feb 2026 13:52:11 -0500 Subject: [PATCH 24/27] docs: Add cluster and volume management section Document that: - Volumes persist across cluster deletion by design - Only use --delete-volumes when explicitly requested - Multiple deployments share one kind cluster - Use --skip-cluster-management to stop single deployment Co-Authored-By: Claude Opus 4.5 --- docs/deployment_patterns.md | 39 +++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/docs/deployment_patterns.md b/docs/deployment_patterns.md index 68484f13..cbb8cdca 100644 --- a/docs/deployment_patterns.md +++ b/docs/deployment_patterns.md @@ -124,3 +124,42 @@ When using Ansible for deployments, pass the token from a credentials file: 1. laconic-so reads the `registry-credentials` config from spec.yml 2. Creates a Kubernetes `docker-registry` secret named `{deployment}-registry` 3. The deployment's pods reference this secret for image pulls + +## Cluster and Volume Management + +### Stopping Deployments + +The `deployment stop` command has two important flags: + +```bash +# Default: stops deployment, deletes cluster, PRESERVES volumes +laconic-so deployment --dir my-deployment stop + +# Explicitly delete volumes (USE WITH CAUTION) +laconic-so deployment --dir my-deployment stop --delete-volumes +``` + +### Volume Persistence + +Volumes persist across cluster deletion by design. This is important because: +- **Data survives cluster recreation**: Ledger data, databases, and other state are preserved +- **Faster recovery**: No need to re-sync or rebuild data after cluster issues +- **Safe cluster upgrades**: Delete and recreate cluster without data loss + +**Only use `--delete-volumes` when:** +- You explicitly want to start fresh with no data +- The user specifically requests volume deletion +- You're cleaning up a test/dev environment completely + +### Shared Cluster Architecture + +In kind deployments, multiple stacks share a single cluster: +- First `deployment start` creates the cluster +- Subsequent deployments reuse the existing cluster +- `deployment stop` on ANY deployment deletes the shared cluster +- Other deployments will fail until cluster is recreated + +To stop a single deployment without affecting the cluster: +```bash +laconic-so deployment --dir my-deployment stop --skip-cluster-management +``` From ee59918082683c00035a2e7061795cd0bbbf0c02 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Mon, 2 Feb 2026 23:26:13 -0500 Subject: [PATCH 25/27] Allow relative volume paths for k8s-kind deployments For k8s-kind, relative paths (e.g., ./data/rpc-config) are resolved to $DEPLOYMENT_DIR/path by _make_absolute_host_path() during kind config generation. This provides Docker Host persistence that survives cluster restarts. Previously, validation threw an exception before paths could be resolved, making it impossible to use relative paths for persistent storage. Changes: - deployment_create.py: Skip relative path check for k8s-kind - cluster_info.py: Allow relative paths to reach PV generation - docs/deployment_patterns.md: Document volume persistence patterns Co-Authored-By: Claude Opus 4.5 --- docs/deployment_patterns.md | 37 +++++++++++++++++++ .../deploy/deployment_create.py | 12 ++++-- stack_orchestrator/deploy/k8s/cluster_info.py | 14 ++++--- 3 files changed, 54 insertions(+), 9 deletions(-) diff --git a/docs/deployment_patterns.md b/docs/deployment_patterns.md index cbb8cdca..fdb930d8 100644 --- a/docs/deployment_patterns.md +++ b/docs/deployment_patterns.md @@ -163,3 +163,40 @@ To stop a single deployment without affecting the cluster: ```bash laconic-so deployment --dir my-deployment stop --skip-cluster-management ``` + +## Volume Persistence in k8s-kind + +k8s-kind has 3 storage layers: + +- **Docker Host**: The physical server running Docker +- **Kind Node**: A Docker container simulating a k8s node +- **Pod Container**: Your workload + +For k8s-kind, volumes with paths are mounted from Docker Host → Kind Node → Pod via extraMounts. + +| spec.yml volume | Storage Location | Survives Pod Restart | Survives Cluster Restart | +|-----------------|------------------|---------------------|-------------------------| +| `vol:` (empty) | Kind Node PVC | ✅ | ❌ | +| `vol: ./data/x` | Docker Host | ✅ | ✅ | +| `vol: /abs/path`| Docker Host | ✅ | ✅ | + +**Recommendation**: Always use paths for data you want to keep. Relative paths +(e.g., `./data/rpc-config`) resolve to `$DEPLOYMENT_DIR/data/rpc-config` on the +Docker Host. + +### Example + +```yaml +# In spec.yml +volumes: + rpc-config: ./data/rpc-config # Persists to $DEPLOYMENT_DIR/data/rpc-config + chain-data: ./data/chain # Persists to $DEPLOYMENT_DIR/data/chain + temp-cache: # Empty = Kind Node PVC (lost on cluster delete) +``` + +### The Antipattern + +Empty-path volumes appear persistent because they survive pod restarts (data lives +in Kind Node container). However, this data is lost when the kind cluster is +recreated. This "false persistence" has caused data loss when operators assumed +their data was safe. diff --git a/stack_orchestrator/deploy/deployment_create.py b/stack_orchestrator/deploy/deployment_create.py index 870410f8..511445be 100644 --- a/stack_orchestrator/deploy/deployment_create.py +++ b/stack_orchestrator/deploy/deployment_create.py @@ -690,10 +690,14 @@ def _check_volume_definitions(spec): for volume_name, volume_path in spec.get_volumes().items(): if volume_path: if not os.path.isabs(volume_path): - raise Exception( - f"Relative path {volume_path} for volume {volume_name} not " - f"supported for deployment type {spec.get_deployment_type()}" - ) + # For k8s-kind: allow relative paths, they'll be resolved + # by _make_absolute_host_path() during kind config generation + if not spec.is_kind_deployment(): + deploy_type = spec.get_deployment_type() + raise Exception( + f"Relative path {volume_path} for volume " + f"{volume_name} not supported for {deploy_type}" + ) @click.command() diff --git a/stack_orchestrator/deploy/k8s/cluster_info.py b/stack_orchestrator/deploy/k8s/cluster_info.py index 22c3ccf4..7c706125 100644 --- a/stack_orchestrator/deploy/k8s/cluster_info.py +++ b/stack_orchestrator/deploy/k8s/cluster_info.py @@ -352,11 +352,15 @@ class ClusterInfo: continue if not os.path.isabs(volume_path): - print( - f"WARNING: {volume_name}:{volume_path} is not absolute, " - "cannot bind volume." - ) - continue + # For k8s-kind, allow relative paths: + # - PV uses /mnt/{volume_name} (path inside kind node) + # - extraMounts resolve the relative path to Docker Host + if not self.spec.is_kind_deployment(): + print( + f"WARNING: {volume_name}:{volume_path} is not absolute, " + "cannot bind volume." + ) + continue if self.spec.is_kind_deployment(): host_path = client.V1HostPathVolumeSource( From 5bc6c978ac59bb36ff1db1dde7c00179f8f85b05 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Sun, 25 Jan 2026 17:35:53 -0500 Subject: [PATCH 26/27] feat(k8s): support acme-email config for Caddy ingress Adds support for configuring ACME email for Let's Encrypt certificates in kind deployments. The email can be specified in the spec under network.acme-email and will be used to configure the Caddy ingress controller ConfigMap. Co-Authored-By: Claude Opus 4.5 --- stack_orchestrator/deploy/k8s/deploy_k8s.py | 2 +- stack_orchestrator/deploy/k8s/helpers.py | 15 +++++++++++++++ stack_orchestrator/deploy/spec.py | 3 +++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py index 326cb6ab..8c0d4bd2 100644 --- a/stack_orchestrator/deploy/k8s/deploy_k8s.py +++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py @@ -316,7 +316,7 @@ class K8sDeployer(Deployer): self.connect_api() if self.is_kind() and not self.skip_cluster_management: # Configure ingress controller (not installed by default in kind) - # Skip if already running + # Skip if already running (idempotent for shared cluster) if not is_ingress_running(): install_ingress_for_kind(self.cluster_info.spec.get_acme_email()) # Wait for ingress to start diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index 888e59ca..6cdc930d 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -368,6 +368,21 @@ def install_ingress_for_kind(acme_email: str = ""): yaml_objects = list(yaml.safe_load_all(yaml_content)) utils.create_from_yaml(api_client, yaml_objects=yaml_objects) + # Patch ConfigMap with ACME email if provided + if acme_email: + if opts.o.debug: + print(f"Configuring ACME email: {acme_email}") + core_api = client.CoreV1Api() + configmap = core_api.read_namespaced_config_map( + name="caddy-ingress-controller-configmap", namespace="caddy-system" + ) + configmap.data["email"] = acme_email + core_api.patch_namespaced_config_map( + name="caddy-ingress-controller-configmap", + namespace="caddy-system", + body=configmap, + ) + def load_images_into_kind(kind_cluster_name: str, image_set: Set[str]): for image in image_set: diff --git a/stack_orchestrator/deploy/spec.py b/stack_orchestrator/deploy/spec.py index e5647b04..060f67ea 100644 --- a/stack_orchestrator/deploy/spec.py +++ b/stack_orchestrator/deploy/spec.py @@ -128,6 +128,9 @@ class Spec: def get_http_proxy(self): return self.obj.get(constants.network_key, {}).get(constants.http_proxy_key, []) + def get_acme_email(self): + return self.obj.get(constants.network_key, {}).get("acme-email", "") + def get_annotations(self): return self.obj.get(constants.annotations_key, {}) From f70e87b848c0d2dc2130a3de42fb10c363763a1b Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Sun, 25 Jan 2026 19:12:44 -0500 Subject: [PATCH 27/27] Add etcd + PKI extraMounts for offline data recovery Mount /var/lib/etcd and /etc/kubernetes/pki to host filesystem so cluster state is preserved for offline recovery. Each deployment gets its own backup directory keyed by deployment ID. Directory structure: data/cluster-backups/{deployment_id}/etcd/ data/cluster-backups/{deployment_id}/pki/ This enables extracting secrets from etcd backups using etcdctl with the preserved PKI certificates. Co-Authored-By: Claude Opus 4.5 --- stack_orchestrator/deploy/k8s/helpers.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index 6cdc930d..ddba30bc 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -564,6 +564,25 @@ def _generate_kind_mounts(parsed_pod_files, deployment_dir, deployment_context): volume_host_path_map = _get_host_paths_for_volumes(deployment_context) seen_host_path_mounts = set() # Track to avoid duplicate mounts + # Cluster state backup for offline data recovery (unique per deployment) + # etcd contains all k8s state; PKI certs needed to decrypt etcd offline + deployment_id = deployment_context.id + backup_subdir = f"cluster-backups/{deployment_id}" + + etcd_host_path = _make_absolute_host_path( + Path(f"./data/{backup_subdir}/etcd"), deployment_dir + ) + volume_definitions.append( + f" - hostPath: {etcd_host_path}\n" f" containerPath: /var/lib/etcd\n" + ) + + pki_host_path = _make_absolute_host_path( + Path(f"./data/{backup_subdir}/pki"), deployment_dir + ) + volume_definitions.append( + f" - hostPath: {pki_host_path}\n" f" containerPath: /etc/kubernetes/pki\n" + ) + # Note these paths are relative to the location of the pod files (at present) # So we need to fix up to make them correct and absolute because kind assumes # relative to the cwd.