From 14258500bc84a4f9676da5b2c914fa071acaaf2e Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Mon, 2 Feb 2026 22:18:19 -0500 Subject: [PATCH] Fix restart command for GitOps deployments - Remove init_operation() from restart - don't regenerate spec from commands.py defaults, use existing git-tracked spec.yml instead - Add docs/deployment_patterns.md documenting GitOps workflow - Add pre-commit rule to CLAUDE.md - Fix line length issues in helpers.py Co-Authored-By: Claude Opus 4.5 --- CLAUDE.md | 1 + docs/deployment_patterns.md | 77 +++++++++++++++++++++ stack_orchestrator/deploy/deployment.py | 85 +++++++++--------------- stack_orchestrator/deploy/k8s/helpers.py | 74 +++++++++++++-------- stack_orchestrator/deploy/spec.py | 2 +- 5 files changed, 158 insertions(+), 81 deletions(-) create mode 100644 docs/deployment_patterns.md diff --git a/CLAUDE.md b/CLAUDE.md index f06b6abc..0626ac93 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -8,6 +8,7 @@ NEVER assume your hypotheses are true without evidence ALWAYS clearly state when something is a hypothesis ALWAYS use evidence from the systems your interacting with to support your claims and hypotheses +ALWAYS run `pre-commit run --all-files` before committing changes ## Key Principles diff --git a/docs/deployment_patterns.md b/docs/deployment_patterns.md new file mode 100644 index 00000000..fb2e0063 --- /dev/null +++ b/docs/deployment_patterns.md @@ -0,0 +1,77 @@ +# Deployment Patterns + +## GitOps Pattern + +For production deployments, we recommend a GitOps approach where your deployment configuration is tracked in version control. + +### Overview + +- **spec.yml is your source of truth**: Maintain it in your operator repository +- **Don't regenerate on every restart**: Run `deploy init` once, then customize and commit +- **Use restart for updates**: The restart command respects your git-tracked spec.yml + +### Workflow + +1. **Initial setup**: Run `deploy init` once to generate a spec.yml template +2. **Customize and commit**: Edit spec.yml with your configuration (hostnames, resources, etc.) and commit to your operator repo +3. **Deploy from git**: Use the committed spec.yml for deployments +4. **Update via git**: Make changes in git, then restart to apply + +```bash +# Initial setup (run once) +laconic-so --stack my-stack deploy init --output spec.yml + +# Customize for your environment +vim spec.yml # Set hostname, resources, etc. + +# Commit to your operator repository +git add spec.yml +git commit -m "Add my-stack deployment configuration" +git push + +# On deployment server: deploy from git-tracked spec +laconic-so deploy create \ + --spec-file /path/to/operator-repo/spec.yml \ + --deployment-dir my-deployment + +laconic-so deployment --dir my-deployment start +``` + +### Updating Deployments + +When you need to update a deployment: + +```bash +# 1. Make changes in your operator repo +vim /path/to/operator-repo/spec.yml +git commit -am "Update configuration" +git push + +# 2. On deployment server: pull and restart +cd /path/to/operator-repo && git pull +laconic-so deployment --dir my-deployment restart +``` + +The `restart` command: +- Pulls latest code from the stack repository +- Uses your git-tracked spec.yml (does NOT regenerate from defaults) +- Syncs the deployment directory +- Restarts services + +### Anti-patterns + +**Don't do this:** +```bash +# BAD: Regenerating spec on every deployment +laconic-so --stack my-stack deploy init --output spec.yml +laconic-so deploy create --spec-file spec.yml ... +``` + +This overwrites your customizations with defaults from the stack's `commands.py`. + +**Do this instead:** +```bash +# GOOD: Use your git-tracked spec +git pull # Get latest spec.yml from your operator repo +laconic-so deployment --dir my-deployment restart +``` diff --git a/stack_orchestrator/deploy/deployment.py b/stack_orchestrator/deploy/deployment.py index f60ea9a4..2500d0d5 100644 --- a/stack_orchestrator/deploy/deployment.py +++ b/stack_orchestrator/deploy/deployment.py @@ -17,7 +17,6 @@ import click from pathlib import Path import subprocess import sys -import tempfile import time from stack_orchestrator import constants from stack_orchestrator.deploy.images import push_images_operation @@ -248,13 +247,13 @@ def run_job(ctx, job_name, helm_release): ) @click.pass_context def restart(ctx, stack_path, config_file, force, expected_ip): - """Pull latest stack, regenerate spec, and restart deployment. + """Pull latest code and restart deployment using git-tracked spec. - This command: - 1. Pulls latest code from the stack git repository - 2. Regenerates spec.yml from the stack's commands.py + GitOps workflow: + 1. Operator maintains spec.yml in their git repository + 2. This command pulls latest code (including updated spec.yml) 3. If hostname changed, verifies DNS routes to this server - 4. Syncs the deployment directory (preserves cluster ID and data) + 4. Syncs deployment directory with the git-tracked spec 5. Stops and restarts the deployment Data volumes are always preserved. The cluster is never destroyed. @@ -264,19 +263,17 @@ def restart(ctx, stack_path, config_file, force, expected_ip): 2. stack-source field in deployment.yml (if stored) 3. Error if neither available - Note: After restart, Caddy will automatically provision TLS certificates - for any new hostnames. + Note: spec.yml should be maintained in git, not regenerated from + commands.py on each restart. Use 'deploy init' only for initial + spec generation, then customize and commit to your operator repo. """ from stack_orchestrator.util import get_yaml, get_parsed_deployment_spec - from stack_orchestrator.deploy.deployment_create import ( - init_operation, - create_operation, - ) + from stack_orchestrator.deploy.deployment_create import create_operation from stack_orchestrator.deploy.dns_probe import verify_dns_via_probe deployment_context: DeploymentContext = ctx.obj - # Get current spec info + # Get current spec info (before git pull) current_spec = deployment_context.spec current_http_proxy = current_spec.get_http_proxy() current_hostname = ( @@ -310,8 +307,8 @@ def restart(ctx, stack_path, config_file, force, expected_ip): print(f"Stack source: {stack_source}") print(f"Current hostname: {current_hostname}") - # Step 1: Git pull - print("\n[1/6] Pulling latest code from stack repository...") + # Step 1: Git pull (brings in updated spec.yml from operator's repo) + print("\n[1/4] Pulling latest code from stack repository...") git_result = subprocess.run( ["git", "pull"], cwd=stack_source, capture_output=True, text=True ) @@ -320,36 +317,23 @@ def restart(ctx, stack_path, config_file, force, expected_ip): sys.exit(1) print(f"Git pull: {git_result.stdout.strip()}") - # Step 2: Regenerate spec - print("\n[2/6] Regenerating spec from commands.py...") - with tempfile.NamedTemporaryFile(mode="w", suffix=".yml", delete=False) as tmp: - new_spec_path = tmp.name + # Use the spec.yml from the deployment directory (updated by git pull if tracked) + spec_file_path = deployment_context.deployment_dir / "spec.yml" + if not spec_file_path.exists(): + print(f"Error: spec.yml not found at {spec_file_path}") + print("Ensure spec.yml exists in the deployment directory.") + sys.exit(1) - # Build deploy context for init - deploy_ctx = make_deploy_context(ctx) - - init_operation( - deploy_command_context=deploy_ctx, - stack=str(stack_source), - deployer_type=current_spec.obj[constants.deploy_to_key], - config=None, - config_file=config_file, - kube_config=None, - image_registry=None, - output=new_spec_path, - map_ports_to_host=None, - ) - - # Parse new spec to get new hostname - new_spec_obj = get_parsed_deployment_spec(new_spec_path) + # Parse spec to check for hostname changes + new_spec_obj = get_parsed_deployment_spec(str(spec_file_path)) new_http_proxy = new_spec_obj.get("network", {}).get("http-proxy", []) new_hostname = new_http_proxy[0]["host-name"] if new_http_proxy else None - print(f"New hostname: {new_hostname}") + print(f"Spec hostname: {new_hostname}") - # Step 3: DNS verification (only if hostname changed) + # Step 2: DNS verification (only if hostname changed) if new_hostname and new_hostname != current_hostname: - print(f"\n[3/6] Hostname changed: {current_hostname} -> {new_hostname}") + print(f"\n[2/4] Hostname changed: {current_hostname} -> {new_hostname}") if force: print("DNS verification skipped (--force)") else: @@ -360,25 +344,26 @@ def restart(ctx, stack_path, config_file, force, expected_ip): print("Use --force to skip this check.") sys.exit(1) else: - print("\n[3/6] Hostname unchanged, skipping DNS verification") + print("\n[2/4] Hostname unchanged, skipping DNS verification") - # Step 4: Sync deployment directory - print("\n[4/6] Syncing deployment directory...") + # Step 3: Sync deployment directory with spec + print("\n[3/4] Syncing deployment directory...") + deploy_ctx = make_deploy_context(ctx) create_operation( deployment_command_context=deploy_ctx, - spec_file=new_spec_path, + spec_file=str(spec_file_path), deployment_dir=str(deployment_context.deployment_dir), update=True, network_dir=None, initial_peers=None, ) - # Reload deployment context with new spec + # Reload deployment context with updated spec deployment_context.init(deployment_context.deployment_dir) ctx.obj = deployment_context - # Step 5: Stop deployment - print("\n[5/6] Stopping deployment...") + # Stop deployment + print("\n[4/4] Restarting deployment...") ctx.obj = make_deploy_context(ctx) down_operation( ctx, delete_volumes=False, extra_args_list=[], skip_cluster_management=True @@ -387,17 +372,13 @@ def restart(ctx, stack_path, config_file, force, expected_ip): # Brief pause to ensure clean shutdown time.sleep(5) - # Step 6: Start deployment - print("\n[6/6] Starting deployment...") + # Start deployment up_operation( ctx, services_list=None, stay_attached=False, skip_cluster_management=True ) print("\n=== Restart Complete ===") - print("Deployment restarted with updated configuration.") + print("Deployment restarted with git-tracked configuration.") if new_hostname and new_hostname != current_hostname: print(f"\nNew hostname: {new_hostname}") print("Caddy will automatically provision TLS certificate.") - - # Cleanup temp file - Path(new_spec_path).unlink(missing_ok=True) diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index 18be7832..613c870a 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -123,6 +123,9 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool: specific stale resources (blacklist), we keep only the valuable data (caddy TLS certs) and delete everything else (whitelist approach). + The etcd image is distroless (no shell), so we extract the statically-linked + etcdctl binary and run it from alpine which has shell support. + Returns True if cleanup succeeded, False if no action needed or failed. """ db_path = Path(etcd_path) / "member" / "snap" / "db" @@ -146,14 +149,26 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool: # Whitelist: prefixes to KEEP - everything else gets deleted keep_prefixes = "/registry/secrets/caddy-system" - # All operations in docker to handle root-owned etcd files + # The etcd image is distroless (no shell). We extract the statically-linked + # etcdctl binary and run it from alpine which has shell + jq support. cleanup_script = f""" set -e ALPINE_IMAGE="alpine:3.19" + # Cleanup previous runs + docker rm -f laconic-etcd-cleanup 2>/dev/null || true + docker rm -f etcd-extract 2>/dev/null || true + docker run --rm -v /tmp:/tmp $ALPINE_IMAGE rm -rf {temp_dir} + # Create temp dir - docker run --rm -v /tmp:/tmp $ALPINE_IMAGE \ - sh -c "rm -rf {temp_dir} && mkdir -p {temp_dir}" + docker run --rm -v /tmp:/tmp $ALPINE_IMAGE mkdir -p {temp_dir} + + # Extract etcdctl binary (it's statically linked) + docker create --name etcd-extract {etcd_image} + docker cp etcd-extract:/usr/local/bin/etcdctl /tmp/etcdctl-bin + docker rm etcd-extract + docker run --rm -v /tmp/etcdctl-bin:/src:ro -v {temp_dir}:/dst $ALPINE_IMAGE \ + sh -c "cp /src /dst/etcdctl && chmod +x /dst/etcdctl" # Copy db to temp location docker run --rm \ @@ -166,8 +181,7 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool: etcdutl snapshot restore /work/etcd-snapshot.db \ --data-dir=/work/etcd-data --skip-hash-check 2>/dev/null - # Start temp etcd - docker rm -f laconic-etcd-cleanup 2>/dev/null || true + # Start temp etcd (runs the etcd binary, no shell needed) docker run -d --name laconic-etcd-cleanup \ -v {temp_dir}/etcd-data:/etcd-data \ -v {temp_dir}:/backup \ @@ -178,31 +192,34 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool: sleep 3 - # Export caddy secrets to backup file (the only thing we keep) - docker exec laconic-etcd-cleanup \ - etcdctl get --prefix "{keep_prefixes}" -w json > {temp_dir}/kept.json \ - 2>/dev/null || echo '{{}}' > {temp_dir}/kept.json + # Use alpine with extracted etcdctl to run commands (alpine has shell + jq) + # Export caddy secrets + docker run --rm \ + -v {temp_dir}:/backup \ + --network container:laconic-etcd-cleanup \ + $ALPINE_IMAGE sh -c \ + '/backup/etcdctl get --prefix "{keep_prefixes}" -w json \ + > /backup/kept.json 2>/dev/null || echo "{{}}" > /backup/kept.json' # Delete ALL registry keys - docker exec laconic-etcd-cleanup etcdctl del --prefix /registry + docker run --rm \ + -v {temp_dir}:/backup \ + --network container:laconic-etcd-cleanup \ + $ALPINE_IMAGE /backup/etcdctl del --prefix /registry - # Restore kept keys using etcdctl txn - docker exec laconic-etcd-cleanup sh -c ' - cat /backup/kept.json 2>/dev/null | \ - (python3 -c " -import sys, json, base64 -try: - data = json.load(sys.stdin) - for kv in data.get(\"kvs\", []): - k = base64.b64decode(kv[\"key\"]).decode() - v = base64.b64decode(kv[\"value\"]).decode(\"latin-1\") - print(k) - print(v) -except: pass -" 2>/dev/null || true) | while IFS= read -r key && IFS= read -r value; do - printf \"%s\" \"$value\" | etcdctl put \"$key\" - done - ' 2>/dev/null || true + # Restore kept keys using jq + docker run --rm \ + -v {temp_dir}:/backup \ + --network container:laconic-etcd-cleanup \ + $ALPINE_IMAGE sh -c ' + apk add --no-cache jq >/dev/null 2>&1 + jq -r ".kvs[] | @base64" /backup/kept.json 2>/dev/null | \ + while read encoded; do + key=$(echo $encoded | base64 -d | jq -r ".key" | base64 -d) + val=$(echo $encoded | base64 -d | jq -r ".value" | base64 -d) + echo "$val" | /backup/etcdctl put "$key" + done + ' || true # Save cleaned snapshot docker exec laconic-etcd-cleanup \ @@ -228,8 +245,9 @@ except: pass docker run --rm -v {etcd_path}:/etcd -v {temp_dir}:/tmp-work $ALPINE_IMAGE \ sh -c "rm -rf /etcd/member && mv /tmp-work/new-etcd/member /etcd/member" - # Cleanup temp (but NOT the backup) + # Cleanup temp files (but NOT the timestamped backup in etcd_path) docker run --rm -v /tmp:/tmp $ALPINE_IMAGE rm -rf {temp_dir} + rm -f /tmp/etcdctl-bin """ result = subprocess.run(cleanup_script, shell=True, capture_output=True, text=True) diff --git a/stack_orchestrator/deploy/spec.py b/stack_orchestrator/deploy/spec.py index db7783c9..a870ef60 100644 --- a/stack_orchestrator/deploy/spec.py +++ b/stack_orchestrator/deploy/spec.py @@ -180,7 +180,7 @@ class Spec: return self.obj.get(constants.deploy_to_key) def get_acme_email(self): - return self.obj.get(constants.acme_email_key, "") + return self.obj.get(constants.network_key, {}).get(constants.acme_email_key, "") def is_kubernetes_deployment(self): return self.get_deployment_type() in [