Fix restart command for GitOps deployments

- Remove init_operation() from restart - don't regenerate spec from commands.py defaults, use existing git-tracked spec.yml instead - Add docs/deployment_patterns.md documenting GitOps workflow - Add pre-commit rule to CLAUDE.md - Fix line length issues in helpers.py Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 22:18:19 -05:00 · 2026-02-02 22:18:19 -05:00 · 14258500bc
commit 14258500bc
parent 3fbd854b8c
5 changed files with 158 additions and 81 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -8,6 +8,7 @@ NEVER assume your hypotheses are true without evidence

 ALWAYS clearly state when something is a hypothesis
 ALWAYS use evidence from the systems your interacting with to support your claims and hypotheses
+ALWAYS run `pre-commit run --all-files` before committing changes

 ## Key Principles

--- a/docs/deployment_patterns.md
+++ b/docs/deployment_patterns.md
@ -0,0 +1,77 @@
+# Deployment Patterns
+
+## GitOps Pattern
+
+For production deployments, we recommend a GitOps approach where your deployment configuration is tracked in version control.
+
+### Overview
+
+- **spec.yml is your source of truth**: Maintain it in your operator repository
+- **Don't regenerate on every restart**: Run `deploy init` once, then customize and commit
+- **Use restart for updates**: The restart command respects your git-tracked spec.yml
+
+### Workflow
+
+1. **Initial setup**: Run `deploy init` once to generate a spec.yml template
+2. **Customize and commit**: Edit spec.yml with your configuration (hostnames, resources, etc.) and commit to your operator repo
+3. **Deploy from git**: Use the committed spec.yml for deployments
+4. **Update via git**: Make changes in git, then restart to apply
+
+```bash
+# Initial setup (run once)
+laconic-so --stack my-stack deploy init --output spec.yml
+
+# Customize for your environment
+vim spec.yml  # Set hostname, resources, etc.
+
+# Commit to your operator repository
+git add spec.yml
+git commit -m "Add my-stack deployment configuration"
+git push
+
+# On deployment server: deploy from git-tracked spec
+laconic-so deploy create \
+  --spec-file /path/to/operator-repo/spec.yml \
+  --deployment-dir my-deployment
+
+laconic-so deployment --dir my-deployment start
+```
+
+### Updating Deployments
+
+When you need to update a deployment:
+
+```bash
+# 1. Make changes in your operator repo
+vim /path/to/operator-repo/spec.yml
+git commit -am "Update configuration"
+git push
+
+# 2. On deployment server: pull and restart
+cd /path/to/operator-repo && git pull
+laconic-so deployment --dir my-deployment restart
+```
+
+The `restart` command:
+- Pulls latest code from the stack repository
+- Uses your git-tracked spec.yml (does NOT regenerate from defaults)
+- Syncs the deployment directory
+- Restarts services
+
+### Anti-patterns
+
+**Don't do this:**
+```bash
+# BAD: Regenerating spec on every deployment
+laconic-so --stack my-stack deploy init --output spec.yml
+laconic-so deploy create --spec-file spec.yml ...
+```
+
+This overwrites your customizations with defaults from the stack's `commands.py`.
+
+**Do this instead:**
+```bash
+# GOOD: Use your git-tracked spec
+git pull  # Get latest spec.yml from your operator repo
+laconic-so deployment --dir my-deployment restart
+```
--- a/stack_orchestrator/deploy/deployment.py
+++ b/stack_orchestrator/deploy/deployment.py
@ -17,7 +17,6 @@ import click
 from pathlib import Path
 import subprocess
 import sys
-import tempfile
 import time
 from stack_orchestrator import constants
 from stack_orchestrator.deploy.images import push_images_operation
@ -248,13 +247,13 @@ def run_job(ctx, job_name, helm_release):
 )
@click.pass_context
 def restart(ctx, stack_path, config_file, force, expected_ip):
-    """Pull latest stack, regenerate spec, and restart deployment.
+    """Pull latest code and restart deployment using git-tracked spec.

-    This command:
-    1. Pulls latest code from the stack git repository
-    2. Regenerates spec.yml from the stack's commands.py
+    GitOps workflow:
+    1. Operator maintains spec.yml in their git repository
+    2. This command pulls latest code (including updated spec.yml)
    3. If hostname changed, verifies DNS routes to this server
-    4. Syncs the deployment directory (preserves cluster ID and data)
+    4. Syncs deployment directory with the git-tracked spec
    5. Stops and restarts the deployment

    Data volumes are always preserved. The cluster is never destroyed.
@ -264,19 +263,17 @@ def restart(ctx, stack_path, config_file, force, expected_ip):
    2. stack-source field in deployment.yml (if stored)
    3. Error if neither available

-    Note: After restart, Caddy will automatically provision TLS certificates
-    for any new hostnames.
+    Note: spec.yml should be maintained in git, not regenerated from
+    commands.py on each restart. Use 'deploy init' only for initial
+    spec generation, then customize and commit to your operator repo.
    """
    from stack_orchestrator.util import get_yaml, get_parsed_deployment_spec
-    from stack_orchestrator.deploy.deployment_create import (
-        init_operation,
-        create_operation,
-    )
+    from stack_orchestrator.deploy.deployment_create import create_operation
    from stack_orchestrator.deploy.dns_probe import verify_dns_via_probe

    deployment_context: DeploymentContext = ctx.obj

-    # Get current spec info
+    # Get current spec info (before git pull)
    current_spec = deployment_context.spec
    current_http_proxy = current_spec.get_http_proxy()
    current_hostname = (
@ -310,8 +307,8 @@ def restart(ctx, stack_path, config_file, force, expected_ip):
    print(f"Stack source: {stack_source}")
    print(f"Current hostname: {current_hostname}")

-    # Step 1: Git pull
-    print("\n[1/6] Pulling latest code from stack repository...")
+    # Step 1: Git pull (brings in updated spec.yml from operator's repo)
+    print("\n[1/4] Pulling latest code from stack repository...")
    git_result = subprocess.run(
        ["git", "pull"], cwd=stack_source, capture_output=True, text=True
    )
@ -320,36 +317,23 @@ def restart(ctx, stack_path, config_file, force, expected_ip):
        sys.exit(1)
    print(f"Git pull: {git_result.stdout.strip()}")

-    # Step 2: Regenerate spec
-    print("\n[2/6] Regenerating spec from commands.py...")
-    with tempfile.NamedTemporaryFile(mode="w", suffix=".yml", delete=False) as tmp:
-        new_spec_path = tmp.name
+    # Use the spec.yml from the deployment directory (updated by git pull if tracked)
+    spec_file_path = deployment_context.deployment_dir / "spec.yml"
+    if not spec_file_path.exists():
+        print(f"Error: spec.yml not found at {spec_file_path}")
+        print("Ensure spec.yml exists in the deployment directory.")
+        sys.exit(1)

-    # Build deploy context for init
-    deploy_ctx = make_deploy_context(ctx)
-
-    init_operation(
-        deploy_command_context=deploy_ctx,
-        stack=str(stack_source),
-        deployer_type=current_spec.obj[constants.deploy_to_key],
-        config=None,
-        config_file=config_file,
-        kube_config=None,
-        image_registry=None,
-        output=new_spec_path,
-        map_ports_to_host=None,
-    )
-
-    # Parse new spec to get new hostname
-    new_spec_obj = get_parsed_deployment_spec(new_spec_path)
+    # Parse spec to check for hostname changes
+    new_spec_obj = get_parsed_deployment_spec(str(spec_file_path))
    new_http_proxy = new_spec_obj.get("network", {}).get("http-proxy", [])
    new_hostname = new_http_proxy[0]["host-name"] if new_http_proxy else None

-    print(f"New hostname: {new_hostname}")
+    print(f"Spec hostname: {new_hostname}")

-    # Step 3: DNS verification (only if hostname changed)
+    # Step 2: DNS verification (only if hostname changed)
    if new_hostname and new_hostname != current_hostname:
-        print(f"\n[3/6] Hostname changed: {current_hostname} -> {new_hostname}")
+        print(f"\n[2/4] Hostname changed: {current_hostname} -> {new_hostname}")
        if force:
            print("DNS verification skipped (--force)")
        else:
@ -360,25 +344,26 @@ def restart(ctx, stack_path, config_file, force, expected_ip):
                print("Use --force to skip this check.")
                sys.exit(1)
    else:
-        print("\n[3/6] Hostname unchanged, skipping DNS verification")
+        print("\n[2/4] Hostname unchanged, skipping DNS verification")

-    # Step 4: Sync deployment directory
-    print("\n[4/6] Syncing deployment directory...")
+    # Step 3: Sync deployment directory with spec
+    print("\n[3/4] Syncing deployment directory...")
+    deploy_ctx = make_deploy_context(ctx)
    create_operation(
        deployment_command_context=deploy_ctx,
-        spec_file=new_spec_path,
+        spec_file=str(spec_file_path),
        deployment_dir=str(deployment_context.deployment_dir),
        update=True,
        network_dir=None,
        initial_peers=None,
    )

-    # Reload deployment context with new spec
+    # Reload deployment context with updated spec
    deployment_context.init(deployment_context.deployment_dir)
    ctx.obj = deployment_context

-    # Step 5: Stop deployment
-    print("\n[5/6] Stopping deployment...")
+    # Stop deployment
+    print("\n[4/4] Restarting deployment...")
    ctx.obj = make_deploy_context(ctx)
    down_operation(
        ctx, delete_volumes=False, extra_args_list=[], skip_cluster_management=True
@ -387,17 +372,13 @@ def restart(ctx, stack_path, config_file, force, expected_ip):
    # Brief pause to ensure clean shutdown
    time.sleep(5)

-    # Step 6: Start deployment
-    print("\n[6/6] Starting deployment...")
+    # Start deployment
    up_operation(
        ctx, services_list=None, stay_attached=False, skip_cluster_management=True
    )

    print("\n=== Restart Complete ===")
-    print("Deployment restarted with updated configuration.")
+    print("Deployment restarted with git-tracked configuration.")
    if new_hostname and new_hostname != current_hostname:
        print(f"\nNew hostname: {new_hostname}")
        print("Caddy will automatically provision TLS certificate.")
-
-    # Cleanup temp file
-    Path(new_spec_path).unlink(missing_ok=True)
--- a/stack_orchestrator/deploy/k8s/helpers.py
+++ b/stack_orchestrator/deploy/k8s/helpers.py
@ -123,6 +123,9 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool:
    specific stale resources (blacklist), we keep only the valuable data
    (caddy TLS certs) and delete everything else (whitelist approach).

+    The etcd image is distroless (no shell), so we extract the statically-linked
+    etcdctl binary and run it from alpine which has shell support.
+
    Returns True if cleanup succeeded, False if no action needed or failed.
    """
    db_path = Path(etcd_path) / "member" / "snap" / "db"
@ -146,14 +149,26 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool:
    # Whitelist: prefixes to KEEP - everything else gets deleted
    keep_prefixes = "/registry/secrets/caddy-system"

-    # All operations in docker to handle root-owned etcd files
+    # The etcd image is distroless (no shell). We extract the statically-linked
+    # etcdctl binary and run it from alpine which has shell + jq support.
    cleanup_script = f"""
        set -e
        ALPINE_IMAGE="alpine:3.19"

+        # Cleanup previous runs
+        docker rm -f laconic-etcd-cleanup 2>/dev/null || true
+        docker rm -f etcd-extract 2>/dev/null || true
+        docker run --rm -v /tmp:/tmp $ALPINE_IMAGE rm -rf {temp_dir}
+
        # Create temp dir
-        docker run --rm -v /tmp:/tmp $ALPINE_IMAGE \
-            sh -c "rm -rf {temp_dir} && mkdir -p {temp_dir}"
+        docker run --rm -v /tmp:/tmp $ALPINE_IMAGE mkdir -p {temp_dir}
+
+        # Extract etcdctl binary (it's statically linked)
+        docker create --name etcd-extract {etcd_image}
+        docker cp etcd-extract:/usr/local/bin/etcdctl /tmp/etcdctl-bin
+        docker rm etcd-extract
+        docker run --rm -v /tmp/etcdctl-bin:/src:ro -v {temp_dir}:/dst $ALPINE_IMAGE \
+            sh -c "cp /src /dst/etcdctl && chmod +x /dst/etcdctl"

        # Copy db to temp location
        docker run --rm \
@ -166,8 +181,7 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool:
            etcdutl snapshot restore /work/etcd-snapshot.db \
                --data-dir=/work/etcd-data --skip-hash-check 2>/dev/null

-        # Start temp etcd
-        docker rm -f laconic-etcd-cleanup 2>/dev/null || true
+        # Start temp etcd (runs the etcd binary, no shell needed)
        docker run -d --name laconic-etcd-cleanup \
            -v {temp_dir}/etcd-data:/etcd-data \
            -v {temp_dir}:/backup \
@ -178,31 +192,34 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool:

        sleep 3

-        # Export caddy secrets to backup file (the only thing we keep)
-        docker exec laconic-etcd-cleanup \
-            etcdctl get --prefix "{keep_prefixes}" -w json > {temp_dir}/kept.json \
-            2>/dev/null || echo '{{}}' > {temp_dir}/kept.json
+        # Use alpine with extracted etcdctl to run commands (alpine has shell + jq)
+        # Export caddy secrets
+        docker run --rm \
+            -v {temp_dir}:/backup \
+            --network container:laconic-etcd-cleanup \
+            $ALPINE_IMAGE sh -c \
+            '/backup/etcdctl get --prefix "{keep_prefixes}" -w json \
+                > /backup/kept.json 2>/dev/null || echo "{{}}" > /backup/kept.json'

        # Delete ALL registry keys
-        docker exec laconic-etcd-cleanup etcdctl del --prefix /registry
+        docker run --rm \
+            -v {temp_dir}:/backup \
+            --network container:laconic-etcd-cleanup \
+            $ALPINE_IMAGE /backup/etcdctl del --prefix /registry

-        # Restore kept keys using etcdctl txn
-        docker exec laconic-etcd-cleanup sh -c '
-            cat /backup/kept.json 2>/dev/null | \
-            (python3 -c "
-import sys, json, base64
-try:
-    data = json.load(sys.stdin)
-    for kv in data.get(\"kvs\", []):
-        k = base64.b64decode(kv[\"key\"]).decode()
-        v = base64.b64decode(kv[\"value\"]).decode(\"latin-1\")
-        print(k)
-        print(v)
-except: pass
-" 2>/dev/null || true) | while IFS= read -r key && IFS= read -r value; do
-                printf \"%s\" \"$value\" | etcdctl put \"$key\"
-            done
-        ' 2>/dev/null || true
+        # Restore kept keys using jq
+        docker run --rm \
+            -v {temp_dir}:/backup \
+            --network container:laconic-etcd-cleanup \
+            $ALPINE_IMAGE sh -c '
+                apk add --no-cache jq >/dev/null 2>&1
+                jq -r ".kvs[] | @base64" /backup/kept.json 2>/dev/null | \
+                while read encoded; do
+                    key=$(echo $encoded | base64 -d | jq -r ".key" | base64 -d)
+                    val=$(echo $encoded | base64 -d | jq -r ".value" | base64 -d)
+                    echo "$val" | /backup/etcdctl put "$key"
+                done
+            ' || true

        # Save cleaned snapshot
        docker exec laconic-etcd-cleanup \
@ -228,8 +245,9 @@ except: pass
        docker run --rm -v {etcd_path}:/etcd -v {temp_dir}:/tmp-work $ALPINE_IMAGE \
            sh -c "rm -rf /etcd/member && mv /tmp-work/new-etcd/member /etcd/member"

-        # Cleanup temp (but NOT the backup)
+        # Cleanup temp files (but NOT the timestamped backup in etcd_path)
        docker run --rm -v /tmp:/tmp $ALPINE_IMAGE rm -rf {temp_dir}
+        rm -f /tmp/etcdctl-bin
    """

    result = subprocess.run(cleanup_script, shell=True, capture_output=True, text=True)
--- a/stack_orchestrator/deploy/spec.py
+++ b/stack_orchestrator/deploy/spec.py
@ -180,7 +180,7 @@ class Spec:
        return self.obj.get(constants.deploy_to_key)

    def get_acme_email(self):
-        return self.obj.get(constants.acme_email_key, "")
+        return self.obj.get(constants.network_key, {}).get(constants.acme_email_key, "")

    def is_kubernetes_deployment(self):
        return self.get_deployment_type() in [