feat(deploy): add deployment restart command

Add `laconic-so deployment restart` command that: - Pulls latest code from stack git repository - Regenerates spec.yml from stack's commands.py - Verifies DNS if hostname changed (with --force to skip) - Syncs deployment directory preserving cluster ID and data - Stops and restarts deployment with --skip-cluster-management Also stores stack-source path in deployment.yml during create for automatic stack location on restart. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-31 19:05:27 -05:00 · 2026-01-31 19:05:27 -05:00 · c197406cc7
commit c197406cc7
parent 4713107546
3 changed files with 358 additions and 13 deletions
--- a/stack_orchestrator/deploy/deployment.py
+++ b/stack_orchestrator/deploy/deployment.py
@ -15,7 +15,10 @@

 import click
 from pathlib import Path
+import subprocess
 import sys
+import tempfile
+import time
 from stack_orchestrator import constants
 from stack_orchestrator.deploy.images import push_images_operation
 from stack_orchestrator.deploy.deploy import (
@ -228,3 +231,173 @@ def run_job(ctx, job_name, helm_release):

    ctx.obj = make_deploy_context(ctx)
    run_job_operation(ctx, job_name, helm_release)
+
+
+@command.command()
+@click.option("--stack-path", help="Path to stack git repo (overrides stored path)")
+@click.option("--config-file", help="Config file to pass to deploy init")
+@click.option(
+    "--force",
+    is_flag=True,
+    default=False,
+    help="Skip DNS verification",
+)
+@click.option(
+    "--expected-ip",
+    help="Expected IP for DNS verification (if different from egress)",
+)
+@click.pass_context
+def restart(ctx, stack_path, config_file, force, expected_ip):
+    """Pull latest stack, regenerate spec, and restart deployment.
+
+    This command:
+    1. Pulls latest code from the stack git repository
+    2. Regenerates spec.yml from the stack's commands.py
+    3. If hostname changed, verifies DNS routes to this server
+    4. Syncs the deployment directory (preserves cluster ID and data)
+    5. Stops and restarts the deployment
+
+    Data volumes are always preserved. The cluster is never destroyed.
+
+    Stack source resolution (in order):
+    1. --stack-path argument (if provided)
+    2. stack-source field in deployment.yml (if stored)
+    3. Error if neither available
+
+    Note: After restart, Caddy will automatically provision TLS certificates
+    for any new hostnames.
+    """
+    from stack_orchestrator.util import get_yaml, get_parsed_deployment_spec
+    from stack_orchestrator.deploy.deployment_create import (
+        init_operation,
+        create_operation,
+    )
+    from stack_orchestrator.deploy.dns_probe import verify_dns_via_probe
+
+    deployment_context: DeploymentContext = ctx.obj
+
+    # Get current spec info
+    current_spec = deployment_context.spec
+    current_http_proxy = current_spec.get_http_proxy()
+    current_hostname = (
+        current_http_proxy[0]["host-name"] if current_http_proxy else None
+    )
+
+    # Resolve stack source path
+    if stack_path:
+        stack_source = Path(stack_path).resolve()
+    else:
+        # Try to get from deployment.yml
+        deployment_file = (
+            deployment_context.deployment_dir / constants.deployment_file_name
+        )
+        deployment_data = get_yaml().load(open(deployment_file))
+        stack_source_str = deployment_data.get("stack-source")
+        if not stack_source_str:
+            print(
+                "Error: No stack-source in deployment.yml and --stack-path not provided"
+            )
+            print("Use --stack-path to specify the stack git repository location")
+            sys.exit(1)
+        stack_source = Path(stack_source_str)
+
+    if not stack_source.exists():
+        print(f"Error: Stack source path does not exist: {stack_source}")
+        sys.exit(1)
+
+    print("=== Deployment Restart ===")
+    print(f"Deployment dir: {deployment_context.deployment_dir}")
+    print(f"Stack source: {stack_source}")
+    print(f"Current hostname: {current_hostname}")
+
+    # Step 1: Git pull
+    print("\n[1/6] Pulling latest code from stack repository...")
+    git_result = subprocess.run(
+        ["git", "pull"], cwd=stack_source, capture_output=True, text=True
+    )
+    if git_result.returncode != 0:
+        print(f"Git pull failed: {git_result.stderr}")
+        sys.exit(1)
+    print(f"Git pull: {git_result.stdout.strip()}")
+
+    # Step 2: Regenerate spec
+    print("\n[2/6] Regenerating spec from commands.py...")
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".yml", delete=False) as tmp:
+        new_spec_path = tmp.name
+
+    # Build deploy context for init
+    deploy_ctx = make_deploy_context(ctx)
+
+    init_operation(
+        deploy_command_context=deploy_ctx,
+        stack=str(stack_source),
+        deployer_type=current_spec.obj[constants.deploy_to_key],
+        config=None,
+        config_file=config_file,
+        kube_config=None,
+        image_registry=None,
+        output=new_spec_path,
+        map_ports_to_host=None,
+    )
+
+    # Parse new spec to get new hostname
+    new_spec_obj = get_parsed_deployment_spec(new_spec_path)
+    new_http_proxy = new_spec_obj.get("network", {}).get("http-proxy", [])
+    new_hostname = new_http_proxy[0]["host-name"] if new_http_proxy else None
+
+    print(f"New hostname: {new_hostname}")
+
+    # Step 3: DNS verification (only if hostname changed)
+    if new_hostname and new_hostname != current_hostname:
+        print(f"\n[3/6] Hostname changed: {current_hostname} -> {new_hostname}")
+        if force:
+            print("DNS verification skipped (--force)")
+        else:
+            print("Verifying DNS via probe...")
+            if not verify_dns_via_probe(new_hostname):
+                print(f"\nDNS verification failed for {new_hostname}")
+                print("Ensure DNS is configured before restarting.")
+                print("Use --force to skip this check.")
+                sys.exit(1)
+    else:
+        print("\n[3/6] Hostname unchanged, skipping DNS verification")
+
+    # Step 4: Sync deployment directory
+    print("\n[4/6] Syncing deployment directory...")
+    create_operation(
+        deployment_command_context=deploy_ctx,
+        spec_file=new_spec_path,
+        deployment_dir=str(deployment_context.deployment_dir),
+        update=True,
+        network_dir=None,
+        initial_peers=None,
+    )
+
+    # Reload deployment context with new spec
+    deployment_context.init(deployment_context.deployment_dir)
+    ctx.obj = deployment_context
+
+    # Step 5: Stop deployment
+    print("\n[5/6] Stopping deployment...")
+    ctx.obj = make_deploy_context(ctx)
+    down_operation(
+        ctx, delete_volumes=False, extra_args_list=[], skip_cluster_management=True
+    )
+
+    # Brief pause to ensure clean shutdown
+    time.sleep(5)
+
+    # Step 6: Start deployment
+    print("\n[6/6] Starting deployment...")
+    up_operation(
+        ctx, services_list=None, stay_attached=False, skip_cluster_management=True
+    )
+
+    print("\n=== Restart Complete ===")
+    print("Deployment restarted with updated configuration.")
+    if new_hostname and new_hostname != current_hostname:
+        print(f"\nNew hostname: {new_hostname}")
+        print("Caddy will automatically provision TLS certificate.")
+
+    # Cleanup temp file
+    Path(new_spec_path).unlink(missing_ok=True)
--- a/stack_orchestrator/deploy/deployment_create.py
+++ b/stack_orchestrator/deploy/deployment_create.py
@ -17,7 +17,7 @@ import click
 from importlib import util
 import os
 from pathlib import Path
-from typing import List
+from typing import List, Optional
 import random
 from shutil import copy, copyfile, copytree, rmtree
 from secrets import token_hex
@ -507,11 +507,14 @@ def _copy_files_to_directory(file_paths: List[Path], directory: Path):
        copy(path, os.path.join(directory, os.path.basename(path)))


-def _create_deployment_file(deployment_dir: Path):
+def _create_deployment_file(deployment_dir: Path, stack_source: Optional[Path] = None):
    deployment_file_path = deployment_dir.joinpath(constants.deployment_file_name)
    cluster = f"{constants.cluster_name_prefix}{token_hex(8)}"
+    deployment_content = {constants.cluster_id_key: cluster}
+    if stack_source:
+        deployment_content["stack-source"] = str(stack_source)
    with open(deployment_file_path, "w") as output_file:
-        output_file.write(f"{constants.cluster_id_key}: {cluster}\n")
+        get_yaml().dump(deployment_content, output_file)


 def _check_volume_definitions(spec):
@ -616,11 +619,15 @@ def create_operation(
        generate_helm_chart(stack_name, spec_file, deployment_dir_path)
        return  # Exit early for helm chart generation

+    # Resolve stack source path for restart capability
+    stack_source = get_stack_path(stack_name)
+
    if update:
        # Sync mode: write to temp dir, then copy to deployment dir with backups
        temp_dir = Path(tempfile.mkdtemp(prefix="deployment-sync-"))
        try:
-            # Write deployment files to temp dir (skip deployment.yml to preserve cluster ID)
+            # Write deployment files to temp dir
+            # (skip deployment.yml to preserve cluster ID)
            _write_deployment_files(
                temp_dir,
                Path(spec_file),
@ -628,12 +635,14 @@ def create_operation(
                stack_name,
                deployment_type,
                include_deployment_file=False,
+                stack_source=stack_source,
            )

-            # Copy from temp to deployment dir, excluding data volumes and backing up changed files
-            # Exclude data/* to avoid touching user data volumes
-            # Exclude config file to preserve deployment settings (XXX breaks passing config vars
-            # from spec. could warn about this or not exclude...)
+            # Copy from temp to deployment dir, excluding data volumes
+            # and backing up changed files.
+            # Exclude data/* to avoid touching user data volumes.
+            # Exclude config file to preserve deployment settings
+            # (XXX breaks passing config vars from spec)
            exclude_patterns = ["data", "data/*", constants.config_file_name]
            _safe_copy_tree(
                temp_dir, deployment_dir_path, exclude_patterns=exclude_patterns
@ -650,6 +659,7 @@ def create_operation(
            stack_name,
            deployment_type,
            include_deployment_file=True,
+            stack_source=stack_source,
        )

    # Delegate to the stack's Python code
@ -670,7 +680,7 @@ def create_operation(
    )


-def _safe_copy_tree(src: Path, dst: Path, exclude_patterns: List[str] = None):
+def _safe_copy_tree(src: Path, dst: Path, exclude_patterns: Optional[List[str]] = None):
    """
    Recursively copy a directory tree, backing up changed files with .bak suffix.

@ -721,6 +731,7 @@ def _write_deployment_files(
    stack_name: str,
    deployment_type: str,
    include_deployment_file: bool = True,
+    stack_source: Optional[Path] = None,
 ):
    """
    Write deployment files to target directory.
@ -730,7 +741,8 @@ def _write_deployment_files(
    :param parsed_spec: Parsed spec object
    :param stack_name: Name of stack
    :param deployment_type: Type of deployment
-    :param include_deployment_file: Whether to create deployment.yml file (skip for update)
+    :param include_deployment_file: Whether to create deployment.yml (skip for update)
+    :param stack_source: Path to stack source (git repo) for restart capability
    """
    stack_file = get_stack_path(stack_name).joinpath(constants.stack_file_name)
    parsed_stack = get_parsed_stack_config(stack_name)
@ -741,7 +753,7 @@ def _write_deployment_files(

    # Create deployment file if requested
    if include_deployment_file:
-        _create_deployment_file(target_dir)
+        _create_deployment_file(target_dir, stack_source=stack_source)

    # Copy any config variables from the spec file into an env file suitable for compose
    _write_config_file(spec_file, target_dir.joinpath(constants.config_file_name))
@ -805,8 +817,9 @@ def _write_deployment_files(
                    )
        else:
            # TODO:
-            # this is odd - looks up config dir that matches a volume name, then copies as a mount dir?
-            # AFAICT this is not used by or relevant to any existing stack - roy
+            # This is odd - looks up config dir that matches a volume name,
+            # then copies as a mount dir?
+            # AFAICT not used by or relevant to any existing stack - roy

            # TODO: We should probably only do this if the volume is marked :ro.
            for volume_name, volume_path in parsed_spec.get_volumes().items():
--- a/stack_orchestrator/deploy/dns_probe.py
+++ b/stack_orchestrator/deploy/dns_probe.py
@ -0,0 +1,159 @@
+# Copyright © 2024 Vulcanize
+# SPDX-License-Identifier: AGPL-3.0
+
+"""DNS verification via temporary ingress probe."""
+
+import secrets
+import socket
+import time
+from typing import Optional
+import requests
+from kubernetes import client
+
+
+def get_server_egress_ip() -> str:
+    """Get this server's public egress IP via ipify."""
+    response = requests.get("https://api.ipify.org", timeout=10)
+    response.raise_for_status()
+    return response.text.strip()
+
+
+def resolve_hostname(hostname: str) -> list[str]:
+    """Resolve hostname to list of IP addresses."""
+    try:
+        _, _, ips = socket.gethostbyname_ex(hostname)
+        return ips
+    except socket.gaierror:
+        return []
+
+
+def verify_dns_simple(hostname: str, expected_ip: Optional[str] = None) -> bool:
+    """Simple DNS verification - check hostname resolves to expected IP.
+
+    If expected_ip not provided, uses server's egress IP.
+    Returns True if hostname resolves to expected IP.
+    """
+    resolved_ips = resolve_hostname(hostname)
+    if not resolved_ips:
+        print(f"DNS FAIL: {hostname} does not resolve")
+        return False
+
+    if expected_ip is None:
+        expected_ip = get_server_egress_ip()
+
+    if expected_ip in resolved_ips:
+        print(f"DNS OK: {hostname} -> {resolved_ips} (includes {expected_ip})")
+        return True
+    else:
+        print(f"DNS WARN: {hostname} -> {resolved_ips} (expected {expected_ip})")
+        return False
+
+
+def create_probe_ingress(hostname: str, namespace: str = "default") -> str:
+    """Create a temporary ingress for DNS probing.
+
+    Returns the probe token that the ingress will respond with.
+    """
+    token = secrets.token_hex(16)
+
+    networking_api = client.NetworkingV1Api()
+
+    # Create a simple ingress that Caddy will pick up
+    ingress = client.V1Ingress(
+        metadata=client.V1ObjectMeta(
+            name="laconic-dns-probe",
+            annotations={
+                "kubernetes.io/ingress.class": "caddy",
+                "laconic.com/probe-token": token,
+            },
+        ),
+        spec=client.V1IngressSpec(
+            rules=[
+                client.V1IngressRule(
+                    host=hostname,
+                    http=client.V1HTTPIngressRuleValue(
+                        paths=[
+                            client.V1HTTPIngressPath(
+                                path="/.well-known/laconic-probe",
+                                path_type="Exact",
+                                backend=client.V1IngressBackend(
+                                    service=client.V1IngressServiceBackend(
+                                        name="caddy-ingress-controller",
+                                        port=client.V1ServiceBackendPort(number=80),
+                                    )
+                                ),
+                            )
+                        ]
+                    ),
+                )
+            ]
+        ),
+    )
+
+    networking_api.create_namespaced_ingress(namespace=namespace, body=ingress)
+    return token
+
+
+def delete_probe_ingress(namespace: str = "default"):
+    """Delete the temporary probe ingress."""
+    networking_api = client.NetworkingV1Api()
+    try:
+        networking_api.delete_namespaced_ingress(
+            name="laconic-dns-probe", namespace=namespace
+        )
+    except client.exceptions.ApiException:
+        pass  # Ignore if already deleted
+
+
+def verify_dns_via_probe(
+    hostname: str, namespace: str = "default", timeout: int = 30, poll_interval: int = 2
+) -> bool:
+    """Verify DNS by creating temp ingress and probing it.
+
+    This definitively proves that traffic to the hostname reaches this cluster.
+
+    Args:
+        hostname: The hostname to verify
+        namespace: Kubernetes namespace for probe ingress
+        timeout: Total seconds to wait for probe to succeed
+        poll_interval: Seconds between probe attempts
+
+    Returns:
+        True if probe succeeds, False otherwise
+    """
+    # First check DNS resolves at all
+    if not resolve_hostname(hostname):
+        print(f"DNS FAIL: {hostname} does not resolve")
+        return False
+
+    print(f"Creating probe ingress for {hostname}...")
+    create_probe_ingress(hostname, namespace)
+
+    try:
+        # Wait for Caddy to pick up the ingress
+        time.sleep(3)
+
+        # Poll until success or timeout
+        probe_url = f"http://{hostname}/.well-known/laconic-probe"
+        start_time = time.time()
+        last_error = None
+
+        while time.time() - start_time < timeout:
+            try:
+                response = requests.get(probe_url, timeout=5)
+                # For now, just verify we get a response from this cluster
+                # A more robust check would verify a unique token
+                if response.status_code < 500:
+                    print(f"DNS PROBE OK: {hostname} routes to this cluster")
+                    return True
+            except requests.RequestException as e:
+                last_error = e
+
+            time.sleep(poll_interval)
+
+        print(f"DNS PROBE FAIL: {hostname} - {last_error}")
+        return False
+
+    finally:
+        print("Cleaning up probe ingress...")
+        delete_probe_ingress(namespace)