feat(deploy): add deployment restart command

Add `laconic-so deployment restart` command that:
- Pulls latest code from stack git repository
- Regenerates spec.yml from stack's commands.py
- Verifies DNS if hostname changed (with --force to skip)
- Syncs deployment directory preserving cluster ID and data
- Stops and restarts deployment with --skip-cluster-management

Also stores stack-source path in deployment.yml during create
for automatic stack location on restart.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
A. F. Dudley 2026-01-31 19:05:27 -05:00
parent 4713107546
commit c197406cc7
3 changed files with 358 additions and 13 deletions

View File

@ -15,7 +15,10 @@
import click
from pathlib import Path
import subprocess
import sys
import tempfile
import time
from stack_orchestrator import constants
from stack_orchestrator.deploy.images import push_images_operation
from stack_orchestrator.deploy.deploy import (
@ -228,3 +231,173 @@ def run_job(ctx, job_name, helm_release):
ctx.obj = make_deploy_context(ctx)
run_job_operation(ctx, job_name, helm_release)
@command.command()
@click.option("--stack-path", help="Path to stack git repo (overrides stored path)")
@click.option("--config-file", help="Config file to pass to deploy init")
@click.option(
"--force",
is_flag=True,
default=False,
help="Skip DNS verification",
)
@click.option(
"--expected-ip",
help="Expected IP for DNS verification (if different from egress)",
)
@click.pass_context
def restart(ctx, stack_path, config_file, force, expected_ip):
"""Pull latest stack, regenerate spec, and restart deployment.
This command:
1. Pulls latest code from the stack git repository
2. Regenerates spec.yml from the stack's commands.py
3. If hostname changed, verifies DNS routes to this server
4. Syncs the deployment directory (preserves cluster ID and data)
5. Stops and restarts the deployment
Data volumes are always preserved. The cluster is never destroyed.
Stack source resolution (in order):
1. --stack-path argument (if provided)
2. stack-source field in deployment.yml (if stored)
3. Error if neither available
Note: After restart, Caddy will automatically provision TLS certificates
for any new hostnames.
"""
from stack_orchestrator.util import get_yaml, get_parsed_deployment_spec
from stack_orchestrator.deploy.deployment_create import (
init_operation,
create_operation,
)
from stack_orchestrator.deploy.dns_probe import verify_dns_via_probe
deployment_context: DeploymentContext = ctx.obj
# Get current spec info
current_spec = deployment_context.spec
current_http_proxy = current_spec.get_http_proxy()
current_hostname = (
current_http_proxy[0]["host-name"] if current_http_proxy else None
)
# Resolve stack source path
if stack_path:
stack_source = Path(stack_path).resolve()
else:
# Try to get from deployment.yml
deployment_file = (
deployment_context.deployment_dir / constants.deployment_file_name
)
deployment_data = get_yaml().load(open(deployment_file))
stack_source_str = deployment_data.get("stack-source")
if not stack_source_str:
print(
"Error: No stack-source in deployment.yml and --stack-path not provided"
)
print("Use --stack-path to specify the stack git repository location")
sys.exit(1)
stack_source = Path(stack_source_str)
if not stack_source.exists():
print(f"Error: Stack source path does not exist: {stack_source}")
sys.exit(1)
print("=== Deployment Restart ===")
print(f"Deployment dir: {deployment_context.deployment_dir}")
print(f"Stack source: {stack_source}")
print(f"Current hostname: {current_hostname}")
# Step 1: Git pull
print("\n[1/6] Pulling latest code from stack repository...")
git_result = subprocess.run(
["git", "pull"], cwd=stack_source, capture_output=True, text=True
)
if git_result.returncode != 0:
print(f"Git pull failed: {git_result.stderr}")
sys.exit(1)
print(f"Git pull: {git_result.stdout.strip()}")
# Step 2: Regenerate spec
print("\n[2/6] Regenerating spec from commands.py...")
with tempfile.NamedTemporaryFile(mode="w", suffix=".yml", delete=False) as tmp:
new_spec_path = tmp.name
# Build deploy context for init
deploy_ctx = make_deploy_context(ctx)
init_operation(
deploy_command_context=deploy_ctx,
stack=str(stack_source),
deployer_type=current_spec.obj[constants.deploy_to_key],
config=None,
config_file=config_file,
kube_config=None,
image_registry=None,
output=new_spec_path,
map_ports_to_host=None,
)
# Parse new spec to get new hostname
new_spec_obj = get_parsed_deployment_spec(new_spec_path)
new_http_proxy = new_spec_obj.get("network", {}).get("http-proxy", [])
new_hostname = new_http_proxy[0]["host-name"] if new_http_proxy else None
print(f"New hostname: {new_hostname}")
# Step 3: DNS verification (only if hostname changed)
if new_hostname and new_hostname != current_hostname:
print(f"\n[3/6] Hostname changed: {current_hostname} -> {new_hostname}")
if force:
print("DNS verification skipped (--force)")
else:
print("Verifying DNS via probe...")
if not verify_dns_via_probe(new_hostname):
print(f"\nDNS verification failed for {new_hostname}")
print("Ensure DNS is configured before restarting.")
print("Use --force to skip this check.")
sys.exit(1)
else:
print("\n[3/6] Hostname unchanged, skipping DNS verification")
# Step 4: Sync deployment directory
print("\n[4/6] Syncing deployment directory...")
create_operation(
deployment_command_context=deploy_ctx,
spec_file=new_spec_path,
deployment_dir=str(deployment_context.deployment_dir),
update=True,
network_dir=None,
initial_peers=None,
)
# Reload deployment context with new spec
deployment_context.init(deployment_context.deployment_dir)
ctx.obj = deployment_context
# Step 5: Stop deployment
print("\n[5/6] Stopping deployment...")
ctx.obj = make_deploy_context(ctx)
down_operation(
ctx, delete_volumes=False, extra_args_list=[], skip_cluster_management=True
)
# Brief pause to ensure clean shutdown
time.sleep(5)
# Step 6: Start deployment
print("\n[6/6] Starting deployment...")
up_operation(
ctx, services_list=None, stay_attached=False, skip_cluster_management=True
)
print("\n=== Restart Complete ===")
print("Deployment restarted with updated configuration.")
if new_hostname and new_hostname != current_hostname:
print(f"\nNew hostname: {new_hostname}")
print("Caddy will automatically provision TLS certificate.")
# Cleanup temp file
Path(new_spec_path).unlink(missing_ok=True)

View File

@ -17,7 +17,7 @@ import click
from importlib import util
import os
from pathlib import Path
from typing import List
from typing import List, Optional
import random
from shutil import copy, copyfile, copytree, rmtree
from secrets import token_hex
@ -507,11 +507,14 @@ def _copy_files_to_directory(file_paths: List[Path], directory: Path):
copy(path, os.path.join(directory, os.path.basename(path)))
def _create_deployment_file(deployment_dir: Path):
def _create_deployment_file(deployment_dir: Path, stack_source: Optional[Path] = None):
deployment_file_path = deployment_dir.joinpath(constants.deployment_file_name)
cluster = f"{constants.cluster_name_prefix}{token_hex(8)}"
deployment_content = {constants.cluster_id_key: cluster}
if stack_source:
deployment_content["stack-source"] = str(stack_source)
with open(deployment_file_path, "w") as output_file:
output_file.write(f"{constants.cluster_id_key}: {cluster}\n")
get_yaml().dump(deployment_content, output_file)
def _check_volume_definitions(spec):
@ -616,11 +619,15 @@ def create_operation(
generate_helm_chart(stack_name, spec_file, deployment_dir_path)
return # Exit early for helm chart generation
# Resolve stack source path for restart capability
stack_source = get_stack_path(stack_name)
if update:
# Sync mode: write to temp dir, then copy to deployment dir with backups
temp_dir = Path(tempfile.mkdtemp(prefix="deployment-sync-"))
try:
# Write deployment files to temp dir (skip deployment.yml to preserve cluster ID)
# Write deployment files to temp dir
# (skip deployment.yml to preserve cluster ID)
_write_deployment_files(
temp_dir,
Path(spec_file),
@ -628,12 +635,14 @@ def create_operation(
stack_name,
deployment_type,
include_deployment_file=False,
stack_source=stack_source,
)
# Copy from temp to deployment dir, excluding data volumes and backing up changed files
# Exclude data/* to avoid touching user data volumes
# Exclude config file to preserve deployment settings (XXX breaks passing config vars
# from spec. could warn about this or not exclude...)
# Copy from temp to deployment dir, excluding data volumes
# and backing up changed files.
# Exclude data/* to avoid touching user data volumes.
# Exclude config file to preserve deployment settings
# (XXX breaks passing config vars from spec)
exclude_patterns = ["data", "data/*", constants.config_file_name]
_safe_copy_tree(
temp_dir, deployment_dir_path, exclude_patterns=exclude_patterns
@ -650,6 +659,7 @@ def create_operation(
stack_name,
deployment_type,
include_deployment_file=True,
stack_source=stack_source,
)
# Delegate to the stack's Python code
@ -670,7 +680,7 @@ def create_operation(
)
def _safe_copy_tree(src: Path, dst: Path, exclude_patterns: List[str] = None):
def _safe_copy_tree(src: Path, dst: Path, exclude_patterns: Optional[List[str]] = None):
"""
Recursively copy a directory tree, backing up changed files with .bak suffix.
@ -721,6 +731,7 @@ def _write_deployment_files(
stack_name: str,
deployment_type: str,
include_deployment_file: bool = True,
stack_source: Optional[Path] = None,
):
"""
Write deployment files to target directory.
@ -730,7 +741,8 @@ def _write_deployment_files(
:param parsed_spec: Parsed spec object
:param stack_name: Name of stack
:param deployment_type: Type of deployment
:param include_deployment_file: Whether to create deployment.yml file (skip for update)
:param include_deployment_file: Whether to create deployment.yml (skip for update)
:param stack_source: Path to stack source (git repo) for restart capability
"""
stack_file = get_stack_path(stack_name).joinpath(constants.stack_file_name)
parsed_stack = get_parsed_stack_config(stack_name)
@ -741,7 +753,7 @@ def _write_deployment_files(
# Create deployment file if requested
if include_deployment_file:
_create_deployment_file(target_dir)
_create_deployment_file(target_dir, stack_source=stack_source)
# Copy any config variables from the spec file into an env file suitable for compose
_write_config_file(spec_file, target_dir.joinpath(constants.config_file_name))
@ -805,8 +817,9 @@ def _write_deployment_files(
)
else:
# TODO:
# this is odd - looks up config dir that matches a volume name, then copies as a mount dir?
# AFAICT this is not used by or relevant to any existing stack - roy
# This is odd - looks up config dir that matches a volume name,
# then copies as a mount dir?
# AFAICT not used by or relevant to any existing stack - roy
# TODO: We should probably only do this if the volume is marked :ro.
for volume_name, volume_path in parsed_spec.get_volumes().items():

View File

@ -0,0 +1,159 @@
# Copyright © 2024 Vulcanize
# SPDX-License-Identifier: AGPL-3.0
"""DNS verification via temporary ingress probe."""
import secrets
import socket
import time
from typing import Optional
import requests
from kubernetes import client
def get_server_egress_ip() -> str:
"""Get this server's public egress IP via ipify."""
response = requests.get("https://api.ipify.org", timeout=10)
response.raise_for_status()
return response.text.strip()
def resolve_hostname(hostname: str) -> list[str]:
"""Resolve hostname to list of IP addresses."""
try:
_, _, ips = socket.gethostbyname_ex(hostname)
return ips
except socket.gaierror:
return []
def verify_dns_simple(hostname: str, expected_ip: Optional[str] = None) -> bool:
"""Simple DNS verification - check hostname resolves to expected IP.
If expected_ip not provided, uses server's egress IP.
Returns True if hostname resolves to expected IP.
"""
resolved_ips = resolve_hostname(hostname)
if not resolved_ips:
print(f"DNS FAIL: {hostname} does not resolve")
return False
if expected_ip is None:
expected_ip = get_server_egress_ip()
if expected_ip in resolved_ips:
print(f"DNS OK: {hostname} -> {resolved_ips} (includes {expected_ip})")
return True
else:
print(f"DNS WARN: {hostname} -> {resolved_ips} (expected {expected_ip})")
return False
def create_probe_ingress(hostname: str, namespace: str = "default") -> str:
"""Create a temporary ingress for DNS probing.
Returns the probe token that the ingress will respond with.
"""
token = secrets.token_hex(16)
networking_api = client.NetworkingV1Api()
# Create a simple ingress that Caddy will pick up
ingress = client.V1Ingress(
metadata=client.V1ObjectMeta(
name="laconic-dns-probe",
annotations={
"kubernetes.io/ingress.class": "caddy",
"laconic.com/probe-token": token,
},
),
spec=client.V1IngressSpec(
rules=[
client.V1IngressRule(
host=hostname,
http=client.V1HTTPIngressRuleValue(
paths=[
client.V1HTTPIngressPath(
path="/.well-known/laconic-probe",
path_type="Exact",
backend=client.V1IngressBackend(
service=client.V1IngressServiceBackend(
name="caddy-ingress-controller",
port=client.V1ServiceBackendPort(number=80),
)
),
)
]
),
)
]
),
)
networking_api.create_namespaced_ingress(namespace=namespace, body=ingress)
return token
def delete_probe_ingress(namespace: str = "default"):
"""Delete the temporary probe ingress."""
networking_api = client.NetworkingV1Api()
try:
networking_api.delete_namespaced_ingress(
name="laconic-dns-probe", namespace=namespace
)
except client.exceptions.ApiException:
pass # Ignore if already deleted
def verify_dns_via_probe(
hostname: str, namespace: str = "default", timeout: int = 30, poll_interval: int = 2
) -> bool:
"""Verify DNS by creating temp ingress and probing it.
This definitively proves that traffic to the hostname reaches this cluster.
Args:
hostname: The hostname to verify
namespace: Kubernetes namespace for probe ingress
timeout: Total seconds to wait for probe to succeed
poll_interval: Seconds between probe attempts
Returns:
True if probe succeeds, False otherwise
"""
# First check DNS resolves at all
if not resolve_hostname(hostname):
print(f"DNS FAIL: {hostname} does not resolve")
return False
print(f"Creating probe ingress for {hostname}...")
create_probe_ingress(hostname, namespace)
try:
# Wait for Caddy to pick up the ingress
time.sleep(3)
# Poll until success or timeout
probe_url = f"http://{hostname}/.well-known/laconic-probe"
start_time = time.time()
last_error = None
while time.time() - start_time < timeout:
try:
response = requests.get(probe_url, timeout=5)
# For now, just verify we get a response from this cluster
# A more robust check would verify a unique token
if response.status_code < 500:
print(f"DNS PROBE OK: {hostname} routes to this cluster")
return True
except requests.RequestException as e:
last_error = e
time.sleep(poll_interval)
print(f"DNS PROBE FAIL: {hostname} - {last_error}")
return False
finally:
print("Cleaning up probe ingress...")
delete_probe_ingress(namespace)