diff --git a/stack_orchestrator/deploy/deployment.py b/stack_orchestrator/deploy/deployment.py index 35abea3c..f60ea9a4 100644 --- a/stack_orchestrator/deploy/deployment.py +++ b/stack_orchestrator/deploy/deployment.py @@ -15,7 +15,10 @@ import click from pathlib import Path +import subprocess import sys +import tempfile +import time from stack_orchestrator import constants from stack_orchestrator.deploy.images import push_images_operation from stack_orchestrator.deploy.deploy import ( @@ -228,3 +231,173 @@ def run_job(ctx, job_name, helm_release): ctx.obj = make_deploy_context(ctx) run_job_operation(ctx, job_name, helm_release) + + +@command.command() +@click.option("--stack-path", help="Path to stack git repo (overrides stored path)") +@click.option("--config-file", help="Config file to pass to deploy init") +@click.option( + "--force", + is_flag=True, + default=False, + help="Skip DNS verification", +) +@click.option( + "--expected-ip", + help="Expected IP for DNS verification (if different from egress)", +) +@click.pass_context +def restart(ctx, stack_path, config_file, force, expected_ip): + """Pull latest stack, regenerate spec, and restart deployment. + + This command: + 1. Pulls latest code from the stack git repository + 2. Regenerates spec.yml from the stack's commands.py + 3. If hostname changed, verifies DNS routes to this server + 4. Syncs the deployment directory (preserves cluster ID and data) + 5. Stops and restarts the deployment + + Data volumes are always preserved. The cluster is never destroyed. + + Stack source resolution (in order): + 1. --stack-path argument (if provided) + 2. stack-source field in deployment.yml (if stored) + 3. Error if neither available + + Note: After restart, Caddy will automatically provision TLS certificates + for any new hostnames. + """ + from stack_orchestrator.util import get_yaml, get_parsed_deployment_spec + from stack_orchestrator.deploy.deployment_create import ( + init_operation, + create_operation, + ) + from stack_orchestrator.deploy.dns_probe import verify_dns_via_probe + + deployment_context: DeploymentContext = ctx.obj + + # Get current spec info + current_spec = deployment_context.spec + current_http_proxy = current_spec.get_http_proxy() + current_hostname = ( + current_http_proxy[0]["host-name"] if current_http_proxy else None + ) + + # Resolve stack source path + if stack_path: + stack_source = Path(stack_path).resolve() + else: + # Try to get from deployment.yml + deployment_file = ( + deployment_context.deployment_dir / constants.deployment_file_name + ) + deployment_data = get_yaml().load(open(deployment_file)) + stack_source_str = deployment_data.get("stack-source") + if not stack_source_str: + print( + "Error: No stack-source in deployment.yml and --stack-path not provided" + ) + print("Use --stack-path to specify the stack git repository location") + sys.exit(1) + stack_source = Path(stack_source_str) + + if not stack_source.exists(): + print(f"Error: Stack source path does not exist: {stack_source}") + sys.exit(1) + + print("=== Deployment Restart ===") + print(f"Deployment dir: {deployment_context.deployment_dir}") + print(f"Stack source: {stack_source}") + print(f"Current hostname: {current_hostname}") + + # Step 1: Git pull + print("\n[1/6] Pulling latest code from stack repository...") + git_result = subprocess.run( + ["git", "pull"], cwd=stack_source, capture_output=True, text=True + ) + if git_result.returncode != 0: + print(f"Git pull failed: {git_result.stderr}") + sys.exit(1) + print(f"Git pull: {git_result.stdout.strip()}") + + # Step 2: Regenerate spec + print("\n[2/6] Regenerating spec from commands.py...") + with tempfile.NamedTemporaryFile(mode="w", suffix=".yml", delete=False) as tmp: + new_spec_path = tmp.name + + # Build deploy context for init + deploy_ctx = make_deploy_context(ctx) + + init_operation( + deploy_command_context=deploy_ctx, + stack=str(stack_source), + deployer_type=current_spec.obj[constants.deploy_to_key], + config=None, + config_file=config_file, + kube_config=None, + image_registry=None, + output=new_spec_path, + map_ports_to_host=None, + ) + + # Parse new spec to get new hostname + new_spec_obj = get_parsed_deployment_spec(new_spec_path) + new_http_proxy = new_spec_obj.get("network", {}).get("http-proxy", []) + new_hostname = new_http_proxy[0]["host-name"] if new_http_proxy else None + + print(f"New hostname: {new_hostname}") + + # Step 3: DNS verification (only if hostname changed) + if new_hostname and new_hostname != current_hostname: + print(f"\n[3/6] Hostname changed: {current_hostname} -> {new_hostname}") + if force: + print("DNS verification skipped (--force)") + else: + print("Verifying DNS via probe...") + if not verify_dns_via_probe(new_hostname): + print(f"\nDNS verification failed for {new_hostname}") + print("Ensure DNS is configured before restarting.") + print("Use --force to skip this check.") + sys.exit(1) + else: + print("\n[3/6] Hostname unchanged, skipping DNS verification") + + # Step 4: Sync deployment directory + print("\n[4/6] Syncing deployment directory...") + create_operation( + deployment_command_context=deploy_ctx, + spec_file=new_spec_path, + deployment_dir=str(deployment_context.deployment_dir), + update=True, + network_dir=None, + initial_peers=None, + ) + + # Reload deployment context with new spec + deployment_context.init(deployment_context.deployment_dir) + ctx.obj = deployment_context + + # Step 5: Stop deployment + print("\n[5/6] Stopping deployment...") + ctx.obj = make_deploy_context(ctx) + down_operation( + ctx, delete_volumes=False, extra_args_list=[], skip_cluster_management=True + ) + + # Brief pause to ensure clean shutdown + time.sleep(5) + + # Step 6: Start deployment + print("\n[6/6] Starting deployment...") + up_operation( + ctx, services_list=None, stay_attached=False, skip_cluster_management=True + ) + + print("\n=== Restart Complete ===") + print("Deployment restarted with updated configuration.") + if new_hostname and new_hostname != current_hostname: + print(f"\nNew hostname: {new_hostname}") + print("Caddy will automatically provision TLS certificate.") + + # Cleanup temp file + Path(new_spec_path).unlink(missing_ok=True) diff --git a/stack_orchestrator/deploy/deployment_create.py b/stack_orchestrator/deploy/deployment_create.py index fd15119c..ec15362f 100644 --- a/stack_orchestrator/deploy/deployment_create.py +++ b/stack_orchestrator/deploy/deployment_create.py @@ -17,7 +17,7 @@ import click from importlib import util import os from pathlib import Path -from typing import List +from typing import List, Optional import random from shutil import copy, copyfile, copytree, rmtree from secrets import token_hex @@ -507,11 +507,14 @@ def _copy_files_to_directory(file_paths: List[Path], directory: Path): copy(path, os.path.join(directory, os.path.basename(path))) -def _create_deployment_file(deployment_dir: Path): +def _create_deployment_file(deployment_dir: Path, stack_source: Optional[Path] = None): deployment_file_path = deployment_dir.joinpath(constants.deployment_file_name) cluster = f"{constants.cluster_name_prefix}{token_hex(8)}" + deployment_content = {constants.cluster_id_key: cluster} + if stack_source: + deployment_content["stack-source"] = str(stack_source) with open(deployment_file_path, "w") as output_file: - output_file.write(f"{constants.cluster_id_key}: {cluster}\n") + get_yaml().dump(deployment_content, output_file) def _check_volume_definitions(spec): @@ -616,11 +619,15 @@ def create_operation( generate_helm_chart(stack_name, spec_file, deployment_dir_path) return # Exit early for helm chart generation + # Resolve stack source path for restart capability + stack_source = get_stack_path(stack_name) + if update: # Sync mode: write to temp dir, then copy to deployment dir with backups temp_dir = Path(tempfile.mkdtemp(prefix="deployment-sync-")) try: - # Write deployment files to temp dir (skip deployment.yml to preserve cluster ID) + # Write deployment files to temp dir + # (skip deployment.yml to preserve cluster ID) _write_deployment_files( temp_dir, Path(spec_file), @@ -628,12 +635,14 @@ def create_operation( stack_name, deployment_type, include_deployment_file=False, + stack_source=stack_source, ) - # Copy from temp to deployment dir, excluding data volumes and backing up changed files - # Exclude data/* to avoid touching user data volumes - # Exclude config file to preserve deployment settings (XXX breaks passing config vars - # from spec. could warn about this or not exclude...) + # Copy from temp to deployment dir, excluding data volumes + # and backing up changed files. + # Exclude data/* to avoid touching user data volumes. + # Exclude config file to preserve deployment settings + # (XXX breaks passing config vars from spec) exclude_patterns = ["data", "data/*", constants.config_file_name] _safe_copy_tree( temp_dir, deployment_dir_path, exclude_patterns=exclude_patterns @@ -650,6 +659,7 @@ def create_operation( stack_name, deployment_type, include_deployment_file=True, + stack_source=stack_source, ) # Delegate to the stack's Python code @@ -670,7 +680,7 @@ def create_operation( ) -def _safe_copy_tree(src: Path, dst: Path, exclude_patterns: List[str] = None): +def _safe_copy_tree(src: Path, dst: Path, exclude_patterns: Optional[List[str]] = None): """ Recursively copy a directory tree, backing up changed files with .bak suffix. @@ -721,6 +731,7 @@ def _write_deployment_files( stack_name: str, deployment_type: str, include_deployment_file: bool = True, + stack_source: Optional[Path] = None, ): """ Write deployment files to target directory. @@ -730,7 +741,8 @@ def _write_deployment_files( :param parsed_spec: Parsed spec object :param stack_name: Name of stack :param deployment_type: Type of deployment - :param include_deployment_file: Whether to create deployment.yml file (skip for update) + :param include_deployment_file: Whether to create deployment.yml (skip for update) + :param stack_source: Path to stack source (git repo) for restart capability """ stack_file = get_stack_path(stack_name).joinpath(constants.stack_file_name) parsed_stack = get_parsed_stack_config(stack_name) @@ -741,7 +753,7 @@ def _write_deployment_files( # Create deployment file if requested if include_deployment_file: - _create_deployment_file(target_dir) + _create_deployment_file(target_dir, stack_source=stack_source) # Copy any config variables from the spec file into an env file suitable for compose _write_config_file(spec_file, target_dir.joinpath(constants.config_file_name)) @@ -805,8 +817,9 @@ def _write_deployment_files( ) else: # TODO: - # this is odd - looks up config dir that matches a volume name, then copies as a mount dir? - # AFAICT this is not used by or relevant to any existing stack - roy + # This is odd - looks up config dir that matches a volume name, + # then copies as a mount dir? + # AFAICT not used by or relevant to any existing stack - roy # TODO: We should probably only do this if the volume is marked :ro. for volume_name, volume_path in parsed_spec.get_volumes().items(): diff --git a/stack_orchestrator/deploy/dns_probe.py b/stack_orchestrator/deploy/dns_probe.py new file mode 100644 index 00000000..e04b4ea2 --- /dev/null +++ b/stack_orchestrator/deploy/dns_probe.py @@ -0,0 +1,159 @@ +# Copyright © 2024 Vulcanize +# SPDX-License-Identifier: AGPL-3.0 + +"""DNS verification via temporary ingress probe.""" + +import secrets +import socket +import time +from typing import Optional +import requests +from kubernetes import client + + +def get_server_egress_ip() -> str: + """Get this server's public egress IP via ipify.""" + response = requests.get("https://api.ipify.org", timeout=10) + response.raise_for_status() + return response.text.strip() + + +def resolve_hostname(hostname: str) -> list[str]: + """Resolve hostname to list of IP addresses.""" + try: + _, _, ips = socket.gethostbyname_ex(hostname) + return ips + except socket.gaierror: + return [] + + +def verify_dns_simple(hostname: str, expected_ip: Optional[str] = None) -> bool: + """Simple DNS verification - check hostname resolves to expected IP. + + If expected_ip not provided, uses server's egress IP. + Returns True if hostname resolves to expected IP. + """ + resolved_ips = resolve_hostname(hostname) + if not resolved_ips: + print(f"DNS FAIL: {hostname} does not resolve") + return False + + if expected_ip is None: + expected_ip = get_server_egress_ip() + + if expected_ip in resolved_ips: + print(f"DNS OK: {hostname} -> {resolved_ips} (includes {expected_ip})") + return True + else: + print(f"DNS WARN: {hostname} -> {resolved_ips} (expected {expected_ip})") + return False + + +def create_probe_ingress(hostname: str, namespace: str = "default") -> str: + """Create a temporary ingress for DNS probing. + + Returns the probe token that the ingress will respond with. + """ + token = secrets.token_hex(16) + + networking_api = client.NetworkingV1Api() + + # Create a simple ingress that Caddy will pick up + ingress = client.V1Ingress( + metadata=client.V1ObjectMeta( + name="laconic-dns-probe", + annotations={ + "kubernetes.io/ingress.class": "caddy", + "laconic.com/probe-token": token, + }, + ), + spec=client.V1IngressSpec( + rules=[ + client.V1IngressRule( + host=hostname, + http=client.V1HTTPIngressRuleValue( + paths=[ + client.V1HTTPIngressPath( + path="/.well-known/laconic-probe", + path_type="Exact", + backend=client.V1IngressBackend( + service=client.V1IngressServiceBackend( + name="caddy-ingress-controller", + port=client.V1ServiceBackendPort(number=80), + ) + ), + ) + ] + ), + ) + ] + ), + ) + + networking_api.create_namespaced_ingress(namespace=namespace, body=ingress) + return token + + +def delete_probe_ingress(namespace: str = "default"): + """Delete the temporary probe ingress.""" + networking_api = client.NetworkingV1Api() + try: + networking_api.delete_namespaced_ingress( + name="laconic-dns-probe", namespace=namespace + ) + except client.exceptions.ApiException: + pass # Ignore if already deleted + + +def verify_dns_via_probe( + hostname: str, namespace: str = "default", timeout: int = 30, poll_interval: int = 2 +) -> bool: + """Verify DNS by creating temp ingress and probing it. + + This definitively proves that traffic to the hostname reaches this cluster. + + Args: + hostname: The hostname to verify + namespace: Kubernetes namespace for probe ingress + timeout: Total seconds to wait for probe to succeed + poll_interval: Seconds between probe attempts + + Returns: + True if probe succeeds, False otherwise + """ + # First check DNS resolves at all + if not resolve_hostname(hostname): + print(f"DNS FAIL: {hostname} does not resolve") + return False + + print(f"Creating probe ingress for {hostname}...") + create_probe_ingress(hostname, namespace) + + try: + # Wait for Caddy to pick up the ingress + time.sleep(3) + + # Poll until success or timeout + probe_url = f"http://{hostname}/.well-known/laconic-probe" + start_time = time.time() + last_error = None + + while time.time() - start_time < timeout: + try: + response = requests.get(probe_url, timeout=5) + # For now, just verify we get a response from this cluster + # A more robust check would verify a unique token + if response.status_code < 500: + print(f"DNS PROBE OK: {hostname} routes to this cluster") + return True + except requests.RequestException as e: + last_error = e + + time.sleep(poll_interval) + + print(f"DNS PROBE FAIL: {hostname} - {last_error}") + return False + + finally: + print("Cleaning up probe ingress...") + delete_probe_ingress(namespace)