# Copyright © 2022, 2023 Vulcanize # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import click from pathlib import Path import subprocess import sys from stack_orchestrator import constants from stack_orchestrator.deploy.images import push_images_operation from stack_orchestrator.deploy.deploy import ( up_operation, down_operation, ps_operation, port_operation, status_operation, ) from stack_orchestrator.deploy.deploy import ( exec_operation, logs_operation, create_deploy_context, update_operation, ) from stack_orchestrator.deploy.deploy_types import DeployCommandContext from stack_orchestrator.deploy.deployment_context import DeploymentContext @click.group() @click.option("--dir", required=True, help="path to deployment directory") @click.pass_context def command(ctx, dir): """manage a deployment""" # Check that --stack wasn't supplied if ctx.parent.obj.stack: print("Error: --stack can't be supplied with the deployment command") sys.exit(1) # Check dir is valid dir_path = Path(dir) if not dir_path.exists(): print(f"Error: deployment directory {dir} does not exist") sys.exit(1) if not dir_path.is_dir(): print( f"Error: supplied deployment directory path {dir} exists but is a " "file not a directory" ) sys.exit(1) # Store the deployment context for subcommands deployment_context = DeploymentContext() deployment_context.init(dir_path) ctx.obj = deployment_context def make_deploy_context(ctx) -> DeployCommandContext: context: DeploymentContext = ctx.obj env_file = context.get_env_file() cluster_name = context.get_cluster_id() if constants.deploy_to_key in context.spec.obj: deployment_type = context.spec.obj[constants.deploy_to_key] else: deployment_type = constants.compose_deploy_type stack = context.deployment_dir return create_deploy_context( ctx.parent.parent.obj, context, stack, None, None, cluster_name, env_file, deployment_type, ) # TODO: remove legacy up command since it's an alias for start @command.command() @click.option( "--stay-attached/--detatch-terminal", default=False, help="detatch or not to see container stdout", ) @click.option( "--skip-cluster-management/--perform-cluster-management", default=False, help="Skip cluster initialization/tear-down (only for kind-k8s deployments)", ) @click.argument("extra_args", nargs=-1) # help: command: up @click.pass_context def up(ctx, stay_attached, skip_cluster_management, extra_args): ctx.obj = make_deploy_context(ctx) services_list = list(extra_args) or None up_operation(ctx, services_list, stay_attached, skip_cluster_management) # start is the preferred alias for up @command.command() @click.option( "--stay-attached/--detatch-terminal", default=False, help="detatch or not to see container stdout", ) @click.option( "--skip-cluster-management/--perform-cluster-management", default=False, help="Skip cluster initialization/tear-down (only for kind-k8s deployments)", ) @click.argument("extra_args", nargs=-1) # help: command: up @click.pass_context def start(ctx, stay_attached, skip_cluster_management, extra_args): ctx.obj = make_deploy_context(ctx) services_list = list(extra_args) or None up_operation(ctx, services_list, stay_attached, skip_cluster_management) # TODO: remove legacy up command since it's an alias for stop @command.command() @click.option( "--delete-volumes/--preserve-volumes", default=False, help="delete data volumes" ) @click.option( "--skip-cluster-management/--perform-cluster-management", default=False, help="Skip cluster initialization/tear-down (only for kind-k8s deployments)", ) @click.argument("extra_args", nargs=-1) # help: command: down @click.pass_context def down(ctx, delete_volumes, skip_cluster_management, extra_args): # Get the stack config file name # TODO: add cluster name and env file here ctx.obj = make_deploy_context(ctx) down_operation(ctx, delete_volumes, extra_args, skip_cluster_management) # stop is the preferred alias for down @command.command() @click.option( "--delete-volumes/--preserve-volumes", default=False, help="delete data volumes" ) @click.option( "--skip-cluster-management/--perform-cluster-management", default=False, help="Skip cluster initialization/tear-down (only for kind-k8s deployments)", ) @click.argument("extra_args", nargs=-1) # help: command: down @click.pass_context def stop(ctx, delete_volumes, skip_cluster_management, extra_args): # TODO: add cluster name and env file here ctx.obj = make_deploy_context(ctx) down_operation(ctx, delete_volumes, extra_args, skip_cluster_management) @command.command() @click.pass_context def ps(ctx): ctx.obj = make_deploy_context(ctx) ps_operation(ctx) @command.command() @click.pass_context def push_images(ctx): deploy_command_context: DeployCommandContext = make_deploy_context(ctx) deployment_context: DeploymentContext = ctx.obj push_images_operation(deploy_command_context, deployment_context) @command.command() @click.argument("extra_args", nargs=-1) # help: command: port @click.pass_context def port(ctx, extra_args): ctx.obj = make_deploy_context(ctx) port_operation(ctx, extra_args) @command.command() @click.argument("extra_args", nargs=-1) # help: command: exec @click.pass_context def exec(ctx, extra_args): ctx.obj = make_deploy_context(ctx) exec_operation(ctx, extra_args) @command.command() @click.option("--tail", "-n", default=None, help="number of lines to display") @click.option("--follow", "-f", is_flag=True, default=False, help="follow log output") @click.argument("extra_args", nargs=-1) # help: command: logs @click.pass_context def logs(ctx, tail, follow, extra_args): ctx.obj = make_deploy_context(ctx) logs_operation(ctx, tail, follow, extra_args) @command.command() @click.pass_context def status(ctx): ctx.obj = make_deploy_context(ctx) status_operation(ctx) @command.command() @click.pass_context def update(ctx): ctx.obj = make_deploy_context(ctx) update_operation(ctx) @command.command() @click.argument("job_name") @click.option( "--helm-release", help="Helm release name (for k8s helm chart deployments, defaults to chart name)", ) @click.pass_context def run_job(ctx, job_name, helm_release): """run a one-time job from the stack""" from stack_orchestrator.deploy.deploy import run_job_operation ctx.obj = make_deploy_context(ctx) run_job_operation(ctx, job_name, helm_release) @command.command() @click.option("--stack-path", help="Path to stack git repo (overrides stored path)") @click.option( "--spec-file", help="Path to GitOps spec.yml in repo (e.g., deployment/spec.yml)" ) @click.option("--config-file", help="Config file to pass to deploy init") @click.option( "--force", is_flag=True, default=False, help="Skip DNS verification", ) @click.option( "--expected-ip", help="Expected IP for DNS verification (if different from egress)", ) @click.option( "--image", multiple=True, help="Override container image: container=image", ) @click.pass_context def restart(ctx, stack_path, spec_file, config_file, force, expected_ip, image): """Pull latest code and restart deployment using git-tracked spec. GitOps workflow: 1. Operator maintains spec.yml in their git repository 2. This command pulls latest code (including updated spec.yml) 3. If hostname changed, verifies DNS routes to this server 4. Syncs deployment directory with the git-tracked spec 5. Stops and restarts the deployment Data volumes are always preserved. The cluster is never destroyed. Stack source resolution (in order): 1. --stack-path argument (if provided) 2. stack-source field in deployment.yml (if stored) 3. Error if neither available Note: spec.yml should be maintained in git, not regenerated from commands.py on each restart. Use 'deploy init' only for initial spec generation, then customize and commit to your operator repo. """ from stack_orchestrator.util import get_yaml, get_parsed_deployment_spec from stack_orchestrator.deploy.deployment_create import create_operation from stack_orchestrator.deploy.dns_probe import verify_dns_via_probe deployment_context: DeploymentContext = ctx.obj # Parse --image flags into a dict of container_name -> image image_overrides = {} for entry in image: if "=" not in entry: raise click.BadParameter( f"Invalid --image format '{entry}', expected container=image", param_hint="'--image'", ) container_name, image_ref = entry.split("=", 1) image_overrides[container_name] = image_ref # Get current spec info (before git pull) current_spec = deployment_context.spec current_http_proxy = current_spec.get_http_proxy() current_hostname = ( current_http_proxy[0]["host-name"] if current_http_proxy else None ) # Resolve stack source path if stack_path: stack_source = Path(stack_path).resolve() else: # Try to get from deployment.yml deployment_file = ( deployment_context.deployment_dir / constants.deployment_file_name ) deployment_data = get_yaml().load(open(deployment_file)) stack_source_str = deployment_data.get("stack-source") if not stack_source_str: print( "Error: No stack-source in deployment.yml and --stack-path not provided" ) print("Use --stack-path to specify the stack git repository location") sys.exit(1) stack_source = Path(stack_source_str) if not stack_source.exists(): print(f"Error: Stack source path does not exist: {stack_source}") sys.exit(1) print("=== Deployment Restart ===") print(f"Deployment dir: {deployment_context.deployment_dir}") print(f"Stack source: {stack_source}") print(f"Current hostname: {current_hostname}") # Step 1: Git pull (brings in updated spec.yml from operator's repo) print("\n[1/4] Pulling latest code from stack repository...") git_result = subprocess.run( ["git", "pull"], cwd=stack_source, capture_output=True, text=True ) if git_result.returncode != 0: print(f"Git pull failed: {git_result.stderr}") sys.exit(1) print(f"Git pull: {git_result.stdout.strip()}") # Determine spec file location # Priority: --spec-file argument > repo's deployment/spec.yml > deployment dir # Find repo root via git rather than assuming a fixed directory depth. git_root_result = subprocess.run( ["git", "rev-parse", "--show-toplevel"], cwd=stack_source, capture_output=True, text=True, ) if git_root_result.returncode == 0: repo_root = Path(git_root_result.stdout.strip()) else: # Fallback: walk up from stack_source looking for .git repo_root = stack_source while repo_root != repo_root.parent: if (repo_root / ".git").exists(): break repo_root = repo_root.parent if spec_file: # Spec file relative to repo root spec_file_path = repo_root / spec_file else: # Try standard GitOps location in repo gitops_spec = repo_root / "deployment" / "spec.yml" if gitops_spec.exists(): spec_file_path = gitops_spec else: # Fall back to deployment directory spec_file_path = deployment_context.deployment_dir / "spec.yml" if not spec_file_path.exists(): print(f"Error: spec.yml not found at {spec_file_path}") print("For GitOps, add spec.yml to your repo at deployment/spec.yml") print("Or specify --spec-file with path relative to repo root") sys.exit(1) print(f"Using spec: {spec_file_path}") # Parse spec to check for hostname changes new_spec_obj = get_parsed_deployment_spec(str(spec_file_path)) new_http_proxy = new_spec_obj.get("network", {}).get("http-proxy", []) new_hostname = new_http_proxy[0]["host-name"] if new_http_proxy else None print(f"Spec hostname: {new_hostname}") # Step 2: DNS verification (only if hostname changed) if new_hostname and new_hostname != current_hostname: print(f"\n[2/4] Hostname changed: {current_hostname} -> {new_hostname}") if force: print("DNS verification skipped (--force)") else: print("Verifying DNS via probe...") if not verify_dns_via_probe(new_hostname): print(f"\nDNS verification failed for {new_hostname}") print("Ensure DNS is configured before restarting.") print("Use --force to skip this check.") sys.exit(1) else: print("\n[2/4] Hostname unchanged, skipping DNS verification") # Step 3: Sync deployment directory with spec # The spec's "stack:" value is often a relative path (e.g. # "stack-orchestrator/stacks/dumpster") that must resolve from the # repo root. Change cwd so stack_is_external() sees it correctly. print("\n[3/4] Syncing deployment directory...") import os prev_cwd = os.getcwd() os.chdir(repo_root) deploy_ctx = make_deploy_context(ctx) create_operation( deployment_command_context=deploy_ctx, spec_file=str(spec_file_path), deployment_dir=str(deployment_context.deployment_dir), update=True, network_dir=None, initial_peers=None, ) # Reload deployment context with updated spec deployment_context.init(deployment_context.deployment_dir) ctx.obj = deployment_context # Apply updated deployment. # If maintenance-service is configured, swap Ingress to maintenance # backend during the Recreate window so users see a branded page # instead of bare 502s. print("\n[4/4] Applying deployment update...") ctx.obj = make_deploy_context(ctx) # Check for maintenance service in the (reloaded) spec maintenance_svc = deployment_context.spec.get_maintenance_service() if maintenance_svc: print(f"Maintenance service configured: {maintenance_svc}") _restart_with_maintenance( ctx, deployment_context, maintenance_svc, image_overrides ) else: up_operation( ctx, services_list=None, stay_attached=False, skip_cluster_management=True, image_overrides=image_overrides or None, ) # Restore cwd after both create_operation and up_operation have run. # Both need the relative stack path to resolve from repo_root. os.chdir(prev_cwd) print("\n=== Restart Complete ===") print("Deployment updated via rolling update.") if new_hostname and new_hostname != current_hostname: print(f"\nNew hostname: {new_hostname}") print("Caddy will automatically provision TLS certificate.") def _restart_with_maintenance( ctx, deployment_context, maintenance_svc, image_overrides ): """Restart with Ingress swap to maintenance service during Recreate. Flow: 1. Deploy all pods (including maintenance pod) with up_operation 2. Patch Ingress: swap all route backends to maintenance service 3. Scale main (non-maintenance) Deployments to 0 4. Scale main Deployments back up (triggers Recreate with new spec) 5. Wait for readiness 6. Patch Ingress: restore original backends This ensures the maintenance pod is already running before we touch the Ingress, and the main pods get a clean Recreate. """ import time from kubernetes.client.exceptions import ApiException from stack_orchestrator.deploy.deploy import up_operation # Step 1: Apply the full deployment (creates/updates all pods + services) # This ensures maintenance pod exists before we swap Ingress to it. up_operation( ctx, services_list=None, stay_attached=False, skip_cluster_management=True, image_overrides=image_overrides or None, ) # Parse maintenance service spec: "container-name:port" maint_container = maintenance_svc.split(":")[0] maint_port = int(maintenance_svc.split(":")[1]) # Connect to k8s API deploy_ctx = ctx.obj deployer = deploy_ctx.deployer deployer.connect_api() namespace = deployer.k8s_namespace app_name = deployer.cluster_info.app_name networking_api = deployer.networking_api apps_api = deployer.apps_api ingress_name = f"{app_name}-ingress" # Step 2: Read current Ingress and save original backends try: ingress = networking_api.read_namespaced_ingress( name=ingress_name, namespace=namespace ) except ApiException: print("Warning: No Ingress found, skipping maintenance swap") return # Resolve which service the maintenance container belongs to maint_service_name = deployer.cluster_info._resolve_service_name_for_container( maint_container ) # Save original backends for restoration original_backends = [] for rule in ingress.spec.rules: rule_backends = [] for path in rule.http.paths: rule_backends.append( { "name": path.backend.service.name, "port": path.backend.service.port.number, } ) original_backends.append(rule_backends) # Patch all Ingress backends to point to maintenance service print("Swapping Ingress to maintenance service...") for rule in ingress.spec.rules: for path in rule.http.paths: path.backend.service.name = maint_service_name path.backend.service.port.number = maint_port networking_api.replace_namespaced_ingress( name=ingress_name, namespace=namespace, body=ingress ) print("Ingress now points to maintenance service") # Step 3: Find main (non-maintenance) Deployments and scale to 0 # then back up to trigger a clean Recreate deployments_resp = apps_api.list_namespaced_deployment( namespace=namespace, label_selector=f"app={app_name}" ) main_deployments = [] for dep in deployments_resp.items: dep_name = dep.metadata.name # Skip maintenance deployments component = (dep.metadata.labels or {}).get("app.kubernetes.io/component", "") is_maintenance = maint_container in component if not is_maintenance: main_deployments.append(dep_name) if main_deployments: # Scale down main deployments for dep_name in main_deployments: print(f"Scaling down {dep_name}...") apps_api.patch_namespaced_deployment_scale( name=dep_name, namespace=namespace, body={"spec": {"replicas": 0}}, ) # Wait for pods to terminate print("Waiting for main pods to terminate...") deadline = time.monotonic() + 120 while time.monotonic() < deadline: pods = deployer.core_api.list_namespaced_pod( namespace=namespace, label_selector=f"app={app_name}", ) # Count non-maintenance pods active = sum( 1 for p in pods.items if p.metadata and p.metadata.deletion_timestamp is None and not any( maint_container in (c.name or "") for c in (p.spec.containers or []) ) ) if active == 0: break time.sleep(2) # Scale back up replicas = deployment_context.spec.get_replicas() for dep_name in main_deployments: print(f"Scaling up {dep_name} to {replicas} replicas...") apps_api.patch_namespaced_deployment_scale( name=dep_name, namespace=namespace, body={"spec": {"replicas": replicas}}, ) # Step 5: Wait for readiness print("Waiting for main pods to become ready...") deadline = time.monotonic() + 300 while time.monotonic() < deadline: all_ready = True for dep_name in main_deployments: dep = apps_api.read_namespaced_deployment( name=dep_name, namespace=namespace ) ready = dep.status.ready_replicas or 0 desired = dep.spec.replicas or 1 if ready < desired: all_ready = False break if all_ready: break time.sleep(5) # Step 6: Restore original Ingress backends print("Restoring original Ingress backends...") ingress = networking_api.read_namespaced_ingress( name=ingress_name, namespace=namespace ) for i, rule in enumerate(ingress.spec.rules): for j, path in enumerate(rule.http.paths): if i < len(original_backends) and j < len(original_backends[i]): path.backend.service.name = original_backends[i][j]["name"] path.backend.service.port.number = original_backends[i][j]["port"] networking_api.replace_namespaced_ingress( name=ingress_name, namespace=namespace, body=ingress ) print("Ingress restored to original backends")