diff --git a/.gitignore b/.gitignore index 3aaa220b..6abbf941 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ __pycache__ package stack_orchestrator/data/build_tag.txt /build +.worktrees diff --git a/.pebbles/events.jsonl b/.pebbles/events.jsonl index 9e07f970..ed5117b7 100644 --- a/.pebbles/events.jsonl +++ b/.pebbles/events.jsonl @@ -1,2 +1,7 @@ {"type": "create", "timestamp": "2026-03-18T14:45:07.038870Z", "issue_id": "so-a1a", "payload": {"title": "deploy create should support external credential injection", "type": "feature", "priority": "2", "description": "deploy create generates config.env but provides no mechanism to inject external credentials (API keys, tokens, etc.) at creation time. Operators must append to config.env after the fact, which mutates a build artifact. deploy create should accept --credentials-file or similar to include secrets in the generated config.env."}} {"type": "create", "timestamp": "2026-03-18T14:45:07.038942Z", "issue_id": "so-b2b", "payload": {"title": "REGISTRY_TOKEN / imagePullSecret flow undocumented", "type": "bug", "priority": "2", "description": "create_registry_secret() exists in deployment_create.py and is called during up(), but REGISTRY_TOKEN is not documented in spec.yml or any user-facing docs. The restart command warns \"Registry token env var REGISTRY_TOKEN not set, skipping registry secret\" but doesn't explain how to set it. For GHCR private images, this is required and the flow from spec.yml -> config.env -> imagePullSecret needs documentation."}} +{"type": "create", "timestamp": "2026-03-18T19:10:00.000000Z", "issue_id": "so-k1k", "payload": {"title": "Stack path resolution differs between deploy create and deployment restart", "type": "bug", "priority": "2", "description": "deploy create resolves --stack as a relative path from cwd. deployment restart resolves --stack-path as absolute, then computes repo_root as 4 parents up (assuming stack_orchestrator/data/stacks/name structure). External stacks with different nesting depths (e.g. stack-orchestrator/stacks/name = 3 levels) get wrong repo_root, causing --spec-file resolution to fail. The two commands should use the same path resolution logic."}} +{"type": "create", "timestamp": "2026-03-18T19:25:00.000000Z", "issue_id": "so-l2l", "payload": {"title": "deployment restart should update in place, not delete/recreate", "type": "bug", "priority": "1", "description": "deployment restart deletes the entire namespace then recreates everything from scratch. This causes:\n\n1. **Downtime** — nothing serves traffic between delete and successful recreate\n2. **No rollback** — deleting the namespace destroys ReplicaSet revision history\n3. **Race conditions** — namespace may still be terminating when up() tries to create\n4. **Cascading failures** — if ANY container fails to start, the entire site is down with no fallback\n\nFix: three changes needed.\n\n**A. up() should create-or-update, not just create.** Use patch/apply semantics for Deployments, Services, Ingresses. When the pod spec changes (new env vars, new image), k8s creates a new ReplicaSet, scales it up, waits for readiness probes, then scales the old one down. Old pods serve traffic until new pods are healthy.\n\n**B. down() should never delete the namespace on restart.** Only on explicit teardown. The namespace owns the revision history. Current code: _delete_namespace() on every down(). Should: delete individual resources by label for teardown, do nothing for restart (let update-in-place handle it).\n\n**C. All containers need readiness probes.** Without them k8s considers pods ready immediately, defeating rolling update safety. laconic-so should generate readiness probes from the http-proxy routes in spec.yml (if a container has an http route, probe that port).\n\nWith these changes, k8s native rolling updates provide zero-downtime deploys and automatic rollback (if new pods fail readiness, rollout stalls, old pods keep serving).\n\nSource files:\n- deploy_k8s.py: up(), down(), _create_deployment(), _delete_namespace()\n- cluster_info.py: pod spec generation (needs readiness probes)\n- deployment.py: restart() orchestration"}} +{"type": "create", "timestamp": "2026-03-18T20:15:03.000000Z", "issue_id": "so-m3m", "payload": {"title": "Add credentials-files spec key for on-disk credential injection", "type": "feature", "priority": "1", "description": "deployment restart regenerates config.env from spec.yml, wiping credentials that were appended from on-disk files (e.g. ~/.credentials/*.env). Operators must append credentials after deploy create, which is fragile and breaks on restart.\n\nFix: New top-level spec key credentials-files. _write_config_file() reads each file and appends its contents to config.env after writing config vars. Files are read at deploy time from the deployment host.\n\nSpec syntax:\n credentials-files:\n - ~/.credentials/dumpster-secrets.env\n - ~/.credentials/dumpster-r2.env\n\nFiles:\n- deploy/spec.py: add get_credentials_files() returning list of paths\n- deploy/deployment_create.py: in _write_config_file(), after writing config vars, read and append each credentials file (expand ~ to home dir)\n\nAlso update dumpster-stack spec.yml to use the new key and remove the ansible credential append workaround from woodburn_deployer (group_vars/all.yml credentials_env_files, stack_deploy role append tasks, restart_dumpster.yml credential steps). Those cleanups are in the woodburn_deployer repo."}} +{"type":"status_update","timestamp":"2026-03-18T21:54:12.59148256Z","issue_id":"so-m3m","payload":{"status":"in_progress"}} +{"type":"close","timestamp":"2026-03-18T21:55:31.6035544Z","issue_id":"so-m3m","payload":{}} diff --git a/stack_orchestrator/deploy/deployment.py b/stack_orchestrator/deploy/deployment.py index 0804b5a6..2c140f30 100644 --- a/stack_orchestrator/deploy/deployment.py +++ b/stack_orchestrator/deploy/deployment.py @@ -399,11 +399,60 @@ def restart(ctx, stack_path, spec_file, config_file, force, expected_ip, image): deployment_context.init(deployment_context.deployment_dir) ctx.obj = deployment_context - # Apply updated deployment (create-or-update triggers rolling update). - # No down() — k8s rolling update keeps old pods serving traffic until - # new pods pass readiness checks. + # Apply updated deployment. + # If maintenance-service is configured, swap Ingress to maintenance + # backend during the Recreate window so users see a branded page + # instead of bare 502s. print("\n[4/4] Applying deployment update...") ctx.obj = make_deploy_context(ctx) + + # Check for maintenance service in the (reloaded) spec + maintenance_svc = deployment_context.spec.get_maintenance_service() + if maintenance_svc: + print(f"Maintenance service configured: {maintenance_svc}") + _restart_with_maintenance( + ctx, deployment_context, maintenance_svc, image_overrides + ) + else: + up_operation( + ctx, + services_list=None, + stay_attached=False, + skip_cluster_management=True, + image_overrides=image_overrides or None, + ) + + print("\n=== Restart Complete ===") + print("Deployment updated via rolling update.") + if new_hostname and new_hostname != current_hostname: + print(f"\nNew hostname: {new_hostname}") + print("Caddy will automatically provision TLS certificate.") + + +def _restart_with_maintenance( + ctx, deployment_context, maintenance_svc, image_overrides +): + """Restart with Ingress swap to maintenance service during Recreate. + + Flow: + 1. Deploy all pods (including maintenance pod) with up_operation + 2. Patch Ingress: swap all route backends to maintenance service + 3. Scale main (non-maintenance) Deployments to 0 + 4. Scale main Deployments back up (triggers Recreate with new spec) + 5. Wait for readiness + 6. Patch Ingress: restore original backends + + This ensures the maintenance pod is already running before we touch + the Ingress, and the main pods get a clean Recreate. + """ + import time + + from kubernetes.client.exceptions import ApiException + + from stack_orchestrator.deploy.deploy import up_operation + + # Step 1: Apply the full deployment (creates/updates all pods + services) + # This ensures maintenance pod exists before we swap Ingress to it. up_operation( ctx, services_list=None, @@ -412,8 +461,146 @@ def restart(ctx, stack_path, spec_file, config_file, force, expected_ip, image): image_overrides=image_overrides or None, ) - print("\n=== Restart Complete ===") - print("Deployment updated via rolling update.") - if new_hostname and new_hostname != current_hostname: - print(f"\nNew hostname: {new_hostname}") - print("Caddy will automatically provision TLS certificate.") + # Parse maintenance service spec: "container-name:port" + maint_container = maintenance_svc.split(":")[0] + maint_port = int(maintenance_svc.split(":")[1]) + + # Connect to k8s API + deploy_ctx = ctx.obj + deployer = deploy_ctx.deployer + deployer.connect_api() + namespace = deployer.k8s_namespace + app_name = deployer.cluster_info.app_name + networking_api = deployer.networking_api + apps_api = deployer.apps_api + + ingress_name = f"{app_name}-ingress" + + # Step 2: Read current Ingress and save original backends + try: + ingress = networking_api.read_namespaced_ingress( + name=ingress_name, namespace=namespace + ) + except ApiException: + print("Warning: No Ingress found, skipping maintenance swap") + return + + # Resolve which service the maintenance container belongs to + maint_service_name = deployer.cluster_info._resolve_service_name_for_container( + maint_container + ) + + # Save original backends for restoration + original_backends = [] + for rule in ingress.spec.rules: + rule_backends = [] + for path in rule.http.paths: + rule_backends.append( + { + "name": path.backend.service.name, + "port": path.backend.service.port.number, + } + ) + original_backends.append(rule_backends) + + # Patch all Ingress backends to point to maintenance service + print("Swapping Ingress to maintenance service...") + for rule in ingress.spec.rules: + for path in rule.http.paths: + path.backend.service.name = maint_service_name + path.backend.service.port.number = maint_port + + networking_api.replace_namespaced_ingress( + name=ingress_name, namespace=namespace, body=ingress + ) + print("Ingress now points to maintenance service") + + # Step 3: Find main (non-maintenance) Deployments and scale to 0 + # then back up to trigger a clean Recreate + deployments_resp = apps_api.list_namespaced_deployment( + namespace=namespace, label_selector=f"app={app_name}" + ) + main_deployments = [] + for dep in deployments_resp.items: + dep_name = dep.metadata.name + # Skip maintenance deployments + component = (dep.metadata.labels or {}).get("app.kubernetes.io/component", "") + is_maintenance = maint_container in component + if not is_maintenance: + main_deployments.append(dep_name) + + if main_deployments: + # Scale down main deployments + for dep_name in main_deployments: + print(f"Scaling down {dep_name}...") + apps_api.patch_namespaced_deployment_scale( + name=dep_name, + namespace=namespace, + body={"spec": {"replicas": 0}}, + ) + + # Wait for pods to terminate + print("Waiting for main pods to terminate...") + deadline = time.monotonic() + 120 + while time.monotonic() < deadline: + pods = deployer.core_api.list_namespaced_pod( + namespace=namespace, + label_selector=f"app={app_name}", + ) + # Count non-maintenance pods + active = sum( + 1 + for p in pods.items + if p.metadata + and p.metadata.deletion_timestamp is None + and not any( + maint_container in (c.name or "") for c in (p.spec.containers or []) + ) + ) + if active == 0: + break + time.sleep(2) + + # Scale back up + replicas = deployment_context.spec.get_replicas() + for dep_name in main_deployments: + print(f"Scaling up {dep_name} to {replicas} replicas...") + apps_api.patch_namespaced_deployment_scale( + name=dep_name, + namespace=namespace, + body={"spec": {"replicas": replicas}}, + ) + + # Step 5: Wait for readiness + print("Waiting for main pods to become ready...") + deadline = time.monotonic() + 300 + while time.monotonic() < deadline: + all_ready = True + for dep_name in main_deployments: + dep = apps_api.read_namespaced_deployment( + name=dep_name, namespace=namespace + ) + ready = dep.status.ready_replicas or 0 + desired = dep.spec.replicas or 1 + if ready < desired: + all_ready = False + break + if all_ready: + break + time.sleep(5) + + # Step 6: Restore original Ingress backends + print("Restoring original Ingress backends...") + ingress = networking_api.read_namespaced_ingress( + name=ingress_name, namespace=namespace + ) + for i, rule in enumerate(ingress.spec.rules): + for j, path in enumerate(rule.http.paths): + if i < len(original_backends) and j < len(original_backends[i]): + path.backend.service.name = original_backends[i][j]["name"] + path.backend.service.port.number = original_backends[i][j]["port"] + + networking_api.replace_namespaced_ingress( + name=ingress_name, namespace=namespace, body=ingress + ) + print("Ingress restored to original backends") diff --git a/stack_orchestrator/deploy/k8s/cluster_info.py b/stack_orchestrator/deploy/k8s/cluster_info.py index d9ede7f1..6890b430 100644 --- a/stack_orchestrator/deploy/k8s/cluster_info.py +++ b/stack_orchestrator/deploy/k8s/cluster_info.py @@ -167,6 +167,28 @@ class ClusterInfo: nodeports.append(service) return nodeports + def _resolve_service_name_for_container(self, container_name: str) -> str: + """Resolve the k8s Service name that routes to a given container. + + For multi-pod stacks, each pod has its own Service. We find which + pod file contains this container and return the corresponding + service name. For single-pod stacks, returns the legacy service name. + """ + pod_files = list(self.parsed_pod_yaml_map.keys()) + multi_pod = len(pod_files) > 1 + + if not multi_pod: + return f"{self.app_name}-service" + + for pod_file in pod_files: + pod = self.parsed_pod_yaml_map[pod_file] + if container_name in pod.get("services", {}): + pod_name = self._pod_name_from_file(pod_file) + return f"{self.app_name}-{pod_name}-service" + + # Fallback: container not found in any pod file + return f"{self.app_name}-service" + def get_ingress( self, use_tls=False, certificates=None, cluster_issuer="letsencrypt-prod" ): @@ -186,12 +208,16 @@ class ClusterInfo: if use_tls: tls.append( client.V1IngressTLS( - hosts=certificate["spec"]["dnsNames"] - if certificate - else [host_name], - secret_name=certificate["spec"]["secretName"] - if certificate - else f"{self.app_name}-{host_name}-tls", + hosts=( + certificate["spec"]["dnsNames"] + if certificate + else [host_name] + ), + secret_name=( + certificate["spec"]["secretName"] + if certificate + else f"{self.app_name}-{host_name}-tls" + ), ) ) @@ -202,16 +228,18 @@ class ClusterInfo: if opts.o.debug: print(f"proxy config: {path} -> {proxy_to}") # proxy_to has the form : + container_name = proxy_to.split(":")[0] proxy_to_port = int(proxy_to.split(":")[1]) + service_name = self._resolve_service_name_for_container( + container_name + ) paths.append( client.V1HTTPIngressPath( path_type="Prefix", path=path, backend=client.V1IngressBackend( service=client.V1IngressServiceBackend( - # TODO: this looks wrong - name=f"{self.app_name}-service", - # TODO: pull port number from the service + name=service_name, port=client.V1ServiceBackendPort( number=proxy_to_port ), @@ -618,14 +646,16 @@ class ClusterInfo: readiness_probe=readiness_probe, security_context=client.V1SecurityContext( privileged=self.spec.get_privileged(), - run_as_user=int(service_info["user"]) - if "user" in service_info - else None, - capabilities=client.V1Capabilities( - add=self.spec.get_capabilities() - ) - if self.spec.get_capabilities() - else None, + run_as_user=( + int(service_info["user"]) + if "user" in service_info + else None + ), + capabilities=( + client.V1Capabilities(add=self.spec.get_capabilities()) + if self.spec.get_capabilities() + else None + ), ), resources=to_k8s_resource_requirements(container_resources), ) @@ -647,18 +677,34 @@ class ClusterInfo: volumes = volumes_for_pod_files(parsed_yaml_map, self.spec, self.app_name) return containers, init_containers, services, volumes - # TODO: put things like image pull policy into an object-scope struct - def get_deployment(self, image_pull_policy: Optional[str] = None): - containers, init_containers, services, volumes = self._build_containers( - self.parsed_pod_yaml_map, image_pull_policy - ) - registry_config = self.spec.get_image_registry_config() - if registry_config: - secret_name = f"{self.app_name}-image-pull-secret" - image_pull_secrets = [client.V1LocalObjectReference(name=secret_name)] - else: - image_pull_secrets = [] + def _pod_name_from_file(self, pod_file: str) -> str: + """Extract pod name from compose file path. + docker-compose-dumpster.yml -> dumpster + docker-compose-dumpster-maintenance.yml -> dumpster-maintenance + """ + import os + + base = os.path.basename(pod_file) + name = base + if name.startswith("docker-compose-"): + name = name[len("docker-compose-") :] + if name.endswith(".yml"): + name = name[: -len(".yml")] + elif name.endswith(".yaml"): + name = name[: -len(".yaml")] + return name + + def _pod_has_pvcs(self, parsed_pod_file: Any) -> bool: + """Check if a parsed compose file declares named volumes (PVCs).""" + volumes = parsed_pod_file.get("volumes", {}) + return len(volumes) > 0 + + def _build_common_pod_metadata(self, services: dict) -> tuple: + """Build shared annotations, labels, affinity, tolerations for pods. + + Returns (annotations, labels, affinity, tolerations). + """ annotations = None labels = {"app": self.app_name} if self.stack_name: @@ -680,7 +726,6 @@ class ClusterInfo: if self.spec.get_node_affinities(): affinities = [] for rule in self.spec.get_node_affinities(): - # TODO add some input validation here label_name = rule["label"] label_value = rule["value"] affinities.append( @@ -703,7 +748,6 @@ class ClusterInfo: if self.spec.get_node_tolerations(): tolerations = [] for toleration in self.spec.get_node_tolerations(): - # TODO add some input validation here toleration_key = toleration["key"] toleration_value = toleration["value"] tolerations.append( @@ -715,44 +759,210 @@ class ClusterInfo: ) ) - use_host_network = self._any_service_has_host_network() - template = client.V1PodTemplateSpec( - metadata=client.V1ObjectMeta(annotations=annotations, labels=labels), - spec=client.V1PodSpec( - containers=containers, - init_containers=init_containers or None, - image_pull_secrets=image_pull_secrets, - volumes=volumes, - affinity=affinity, - tolerations=tolerations, - runtime_class_name=self.spec.get_runtime_class(), - host_network=use_host_network or None, - dns_policy=("ClusterFirstWithHostNet" if use_host_network else None), - ), - ) - spec = client.V1DeploymentSpec( - replicas=self.spec.get_replicas(), - template=template, - selector={"matchLabels": {"app": self.app_name}}, - ) + return annotations, labels, affinity, tolerations - deployment = client.V1Deployment( - api_version="apps/v1", - kind="Deployment", - metadata=client.V1ObjectMeta( - name=f"{self.app_name}-deployment", - labels={ - "app": self.app_name, - **( - {"app.kubernetes.io/stack": self.stack_name} - if self.stack_name - else {} + # TODO: put things like image pull policy into an object-scope struct + def get_deployment(self, image_pull_policy: Optional[str] = None): + """Build a single k8s Deployment from all pod files (legacy behavior). + + When only one pod is defined in the stack, this is equivalent to + get_deployments()[0]. Kept for backward compatibility. + """ + deployments = self.get_deployments(image_pull_policy) + if not deployments: + return None + # Legacy: return the first (and usually only) deployment + return deployments[0] + + def get_deployments( + self, image_pull_policy: Optional[str] = None + ) -> List[client.V1Deployment]: + """Build one k8s Deployment per pod file. + + Each pod file (docker-compose-.yml) becomes its own Deployment + with independent lifecycle and update strategy: + - Pods with PVCs get strategy=Recreate (can't do rolling updates + with ReadWriteOnce volumes) + - Pods without PVCs get strategy=RollingUpdate + + This enables maintenance services to survive main pod restarts. + """ + if not self.parsed_pod_yaml_map: + return [] + + registry_config = self.spec.get_image_registry_config() + if registry_config: + secret_name = f"{self.app_name}-image-pull-secret" + image_pull_secrets = [client.V1LocalObjectReference(name=secret_name)] + else: + image_pull_secrets = [] + + use_host_network = self._any_service_has_host_network() + pod_files = list(self.parsed_pod_yaml_map.keys()) + + # Single pod file: preserve legacy naming ({app_name}-deployment) + # Multiple pod files: use {app_name}-{pod_name}-deployment + multi_pod = len(pod_files) > 1 + + deployments = [] + for pod_file in pod_files: + pod_name = self._pod_name_from_file(pod_file) + single_pod_map = {pod_file: self.parsed_pod_yaml_map[pod_file]} + containers, init_containers, services, volumes = self._build_containers( + single_pod_map, image_pull_policy + ) + annotations, labels, affinity, tolerations = ( + self._build_common_pod_metadata(services) + ) + + # Add pod-name label so Services can target specific pods + if multi_pod: + labels["app.kubernetes.io/component"] = pod_name + + has_pvcs = self._pod_has_pvcs(self.parsed_pod_yaml_map[pod_file]) + if has_pvcs: + strategy = client.V1DeploymentStrategy(type="Recreate") + else: + strategy = client.V1DeploymentStrategy( + type="RollingUpdate", + rolling_update=client.V1RollingUpdateDeployment( + max_unavailable=0, max_surge=1 ), - }, - ), - spec=spec, - ) - return deployment + ) + + # Pod selector: for multi-pod, select by both app and component + selector_labels = {"app": self.app_name} + if multi_pod: + selector_labels["app.kubernetes.io/component"] = pod_name + + template = client.V1PodTemplateSpec( + metadata=client.V1ObjectMeta(annotations=annotations, labels=labels), + spec=client.V1PodSpec( + containers=containers, + init_containers=init_containers or None, + image_pull_secrets=image_pull_secrets, + volumes=volumes, + affinity=affinity, + tolerations=tolerations, + runtime_class_name=self.spec.get_runtime_class(), + host_network=use_host_network or None, + dns_policy=( + "ClusterFirstWithHostNet" if use_host_network else None + ), + ), + ) + + if multi_pod: + deployment_name = f"{self.app_name}-{pod_name}-deployment" + else: + deployment_name = f"{self.app_name}-deployment" + + spec = client.V1DeploymentSpec( + replicas=self.spec.get_replicas(), + template=template, + selector={"matchLabels": selector_labels}, + strategy=strategy, + ) + + deployment = client.V1Deployment( + api_version="apps/v1", + kind="Deployment", + metadata=client.V1ObjectMeta( + name=deployment_name, + labels={ + "app": self.app_name, + **( + { + "app.kubernetes.io/stack": self.stack_name, + } + if self.stack_name + else {} + ), + **( + {"app.kubernetes.io/component": pod_name} + if multi_pod + else {} + ), + }, + ), + spec=spec, + ) + deployments.append(deployment) + + return deployments + + def get_services(self) -> List[client.V1Service]: + """Build per-pod ClusterIP Services for multi-pod stacks. + + Each pod's containers get their own Service so Ingress can route + to specific pods. For single-pod stacks, returns a list with one + service matching the legacy get_service() behavior. + """ + pod_files = list(self.parsed_pod_yaml_map.keys()) + multi_pod = len(pod_files) > 1 + + if not multi_pod: + # Legacy: single service for all pods + svc = self.get_service() + return [svc] if svc else [] + + # Multi-pod: one service per pod, only for pods that have + # ports referenced by http-proxy routes + http_proxy_list = self.spec.get_http_proxy() + if not http_proxy_list: + return [] + + # Build map: container_name -> port from http-proxy routes + container_ports: dict = {} + for http_proxy in http_proxy_list: + for route in http_proxy.get("routes", []): + proxy_to = route.get("proxy-to", "") + if ":" in proxy_to: + container, port_str = proxy_to.rsplit(":", 1) + port = int(port_str) + if container not in container_ports: + container_ports[container] = set() + container_ports[container].add(port) + + # Build map: pod_file -> set of service names in that pod + pod_services_map: dict = {} + for pod_file in pod_files: + pod = self.parsed_pod_yaml_map[pod_file] + pod_services_map[pod_file] = set(pod.get("services", {}).keys()) + + services = [] + for pod_file in pod_files: + pod_name = self._pod_name_from_file(pod_file) + svc_names = pod_services_map[pod_file] + # Collect ports from http-proxy that belong to this pod's containers + ports_set: Set[int] = set() + for svc_name in svc_names: + if svc_name in container_ports: + ports_set.update(container_ports[svc_name]) + + if not ports_set: + continue + + service_ports = [ + client.V1ServicePort(port=p, target_port=p, name=f"port-{p}") + for p in sorted(ports_set) + ] + service = client.V1Service( + metadata=client.V1ObjectMeta( + name=f"{self.app_name}-{pod_name}-service", + labels={"app": self.app_name}, + ), + spec=client.V1ServiceSpec( + type="ClusterIP", + ports=service_ports, + selector={ + "app": self.app_name, + "app.kubernetes.io/component": pod_name, + }, + ), + ) + services.append(service) + return services def get_jobs(self, image_pull_policy: Optional[str] = None) -> List[client.V1Job]: """Build k8s Job objects from parsed job compose files. diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py index 787f20fd..eb257ef6 100644 --- a/stack_orchestrator/deploy/k8s/deploy_k8s.py +++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py @@ -411,91 +411,102 @@ class K8sDeployer(Deployer): if opts.o.debug: print("No pods defined, skipping Deployment creation") return - # Process compose files into a Deployment - deployment = self.cluster_info.get_deployment(image_pull_policy="Always") - # Apply image overrides if provided - if self.image_overrides: - for container in deployment.spec.template.spec.containers: - if container.name in self.image_overrides: - container.image = self.image_overrides[container.name] - if opts.o.debug: - print( - f"Overriding image for {container.name}: {container.image}" - ) - # Create or update the k8s Deployment - if opts.o.debug: - print(f"Sending this deployment: {deployment}") - if not opts.o.dry_run: - name = deployment.metadata.name - try: - deployment_resp = cast( - client.V1Deployment, - self.apps_api.create_namespaced_deployment( - body=deployment, namespace=self.k8s_namespace - ), - ) - print(f"Created Deployment {name}") - except ApiException as e: - if e.status == 409: - # Already exists — replace to ensure removed fields - # (volumes, mounts, env vars) are actually deleted. - # Patch uses strategic merge which preserves old fields. - existing = self.apps_api.read_namespaced_deployment( - name=name, namespace=self.k8s_namespace - ) - deployment.metadata.resource_version = ( - existing.metadata.resource_version - ) + # Process compose files into Deployments (one per pod file) + deployments = self.cluster_info.get_deployments(image_pull_policy="Always") + for deployment in deployments: + # Apply image overrides if provided + if self.image_overrides: + for container in deployment.spec.template.spec.containers: + if container.name in self.image_overrides: + container.image = self.image_overrides[container.name] + if opts.o.debug: + print( + f"Overriding image for {container.name}:" + f" {container.image}" + ) + # Create or update the k8s Deployment + if opts.o.debug: + print(f"Sending this deployment: {deployment}") + if not opts.o.dry_run: + name = deployment.metadata.name + try: deployment_resp = cast( client.V1Deployment, - self.apps_api.replace_namespaced_deployment( - name=name, - namespace=self.k8s_namespace, - body=deployment, + self.apps_api.create_namespaced_deployment( + body=deployment, namespace=self.k8s_namespace ), ) - print(f"Updated Deployment {name} (rolling update)") - else: - raise - if opts.o.debug: - meta = deployment_resp.metadata - spec = deployment_resp.spec - if meta and spec and spec.template.spec: - containers = spec.template.spec.containers - img = containers[0].image if containers else None - print(f" {meta.namespace} {meta.name} gen={meta.generation} {img}") + strategy = ( + deployment.spec.strategy.type + if deployment.spec.strategy + else "default" + ) + print(f"Created Deployment {name} (strategy: {strategy})") + except ApiException as e: + if e.status == 409: + # Already exists — replace to ensure removed fields + # (volumes, mounts, env vars) are actually deleted. + existing = self.apps_api.read_namespaced_deployment( + name=name, namespace=self.k8s_namespace + ) + deployment.metadata.resource_version = ( + existing.metadata.resource_version + ) + deployment_resp = cast( + client.V1Deployment, + self.apps_api.replace_namespaced_deployment( + name=name, + namespace=self.k8s_namespace, + body=deployment, + ), + ) + print(f"Updated Deployment {name} (rolling update)") + else: + raise + if opts.o.debug: + meta = deployment_resp.metadata + spec = deployment_resp.spec + if meta and spec and spec.template.spec: + containers = spec.template.spec.containers + img = containers[0].image if containers else None + print( + f" {meta.namespace} {meta.name}" + f" gen={meta.generation} {img}" + ) - service = self.cluster_info.get_service() - if opts.o.debug: - print(f"Sending this service: {service}") - if service and not opts.o.dry_run: - svc_name = service.metadata.name - try: - service_resp = self.core_api.create_namespaced_service( - namespace=self.k8s_namespace, body=service - ) - print(f"Created Service {svc_name}") - except ApiException as e: - if e.status == 409: - # Replace to ensure removed ports are deleted. - # Must preserve clusterIP (immutable) and resourceVersion. - existing = self.core_api.read_namespaced_service( - name=svc_name, namespace=self.k8s_namespace - ) - service.metadata.resource_version = ( - existing.metadata.resource_version - ) - service.spec.cluster_ip = existing.spec.cluster_ip - service_resp = self.core_api.replace_namespaced_service( - name=svc_name, - namespace=self.k8s_namespace, - body=service, - ) - print(f"Updated Service {svc_name}") - else: - raise + # Create Services (one per pod for multi-pod, or one for single-pod) + services = self.cluster_info.get_services() + for service in services: if opts.o.debug: - print(f" {service_resp}") + print(f"Sending this service: {service}") + if service and not opts.o.dry_run: + svc_name = service.metadata.name + try: + service_resp = self.core_api.create_namespaced_service( + namespace=self.k8s_namespace, body=service + ) + print(f"Created Service {svc_name}") + except ApiException as e: + if e.status == 409: + # Replace to ensure removed ports are deleted. + # Must preserve clusterIP (immutable) and resourceVersion. + existing = self.core_api.read_namespaced_service( + name=svc_name, namespace=self.k8s_namespace + ) + service.metadata.resource_version = ( + existing.metadata.resource_version + ) + service.spec.cluster_ip = existing.spec.cluster_ip + service_resp = self.core_api.replace_namespaced_service( + name=svc_name, + namespace=self.k8s_namespace, + body=service, + ) + print(f"Updated Service {svc_name}") + else: + raise + if opts.o.debug: + print(f" {service_resp}") def _create_jobs(self): # Process job compose files into k8s Jobs @@ -880,48 +891,49 @@ class K8sDeployer(Deployer): print("No pods defined, skipping update") return self.connect_api() - ref_deployment = self.cluster_info.get_deployment() - if not ref_deployment or not ref_deployment.metadata: - return - ref_name = ref_deployment.metadata.name - if not ref_name: - return + ref_deployments = self.cluster_info.get_deployments() + for ref_deployment in ref_deployments: + if not ref_deployment or not ref_deployment.metadata: + continue + ref_name = ref_deployment.metadata.name + if not ref_name: + continue - deployment = cast( - client.V1Deployment, - self.apps_api.read_namespaced_deployment( - name=ref_name, namespace=self.k8s_namespace - ), - ) - if not deployment.spec or not deployment.spec.template: - return - template_spec = deployment.spec.template.spec - if not template_spec or not template_spec.containers: - return + deployment = cast( + client.V1Deployment, + self.apps_api.read_namespaced_deployment( + name=ref_name, namespace=self.k8s_namespace + ), + ) + if not deployment.spec or not deployment.spec.template: + continue + template_spec = deployment.spec.template.spec + if not template_spec or not template_spec.containers: + continue - ref_spec = ref_deployment.spec - if ref_spec and ref_spec.template and ref_spec.template.spec: - ref_containers = ref_spec.template.spec.containers - if ref_containers: - new_env = ref_containers[0].env - for container in template_spec.containers: - old_env = container.env - if old_env != new_env: - container.env = new_env + ref_spec = ref_deployment.spec + if ref_spec and ref_spec.template and ref_spec.template.spec: + ref_containers = ref_spec.template.spec.containers + if ref_containers: + new_env = ref_containers[0].env + for container in template_spec.containers: + old_env = container.env + if old_env != new_env: + container.env = new_env - template_meta = deployment.spec.template.metadata - if template_meta: - template_meta.annotations = { - "kubectl.kubernetes.io/restartedAt": datetime.utcnow() - .replace(tzinfo=timezone.utc) - .isoformat() - } + template_meta = deployment.spec.template.metadata + if template_meta: + template_meta.annotations = { + "kubectl.kubernetes.io/restartedAt": datetime.utcnow() + .replace(tzinfo=timezone.utc) + .isoformat() + } - self.apps_api.patch_namespaced_deployment( - name=ref_name, - namespace=self.k8s_namespace, - body=deployment, - ) + self.apps_api.patch_namespaced_deployment( + name=ref_name, + namespace=self.k8s_namespace, + body=deployment, + ) def run( self, diff --git a/stack_orchestrator/deploy/spec.py b/stack_orchestrator/deploy/spec.py index 2cef0e4a..ef37bc3c 100644 --- a/stack_orchestrator/deploy/spec.py +++ b/stack_orchestrator/deploy/spec.py @@ -264,5 +264,14 @@ class Spec: def is_kind_deployment(self): return self.get_deployment_type() in [constants.k8s_kind_deploy_type] + def get_maintenance_service(self) -> typing.Optional[str]: + """Return maintenance-service value (e.g. 'dumpster-maintenance:8000') or None. + + When set, the restart command swaps Ingress backends to this service + during the main pod Recreate, so users see a branded maintenance page + instead of a bare 502. + """ + return self.obj.get("maintenance-service") + def is_docker_deployment(self): return self.get_deployment_type() in [constants.compose_deploy_type]