feat: update-in-place deployments with rolling updates

Replace the destroy-and-recreate deployment model with in-place updates. deploy_k8s.py: All resource creation (Deployment, Service, Ingress, NodePort, ConfigMap) now uses create-or-update semantics. If a resource already exists (409 Conflict), it patches instead of failing. For Deployments, this triggers a k8s rolling update — old pods serve traffic until new pods pass readiness checks. deployment.py: restart() no longer calls down(). It just calls up() which patches existing resources. No namespace deletion, no downtime gap, no race conditions. k8s handles the rollout. This gives: - Zero-downtime deploys (old pods serve during rollout) - Automatic rollback (if new pods fail readiness, rollout stalls) - Manual rollback via kubectl rollout undo Closes so-l2l (parts A and B). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 19:40:20 +00:00 · 2026-03-18 19:40:20 +00:00 · 2d11ca7bb0
commit 2d11ca7bb0
parent ba39c991f1
2 changed files with 93 additions and 47 deletions
--- a/stack_orchestrator/deploy/deployment.py
+++ b/stack_orchestrator/deploy/deployment.py
@ -17,7 +17,7 @@ import click
 from pathlib import Path
 import subprocess
 import sys
-import time
+
 from stack_orchestrator import constants
 from stack_orchestrator.deploy.images import push_images_operation
 from stack_orchestrator.deploy.deploy import (
@ -383,23 +383,17 @@ def restart(ctx, stack_path, spec_file, config_file, force, expected_ip):
    deployment_context.init(deployment_context.deployment_dir)
    ctx.obj = deployment_context

-    # Stop deployment
-    print("\n[4/4] Restarting deployment...")
+    # Apply updated deployment (create-or-update triggers rolling update).
+    # No down() — k8s rolling update keeps old pods serving traffic until
+    # new pods pass readiness checks.
+    print("\n[4/4] Applying deployment update...")
    ctx.obj = make_deploy_context(ctx)
-    down_operation(
-        ctx, delete_volumes=False, extra_args_list=[], skip_cluster_management=True
-    )
-
-    # Brief pause to ensure clean shutdown
-    time.sleep(5)
-
-    # Start deployment
    up_operation(
        ctx, services_list=None, stay_attached=False, skip_cluster_management=True
    )

    print("\n=== Restart Complete ===")
-    print("Deployment restarted with git-tracked configuration.")
+    print("Deployment updated via rolling update.")
    if new_hostname and new_hostname != current_hostname:
        print(f"\nNew hostname: {new_hostname}")
        print("Caddy will automatically provision TLS certificate.")
--- a/stack_orchestrator/deploy/k8s/deploy_k8s.py
+++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py
@ -384,12 +384,20 @@ class K8sDeployer(Deployer):
            if opts.o.debug:
                print(f"Sending this ConfigMap: {cfg_map}")
            if not opts.o.dry_run:
-                cfg_rsp = self.core_api.create_namespaced_config_map(
-                    body=cfg_map, namespace=self.k8s_namespace
-                )
-                if opts.o.debug:
-                    print("ConfigMap created:")
-                    print(f"{cfg_rsp}")
+                cm_name = cfg_map.metadata.name
+                try:
+                    self.core_api.create_namespaced_config_map(
+                        body=cfg_map, namespace=self.k8s_namespace
+                    )
+                except ApiException as e:
+                    if e.status == 409:
+                        self.core_api.patch_namespaced_config_map(
+                            name=cm_name,
+                            namespace=self.k8s_namespace,
+                            body=cfg_map,
+                        )
+                    else:
+                        raise

    def _create_deployment(self):
        # Skip if there are no pods to deploy (e.g. jobs-only stacks)
@ -401,38 +409,64 @@ class K8sDeployer(Deployer):
        deployment = self.cluster_info.get_deployment(
            image_pull_policy="Always"
        )
-        # Create the k8s objects
+        # Create or update the k8s Deployment
        if opts.o.debug:
            print(f"Sending this deployment: {deployment}")
        if not opts.o.dry_run:
-            deployment_resp = cast(
-                client.V1Deployment,
-                self.apps_api.create_namespaced_deployment(
-                    body=deployment, namespace=self.k8s_namespace
-                ),
-            )
+            name = deployment.metadata.name
+            try:
+                deployment_resp = cast(
+                    client.V1Deployment,
+                    self.apps_api.create_namespaced_deployment(
+                        body=deployment, namespace=self.k8s_namespace
+                    ),
+                )
+                print(f"Created Deployment {name}")
+            except ApiException as e:
+                if e.status == 409:
+                    # Already exists — patch to trigger rolling update
+                    deployment_resp = cast(
+                        client.V1Deployment,
+                        self.apps_api.patch_namespaced_deployment(
+                            name=name,
+                            namespace=self.k8s_namespace,
+                            body=deployment,
+                        ),
+                    )
+                    print(f"Updated Deployment {name} (rolling update)")
+                else:
+                    raise
            if opts.o.debug:
-                print("Deployment created:")
                meta = deployment_resp.metadata
                spec = deployment_resp.spec
                if meta and spec and spec.template.spec:
-                    ns = meta.namespace
-                    name = meta.name
-                    gen = meta.generation
                    containers = spec.template.spec.containers
                    img = containers[0].image if containers else None
-                    print(f"{ns} {name} {gen} {img}")
+                    print(f"  {meta.namespace} {meta.name} gen={meta.generation} {img}")

        service = self.cluster_info.get_service()
        if opts.o.debug:
            print(f"Sending this service: {service}")
        if service and not opts.o.dry_run:
-            service_resp = self.core_api.create_namespaced_service(
-                namespace=self.k8s_namespace, body=service
-            )
+            svc_name = service.metadata.name
+            try:
+                service_resp = self.core_api.create_namespaced_service(
+                    namespace=self.k8s_namespace, body=service
+                )
+                print(f"Created Service {svc_name}")
+            except ApiException as e:
+                if e.status == 409:
+                    # Service exists — patch it (preserves clusterIP)
+                    service_resp = self.core_api.patch_namespaced_service(
+                        name=svc_name,
+                        namespace=self.k8s_namespace,
+                        body=service,
+                    )
+                    print(f"Updated Service {svc_name}")
+                else:
+                    raise
            if opts.o.debug:
-                print("Service created:")
-                print(f"{service_resp}")
+                print(f"  {service_resp}")

    def _create_jobs(self):
        # Process job compose files into k8s Jobs
@ -570,12 +604,22 @@ class K8sDeployer(Deployer):
            if opts.o.debug:
                print(f"Sending this ingress: {ingress}")
            if not opts.o.dry_run:
-                ingress_resp = self.networking_api.create_namespaced_ingress(
-                    namespace=self.k8s_namespace, body=ingress
-                )
-                if opts.o.debug:
-                    print("Ingress created:")
-                    print(f"{ingress_resp}")
+                ing_name = ingress.metadata.name
+                try:
+                    self.networking_api.create_namespaced_ingress(
+                        namespace=self.k8s_namespace, body=ingress
+                    )
+                    print(f"Created Ingress {ing_name}")
+                except ApiException as e:
+                    if e.status == 409:
+                        self.networking_api.patch_namespaced_ingress(
+                            name=ing_name,
+                            namespace=self.k8s_namespace,
+                            body=ingress,
+                        )
+                        print(f"Updated Ingress {ing_name}")
+                    else:
+                        raise
        else:
            if opts.o.debug:
                print("No ingress configured")
@ -585,12 +629,20 @@ class K8sDeployer(Deployer):
            if opts.o.debug:
                print(f"Sending this nodeport: {nodeport}")
            if not opts.o.dry_run:
-                nodeport_resp = self.core_api.create_namespaced_service(
-                    namespace=self.k8s_namespace, body=nodeport
-                )
-                if opts.o.debug:
-                    print("NodePort created:")
-                    print(f"{nodeport_resp}")
+                np_name = nodeport.metadata.name
+                try:
+                    self.core_api.create_namespaced_service(
+                        namespace=self.k8s_namespace, body=nodeport
+                    )
+                except ApiException as e:
+                    if e.status == 409:
+                        self.core_api.patch_namespaced_service(
+                            name=np_name,
+                            namespace=self.k8s_namespace,
+                            body=nodeport,
+                        )
+                    else:
+                        raise

        # Call start() hooks — stacks can create additional k8s resources
        if self.deployment_context: