Add support for k8s pod to node affinity and taint toleration (#917)

Reviewed-on: #917 Reviewed-by: Thomas E Lackey <telackey@noreply.git.vdb.to> Co-authored-by: David Boreham <david@bozemanpass.com> Co-committed-by: David Boreham <david@bozemanpass.com>
2024-08-15 20:32:58 +00:00 · 2024-08-15 20:32:58 +00:00 · e56da7dcc1
commit e56da7dcc1
parent 60d34217f8
7 changed files with 369 additions and 1 deletions
--- a/.gitea/workflows/test-k8s-deployment-control.yml
+++ b/.gitea/workflows/test-k8s-deployment-control.yml
@ -0,0 +1,69 @@
+name: K8s Deployment Control Test
+
+on:
+  pull_request:
+    branches: '*'
+  push:
+    branches: '*'
+    paths:
+      - '!**'
+      - '.gitea/workflows/triggers/test-k8s-deployment-control'
+      - '.gitea/workflows/test-k8s-deployment-control.yml'
+      - 'tests/k8s-deployment-control/run-test.sh'
+  schedule: # Note: coordinate with other tests to not overload runners at the same time of day
+    - cron: '3 30 * * *'
+
+jobs:
+  test:
+    name: "Run deployment control suite on kind/k8s"
+    runs-on: ubuntu-22.04
+    steps:
+      - name: "Clone project repository"
+        uses: actions/checkout@v3
+      # At present the stock setup-python action fails on Linux/aarch64
+      # Conditional steps below workaroud this by using deadsnakes for that case only
+      - name: "Install Python for ARM on Linux"
+        if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
+        uses: deadsnakes/action@v3.0.1
+        with:
+          python-version: '3.8'
+      - name: "Install Python cases other than ARM on Linux"
+        if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.8'
+      - name: "Print Python version"
+        run: python3 --version
+      - name: "Install shiv"
+        run: pip install shiv
+      - name: "Generate build version file"
+        run: ./scripts/create_build_tag_file.sh
+      - name: "Build local shiv package"
+        run: ./scripts/build_shiv_package.sh
+      - name: "Check cgroups version"
+        run: mount | grep cgroup
+      - name: "Install kind"
+        run: ./tests/scripts/install-kind.sh
+      - name: "Install Kubectl"
+        run: ./tests/scripts/install-kubectl.sh
+      - name: "Run k8s deployment control test"
+        run: |
+          source /opt/bash-utils/cgroup-helper.sh
+          join_cgroup
+          ./tests/k8s-deployment-control/run-test.sh
+      - name: Notify Vulcanize Slack on CI failure
+        if: ${{ always() && github.ref_name == 'main' }}
+        uses: ravsamhq/notify-slack-action@v2
+        with:
+          status: ${{ job.status }}
+          notify_when: 'failure'
+        env:
+          SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
+      - name: Notify DeepStack Slack on CI failure
+        if: ${{ always() && github.ref_name == 'main' }}
+        uses: ravsamhq/notify-slack-action@v2
+        with:
+          status: ${{ job.status }}
+          notify_when: 'failure'
+        env:
+          SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}
--- a/.gitea/workflows/triggers/test-k8s-deployment-control
+++ b/.gitea/workflows/triggers/test-k8s-deployment-control
--- a/docs/k8s-deployment-enhancements.md
+++ b/docs/k8s-deployment-enhancements.md
@ -0,0 +1,27 @@
+# K8S Deployment Enhancements
+## Controlling pod placement
+The placement of pods created as part of a stack deployment can be controlled to either avoid certain nodes, or require certain nodes.
+### Pod/Node Affinity
+Node affinity rules applied to pods target node labels. The effect is that a pod can only be placed on a node having the specified label value. Note that other pods that do not have any node affinity rules can also be placed on those same nodes. Thus node affinity for a pod controls where that pod can be placed, but does not control where other pods are placed.
+
+Node affinity for stack pods is specified in the deployment's `spec.yml` file as follows:
+```
+node-affinities:
+  - label: nodetype
+    value: typeb
+```
+This example denotes that the stack's pods should only be placed on nodes that have the label `nodetype` with value `typeb`.
+### Node Taint Toleration
+K8s nodes can be given one or more "taints". These are special fields (distinct from labels) with a name (key) and optional value.
+When placing pods, the k8s scheduler will only assign a pod to a tainted node if the pod posesses a corresponding "toleration".
+This is metadata associated with the pod that specifies that the pod "tolerates" a given taint.
+Therefore taint toleration provides a mechanism by which only certain pods can be placed on specific nodes, and provides a complementary mechanism to node affinity.
+
+Taint toleration for stack pods is specified in the deployment's `spec.yml` file as follows:
+```
+node-tolerations:
+  - key: nodetype
+    value: typeb
+```
+This example denotes that the stack's pods will tolerate a taint: `nodetype=typeb`
+
--- a/stack_orchestrator/constants.py
+++ b/stack_orchestrator/constants.py
@ -35,5 +35,7 @@ security_key = "security"
 annotations_key = "annotations"
 labels_key = "labels"
 replicas_key = "replicas"
+node_affinities_key = "node-affinities"
+node_tolerations_key = "node-tolerations"
 kind_config_filename = "kind-config.yml"
 kube_config_filename = "kubeconfig.yml"
--- a/stack_orchestrator/deploy/k8s/cluster_info.py
+++ b/stack_orchestrator/deploy/k8s/cluster_info.py
@ -365,6 +365,8 @@ class ClusterInfo:

        annotations = None
        labels = {"app": self.app_name}
+        affinity = None
+        tolerations = None

        if self.spec.get_annotations():
            annotations = {}
@ -377,12 +379,52 @@ class ClusterInfo:
                for service_name in services:
                    labels[key.replace("{name}", service_name)] = value

+        if self.spec.get_node_affinities():
+            affinities = []
+            for rule in self.spec.get_node_affinities():
+                # TODO add some input validation here
+                label_name = rule['label']
+                label_value = rule['value']
+                affinities.append(client.V1NodeSelectorTerm(
+                            match_expressions=[client.V1NodeSelectorRequirement(
+                                key=label_name,
+                                operator="In",
+                                values=[label_value]
+                            )]
+                        )
+                    )
+            affinity = client.V1Affinity(
+                node_affinity=client.V1NodeAffinity(
+                    required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
+                        node_selector_terms=affinities
+                    ))
+                )
+
+        if self.spec.get_node_tolerations():
+            tolerations = []
+            for toleration in self.spec.get_node_tolerations():
+                # TODO add some input validation here
+                toleration_key = toleration['key']
+                toleration_value = toleration['value']
+                tolerations.append(client.V1Toleration(
+                    effect="NoSchedule",
+                    key=toleration_key,
+                    operator="Equal",
+                    value=toleration_value
+                ))
+
        template = client.V1PodTemplateSpec(
            metadata=client.V1ObjectMeta(
                annotations=annotations,
                labels=labels
            ),
-            spec=client.V1PodSpec(containers=containers, image_pull_secrets=image_pull_secrets, volumes=volumes),
+            spec=client.V1PodSpec(
+                containers=containers,
+                image_pull_secrets=image_pull_secrets,
+                volumes=volumes,
+                affinity=affinity,
+                tolerations=tolerations
+                ),
        )
        spec = client.V1DeploymentSpec(
            replicas=self.spec.get_replicas(),
--- a/stack_orchestrator/deploy/spec.py
+++ b/stack_orchestrator/deploy/spec.py
@ -120,6 +120,12 @@ class Spec:
    def get_replicas(self):
        return self.obj.get(constants.replicas_key, 1)

+    def get_node_affinities(self):
+        return self.obj.get(constants.node_affinities_key, [])
+
+    def get_node_tolerations(self):
+        return self.obj.get(constants.node_tolerations_key, [])
+
    def get_labels(self):
        return self.obj.get(constants.labels_key, {})

--- a/tests/k8s-deployment-control/run-test.sh
+++ b/tests/k8s-deployment-control/run-test.sh
@ -0,0 +1,222 @@
+#!/usr/bin/env bash
+set -e
+if [ -n "$CERC_SCRIPT_DEBUG" ]; then
+    set -x
+    # Dump environment variables for debugging
+    echo "Environment variables:"
+    env
+fi
+
+if [ "$1" == "from-path" ]; then
+    TEST_TARGET_SO="laconic-so"
+else
+    TEST_TARGET_SO=$( ls -t1 ./package/laconic-so* | head -1 )
+fi
+
+# Helper functions: TODO move into a separate file
+wait_for_pods_started () {
+    for i in {1..50}
+    do
+        local ps_output=$( $TEST_TARGET_SO deployment --dir $test_deployment_dir ps )
+
+        if [[ "$ps_output" == *"Running containers:"* ]]; then
+            # if ready, return
+            return
+        else
+            # if not ready, wait
+            sleep 5
+        fi
+    done
+    # Timed out, error exit
+    echo "waiting for pods to start: FAILED"
+    delete_cluster_exit
+}
+
+wait_for_log_output () {
+    for i in {1..50}
+    do
+
+        local log_output=$( $TEST_TARGET_SO deployment --dir $test_deployment_dir logs )
+
+        if [[ ! -z "$log_output" ]]; then
+            # if ready, return
+            return
+        else
+            # if not ready, wait
+            sleep 5
+        fi
+    done
+    # Timed out, error exit
+    echo "waiting for pods log content: FAILED"
+    delete_cluster_exit
+}
+
+delete_cluster_exit () {
+    $TEST_TARGET_SO deployment --dir $test_deployment_dir stop --delete-volumes
+    exit 1
+}
+
+# Set a non-default repo dir
+export CERC_REPO_BASE_DIR=~/stack-orchestrator-test/repo-base-dir
+echo "Testing this package: $TEST_TARGET_SO"
+echo "Test version command"
+reported_version_string=$( $TEST_TARGET_SO version )
+echo "Version reported is: ${reported_version_string}"
+echo "Cloning repositories into: $CERC_REPO_BASE_DIR"
+rm -rf $CERC_REPO_BASE_DIR
+mkdir -p $CERC_REPO_BASE_DIR
+$TEST_TARGET_SO --stack test setup-repositories
+$TEST_TARGET_SO --stack test build-containers
+# Test basic stack-orchestrator deploy to k8s
+test_deployment_dir=$CERC_REPO_BASE_DIR/test-deployment-dir
+test_deployment_spec=$CERC_REPO_BASE_DIR/test-deployment-spec.yml
+
+# Create a deployment that we can use to check our test cases
+$TEST_TARGET_SO --stack test deploy --deploy-to k8s-kind init --output $test_deployment_spec
+# Check the file now exists
+if [ ! -f "$test_deployment_spec" ]; then
+    echo "deploy init test: spec file not present"
+    echo "deploy init test: FAILED"
+    exit 1
+fi
+echo "deploy init test: passed"
+
+$TEST_TARGET_SO --stack test deploy create --spec-file $test_deployment_spec --deployment-dir $test_deployment_dir
+# Check the deployment dir exists
+if [ ! -d "$test_deployment_dir" ]; then
+    echo "deploy create test: deployment directory not present"
+    echo "deploy create test: FAILED"
+    exit 1
+fi
+echo "deploy create test: passed"
+# Check the file writted by the create command in the stack now exists
+if [ ! -f "$test_deployment_dir/create-file" ]; then
+    echo "deploy create test: create output file not present"
+    echo "deploy create test: FAILED"
+    exit 1
+fi
+echo "deploy create output file test: passed"
+
+# At this point the deployment's kind-config.yml will look like this:
+# kind: Cluster
+# apiVersion: kind.x-k8s.io/v1alpha4
+# nodes:
+# - role: control-plane
+#   kubeadmConfigPatches:
+#     - |
+#       kind: InitConfiguration
+#       nodeRegistration:
+#         kubeletExtraArgs:
+#           node-labels: "ingress-ready=true"
+#   extraPortMappings:
+#   - containerPort: 80
+#    hostPort: 80
+
+# We need to change it to this:
+# Note we also turn up the log level on the scheduler in order to diagnose placement errors
+# See logs like: kubectl -n kube-system logs kube-scheduler-laconic-f185cd245d8dba98-control-plane
+kind_config_file=${test_deployment_dir}/kind-config.yml
+cat << EOF > ${kind_config_file} 
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+kubeadmConfigPatches:
+- |
+  kind: ClusterConfiguration
+  scheduler:
+    extraArgs:
+      v: "3"
+nodes:
+- role: control-plane
+  kubeadmConfigPatches:
+    - |
+      kind: InitConfiguration
+      nodeRegistration:
+        kubeletExtraArgs:
+          node-labels: "ingress-ready=true"
+  extraPortMappings:
+  - containerPort: 80
+    hostPort: 80
+- role: worker
+  labels:
+    nodetype: a
+- role: worker
+  labels:
+    nodetype: b
+- role: worker
+  labels:
+    nodetype: c
+  kubeadmConfigPatches:
+  - |
+    kind: JoinConfiguration
+    nodeRegistration:
+      taints:
+        - key: "nodeavoid"
+          value: "c"
+          effect: "NoSchedule"
+EOF
+
+# At this point we should have 4 nodes, three labeled like this:
+# $ kubectl get nodes --show-labels=true
+# NAME                                     STATUS   ROLES           AGE     VERSION   LABELS
+# laconic-3af549a3ba0e3a3c-control-plane   Ready    control-plane   2m37s   v1.30.0   ...,ingress-ready=true
+# laconic-3af549a3ba0e3a3c-worker          Ready    <none>          2m18s   v1.30.0   ...,nodetype=a
+# laconic-3af549a3ba0e3a3c-worker2         Ready    <none>          2m18s   v1.30.0   ...,nodetype=b
+# laconic-3af549a3ba0e3a3c-worker3         Ready    <none>          2m18s   v1.30.0   ...,nodetype=c
+
+# And with taints like this:
+# $ kubectl get nodes -o custom-columns=NAME:.metadata.name,TAINTS:.spec.taints --no-headers
+# laconic-3af549a3ba0e3a3c-control-plane   [map[effect:NoSchedule key:node-role.kubernetes.io/control-plane]]
+# laconic-3af549a3ba0e3a3c-worker          <none>
+# laconic-3af549a3ba0e3a3c-worker2         <none>
+# laconic-3af549a3ba0e3a3c-worker3         [map[effect:NoSchedule key:nodeavoid value:c]]
+
+# We can now modify the deployment spec file to require a set of affinity and/or taint combinations
+# then bring up the deployment and check that the pod is scheduled to an expected node.
+
+# Add a requirement to schedule on a node labeled nodetype=c and
+# a toleration such that no other pods schedule on that node
+deployment_spec_file=${test_deployment_dir}/spec.yml
+cat << EOF >> ${deployment_spec_file}
+node-affinities:
+  - label: nodetype
+    value: c
+node-tolerations:
+  - key: nodeavoid
+    value: c
+EOF
+
+# Get the deployment ID so we can generate low level kubectl commands later
+deployment_id=$(cat ${test_deployment_dir}/deployment.yml | cut -d ' ' -f 2)
+
+# Try to start the deployment
+$TEST_TARGET_SO deployment --dir $test_deployment_dir start
+wait_for_pods_started
+# Check logs command works
+wait_for_log_output
+sleep 1
+log_output_1=$( $TEST_TARGET_SO deployment --dir $test_deployment_dir logs )
+if [[ "$log_output_1" == *"filesystem is fresh"* ]]; then
+    echo "deployment of pod test: passed"
+else
+    echo "deployment pod test: FAILED"
+    echo $log_output_1
+    delete_cluster_exit
+fi
+
+# The deployment's pod should be scheduled onto node: worker3
+# Check that's what happened
+# Get get the node onto which the stack pod has been deployed
+deployment_node=$(kubectl get pods -l app=${deployment_id} -o=jsonpath='{.items..spec.nodeName}')
+expected_node=${deployment_id}-worker3
+echo "Stack pod deployed to node: ${deployment_node}"
+if [[ ${deployment_node} == ${expected_node} ]]; then
+    echo "deployment of pod test: passed"
+else
+    echo "deployment pod test: FAILED"
+    echo "Stack pod deployed to node: ${deployment_node}, expected node: ${expected_node}"
+    delete_cluster_exit
+fi
+
+# Stop and clean up
+$TEST_TARGET_SO deployment --dir $test_deployment_dir stop --delete-volumes
+echo "Test passed"