From e56da7dcc11119e2a188b3366fc5b54a84471df9 Mon Sep 17 00:00:00 2001 From: David Boreham Date: Thu, 15 Aug 2024 20:32:58 +0000 Subject: [PATCH] Add support for k8s pod to node affinity and taint toleration (#917) Reviewed-on: https://git.vdb.to/cerc-io/stack-orchestrator/pulls/917 Reviewed-by: Thomas E Lackey Co-authored-by: David Boreham Co-committed-by: David Boreham --- .../workflows/test-k8s-deployment-control.yml | 69 ++++++ .../triggers/test-k8s-deployment-control | 0 docs/k8s-deployment-enhancements.md | 27 +++ stack_orchestrator/constants.py | 2 + stack_orchestrator/deploy/k8s/cluster_info.py | 44 +++- stack_orchestrator/deploy/spec.py | 6 + tests/k8s-deployment-control/run-test.sh | 222 ++++++++++++++++++ 7 files changed, 369 insertions(+), 1 deletion(-) create mode 100644 .gitea/workflows/test-k8s-deployment-control.yml create mode 100644 .gitea/workflows/triggers/test-k8s-deployment-control create mode 100644 docs/k8s-deployment-enhancements.md create mode 100755 tests/k8s-deployment-control/run-test.sh diff --git a/.gitea/workflows/test-k8s-deployment-control.yml b/.gitea/workflows/test-k8s-deployment-control.yml new file mode 100644 index 00000000..f4848a6a --- /dev/null +++ b/.gitea/workflows/test-k8s-deployment-control.yml @@ -0,0 +1,69 @@ +name: K8s Deployment Control Test + +on: + pull_request: + branches: '*' + push: + branches: '*' + paths: + - '!**' + - '.gitea/workflows/triggers/test-k8s-deployment-control' + - '.gitea/workflows/test-k8s-deployment-control.yml' + - 'tests/k8s-deployment-control/run-test.sh' + schedule: # Note: coordinate with other tests to not overload runners at the same time of day + - cron: '3 30 * * *' + +jobs: + test: + name: "Run deployment control suite on kind/k8s" + runs-on: ubuntu-22.04 + steps: + - name: "Clone project repository" + uses: actions/checkout@v3 + # At present the stock setup-python action fails on Linux/aarch64 + # Conditional steps below workaroud this by using deadsnakes for that case only + - name: "Install Python for ARM on Linux" + if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }} + uses: deadsnakes/action@v3.0.1 + with: + python-version: '3.8' + - name: "Install Python cases other than ARM on Linux" + if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }} + uses: actions/setup-python@v4 + with: + python-version: '3.8' + - name: "Print Python version" + run: python3 --version + - name: "Install shiv" + run: pip install shiv + - name: "Generate build version file" + run: ./scripts/create_build_tag_file.sh + - name: "Build local shiv package" + run: ./scripts/build_shiv_package.sh + - name: "Check cgroups version" + run: mount | grep cgroup + - name: "Install kind" + run: ./tests/scripts/install-kind.sh + - name: "Install Kubectl" + run: ./tests/scripts/install-kubectl.sh + - name: "Run k8s deployment control test" + run: | + source /opt/bash-utils/cgroup-helper.sh + join_cgroup + ./tests/k8s-deployment-control/run-test.sh + - name: Notify Vulcanize Slack on CI failure + if: ${{ always() && github.ref_name == 'main' }} + uses: ravsamhq/notify-slack-action@v2 + with: + status: ${{ job.status }} + notify_when: 'failure' + env: + SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }} + - name: Notify DeepStack Slack on CI failure + if: ${{ always() && github.ref_name == 'main' }} + uses: ravsamhq/notify-slack-action@v2 + with: + status: ${{ job.status }} + notify_when: 'failure' + env: + SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }} diff --git a/.gitea/workflows/triggers/test-k8s-deployment-control b/.gitea/workflows/triggers/test-k8s-deployment-control new file mode 100644 index 00000000..e69de29b diff --git a/docs/k8s-deployment-enhancements.md b/docs/k8s-deployment-enhancements.md new file mode 100644 index 00000000..424d529f --- /dev/null +++ b/docs/k8s-deployment-enhancements.md @@ -0,0 +1,27 @@ +# K8S Deployment Enhancements +## Controlling pod placement +The placement of pods created as part of a stack deployment can be controlled to either avoid certain nodes, or require certain nodes. +### Pod/Node Affinity +Node affinity rules applied to pods target node labels. The effect is that a pod can only be placed on a node having the specified label value. Note that other pods that do not have any node affinity rules can also be placed on those same nodes. Thus node affinity for a pod controls where that pod can be placed, but does not control where other pods are placed. + +Node affinity for stack pods is specified in the deployment's `spec.yml` file as follows: +``` +node-affinities: + - label: nodetype + value: typeb +``` +This example denotes that the stack's pods should only be placed on nodes that have the label `nodetype` with value `typeb`. +### Node Taint Toleration +K8s nodes can be given one or more "taints". These are special fields (distinct from labels) with a name (key) and optional value. +When placing pods, the k8s scheduler will only assign a pod to a tainted node if the pod posesses a corresponding "toleration". +This is metadata associated with the pod that specifies that the pod "tolerates" a given taint. +Therefore taint toleration provides a mechanism by which only certain pods can be placed on specific nodes, and provides a complementary mechanism to node affinity. + +Taint toleration for stack pods is specified in the deployment's `spec.yml` file as follows: +``` +node-tolerations: + - key: nodetype + value: typeb +``` +This example denotes that the stack's pods will tolerate a taint: `nodetype=typeb` + diff --git a/stack_orchestrator/constants.py b/stack_orchestrator/constants.py index aee36ad8..07fc68f4 100644 --- a/stack_orchestrator/constants.py +++ b/stack_orchestrator/constants.py @@ -35,5 +35,7 @@ security_key = "security" annotations_key = "annotations" labels_key = "labels" replicas_key = "replicas" +node_affinities_key = "node-affinities" +node_tolerations_key = "node-tolerations" kind_config_filename = "kind-config.yml" kube_config_filename = "kubeconfig.yml" diff --git a/stack_orchestrator/deploy/k8s/cluster_info.py b/stack_orchestrator/deploy/k8s/cluster_info.py index 35c06e42..0a1f4a71 100644 --- a/stack_orchestrator/deploy/k8s/cluster_info.py +++ b/stack_orchestrator/deploy/k8s/cluster_info.py @@ -365,6 +365,8 @@ class ClusterInfo: annotations = None labels = {"app": self.app_name} + affinity = None + tolerations = None if self.spec.get_annotations(): annotations = {} @@ -377,12 +379,52 @@ class ClusterInfo: for service_name in services: labels[key.replace("{name}", service_name)] = value + if self.spec.get_node_affinities(): + affinities = [] + for rule in self.spec.get_node_affinities(): + # TODO add some input validation here + label_name = rule['label'] + label_value = rule['value'] + affinities.append(client.V1NodeSelectorTerm( + match_expressions=[client.V1NodeSelectorRequirement( + key=label_name, + operator="In", + values=[label_value] + )] + ) + ) + affinity = client.V1Affinity( + node_affinity=client.V1NodeAffinity( + required_during_scheduling_ignored_during_execution=client.V1NodeSelector( + node_selector_terms=affinities + )) + ) + + if self.spec.get_node_tolerations(): + tolerations = [] + for toleration in self.spec.get_node_tolerations(): + # TODO add some input validation here + toleration_key = toleration['key'] + toleration_value = toleration['value'] + tolerations.append(client.V1Toleration( + effect="NoSchedule", + key=toleration_key, + operator="Equal", + value=toleration_value + )) + template = client.V1PodTemplateSpec( metadata=client.V1ObjectMeta( annotations=annotations, labels=labels ), - spec=client.V1PodSpec(containers=containers, image_pull_secrets=image_pull_secrets, volumes=volumes), + spec=client.V1PodSpec( + containers=containers, + image_pull_secrets=image_pull_secrets, + volumes=volumes, + affinity=affinity, + tolerations=tolerations + ), ) spec = client.V1DeploymentSpec( replicas=self.spec.get_replicas(), diff --git a/stack_orchestrator/deploy/spec.py b/stack_orchestrator/deploy/spec.py index e8d293e3..99dfb16f 100644 --- a/stack_orchestrator/deploy/spec.py +++ b/stack_orchestrator/deploy/spec.py @@ -120,6 +120,12 @@ class Spec: def get_replicas(self): return self.obj.get(constants.replicas_key, 1) + def get_node_affinities(self): + return self.obj.get(constants.node_affinities_key, []) + + def get_node_tolerations(self): + return self.obj.get(constants.node_tolerations_key, []) + def get_labels(self): return self.obj.get(constants.labels_key, {}) diff --git a/tests/k8s-deployment-control/run-test.sh b/tests/k8s-deployment-control/run-test.sh new file mode 100755 index 00000000..8ca9064b --- /dev/null +++ b/tests/k8s-deployment-control/run-test.sh @@ -0,0 +1,222 @@ +#!/usr/bin/env bash +set -e +if [ -n "$CERC_SCRIPT_DEBUG" ]; then + set -x + # Dump environment variables for debugging + echo "Environment variables:" + env +fi + +if [ "$1" == "from-path" ]; then + TEST_TARGET_SO="laconic-so" +else + TEST_TARGET_SO=$( ls -t1 ./package/laconic-so* | head -1 ) +fi + +# Helper functions: TODO move into a separate file +wait_for_pods_started () { + for i in {1..50} + do + local ps_output=$( $TEST_TARGET_SO deployment --dir $test_deployment_dir ps ) + + if [[ "$ps_output" == *"Running containers:"* ]]; then + # if ready, return + return + else + # if not ready, wait + sleep 5 + fi + done + # Timed out, error exit + echo "waiting for pods to start: FAILED" + delete_cluster_exit +} + +wait_for_log_output () { + for i in {1..50} + do + + local log_output=$( $TEST_TARGET_SO deployment --dir $test_deployment_dir logs ) + + if [[ ! -z "$log_output" ]]; then + # if ready, return + return + else + # if not ready, wait + sleep 5 + fi + done + # Timed out, error exit + echo "waiting for pods log content: FAILED" + delete_cluster_exit +} + +delete_cluster_exit () { + $TEST_TARGET_SO deployment --dir $test_deployment_dir stop --delete-volumes + exit 1 +} + +# Set a non-default repo dir +export CERC_REPO_BASE_DIR=~/stack-orchestrator-test/repo-base-dir +echo "Testing this package: $TEST_TARGET_SO" +echo "Test version command" +reported_version_string=$( $TEST_TARGET_SO version ) +echo "Version reported is: ${reported_version_string}" +echo "Cloning repositories into: $CERC_REPO_BASE_DIR" +rm -rf $CERC_REPO_BASE_DIR +mkdir -p $CERC_REPO_BASE_DIR +$TEST_TARGET_SO --stack test setup-repositories +$TEST_TARGET_SO --stack test build-containers +# Test basic stack-orchestrator deploy to k8s +test_deployment_dir=$CERC_REPO_BASE_DIR/test-deployment-dir +test_deployment_spec=$CERC_REPO_BASE_DIR/test-deployment-spec.yml + +# Create a deployment that we can use to check our test cases +$TEST_TARGET_SO --stack test deploy --deploy-to k8s-kind init --output $test_deployment_spec +# Check the file now exists +if [ ! -f "$test_deployment_spec" ]; then + echo "deploy init test: spec file not present" + echo "deploy init test: FAILED" + exit 1 +fi +echo "deploy init test: passed" + +$TEST_TARGET_SO --stack test deploy create --spec-file $test_deployment_spec --deployment-dir $test_deployment_dir +# Check the deployment dir exists +if [ ! -d "$test_deployment_dir" ]; then + echo "deploy create test: deployment directory not present" + echo "deploy create test: FAILED" + exit 1 +fi +echo "deploy create test: passed" +# Check the file writted by the create command in the stack now exists +if [ ! -f "$test_deployment_dir/create-file" ]; then + echo "deploy create test: create output file not present" + echo "deploy create test: FAILED" + exit 1 +fi +echo "deploy create output file test: passed" + +# At this point the deployment's kind-config.yml will look like this: +# kind: Cluster +# apiVersion: kind.x-k8s.io/v1alpha4 +# nodes: +# - role: control-plane +# kubeadmConfigPatches: +# - | +# kind: InitConfiguration +# nodeRegistration: +# kubeletExtraArgs: +# node-labels: "ingress-ready=true" +# extraPortMappings: +# - containerPort: 80 +# hostPort: 80 + +# We need to change it to this: +# Note we also turn up the log level on the scheduler in order to diagnose placement errors +# See logs like: kubectl -n kube-system logs kube-scheduler-laconic-f185cd245d8dba98-control-plane +kind_config_file=${test_deployment_dir}/kind-config.yml +cat << EOF > ${kind_config_file} +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +kubeadmConfigPatches: +- | + kind: ClusterConfiguration + scheduler: + extraArgs: + v: "3" +nodes: +- role: control-plane + kubeadmConfigPatches: + - | + kind: InitConfiguration + nodeRegistration: + kubeletExtraArgs: + node-labels: "ingress-ready=true" + extraPortMappings: + - containerPort: 80 + hostPort: 80 +- role: worker + labels: + nodetype: a +- role: worker + labels: + nodetype: b +- role: worker + labels: + nodetype: c + kubeadmConfigPatches: + - | + kind: JoinConfiguration + nodeRegistration: + taints: + - key: "nodeavoid" + value: "c" + effect: "NoSchedule" +EOF + +# At this point we should have 4 nodes, three labeled like this: +# $ kubectl get nodes --show-labels=true +# NAME STATUS ROLES AGE VERSION LABELS +# laconic-3af549a3ba0e3a3c-control-plane Ready control-plane 2m37s v1.30.0 ...,ingress-ready=true +# laconic-3af549a3ba0e3a3c-worker Ready 2m18s v1.30.0 ...,nodetype=a +# laconic-3af549a3ba0e3a3c-worker2 Ready 2m18s v1.30.0 ...,nodetype=b +# laconic-3af549a3ba0e3a3c-worker3 Ready 2m18s v1.30.0 ...,nodetype=c + +# And with taints like this: +# $ kubectl get nodes -o custom-columns=NAME:.metadata.name,TAINTS:.spec.taints --no-headers +# laconic-3af549a3ba0e3a3c-control-plane [map[effect:NoSchedule key:node-role.kubernetes.io/control-plane]] +# laconic-3af549a3ba0e3a3c-worker +# laconic-3af549a3ba0e3a3c-worker2 +# laconic-3af549a3ba0e3a3c-worker3 [map[effect:NoSchedule key:nodeavoid value:c]] + +# We can now modify the deployment spec file to require a set of affinity and/or taint combinations +# then bring up the deployment and check that the pod is scheduled to an expected node. + +# Add a requirement to schedule on a node labeled nodetype=c and +# a toleration such that no other pods schedule on that node +deployment_spec_file=${test_deployment_dir}/spec.yml +cat << EOF >> ${deployment_spec_file} +node-affinities: + - label: nodetype + value: c +node-tolerations: + - key: nodeavoid + value: c +EOF + +# Get the deployment ID so we can generate low level kubectl commands later +deployment_id=$(cat ${test_deployment_dir}/deployment.yml | cut -d ' ' -f 2) + +# Try to start the deployment +$TEST_TARGET_SO deployment --dir $test_deployment_dir start +wait_for_pods_started +# Check logs command works +wait_for_log_output +sleep 1 +log_output_1=$( $TEST_TARGET_SO deployment --dir $test_deployment_dir logs ) +if [[ "$log_output_1" == *"filesystem is fresh"* ]]; then + echo "deployment of pod test: passed" +else + echo "deployment pod test: FAILED" + echo $log_output_1 + delete_cluster_exit +fi + +# The deployment's pod should be scheduled onto node: worker3 +# Check that's what happened +# Get get the node onto which the stack pod has been deployed +deployment_node=$(kubectl get pods -l app=${deployment_id} -o=jsonpath='{.items..spec.nodeName}') +expected_node=${deployment_id}-worker3 +echo "Stack pod deployed to node: ${deployment_node}" +if [[ ${deployment_node} == ${expected_node} ]]; then + echo "deployment of pod test: passed" +else + echo "deployment pod test: FAILED" + echo "Stack pod deployed to node: ${deployment_node}, expected node: ${expected_node}" + delete_cluster_exit +fi + +# Stop and clean up +$TEST_TARGET_SO deployment --dir $test_deployment_dir stop --delete-volumes +echo "Test passed"