Add support for k8s pod to node affinity and taint toleration #917
69
.gitea/workflows/test-k8s-deployment-control.yml
Normal file
69
.gitea/workflows/test-k8s-deployment-control.yml
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
name: K8s Deployment Control Test
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
branches: '*'
|
||||||
|
push:
|
||||||
|
branches: '*'
|
||||||
|
paths:
|
||||||
|
- '!**'
|
||||||
|
- '.gitea/workflows/triggers/test-k8s-deployment-control'
|
||||||
|
- '.gitea/workflows/test-k8s-deployment-control.yml'
|
||||||
|
- 'tests/k8s-deployment-control/run-test.sh'
|
||||||
|
schedule: # Note: coordinate with other tests to not overload runners at the same time of day
|
||||||
|
- cron: '3 30 * * *'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test:
|
||||||
|
name: "Run deployment control suite on kind/k8s"
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
steps:
|
||||||
|
- name: "Clone project repository"
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
# At present the stock setup-python action fails on Linux/aarch64
|
||||||
|
# Conditional steps below workaroud this by using deadsnakes for that case only
|
||||||
|
- name: "Install Python for ARM on Linux"
|
||||||
|
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
|
||||||
|
uses: deadsnakes/action@v3.0.1
|
||||||
|
with:
|
||||||
|
python-version: '3.8'
|
||||||
|
- name: "Install Python cases other than ARM on Linux"
|
||||||
|
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: '3.8'
|
||||||
|
- name: "Print Python version"
|
||||||
|
run: python3 --version
|
||||||
|
- name: "Install shiv"
|
||||||
|
run: pip install shiv
|
||||||
|
- name: "Generate build version file"
|
||||||
|
run: ./scripts/create_build_tag_file.sh
|
||||||
|
- name: "Build local shiv package"
|
||||||
|
run: ./scripts/build_shiv_package.sh
|
||||||
|
- name: "Check cgroups version"
|
||||||
|
run: mount | grep cgroup
|
||||||
|
- name: "Install kind"
|
||||||
|
run: ./tests/scripts/install-kind.sh
|
||||||
|
- name: "Install Kubectl"
|
||||||
|
run: ./tests/scripts/install-kubectl.sh
|
||||||
|
- name: "Run k8s deployment control test"
|
||||||
|
run: |
|
||||||
|
source /opt/bash-utils/cgroup-helper.sh
|
||||||
|
join_cgroup
|
||||||
|
./tests/k8s-deployment-control/run-test.sh
|
||||||
|
- name: Notify Vulcanize Slack on CI failure
|
||||||
|
if: ${{ always() && github.ref_name == 'main' }}
|
||||||
|
uses: ravsamhq/notify-slack-action@v2
|
||||||
|
with:
|
||||||
|
status: ${{ job.status }}
|
||||||
|
notify_when: 'failure'
|
||||||
|
env:
|
||||||
|
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
|
||||||
|
- name: Notify DeepStack Slack on CI failure
|
||||||
|
if: ${{ always() && github.ref_name == 'main' }}
|
||||||
|
uses: ravsamhq/notify-slack-action@v2
|
||||||
|
with:
|
||||||
|
status: ${{ job.status }}
|
||||||
|
notify_when: 'failure'
|
||||||
|
env:
|
||||||
|
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}
|
27
docs/k8s-deployment-enhancements.md
Normal file
27
docs/k8s-deployment-enhancements.md
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
# K8S Deployment Enhancements
|
||||||
|
## Controlling pod placement
|
||||||
|
The placement of pods created as part of a stack deployment can be controlled to either avoid certain nodes, or require certain nodes.
|
||||||
|
### Pod/Node Affinity
|
||||||
|
Node affinity rules applied to pods target node labels. The effect is that a pod can only be placed on a node having the specified label value. Note that other pods that do not have any node affinity rules can also be placed on those same nodes. Thus node affinity for a pod controls where that pod can be placed, but does not control where other pods are placed.
|
||||||
|
|
||||||
|
Node affinity for stack pods is specified in the deployment's `spec.yml` file as follows:
|
||||||
|
```
|
||||||
|
node-affinities:
|
||||||
|
- label: nodetype
|
||||||
|
value: typeb
|
||||||
|
```
|
||||||
|
This example denotes that the stack's pods should only be placed on nodes that have the label `nodetype` with value `typeb`.
|
||||||
|
### Node Taint Toleration
|
||||||
|
K8s nodes can be given one or more "taints". These are special fields (distinct from labels) with a name (key) and optional value.
|
||||||
|
When placing pods, the k8s scheduler will only assign a pod to a tainted node if the pod posesses a corresponding "toleration".
|
||||||
|
This is metadata associated with the pod that specifies that the pod "tolerates" a given taint.
|
||||||
|
Therefore taint toleration provides a mechanism by which only certain pods can be placed on specific nodes, and provides a complementary mechanism to node affinity.
|
||||||
|
|
||||||
|
Taint toleration for stack pods is specified in the deployment's `spec.yml` file as follows:
|
||||||
|
```
|
||||||
|
node-tolerations:
|
||||||
|
- key: nodetype
|
||||||
|
value: typeb
|
||||||
|
```
|
||||||
|
This example denotes that the stack's pods will tolerate a taint: `nodetype=typeb`
|
||||||
|
|
@ -35,5 +35,7 @@ security_key = "security"
|
|||||||
annotations_key = "annotations"
|
annotations_key = "annotations"
|
||||||
labels_key = "labels"
|
labels_key = "labels"
|
||||||
replicas_key = "replicas"
|
replicas_key = "replicas"
|
||||||
|
node_affinities_key = "node-affinities"
|
||||||
|
node_tolerations_key = "node-tolerations"
|
||||||
kind_config_filename = "kind-config.yml"
|
kind_config_filename = "kind-config.yml"
|
||||||
kube_config_filename = "kubeconfig.yml"
|
kube_config_filename = "kubeconfig.yml"
|
||||||
|
@ -365,6 +365,8 @@ class ClusterInfo:
|
|||||||
|
|
||||||
annotations = None
|
annotations = None
|
||||||
labels = {"app": self.app_name}
|
labels = {"app": self.app_name}
|
||||||
|
affinity = None
|
||||||
|
tolerations = None
|
||||||
|
|
||||||
if self.spec.get_annotations():
|
if self.spec.get_annotations():
|
||||||
annotations = {}
|
annotations = {}
|
||||||
@ -377,12 +379,52 @@ class ClusterInfo:
|
|||||||
for service_name in services:
|
for service_name in services:
|
||||||
labels[key.replace("{name}", service_name)] = value
|
labels[key.replace("{name}", service_name)] = value
|
||||||
|
|
||||||
|
if self.spec.get_node_affinities():
|
||||||
|
affinities = []
|
||||||
|
for rule in self.spec.get_node_affinities():
|
||||||
|
# TODO add some input validation here
|
||||||
|
label_name = rule['label']
|
||||||
|
label_value = rule['value']
|
||||||
|
affinities.append(client.V1NodeSelectorTerm(
|
||||||
|
match_expressions=[client.V1NodeSelectorRequirement(
|
||||||
|
key=label_name,
|
||||||
|
operator="In",
|
||||||
|
values=[label_value]
|
||||||
|
)]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
affinity = client.V1Affinity(
|
||||||
|
node_affinity=client.V1NodeAffinity(
|
||||||
|
required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
|
||||||
|
node_selector_terms=affinities
|
||||||
|
))
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.spec.get_node_tolerations():
|
||||||
|
tolerations = []
|
||||||
|
for toleration in self.spec.get_node_tolerations():
|
||||||
|
# TODO add some input validation here
|
||||||
|
toleration_key = toleration['key']
|
||||||
|
toleration_value = toleration['value']
|
||||||
|
tolerations.append(client.V1Toleration(
|
||||||
|
effect="NoSchedule",
|
||||||
|
key=toleration_key,
|
||||||
|
operator="Equal",
|
||||||
|
value=toleration_value
|
||||||
|
))
|
||||||
|
|
||||||
template = client.V1PodTemplateSpec(
|
template = client.V1PodTemplateSpec(
|
||||||
metadata=client.V1ObjectMeta(
|
metadata=client.V1ObjectMeta(
|
||||||
annotations=annotations,
|
annotations=annotations,
|
||||||
labels=labels
|
labels=labels
|
||||||
),
|
),
|
||||||
spec=client.V1PodSpec(containers=containers, image_pull_secrets=image_pull_secrets, volumes=volumes),
|
spec=client.V1PodSpec(
|
||||||
|
containers=containers,
|
||||||
|
image_pull_secrets=image_pull_secrets,
|
||||||
|
volumes=volumes,
|
||||||
|
affinity=affinity,
|
||||||
|
tolerations=tolerations
|
||||||
|
),
|
||||||
)
|
)
|
||||||
spec = client.V1DeploymentSpec(
|
spec = client.V1DeploymentSpec(
|
||||||
replicas=self.spec.get_replicas(),
|
replicas=self.spec.get_replicas(),
|
||||||
|
@ -120,6 +120,12 @@ class Spec:
|
|||||||
def get_replicas(self):
|
def get_replicas(self):
|
||||||
return self.obj.get(constants.replicas_key, 1)
|
return self.obj.get(constants.replicas_key, 1)
|
||||||
|
|
||||||
|
def get_node_affinities(self):
|
||||||
|
return self.obj.get(constants.node_affinities_key, [])
|
||||||
|
|
||||||
|
def get_node_tolerations(self):
|
||||||
|
return self.obj.get(constants.node_tolerations_key, [])
|
||||||
|
|
||||||
def get_labels(self):
|
def get_labels(self):
|
||||||
return self.obj.get(constants.labels_key, {})
|
return self.obj.get(constants.labels_key, {})
|
||||||
|
|
||||||
|
222
tests/k8s-deployment-control/run-test.sh
Executable file
222
tests/k8s-deployment-control/run-test.sh
Executable file
@ -0,0 +1,222 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -e
|
||||||
|
if [ -n "$CERC_SCRIPT_DEBUG" ]; then
|
||||||
|
set -x
|
||||||
|
# Dump environment variables for debugging
|
||||||
|
echo "Environment variables:"
|
||||||
|
env
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$1" == "from-path" ]; then
|
||||||
|
TEST_TARGET_SO="laconic-so"
|
||||||
|
else
|
||||||
|
TEST_TARGET_SO=$( ls -t1 ./package/laconic-so* | head -1 )
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Helper functions: TODO move into a separate file
|
||||||
|
wait_for_pods_started () {
|
||||||
|
for i in {1..50}
|
||||||
|
do
|
||||||
|
local ps_output=$( $TEST_TARGET_SO deployment --dir $test_deployment_dir ps )
|
||||||
|
|
||||||
|
if [[ "$ps_output" == *"Running containers:"* ]]; then
|
||||||
|
# if ready, return
|
||||||
|
return
|
||||||
|
else
|
||||||
|
# if not ready, wait
|
||||||
|
sleep 5
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
# Timed out, error exit
|
||||||
|
echo "waiting for pods to start: FAILED"
|
||||||
|
delete_cluster_exit
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_log_output () {
|
||||||
|
for i in {1..50}
|
||||||
|
do
|
||||||
|
|
||||||
|
local log_output=$( $TEST_TARGET_SO deployment --dir $test_deployment_dir logs )
|
||||||
|
|
||||||
|
if [[ ! -z "$log_output" ]]; then
|
||||||
|
# if ready, return
|
||||||
|
return
|
||||||
|
else
|
||||||
|
# if not ready, wait
|
||||||
|
sleep 5
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
# Timed out, error exit
|
||||||
|
echo "waiting for pods log content: FAILED"
|
||||||
|
delete_cluster_exit
|
||||||
|
}
|
||||||
|
|
||||||
|
delete_cluster_exit () {
|
||||||
|
$TEST_TARGET_SO deployment --dir $test_deployment_dir stop --delete-volumes
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Set a non-default repo dir
|
||||||
|
export CERC_REPO_BASE_DIR=~/stack-orchestrator-test/repo-base-dir
|
||||||
|
echo "Testing this package: $TEST_TARGET_SO"
|
||||||
|
echo "Test version command"
|
||||||
|
reported_version_string=$( $TEST_TARGET_SO version )
|
||||||
|
echo "Version reported is: ${reported_version_string}"
|
||||||
|
echo "Cloning repositories into: $CERC_REPO_BASE_DIR"
|
||||||
|
rm -rf $CERC_REPO_BASE_DIR
|
||||||
|
mkdir -p $CERC_REPO_BASE_DIR
|
||||||
|
$TEST_TARGET_SO --stack test setup-repositories
|
||||||
|
$TEST_TARGET_SO --stack test build-containers
|
||||||
|
# Test basic stack-orchestrator deploy to k8s
|
||||||
|
test_deployment_dir=$CERC_REPO_BASE_DIR/test-deployment-dir
|
||||||
|
test_deployment_spec=$CERC_REPO_BASE_DIR/test-deployment-spec.yml
|
||||||
|
|
||||||
|
# Create a deployment that we can use to check our test cases
|
||||||
|
$TEST_TARGET_SO --stack test deploy --deploy-to k8s-kind init --output $test_deployment_spec
|
||||||
|
# Check the file now exists
|
||||||
|
if [ ! -f "$test_deployment_spec" ]; then
|
||||||
|
echo "deploy init test: spec file not present"
|
||||||
|
echo "deploy init test: FAILED"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "deploy init test: passed"
|
||||||
|
|
||||||
|
$TEST_TARGET_SO --stack test deploy create --spec-file $test_deployment_spec --deployment-dir $test_deployment_dir
|
||||||
|
# Check the deployment dir exists
|
||||||
|
if [ ! -d "$test_deployment_dir" ]; then
|
||||||
|
echo "deploy create test: deployment directory not present"
|
||||||
|
echo "deploy create test: FAILED"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "deploy create test: passed"
|
||||||
|
# Check the file writted by the create command in the stack now exists
|
||||||
|
if [ ! -f "$test_deployment_dir/create-file" ]; then
|
||||||
|
echo "deploy create test: create output file not present"
|
||||||
|
echo "deploy create test: FAILED"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "deploy create output file test: passed"
|
||||||
|
|
||||||
|
# At this point the deployment's kind-config.yml will look like this:
|
||||||
|
# kind: Cluster
|
||||||
|
# apiVersion: kind.x-k8s.io/v1alpha4
|
||||||
|
# nodes:
|
||||||
|
# - role: control-plane
|
||||||
|
# kubeadmConfigPatches:
|
||||||
|
# - |
|
||||||
|
# kind: InitConfiguration
|
||||||
|
# nodeRegistration:
|
||||||
|
# kubeletExtraArgs:
|
||||||
|
# node-labels: "ingress-ready=true"
|
||||||
|
# extraPortMappings:
|
||||||
|
# - containerPort: 80
|
||||||
|
# hostPort: 80
|
||||||
|
|
||||||
|
# We need to change it to this:
|
||||||
|
# Note we also turn up the log level on the scheduler in order to diagnose placement errors
|
||||||
|
# See logs like: kubectl -n kube-system logs kube-scheduler-laconic-f185cd245d8dba98-control-plane
|
||||||
|
kind_config_file=${test_deployment_dir}/kind-config.yml
|
||||||
|
cat << EOF > ${kind_config_file}
|
||||||
|
kind: Cluster
|
||||||
|
apiVersion: kind.x-k8s.io/v1alpha4
|
||||||
|
kubeadmConfigPatches:
|
||||||
|
- |
|
||||||
|
kind: ClusterConfiguration
|
||||||
|
scheduler:
|
||||||
|
extraArgs:
|
||||||
|
v: "3"
|
||||||
|
nodes:
|
||||||
|
- role: control-plane
|
||||||
|
kubeadmConfigPatches:
|
||||||
|
- |
|
||||||
|
kind: InitConfiguration
|
||||||
|
nodeRegistration:
|
||||||
|
kubeletExtraArgs:
|
||||||
|
node-labels: "ingress-ready=true"
|
||||||
|
extraPortMappings:
|
||||||
|
- containerPort: 80
|
||||||
|
hostPort: 80
|
||||||
|
- role: worker
|
||||||
|
labels:
|
||||||
|
nodetype: a
|
||||||
|
- role: worker
|
||||||
|
labels:
|
||||||
|
nodetype: b
|
||||||
|
- role: worker
|
||||||
|
labels:
|
||||||
|
nodetype: c
|
||||||
|
kubeadmConfigPatches:
|
||||||
|
- |
|
||||||
|
kind: JoinConfiguration
|
||||||
|
nodeRegistration:
|
||||||
|
taints:
|
||||||
|
- key: "nodeavoid"
|
||||||
|
value: "c"
|
||||||
|
effect: "NoSchedule"
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# At this point we should have 4 nodes, three labeled like this:
|
||||||
|
# $ kubectl get nodes --show-labels=true
|
||||||
|
# NAME STATUS ROLES AGE VERSION LABELS
|
||||||
|
# laconic-3af549a3ba0e3a3c-control-plane Ready control-plane 2m37s v1.30.0 ...,ingress-ready=true
|
||||||
|
# laconic-3af549a3ba0e3a3c-worker Ready <none> 2m18s v1.30.0 ...,nodetype=a
|
||||||
|
# laconic-3af549a3ba0e3a3c-worker2 Ready <none> 2m18s v1.30.0 ...,nodetype=b
|
||||||
|
# laconic-3af549a3ba0e3a3c-worker3 Ready <none> 2m18s v1.30.0 ...,nodetype=c
|
||||||
|
|
||||||
|
# And with taints like this:
|
||||||
|
# $ kubectl get nodes -o custom-columns=NAME:.metadata.name,TAINTS:.spec.taints --no-headers
|
||||||
|
# laconic-3af549a3ba0e3a3c-control-plane [map[effect:NoSchedule key:node-role.kubernetes.io/control-plane]]
|
||||||
|
# laconic-3af549a3ba0e3a3c-worker <none>
|
||||||
|
# laconic-3af549a3ba0e3a3c-worker2 <none>
|
||||||
|
# laconic-3af549a3ba0e3a3c-worker3 [map[effect:NoSchedule key:nodeavoid value:c]]
|
||||||
|
|
||||||
|
# We can now modify the deployment spec file to require a set of affinity and/or taint combinations
|
||||||
|
# then bring up the deployment and check that the pod is scheduled to an expected node.
|
||||||
|
|
||||||
|
# Add a requirement to schedule on a node labeled nodetype=c and
|
||||||
|
# a toleration such that no other pods schedule on that node
|
||||||
|
deployment_spec_file=${test_deployment_dir}/spec.yml
|
||||||
|
cat << EOF >> ${deployment_spec_file}
|
||||||
|
node-affinities:
|
||||||
|
- label: nodetype
|
||||||
|
value: c
|
||||||
|
node-tolerations:
|
||||||
|
- key: nodeavoid
|
||||||
|
value: c
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Get the deployment ID so we can generate low level kubectl commands later
|
||||||
|
deployment_id=$(cat ${test_deployment_dir}/deployment.yml | cut -d ' ' -f 2)
|
||||||
|
|
||||||
|
# Try to start the deployment
|
||||||
|
$TEST_TARGET_SO deployment --dir $test_deployment_dir start
|
||||||
|
wait_for_pods_started
|
||||||
|
# Check logs command works
|
||||||
|
wait_for_log_output
|
||||||
|
sleep 1
|
||||||
|
log_output_1=$( $TEST_TARGET_SO deployment --dir $test_deployment_dir logs )
|
||||||
|
if [[ "$log_output_1" == *"filesystem is fresh"* ]]; then
|
||||||
|
echo "deployment of pod test: passed"
|
||||||
|
else
|
||||||
|
echo "deployment pod test: FAILED"
|
||||||
|
echo $log_output_1
|
||||||
|
delete_cluster_exit
|
||||||
|
fi
|
||||||
|
|
||||||
|
# The deployment's pod should be scheduled onto node: worker3
|
||||||
|
# Check that's what happened
|
||||||
|
# Get get the node onto which the stack pod has been deployed
|
||||||
|
deployment_node=$(kubectl get pods -l app=${deployment_id} -o=jsonpath='{.items..spec.nodeName}')
|
||||||
|
expected_node=${deployment_id}-worker3
|
||||||
|
echo "Stack pod deployed to node: ${deployment_node}"
|
||||||
|
if [[ ${deployment_node} == ${expected_node} ]]; then
|
||||||
|
echo "deployment of pod test: passed"
|
||||||
|
else
|
||||||
|
echo "deployment pod test: FAILED"
|
||||||
|
echo "Stack pod deployed to node: ${deployment_node}, expected node: ${expected_node}"
|
||||||
|
delete_cluster_exit
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Stop and clean up
|
||||||
|
$TEST_TARGET_SO deployment --dir $test_deployment_dir stop --delete-volumes
|
||||||
|
echo "Test passed"
|
Loading…
Reference in New Issue
Block a user