diff --git a/.gitea/workflows/test-k8s-deploy.yml b/.gitea/workflows/test-k8s-deploy.yml index b3aabb02..84cce91a 100644 --- a/.gitea/workflows/test-k8s-deploy.yml +++ b/.gitea/workflows/test-k8s-deploy.yml @@ -10,10 +10,14 @@ on: paths-ignore: - '.gitea/workflows/triggers/*' +# Needed until we can incorporate docker startup into the executor container +env: + DOCKER_HOST: unix:///var/run/dind.sock + jobs: test: - name: "Run deploy test suite on kind/k8s" - runs-on: ubuntu-22.04-with-syn-ethdb + name: "Run deploy test suite" + runs-on: ubuntu-latest steps: - name: "Clone project repository" uses: actions/checkout@v3 @@ -37,9 +41,15 @@ jobs: run: ./scripts/create_build_tag_file.sh - name: "Build local shiv package" run: ./scripts/build_shiv_package.sh - - name: "Install kind" - run: ./tests/scripts/install-kind.sh - - name: "Install Kubectl" - run: ./tests/scripts/install-kubectl.sh - - name: "Run k8s deployment test" - run: ./tests/k8s-deploy/run-deploy-test.sh + - name: Start dockerd # Also needed until we can incorporate into the executor + run: | + dockerd -H $DOCKER_HOST --userland-proxy=false & + sleep 5 + - name: "Install Go" + uses: actions/setup-go@v4 + with: + go-version: '1.21' + - name: "Install Kind" + run: go install sigs.k8s.io/kind@v0.20.0 + - name: "Debug Kind" + run: kind create cluster --retain && docker logs kind-control-plane diff --git a/stack_orchestrator/data/compose/docker-compose-grafana.yml b/stack_orchestrator/data/compose/docker-compose-grafana.yml index 4aa8d1e1..6d32e37b 100644 --- a/stack_orchestrator/data/compose/docker-compose-grafana.yml +++ b/stack_orchestrator/data/compose/docker-compose-grafana.yml @@ -2,8 +2,10 @@ version: "3.7" services: grafana: - image: grafana/grafana + image: grafana/grafana:10.2.2 restart: always + environment: + GF_SERVER_ROOT_URL: ${GF_SERVER_ROOT_URL} volumes: - ../config/monitoring/grafana/provisioning:/etc/grafana/provisioning - ../config/monitoring/grafana/dashboards:/etc/grafana/dashboards diff --git a/stack_orchestrator/data/compose/docker-compose-prom-server.yml b/stack_orchestrator/data/compose/docker-compose-prom-server.yml index b085e295..9095b6dc 100644 --- a/stack_orchestrator/data/compose/docker-compose-prom-server.yml +++ b/stack_orchestrator/data/compose/docker-compose-prom-server.yml @@ -2,7 +2,7 @@ version: "3.7" services: prometheus: - image: prom/prometheus + image: prom/prometheus:v2.49.1 restart: always volumes: - ../config/monitoring/prometheus:/etc/prometheus diff --git a/stack_orchestrator/data/config/monitoring/watcher-alert-rules.yml b/stack_orchestrator/data/config/monitoring/watcher-alert-rules.yml index 7e26ba14..9df9472f 100644 --- a/stack_orchestrator/data/config/monitoring/watcher-alert-rules.yml +++ b/stack_orchestrator/data/config/monitoring/watcher-alert-rules.yml @@ -95,14 +95,10 @@ groups: maxDataPoints: 43200 refId: condition type: math - dashboardUid: cb9b746a-9abc-482e-9214-5231e0dd75ca - panelId: 24 noDataState: Alerting execErrState: Alerting for: 15m annotations: - __dashboardUid__: cb9b746a-9abc-482e-9214-5231e0dd75ca - __panelId__: "24" summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} isPaused: false - uid: censures_diff_external @@ -191,14 +187,10 @@ groups: maxDataPoints: 43200 refId: condition type: math - dashboardUid: cb9b746a-9abc-482e-9214-5231e0dd75ca - panelId: 24 noDataState: Alerting execErrState: Alerting for: 15m annotations: - __dashboardUid__: cb9b746a-9abc-482e-9214-5231e0dd75ca - __panelId__: "24" summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} isPaused: false - uid: claims_diff_external @@ -287,14 +279,10 @@ groups: maxDataPoints: 43200 refId: condition type: math - dashboardUid: cb9b746a-9abc-482e-9214-5231e0dd75ca - panelId: 24 noDataState: Alerting execErrState: Alerting for: 15m annotations: - __dashboardUid__: cb9b746a-9abc-482e-9214-5231e0dd75ca - __panelId__: "24" summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} isPaused: false - uid: conditional_star_release_diff_external @@ -383,14 +371,10 @@ groups: maxDataPoints: 43200 refId: condition type: math - dashboardUid: cb9b746a-9abc-482e-9214-5231e0dd75ca - panelId: 24 noDataState: Alerting execErrState: Alerting for: 15m annotations: - __dashboardUid__: cb9b746a-9abc-482e-9214-5231e0dd75ca - __panelId__: "24" summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} isPaused: false - uid: delegated_sending_diff_external @@ -479,14 +463,10 @@ groups: maxDataPoints: 43200 refId: condition type: math - dashboardUid: cb9b746a-9abc-482e-9214-5231e0dd75ca - panelId: 24 noDataState: Alerting execErrState: Alerting for: 15m annotations: - __dashboardUid__: cb9b746a-9abc-482e-9214-5231e0dd75ca - __panelId__: "24" summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} isPaused: false - uid: ecliptic_diff_external @@ -575,14 +555,10 @@ groups: maxDataPoints: 43200 refId: condition type: math - dashboardUid: cb9b746a-9abc-482e-9214-5231e0dd75ca - panelId: 24 noDataState: Alerting execErrState: Alerting for: 15m annotations: - __dashboardUid__: cb9b746a-9abc-482e-9214-5231e0dd75ca - __panelId__: "24" summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} isPaused: false - uid: linear_star_release_diff_external @@ -671,14 +647,10 @@ groups: maxDataPoints: 43200 refId: condition type: math - dashboardUid: cb9b746a-9abc-482e-9214-5231e0dd75ca - panelId: 24 noDataState: Alerting execErrState: Alerting for: 15m annotations: - __dashboardUid__: cb9b746a-9abc-482e-9214-5231e0dd75ca - __panelId__: "24" summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} isPaused: false - uid: polls_diff_external @@ -767,14 +739,10 @@ groups: maxDataPoints: 43200 refId: condition type: math - dashboardUid: cb9b746a-9abc-482e-9214-5231e0dd75ca - panelId: 24 noDataState: Alerting execErrState: Alerting for: 15m annotations: - __dashboardUid__: cb9b746a-9abc-482e-9214-5231e0dd75ca - __panelId__: "24" summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} isPaused: false @@ -865,14 +833,10 @@ groups: maxDataPoints: 43200 refId: condition type: math - dashboardUid: cb9b746a-9abc-482e-9214-5231e0dd75ca - panelId: 24 noDataState: Alerting execErrState: Alerting for: 15m annotations: - __dashboardUid__: cb9b746a-9abc-482e-9214-5231e0dd75ca - __panelId__: "24" summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} isPaused: false - uid: merkl_sushiswap_diff_external @@ -961,13 +925,9 @@ groups: maxDataPoints: 43200 refId: condition type: math - dashboardUid: cb9b746a-9abc-482e-9214-5231e0dd75ca - panelId: 24 noDataState: Alerting execErrState: Alerting for: 15m annotations: - __dashboardUid__: cb9b746a-9abc-482e-9214-5231e0dd75ca - __panelId__: "24" summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} isPaused: false diff --git a/stack_orchestrator/data/stacks/monitoring/README.md b/stack_orchestrator/data/stacks/monitoring/README.md index 7b410478..74b1bff8 100644 --- a/stack_orchestrator/data/stacks/monitoring/README.md +++ b/stack_orchestrator/data/stacks/monitoring/README.md @@ -140,6 +140,10 @@ Set the following env variables in the deployment env config file (`monitoring-d # External ETH RPC endpoint (filecoin) # (Optional, default: https://api.node.glif.io/rpc/v1) CERC_FIL_RPC_ENDPOINT= + + # Grafana server host URL (used in various links in alerts, etc.) + # (Optional, default: http://localhost:3000) + GF_SERVER_ROOT_URL= ``` ## Start the stack diff --git a/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md b/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md index 3673b530..bce2d84b 100644 --- a/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md +++ b/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md @@ -130,6 +130,10 @@ Set the following env variables in the deployment env config file (`monitoring-w ```bash # Infura key to be used CERC_INFURA_KEY= + + # Grafana server host URL to be used + # (Optional, default: http://localhost:3000) + GF_SERVER_ROOT_URL= ``` ## Start the stack diff --git a/stack_orchestrator/deploy/k8s/cluster_info.py b/stack_orchestrator/deploy/k8s/cluster_info.py index 85fd63a8..7718c777 100644 --- a/stack_orchestrator/deploy/k8s/cluster_info.py +++ b/stack_orchestrator/deploy/k8s/cluster_info.py @@ -168,8 +168,8 @@ class ClusterInfo: result.append(pv) return result - # TODO: put things like image pull policy into an object-scope struct - def get_deployment(self, image_pull_policy: str = None): + # to suit the deployment, and also annotate the container specs to point at said volumes + def get_deployment(self): containers = [] for pod_name in self.parsed_pod_yaml_map: pod = self.parsed_pod_yaml_map[pod_name] @@ -189,7 +189,7 @@ class ClusterInfo: container = client.V1Container( name=container_name, image=image_to_use, - image_pull_policy=image_pull_policy, + image_pull_policy="Always", env=envs_from_environment_variables_map(self.environment_variables.map), ports=[client.V1ContainerPort(container_port=port)], volume_mounts=volume_mounts, diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py index 0a339fe9..bf82ebdf 100644 --- a/stack_orchestrator/deploy/k8s/deploy_k8s.py +++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py @@ -111,7 +111,7 @@ class K8sDeployer(Deployer): print("PVCs created:") print(f"{pvc_resp}") # Process compose files into a Deployment - deployment = self.cluster_info.get_deployment(image_pull_policy=None if self.is_kind() else "Always") + deployment = self.cluster_info.get_deployment() # Create the k8s objects if opts.o.debug: print(f"Sending this deployment: {deployment}") @@ -132,18 +132,18 @@ class K8sDeployer(Deployer): print("Service created:") print(f"{service_resp}") - if not self.is_kind(): - ingress: client.V1Ingress = self.cluster_info.get_ingress() + # TODO: disable ingress for kind + ingress: client.V1Ingress = self.cluster_info.get_ingress() - if opts.o.debug: - print(f"Sending this ingress: {ingress}") - ingress_resp = self.networking_api.create_namespaced_ingress( - namespace=self.k8s_namespace, - body=ingress - ) - if opts.o.debug: - print("Ingress created:") - print(f"{ingress_resp}") + if opts.o.debug: + print(f"Sending this ingress: {ingress}") + ingress_resp = self.networking_api.create_namespaced_ingress( + namespace=self.k8s_namespace, + body=ingress + ) + if opts.o.debug: + print("Ingress created:") + print(f"{ingress_resp}") def down(self, timeout, volumes): self.connect_api() @@ -196,16 +196,16 @@ class K8sDeployer(Deployer): except client.exceptions.ApiException as e: _check_delete_exception(e) - if not self.is_kind(): - ingress: client.V1Ingress = self.cluster_info.get_ingress() - if opts.o.debug: - print(f"Deleting this ingress: {ingress}") - try: - self.networking_api.delete_namespaced_ingress( - name=ingress.metadata.name, namespace=self.k8s_namespace - ) - except client.exceptions.ApiException as e: - _check_delete_exception(e) + # TODO: disable ingress for kind + ingress: client.V1Ingress = self.cluster_info.get_ingress() + if opts.o.debug: + print(f"Deleting this ingress: {ingress}") + try: + self.networking_api.delete_namespaced_ingress( + name=ingress.metadata.name, namespace=self.k8s_namespace + ) + except client.exceptions.ApiException as e: + _check_delete_exception(e) if self.is_kind(): # Destroy the kind cluster @@ -219,7 +219,7 @@ class K8sDeployer(Deployer): if all_pods.items: for p in all_pods.items: - if f"{self.cluster_info.app_name}-deployment" in p.metadata.name: + if self.cluster_info.app_name in p.metadata.name: pods.append(p) if not pods: @@ -266,7 +266,7 @@ class K8sDeployer(Deployer): ret = [] for p in pods.items: - if f"{self.cluster_info.app_name}-deployment" in p.metadata.name: + if self.cluster_info.app_name in p.metadata.name: pod_ip = p.status.pod_ip ports = AttrDict() for c in p.spec.containers: @@ -299,20 +299,11 @@ class K8sDeployer(Deployer): def logs(self, services, tail, follow, stream): self.connect_api() - pods = pods_in_deployment(self.core_api, self.cluster_info.app_name) + pods = pods_in_deployment(self.core_api, "test-deployment") if len(pods) > 1: print("Warning: more than one pod in the deployment") - if len(pods) == 0: - log_data = "******* Pods not running ********\n" - else: - k8s_pod_name = pods[0] - # If the pod is not yet started, the logs request below will throw an exception - try: - log_data = self.core_api.read_namespaced_pod_log(k8s_pod_name, namespace="default", container="test") - except client.exceptions.ApiException as e: - if opts.o.debug: - print(f"Error from read_namespaced_pod_log: {e}") - log_data = "******* No logs available ********\n" + k8s_pod_name = pods[0] + log_data = self.core_api.read_namespaced_pod_log(k8s_pod_name, namespace="default", container="test") return log_stream_from_string(log_data) def update(self): diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index 62545dfd..9f968dbf 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -21,7 +21,6 @@ from typing import Set, Mapping, List from stack_orchestrator.opts import opts from stack_orchestrator.deploy.deploy_util import parsed_pod_files_map_from_file_names -from stack_orchestrator.deploy.deployer import DeployerException def _run_command(command: str): @@ -30,13 +29,10 @@ def _run_command(command: str): result = subprocess.run(command, shell=True) if opts.o.debug: print(f"Result: {result}") - return result def create_cluster(name: str, config_file: str): - result = _run_command(f"kind create cluster --name {name} --config {config_file}") - if result.returncode != 0: - raise DeployerException(f"kind create cluster failed: {result}") + _run_command(f"kind create cluster --name {name} --config {config_file}") def destroy_cluster(name: str): @@ -45,14 +41,12 @@ def destroy_cluster(name: str): def load_images_into_kind(kind_cluster_name: str, image_set: Set[str]): for image in image_set: - result = _run_command(f"kind load docker-image {image} --name {kind_cluster_name}") - if result.returncode != 0: - raise DeployerException(f"kind create cluster failed: {result}") + _run_command(f"kind load docker-image {image} --name {kind_cluster_name}") def pods_in_deployment(core_api: client.CoreV1Api, deployment_name: str): pods = [] - pod_response = core_api.list_namespaced_pod(namespace="default", label_selector=f"app={deployment_name}") + pod_response = core_api.list_namespaced_pod(namespace="default", label_selector="app=test-app") if opts.o.debug: print(f"pod_response: {pod_response}") for pod_info in pod_response.items: diff --git a/tests/k8s-deploy/run-deploy-test.sh b/tests/k8s-deploy/run-deploy-test.sh index 15eb2d3b..b7ee9dd0 100755 --- a/tests/k8s-deploy/run-deploy-test.sh +++ b/tests/k8s-deploy/run-deploy-test.sh @@ -1,59 +1,14 @@ #!/usr/bin/env bash set -e if [ -n "$CERC_SCRIPT_DEBUG" ]; then - set -x - # Dump environment variables for debugging - echo "Environment variables:" - env + set -x fi - -# Helper functions: TODO move into a separate file -wait_for_pods_started () { - for i in {1..5} - do - local ps_output=$( $TEST_TARGET_SO deployment --dir $test_deployment_dir ps ) - - if [[ "$ps_output" == *"Running containers:"* ]]; then - # if ready, return - return - else - # if not ready, wait - sleep 5 - fi - done - # Timed out, error exit - echo "waiting for pods to start: FAILED" - delete_cluster_exit -} - -wait_for_log_output () { - for i in {1..5} - do - - local log_output=$( $TEST_TARGET_SO deployment --dir $test_deployment_dir logs ) - - if [[ ! -z "$log_output" ]]; then - # if ready, return - return - else - # if not ready, wait - sleep 5 - fi - done - # Timed out, error exit - echo "waiting for pods log content: FAILED" - delete_cluster_exit -} - - -delete_cluster_exit () { - $TEST_TARGET_SO deployment --dir $test_deployment_dir stop --delete-volumes - exit 1 -} - # Note: eventually this test should be folded into ../deploy/ # but keeping it separate for now for convenience TEST_TARGET_SO=$( ls -t1 ./package/laconic-so* | head -1 ) +# Dump environment variables for debugging +echo "Environment variables:" +env # Set a non-default repo dir export CERC_REPO_BASE_DIR=~/stack-orchestrator-test/repo-base-dir echo "Testing this package: $TEST_TARGET_SO" @@ -63,9 +18,7 @@ echo "Version reported is: ${reported_version_string}" echo "Cloning repositories into: $CERC_REPO_BASE_DIR" rm -rf $CERC_REPO_BASE_DIR mkdir -p $CERC_REPO_BASE_DIR -$TEST_TARGET_SO --stack test setup-repositories -$TEST_TARGET_SO --stack test build-containers -# Test basic stack-orchestrator deploy to k8s +# Test basic stack-orchestrator deploy test_deployment_dir=$CERC_REPO_BASE_DIR/test-deployment-dir test_deployment_spec=$CERC_REPO_BASE_DIR/test-deployment-spec.yml $TEST_TARGET_SO --stack test deploy --deploy-to k8s-kind init --output $test_deployment_spec --config CERC_TEST_PARAM_1=PASSED @@ -100,36 +53,23 @@ fi echo "deploy create output file test: passed" # Try to start the deployment $TEST_TARGET_SO deployment --dir $test_deployment_dir start -wait_for_pods_started +# TODO: add a check to see if the container is up +# Sleep because k8s not up yet +sleep 30 # Check logs command works -wait_for_log_output log_output_3=$( $TEST_TARGET_SO deployment --dir $test_deployment_dir logs ) if [[ "$log_output_3" == *"Filesystem is fresh"* ]]; then echo "deployment logs test: passed" else echo "deployment logs test: FAILED" - delete_cluster_exit + exit 1 fi # Check the config variable CERC_TEST_PARAM_1 was passed correctly if [[ "$log_output_3" == *"Test-param-1: PASSED"* ]]; then echo "deployment config test: passed" else echo "deployment config test: FAILED" - delete_cluster_exit -fi -# Stop then start again and check the volume was preserved -$TEST_TARGET_SO deployment --dir $test_deployment_dir stop -# Sleep a bit just in case -sleep 2 -$TEST_TARGET_SO deployment --dir $test_deployment_dir start -wait_for_pods_started -wait_for_log_output -log_output_4=$( $TEST_TARGET_SO deployment --dir $test_deployment_dir logs ) -if [[ "$log_output_4" == *"Filesystem is old"* ]]; then - echo "Retain volumes test: passed" -else - echo "Retain volumes test: FAILED" - delete_cluster_exit + exit 1 fi # Stop and clean up $TEST_TARGET_SO deployment --dir $test_deployment_dir stop --delete-volumes diff --git a/tests/scripts/install-kind.sh b/tests/scripts/install-kind.sh deleted file mode 100755 index 254c3288..00000000 --- a/tests/scripts/install-kind.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env bash -# TODO: handle ARM -curl --silent -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64 -chmod +x ./kind -mv ./kind /usr/local/bin diff --git a/tests/scripts/install-kubectl.sh b/tests/scripts/install-kubectl.sh deleted file mode 100755 index 7a5062fe..00000000 --- a/tests/scripts/install-kubectl.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env bash -# TODO: handle ARM -curl --silent -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" -chmod +x ./kubectl -mv ./kubectl /usr/local/bin