Add alerts on blackbox metrics for monitoring endpoints

2024-04-15 17:02:50 +05:30 · 2024-04-15 17:02:50 +05:30 · 1746f7366c
commit 1746f7366c
parent 345d200873
4 changed files with 128 additions and 5 deletions
--- a/stack_orchestrator/data/config/monitoring/blackbox-alert-rules.yml
+++ b/stack_orchestrator/data/config/monitoring/blackbox-alert-rules.yml
@ -0,0 +1,121 @@
+apiVersion: 1
+groups:
+    - orgId: 1
+      name: blackbox
+      folder: BlackboxAlerts
+      interval: 30s
+      rules:
+        # Azimuth Gateway endpoint
+        - uid: azimuth_gateway
+          title: azimuth_gateway_endpoint_tracking
+          condition: condition
+          data:
+            - refId: probe
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: PBFA97CFB590B2093
+              model:
+                editorMode: code
+                expr: probe_success{instance="<AZIMUTH_GATEWAY_GQL_ENDPOINT>"}
+                instant: true
+                intervalMs: 1000
+                legendFormat: __auto
+                maxDataPoints: 43200
+                range: false
+                refId: probe
+            - refId: condition
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: __expr__
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 0
+                            - 0
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params: []
+                      reducer:
+                        params: []
+                        type: avg
+                      type: query
+                datasource:
+                    name: Expression
+                    type: __expr__
+                    uid: __expr__
+                expression: ${probe} != 1
+                intervalMs: 1000
+                maxDataPoints: 43200
+                refId: condition
+                type: math
+          noDataState: Alerting
+          execErrState: Alerting
+          for: 5m
+          annotations:
+            summary: Probe failed for Azimuth gateway endpoint {{ index $labels "instance" }}
+          labels:
+            endpoint: '{{ index $labels "instance" }}'
+            probe_success: '{{ index $values "probe" }}'
+          isPaused: false
+        # Laconicd GQL endpoint
+        - uid: laconicd_gql
+          title: laconicd_gql_endpoint_tracking
+          condition: condition
+          data:
+            - refId: probe
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: PBFA97CFB590B2093
+              model:
+                editorMode: code
+                expr: probe_success{instance="<LACONICD_GQL_ENDPOINT>"}
+                instant: true
+                intervalMs: 1000
+                legendFormat: __auto
+                maxDataPoints: 43200
+                range: false
+                refId: probe
+            - refId: condition
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: __expr__
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 0
+                            - 0
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params: []
+                      reducer:
+                        params: []
+                        type: avg
+                      type: query
+                datasource:
+                    name: Expression
+                    type: __expr__
+                    uid: __expr__
+                expression: ${probe} != 1
+                intervalMs: 1000
+                maxDataPoints: 43200
+                refId: condition
+                type: math
+          noDataState: Alerting
+          execErrState: Alerting
+          for: 5m
+          annotations:
+            summary: Probe failed for Laconicd GQL endpoint {{ index $labels "instance" }}
+          labels:
+            endpoint: '{{ index $labels "instance" }}'
+            probe_success: '{{ index $values "probe" }}'
+          isPaused: false
--- a/stack_orchestrator/data/config/monitoring/grafana/dashboards/laconicd-dashboard.json
+++ b/stack_orchestrator/data/config/monitoring/grafana/dashboards/laconicd-dashboard.json
@ -49,7 +49,7 @@
      },
      "gridPos": {
        "h": 3,
-        "w": 3,
+        "w": 4,
        "x": 0,
        "y": 0
      },
--- a/stack_orchestrator/data/stacks/monitoring/README.md
+++ b/stack_orchestrator/data/stacks/monitoring/README.md
@ -123,6 +123,7 @@ laconic-so --stack monitoring deploy create --spec-file monitoring-spec.yml --de
  ```yml
  ...
  - job_name: laconicd
+    ...
    static_configs:
      - targets: ['example-host:1317']
  ...
--- a/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md
+++ b/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md
@ -49,6 +49,7 @@ Add the following scrape configs to prometheus config file (`monitoring-watchers
        - <LACONICD_GQL_ENDPOINT>
  ...
  - job_name: laconicd
+    ...
    static_configs:
      - targets: ['LACONICD_REST_HOST:LACONICD_REST_PORT']
  ...