From 1746f7366c3586eaf38975949f97a2e4fb680ae3 Mon Sep 17 00:00:00 2001 From: Prathamesh Musale Date: Mon, 15 Apr 2024 17:02:50 +0530 Subject: [PATCH] Add alerts on blackbox metrics for monitoring endpoints --- .../monitoring/blackbox-alert-rules.yml | 121 ++++++++++++++++++ .../dashboards/laconicd-dashboard.json | 2 +- .../data/stacks/monitoring/README.md | 5 +- .../stacks/monitoring/monitoring-watchers.md | 5 +- 4 files changed, 128 insertions(+), 5 deletions(-) create mode 100644 stack_orchestrator/data/config/monitoring/blackbox-alert-rules.yml diff --git a/stack_orchestrator/data/config/monitoring/blackbox-alert-rules.yml b/stack_orchestrator/data/config/monitoring/blackbox-alert-rules.yml new file mode 100644 index 00000000..11a3366f --- /dev/null +++ b/stack_orchestrator/data/config/monitoring/blackbox-alert-rules.yml @@ -0,0 +1,121 @@ +apiVersion: 1 +groups: + - orgId: 1 + name: blackbox + folder: BlackboxAlerts + interval: 30s + rules: + # Azimuth Gateway endpoint + - uid: azimuth_gateway + title: azimuth_gateway_endpoint_tracking + condition: condition + data: + - refId: probe + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + editorMode: code + expr: probe_success{instance=""} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: probe + - refId: condition + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + - 0 + type: gt + operator: + type: and + query: + params: [] + reducer: + params: [] + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: ${probe} != 1 + intervalMs: 1000 + maxDataPoints: 43200 + refId: condition + type: math + noDataState: Alerting + execErrState: Alerting + for: 5m + annotations: + summary: Probe failed for Azimuth gateway endpoint {{ index $labels "instance" }} + labels: + endpoint: '{{ index $labels "instance" }}' + probe_success: '{{ index $values "probe" }}' + isPaused: false + # Laconicd GQL endpoint + - uid: laconicd_gql + title: laconicd_gql_endpoint_tracking + condition: condition + data: + - refId: probe + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + editorMode: code + expr: probe_success{instance=""} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: probe + - refId: condition + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + - 0 + type: gt + operator: + type: and + query: + params: [] + reducer: + params: [] + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: ${probe} != 1 + intervalMs: 1000 + maxDataPoints: 43200 + refId: condition + type: math + noDataState: Alerting + execErrState: Alerting + for: 5m + annotations: + summary: Probe failed for Laconicd GQL endpoint {{ index $labels "instance" }} + labels: + endpoint: '{{ index $labels "instance" }}' + probe_success: '{{ index $values "probe" }}' + isPaused: false diff --git a/stack_orchestrator/data/config/monitoring/grafana/dashboards/laconicd-dashboard.json b/stack_orchestrator/data/config/monitoring/grafana/dashboards/laconicd-dashboard.json index 0735885c..0f8c99ac 100644 --- a/stack_orchestrator/data/config/monitoring/grafana/dashboards/laconicd-dashboard.json +++ b/stack_orchestrator/data/config/monitoring/grafana/dashboards/laconicd-dashboard.json @@ -49,7 +49,7 @@ }, "gridPos": { "h": 3, - "w": 3, + "w": 4, "x": 0, "y": 0 }, diff --git a/stack_orchestrator/data/stacks/monitoring/README.md b/stack_orchestrator/data/stacks/monitoring/README.md index 99502902..3a507379 100644 --- a/stack_orchestrator/data/stacks/monitoring/README.md +++ b/stack_orchestrator/data/stacks/monitoring/README.md @@ -123,8 +123,9 @@ laconic-so --stack monitoring deploy create --spec-file monitoring-spec.yml --de ```yml ... - job_name: laconicd - static_configs: - - targets: ['example-host:1317'] + ... + static_configs: + - targets: ['example-host:1317'] ... ``` diff --git a/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md b/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md index 2f057c3c..e480520f 100644 --- a/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md +++ b/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md @@ -49,8 +49,9 @@ Add the following scrape configs to prometheus config file (`monitoring-watchers - ... - job_name: laconicd - static_configs: - - targets: ['LACONICD_REST_HOST:LACONICD_REST_PORT'] + ... + static_configs: + - targets: ['LACONICD_REST_HOST:LACONICD_REST_PORT'] ... - job_name: azimuth scrape_interval: 10s