[WIP] Add alerts on blackbox metrics for monitoring endpoints #803

Draft
prathamesh wants to merge 4 commits from deep-stack/stack-orchestrator:pm-endpoint-alerts into main
Showing only changes of commit abf4d39a22 - Show all commits

View File

@ -24,6 +24,20 @@ groups:
maxDataPoints: 43200
range: false
refId: probe
- refId: http_status_code
relativeTimeRange:
from: 600
to: 0
datasourceUid: PBFA97CFB590B2093
model:
editorMode: code
expr: probe_http_status_code{destination="azimuth_gateway"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: http_status_code
- refId: condition
relativeTimeRange:
from: 600
@ -48,7 +62,7 @@ groups:
name: Expression
type: __expr__
uid: __expr__
expression: ${probe} != 1
expression: ${probe} != 1 || ${http_status_code} != 200
intervalMs: 1000
maxDataPoints: 43200
refId: condition
@ -57,9 +71,8 @@ groups:
execErrState: Alerting
for: 5m
annotations:
summary: Probe failed for Azimuth gateway endpoint {{ index $labels "instance" }}
summary: Probe failed for Azimuth gateway endpoint
labels:
endpoint: '{{ index $labels "instance" }}'
probe_success: '{{ index $values "probe" }}'
isPaused: false
# Laconicd GQL endpoint
@ -81,6 +94,20 @@ groups:
maxDataPoints: 43200
range: false
refId: probe
- refId: http_status_code
relativeTimeRange:
from: 600
to: 0
datasourceUid: PBFA97CFB590B2093
model:
editorMode: code
expr: probe_http_status_code{destination="laconicd_gql"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: http_status_code
- refId: condition
relativeTimeRange:
from: 600
@ -105,7 +132,7 @@ groups:
name: Expression
type: __expr__
uid: __expr__
expression: ${probe} != 1
expression: ${probe} != 1 || ${http_status_code} != 200
intervalMs: 1000
maxDataPoints: 43200
refId: condition
@ -114,8 +141,7 @@ groups:
execErrState: Alerting
for: 5m
annotations:
summary: Probe failed for Laconicd GQL endpoint {{ index $labels "instance" }}
summary: Probe failed for Laconicd GQL endpoint
labels:
endpoint: '{{ index $labels "instance" }}'
probe_success: '{{ index $values "probe" }}'
isPaused: false