Compare commits

...

5 Commits

6 changed files with 270 additions and 114 deletions

View File

@ -0,0 +1,147 @@
apiVersion: 1
groups:
- orgId: 1
name: blackbox
folder: BlackboxAlerts
interval: 30s
rules:
# Azimuth Gateway endpoint
- uid: azimuth_gateway
title: azimuth_gateway_endpoint_tracking
condition: condition
data:
- refId: probe
relativeTimeRange:
from: 600
to: 0
datasourceUid: PBFA97CFB590B2093
model:
editorMode: code
expr: probe_success{destination="azimuth_gateway"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: probe
- refId: http_status_code
relativeTimeRange:
from: 600
to: 0
datasourceUid: PBFA97CFB590B2093
model:
editorMode: code
expr: probe_http_status_code{destination="azimuth_gateway"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: http_status_code
- refId: condition
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params: []
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: ${probe} != 1 || ${http_status_code} != 200
intervalMs: 1000
maxDataPoints: 43200
refId: condition
type: math
noDataState: Alerting
execErrState: Alerting
for: 5m
annotations:
summary: Probe failed for Azimuth gateway endpoint
labels:
probe_success: '{{ index $values "probe" }}'
isPaused: false
# Laconicd GQL endpoint
- uid: laconicd_gql
title: laconicd_gql_endpoint_tracking
condition: condition
data:
- refId: probe
relativeTimeRange:
from: 600
to: 0
datasourceUid: PBFA97CFB590B2093
model:
editorMode: code
expr: probe_success{destination="laconicd_gql"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: probe
- refId: http_status_code
relativeTimeRange:
from: 600
to: 0
datasourceUid: PBFA97CFB590B2093
model:
editorMode: code
expr: probe_http_status_code{destination="laconicd_gql"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: http_status_code
- refId: condition
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params: []
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: ${probe} != 1 || ${http_status_code} != 200
intervalMs: 1000
maxDataPoints: 43200
refId: condition
type: math
noDataState: Alerting
execErrState: Alerting
for: 5m
annotations:
summary: Probe failed for Laconicd GQL endpoint
labels:
probe_success: '{{ index $values "probe" }}'
isPaused: false

View File

@ -49,7 +49,7 @@
}, },
"gridPos": { "gridPos": {
"h": 3, "h": 3,
"w": 3, "w": 4,
"x": 0, "x": 0,
"y": 0 "y": 0
}, },

View File

@ -24,9 +24,10 @@ scrape_configs:
params: params:
module: [http_2xx] module: [http_2xx]
static_configs: static_configs:
# Add URLs to be monitored below # Add URLs for targets to be monitored below
- targets: # - targets: [https://github.com]
# - https://github.com # labels:
# destination: 'github'
relabel_configs: relabel_configs:
- source_labels: [__address__] - source_labels: [__address__]
regex: (.*)(:80)? regex: (.*)(:80)?

View File

@ -59,29 +59,29 @@ groups:
conditions: conditions:
- evaluator: - evaluator:
params: params:
- 0 - 15
- 0 - 0
type: gt type: gt
operator: operator:
type: and type: when
query: query:
params: [] params:
- diff
reducer: reducer:
params: [] params: []
type: avg type: last
type: query type: query
datasource: datasource:
name: Expression name: Expression
type: __expr__ type: __expr__
uid: __expr__ uid: __expr__
expression: ${diff} >= 16 expression: ""
intervalMs: 1000 hide: false
maxDataPoints: 43200
refId: condition refId: condition
type: math type: classic_conditions
noDataState: Alerting noDataState: Alerting
execErrState: Alerting execErrState: Alerting
for: 15m for: 5m
annotations: annotations:
summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }}
isPaused: false isPaused: false
@ -135,29 +135,29 @@ groups:
conditions: conditions:
- evaluator: - evaluator:
params: params:
- 0 - 15
- 0 - 0
type: gt type: gt
operator: operator:
type: and type: when
query: query:
params: [] params:
- diff
reducer: reducer:
params: [] params: []
type: avg type: last
type: query type: query
datasource: datasource:
name: Expression name: Expression
type: __expr__ type: __expr__
uid: __expr__ uid: __expr__
expression: ${diff} >= 16 expression: ""
intervalMs: 1000 hide: false
maxDataPoints: 43200
refId: condition refId: condition
type: math type: classic_conditions
noDataState: Alerting noDataState: Alerting
execErrState: Alerting execErrState: Alerting
for: 15m for: 5m
annotations: annotations:
summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }}
isPaused: false isPaused: false
@ -202,7 +202,7 @@ groups:
legendFormat: __auto legendFormat: __auto
range: false range: false
refId: latest_external refId: latest_external
- refId: condition - refId: condition
relativeTimeRange: relativeTimeRange:
from: 600 from: 600
to: 0 to: 0
@ -211,29 +211,29 @@ groups:
conditions: conditions:
- evaluator: - evaluator:
params: params:
- 0 - 15
- 0 - 0
type: gt type: gt
operator: operator:
type: and type: when
query: query:
params: [] params:
- diff
reducer: reducer:
params: [] params: []
type: avg type: last
type: query type: query
datasource: datasource:
name: Expression name: Expression
type: __expr__ type: __expr__
uid: __expr__ uid: __expr__
expression: ${diff} >= 16 expression: ""
intervalMs: 1000 hide: false
maxDataPoints: 43200
refId: condition refId: condition
type: math type: classic_conditions
noDataState: Alerting noDataState: Alerting
execErrState: Alerting execErrState: Alerting
for: 15m for: 5m
annotations: annotations:
summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }}
isPaused: false isPaused: false
@ -278,7 +278,7 @@ groups:
legendFormat: __auto legendFormat: __auto
range: false range: false
refId: latest_external refId: latest_external
- refId: condition - refId: condition
relativeTimeRange: relativeTimeRange:
from: 600 from: 600
to: 0 to: 0
@ -287,29 +287,29 @@ groups:
conditions: conditions:
- evaluator: - evaluator:
params: params:
- 0 - 15
- 0 - 0
type: gt type: gt
operator: operator:
type: and type: when
query: query:
params: [] params:
- diff
reducer: reducer:
params: [] params: []
type: avg type: last
type: query type: query
datasource: datasource:
name: Expression name: Expression
type: __expr__ type: __expr__
uid: __expr__ uid: __expr__
expression: ${diff} >= 16 expression: ""
intervalMs: 1000 hide: false
maxDataPoints: 43200
refId: condition refId: condition
type: math type: classic_conditions
noDataState: Alerting noDataState: Alerting
execErrState: Alerting execErrState: Alerting
for: 15m for: 5m
annotations: annotations:
summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }}
isPaused: false isPaused: false
@ -363,29 +363,29 @@ groups:
conditions: conditions:
- evaluator: - evaluator:
params: params:
- 0 - 15
- 0 - 0
type: gt type: gt
operator: operator:
type: and type: when
query: query:
params: [] params:
- diff
reducer: reducer:
params: [] params: []
type: avg type: last
type: query type: query
datasource: datasource:
name: Expression name: Expression
type: __expr__ type: __expr__
uid: __expr__ uid: __expr__
expression: ${diff} >= 16 expression: ""
intervalMs: 1000 hide: false
maxDataPoints: 43200
refId: condition refId: condition
type: math type: classic_conditions
noDataState: Alerting noDataState: Alerting
execErrState: Alerting execErrState: Alerting
for: 15m for: 5m
annotations: annotations:
summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }}
isPaused: false isPaused: false
@ -439,29 +439,29 @@ groups:
conditions: conditions:
- evaluator: - evaluator:
params: params:
- 0 - 15
- 0 - 0
type: gt type: gt
operator: operator:
type: and type: when
query: query:
params: [] params:
- diff
reducer: reducer:
params: [] params: []
type: avg type: last
type: query type: query
datasource: datasource:
name: Expression name: Expression
type: __expr__ type: __expr__
uid: __expr__ uid: __expr__
expression: ${diff} >= 16 expression: ""
intervalMs: 1000 hide: false
maxDataPoints: 43200
refId: condition refId: condition
type: math type: classic_conditions
noDataState: Alerting noDataState: Alerting
execErrState: Alerting execErrState: Alerting
for: 15m for: 5m
annotations: annotations:
summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }}
isPaused: false isPaused: false
@ -515,29 +515,29 @@ groups:
conditions: conditions:
- evaluator: - evaluator:
params: params:
- 0 - 15
- 0 - 0
type: gt type: gt
operator: operator:
type: and type: when
query: query:
params: [] params:
- diff
reducer: reducer:
params: [] params: []
type: avg type: last
type: query type: query
datasource: datasource:
name: Expression name: Expression
type: __expr__ type: __expr__
uid: __expr__ uid: __expr__
expression: ${diff} >= 16 expression: ""
intervalMs: 1000 hide: false
maxDataPoints: 43200
refId: condition refId: condition
type: math type: classic_conditions
noDataState: Alerting noDataState: Alerting
execErrState: Alerting execErrState: Alerting
for: 15m for: 5m
annotations: annotations:
summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }}
isPaused: false isPaused: false
@ -591,29 +591,29 @@ groups:
conditions: conditions:
- evaluator: - evaluator:
params: params:
- 0 - 15
- 0 - 0
type: gt type: gt
operator: operator:
type: and type: when
query: query:
params: [] params:
- diff
reducer: reducer:
params: [] params: []
type: avg type: last
type: query type: query
datasource: datasource:
name: Expression name: Expression
type: __expr__ type: __expr__
uid: __expr__ uid: __expr__
expression: ${diff} >= 16 expression: ""
intervalMs: 1000 hide: false
maxDataPoints: 43200
refId: condition refId: condition
type: math type: classic_conditions
noDataState: Alerting noDataState: Alerting
execErrState: Alerting execErrState: Alerting
for: 15m for: 5m
annotations: annotations:
summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }}
isPaused: false isPaused: false
@ -669,29 +669,29 @@ groups:
conditions: conditions:
- evaluator: - evaluator:
params: params:
- 0 - 15
- 0 - 0
type: gt type: gt
operator: operator:
type: and type: when
query: query:
params: [] params:
- diff
reducer: reducer:
params: [] params: []
type: avg type: last
type: query type: query
datasource: datasource:
name: Expression name: Expression
type: __expr__ type: __expr__
uid: __expr__ uid: __expr__
expression: ${diff} >= 16 expression: ""
intervalMs: 1000 hide: false
maxDataPoints: 43200
refId: condition refId: condition
type: math type: classic_conditions
noDataState: Alerting noDataState: Alerting
execErrState: Alerting execErrState: Alerting
for: 15m for: 5m
annotations: annotations:
summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }}
isPaused: false isPaused: false
@ -745,29 +745,29 @@ groups:
conditions: conditions:
- evaluator: - evaluator:
params: params:
- 0 - 15
- 0 - 0
type: gt type: gt
operator: operator:
type: and type: when
query: query:
params: [] params:
- diff
reducer: reducer:
params: [] params: []
type: avg type: last
type: query type: query
datasource: datasource:
name: Expression name: Expression
type: __expr__ type: __expr__
uid: __expr__ uid: __expr__
expression: ${diff} >= 16 expression: ""
intervalMs: 1000 hide: false
maxDataPoints: 43200
refId: condition refId: condition
type: math type: classic_conditions
noDataState: Alerting noDataState: Alerting
execErrState: Alerting execErrState: Alerting
for: 15m for: 5m
annotations: annotations:
summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }}
isPaused: false isPaused: false
@ -823,29 +823,29 @@ groups:
conditions: conditions:
- evaluator: - evaluator:
params: params:
- 0 - 15
- 0 - 0
type: gt type: gt
operator: operator:
type: and type: when
query: query:
params: [] params:
- diff
reducer: reducer:
params: [] params: []
type: avg type: last
type: query type: query
datasource: datasource:
name: Expression name: Expression
type: __expr__ type: __expr__
uid: __expr__ uid: __expr__
expression: ${diff} >= 16 expression: ""
intervalMs: 1000 hide: false
maxDataPoints: 43200
refId: condition refId: condition
type: math type: classic_conditions
noDataState: Alerting noDataState: Alerting
execErrState: Alerting execErrState: Alerting
for: 15m for: 5m
annotations: annotations:
summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }}
isPaused: false isPaused: false

View File

@ -123,8 +123,9 @@ laconic-so --stack monitoring deploy create --spec-file monitoring-spec.yml --de
```yml ```yml
... ...
- job_name: laconicd - job_name: laconicd
static_configs: ...
- targets: ['example-host:1317'] static_configs:
- targets: ['example-host:1317']
... ...
``` ```

View File

@ -44,13 +44,18 @@ Add the following scrape configs to prometheus config file (`monitoring-watchers
- job_name: 'blackbox' - job_name: 'blackbox'
... ...
static_configs: static_configs:
- targets: - targets: [<AZIMUTH_GATEWAY_GQL_ENDPOINT>]
- <AZIMUTH_GATEWAY_GQL_ENDPOINT> labels:
- <LACONICD_GQL_ENDPOINT> # Add destination label for pre-configured alerts
destination: 'azimuth_gateway'
- targets: [<LACONICD_GQL_ENDPOINT>]
labels:
destination: 'laconicd_gql'
... ...
- job_name: laconicd - job_name: laconicd
static_configs: ...
- targets: ['LACONICD_REST_HOST:LACONICD_REST_PORT'] static_configs:
- targets: ['LACONICD_REST_HOST:LACONICD_REST_PORT']
... ...
- job_name: azimuth - job_name: azimuth
scrape_interval: 10s scrape_interval: 10s
@ -119,10 +124,12 @@ Add scrape config as done above for any additional watcher to add it to the Watc
### Grafana alerts config ### Grafana alerts config
Place the pre-configured watcher alerts rules in Grafana provisioning directory: Place the pre-configured watcher and blackbox endpoint alerts rules in Grafana provisioning directory:
```bash ```bash
cp monitoring-watchers-deployment/config/monitoring/watcher-alert-rules.yml monitoring-watchers-deployment/config/monitoring/grafana/provisioning/alerting/ cp monitoring-watchers-deployment/config/monitoring/watcher-alert-rules.yml monitoring-watchers-deployment/config/monitoring/grafana/provisioning/alerting/
cp monitoring-watchers-deployment/config/monitoring/blackbox-alert-rules.yml monitoring-watchers-deployment/config/monitoring/grafana/provisioning/alerting/
``` ```
Update the alerting contact points config (`monitoring-watchers-deployment/config/monitoring/grafana/provisioning/alerting/contactpoints.yml`) with desired contact points Update the alerting contact points config (`monitoring-watchers-deployment/config/monitoring/grafana/provisioning/alerting/contactpoints.yml`) with desired contact points
@ -135,7 +142,7 @@ Add corresponding routes to the notification policies config (`monitoring-watche
- receiver: SlackNotifier - receiver: SlackNotifier
object_matchers: object_matchers:
# Add matchers below # Add matchers below
- ['grafana_folder', '=', 'WatcherAlerts'] - ['grafana_folder', '=~', 'WatcherAlerts|BlackboxAlerts']
``` ```
### Env ### Env