From a3eb3c0bb0162b10ea7e22df56baa2bc54dd147b Mon Sep 17 00:00:00 2001 From: prathamesh0 <42446521+prathamesh0@users.noreply.github.com> Date: Mon, 8 Jan 2024 17:25:30 +0530 Subject: [PATCH] Setup basic alerting for watchers in monitoring stack (#698) * Provision Grafana alert contactpoints and policies for Slack * Add watcher alert rules * Update watcher monitoring instructions * Add listening port flag to node exporter command * Add reference links --- .../compose/docker-compose-node-exporter.yml | 3 +- .../provisioning/alerting/contactpoints.yml | 14 + .../provisioning/alerting/policies.yml | 15 + .../config/monitoring/watcher-alert-rules.yml | 973 ++++++++++++++++++ .../stacks/monitoring/monitoring-watchers.md | 23 + 5 files changed, 1026 insertions(+), 2 deletions(-) create mode 100644 stack_orchestrator/data/config/monitoring/grafana/provisioning/alerting/contactpoints.yml create mode 100644 stack_orchestrator/data/config/monitoring/grafana/provisioning/alerting/policies.yml create mode 100644 stack_orchestrator/data/config/monitoring/watcher-alert-rules.yml diff --git a/stack_orchestrator/data/compose/docker-compose-node-exporter.yml b/stack_orchestrator/data/compose/docker-compose-node-exporter.yml index 8b49c269..6fa02d6d 100644 --- a/stack_orchestrator/data/compose/docker-compose-node-exporter.yml +++ b/stack_orchestrator/data/compose/docker-compose-node-exporter.yml @@ -5,12 +5,11 @@ services: image: prom/node-exporter:latest restart: unless-stopped command: + - '--web.listen-address=:9100' - '--path.rootfs=/host' - '--collector.systemd' - '--collector.processes' network_mode: host pid: host - ports: - - 9100 volumes: - '/:/host:ro,rslave' diff --git a/stack_orchestrator/data/config/monitoring/grafana/provisioning/alerting/contactpoints.yml b/stack_orchestrator/data/config/monitoring/grafana/provisioning/alerting/contactpoints.yml new file mode 100644 index 00000000..5e924946 --- /dev/null +++ b/stack_orchestrator/data/config/monitoring/grafana/provisioning/alerting/contactpoints.yml @@ -0,0 +1,14 @@ +# https://www.clever-cloud.com/blog/features/2021/12/03/slack-alerts-for-grafana/ + +apiVersion: 1 + +contactPoints: + - orgId: 1 + name: SlackNotifier + receivers: + - uid: a71b06e3-58b6-41fe-af65-fbbb29653951 + type: slack + settings: + # Slack hook URL (see https://api.slack.com/messaging/webhooks) + url: + disableResolveMessage: false diff --git a/stack_orchestrator/data/config/monitoring/grafana/provisioning/alerting/policies.yml b/stack_orchestrator/data/config/monitoring/grafana/provisioning/alerting/policies.yml new file mode 100644 index 00000000..b1beb31d --- /dev/null +++ b/stack_orchestrator/data/config/monitoring/grafana/provisioning/alerting/policies.yml @@ -0,0 +1,15 @@ +# https://grafana.com/docs/grafana/latest/alerting/alerting-rules/create-notification-policy/ + +apiVersion: 1 + +policies: + - orgId: 1 + receiver: grafana-default-email + group_by: + - grafana_folder + - alertname + routes: + - receiver: SlackNotifier + object_matchers: + # Add matchers below + # - ['grafana_folder', '=', 'MyAlerts'] diff --git a/stack_orchestrator/data/config/monitoring/watcher-alert-rules.yml b/stack_orchestrator/data/config/monitoring/watcher-alert-rules.yml new file mode 100644 index 00000000..7e26ba14 --- /dev/null +++ b/stack_orchestrator/data/config/monitoring/watcher-alert-rules.yml @@ -0,0 +1,973 @@ +# https://grafana.com/docs/grafana/latest/alerting/alerting-rules/create-grafana-managed-rule/ + +apiVersion: 1 + +groups: + - orgId: 1 + name: watcher + folder: WatcherAlerts + interval: 30s + rules: + # Azimuth + - uid: azimuth_diff_external + title: azimuth_watcher_head_tracking + condition: condition + data: + - refId: diff + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + disableTextWrap: false + editorMode: code + expr: latest_block_number - on(chain) group_right sync_status_block_number{job="azimuth", instance="azimuth", kind="latest_indexed"} + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: diff + useBackend: false + - refId: latest_external + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: latest_block_number{chain="ethereum"} + hide: false + instant: true + legendFormat: __auto + range: false + refId: latest_external + - refId: latest_indexed + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: sync_status_block_number{job="azimuth", instance="azimuth", kind="latest_indexed"} + hide: false + instant: true + legendFormat: __auto + range: false + refId: latest_indexed + - refId: condition + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + - 0 + type: gt + operator: + type: and + query: + params: [] + reducer: + params: [] + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: ${diff} >= 16 + intervalMs: 1000 + maxDataPoints: 43200 + refId: condition + type: math + dashboardUid: cb9b746a-9abc-482e-9214-5231e0dd75ca + panelId: 24 + noDataState: Alerting + execErrState: Alerting + for: 15m + annotations: + __dashboardUid__: cb9b746a-9abc-482e-9214-5231e0dd75ca + __panelId__: "24" + summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} + isPaused: false + - uid: censures_diff_external + title: censures_watcher_head_tracking + condition: condition + data: + - refId: diff + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + disableTextWrap: false + editorMode: code + expr: latest_block_number - on(chain) group_right sync_status_block_number{job="azimuth", instance="censures", kind="latest_indexed"} + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: diff + useBackend: false + - refId: latest_external + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: latest_block_number{chain="ethereum"} + hide: false + instant: true + legendFormat: __auto + range: false + refId: latest_external + - refId: latest_indexed + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: sync_status_block_number{job="azimuth", instance="censures", kind="latest_indexed"} + hide: false + instant: true + legendFormat: __auto + range: false + refId: latest_indexed + - refId: condition + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + - 0 + type: gt + operator: + type: and + query: + params: [] + reducer: + params: [] + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: ${diff} >= 16 + intervalMs: 1000 + maxDataPoints: 43200 + refId: condition + type: math + dashboardUid: cb9b746a-9abc-482e-9214-5231e0dd75ca + panelId: 24 + noDataState: Alerting + execErrState: Alerting + for: 15m + annotations: + __dashboardUid__: cb9b746a-9abc-482e-9214-5231e0dd75ca + __panelId__: "24" + summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} + isPaused: false + - uid: claims_diff_external + title: claims_watcher_head_tracking + condition: condition + data: + - refId: diff + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + disableTextWrap: false + editorMode: code + expr: latest_block_number - on(chain) group_right sync_status_block_number{job="azimuth", instance="claims", kind="latest_indexed"} + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: diff + useBackend: false + - refId: latest_external + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: latest_block_number{chain="ethereum"} + hide: false + instant: true + legendFormat: __auto + range: false + refId: latest_external + - refId: latest_indexed + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: sync_status_block_number{job="azimuth", instance="claims", kind="latest_indexed"} + hide: false + instant: true + legendFormat: __auto + range: false + refId: latest_indexed + - refId: condition + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + - 0 + type: gt + operator: + type: and + query: + params: [] + reducer: + params: [] + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: ${diff} >= 16 + intervalMs: 1000 + maxDataPoints: 43200 + refId: condition + type: math + dashboardUid: cb9b746a-9abc-482e-9214-5231e0dd75ca + panelId: 24 + noDataState: Alerting + execErrState: Alerting + for: 15m + annotations: + __dashboardUid__: cb9b746a-9abc-482e-9214-5231e0dd75ca + __panelId__: "24" + summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} + isPaused: false + - uid: conditional_star_release_diff_external + title: conditional_star_release_watcher_head_tracking + condition: condition + data: + - refId: diff + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + disableTextWrap: false + editorMode: code + expr: latest_block_number - on(chain) group_right sync_status_block_number{job="azimuth", instance="conditional_star_release", kind="latest_indexed"} + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: diff + useBackend: false + - refId: latest_external + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: latest_block_number{chain="ethereum"} + hide: false + instant: true + legendFormat: __auto + range: false + refId: latest_external + - refId: latest_indexed + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: sync_status_block_number{job="azimuth", instance="conditional_star_release", kind="latest_indexed"} + hide: false + instant: true + legendFormat: __auto + range: false + refId: latest_indexed + - refId: condition + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + - 0 + type: gt + operator: + type: and + query: + params: [] + reducer: + params: [] + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: ${diff} >= 16 + intervalMs: 1000 + maxDataPoints: 43200 + refId: condition + type: math + dashboardUid: cb9b746a-9abc-482e-9214-5231e0dd75ca + panelId: 24 + noDataState: Alerting + execErrState: Alerting + for: 15m + annotations: + __dashboardUid__: cb9b746a-9abc-482e-9214-5231e0dd75ca + __panelId__: "24" + summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} + isPaused: false + - uid: delegated_sending_diff_external + title: delegated_sending_watcher_head_tracking + condition: condition + data: + - refId: diff + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + disableTextWrap: false + editorMode: code + expr: latest_block_number - on(chain) group_right sync_status_block_number{job="azimuth", instance="delegated_sending", kind="latest_indexed"} + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: diff + useBackend: false + - refId: latest_external + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: latest_block_number{chain="ethereum"} + hide: false + instant: true + legendFormat: __auto + range: false + refId: latest_external + - refId: latest_indexed + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: sync_status_block_number{job="azimuth", instance="delegated_sending", kind="latest_indexed"} + hide: false + instant: true + legendFormat: __auto + range: false + refId: latest_indexed + - refId: condition + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + - 0 + type: gt + operator: + type: and + query: + params: [] + reducer: + params: [] + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: ${diff} >= 16 + intervalMs: 1000 + maxDataPoints: 43200 + refId: condition + type: math + dashboardUid: cb9b746a-9abc-482e-9214-5231e0dd75ca + panelId: 24 + noDataState: Alerting + execErrState: Alerting + for: 15m + annotations: + __dashboardUid__: cb9b746a-9abc-482e-9214-5231e0dd75ca + __panelId__: "24" + summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} + isPaused: false + - uid: ecliptic_diff_external + title: ecliptic_watcher_head_tracking + condition: condition + data: + - refId: diff + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + disableTextWrap: false + editorMode: code + expr: latest_block_number - on(chain) group_right sync_status_block_number{job="azimuth", instance="ecliptic", kind="latest_indexed"} + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: diff + useBackend: false + - refId: latest_external + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: latest_block_number{chain="ethereum"} + hide: false + instant: true + legendFormat: __auto + range: false + refId: latest_external + - refId: latest_indexed + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: sync_status_block_number{job="azimuth", instance="ecliptic", kind="latest_indexed"} + hide: false + instant: true + legendFormat: __auto + range: false + refId: latest_indexed + - refId: condition + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + - 0 + type: gt + operator: + type: and + query: + params: [] + reducer: + params: [] + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: ${diff} >= 16 + intervalMs: 1000 + maxDataPoints: 43200 + refId: condition + type: math + dashboardUid: cb9b746a-9abc-482e-9214-5231e0dd75ca + panelId: 24 + noDataState: Alerting + execErrState: Alerting + for: 15m + annotations: + __dashboardUid__: cb9b746a-9abc-482e-9214-5231e0dd75ca + __panelId__: "24" + summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} + isPaused: false + - uid: linear_star_release_diff_external + title: linear_star_release_watcher_head_tracking + condition: condition + data: + - refId: diff + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + disableTextWrap: false + editorMode: code + expr: latest_block_number - on(chain) group_right sync_status_block_number{job="azimuth", instance="linear_star_release", kind="latest_indexed"} + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: diff + useBackend: false + - refId: latest_external + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: latest_block_number{chain="ethereum"} + hide: false + instant: true + legendFormat: __auto + range: false + refId: latest_external + - refId: latest_indexed + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: sync_status_block_number{job="azimuth", instance="azimuth", kind="latest_indexed"} + hide: false + instant: true + legendFormat: __auto + range: false + refId: latest_indexed + - refId: condition + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + - 0 + type: gt + operator: + type: and + query: + params: [] + reducer: + params: [] + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: ${diff} >= 16 + intervalMs: 1000 + maxDataPoints: 43200 + refId: condition + type: math + dashboardUid: cb9b746a-9abc-482e-9214-5231e0dd75ca + panelId: 24 + noDataState: Alerting + execErrState: Alerting + for: 15m + annotations: + __dashboardUid__: cb9b746a-9abc-482e-9214-5231e0dd75ca + __panelId__: "24" + summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} + isPaused: false + - uid: polls_diff_external + title: polls_watcher_head_tracking + condition: condition + data: + - refId: diff + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + disableTextWrap: false + editorMode: code + expr: latest_block_number - on(chain) group_right sync_status_block_number{job="azimuth", instance="polls", kind="latest_indexed"} + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: diff + useBackend: false + - refId: latest_external + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: latest_block_number{chain="ethereum"} + hide: false + instant: true + legendFormat: __auto + range: false + refId: latest_external + - refId: latest_indexed + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: sync_status_block_number{job="azimuth", instance="polls", kind="latest_indexed"} + hide: false + instant: true + legendFormat: __auto + range: false + refId: latest_indexed + - refId: condition + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + - 0 + type: gt + operator: + type: and + query: + params: [] + reducer: + params: [] + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: ${diff} >= 16 + intervalMs: 1000 + maxDataPoints: 43200 + refId: condition + type: math + dashboardUid: cb9b746a-9abc-482e-9214-5231e0dd75ca + panelId: 24 + noDataState: Alerting + execErrState: Alerting + for: 15m + annotations: + __dashboardUid__: cb9b746a-9abc-482e-9214-5231e0dd75ca + __panelId__: "24" + summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} + isPaused: false + + # Sushi + - uid: sushiswap_diff_external + title: sushiswap_watcher_head_tracking + condition: condition + data: + - refId: diff + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + disableTextWrap: false + editorMode: code + expr: latest_block_number - on(chain) group_right sync_status_block_number{job="sushi", instance="sushiswap", kind="latest_indexed"} + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: diff + useBackend: false + - refId: latest_external + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: latest_block_number{chain="filecoin"} + hide: false + instant: true + legendFormat: __auto + range: false + refId: latest_external + - refId: latest_indexed + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: sync_status_block_number{job="sushi", instance="sushiswap", kind="latest_indexed"} + hide: false + instant: true + legendFormat: __auto + range: false + refId: latest_indexed + - refId: condition + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + - 0 + type: gt + operator: + type: and + query: + params: [] + reducer: + params: [] + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: ${diff} >= 16 + intervalMs: 1000 + maxDataPoints: 43200 + refId: condition + type: math + dashboardUid: cb9b746a-9abc-482e-9214-5231e0dd75ca + panelId: 24 + noDataState: Alerting + execErrState: Alerting + for: 15m + annotations: + __dashboardUid__: cb9b746a-9abc-482e-9214-5231e0dd75ca + __panelId__: "24" + summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} + isPaused: false + - uid: merkl_sushiswap_diff_external + title: merkl_sushiswap_watcher_head_tracking + condition: condition + data: + - refId: diff + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + disableTextWrap: false + editorMode: code + expr: latest_block_number - on(chain) group_right sync_status_block_number{job="sushi", instance="merkl_sushiswap", kind="latest_indexed"} + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: diff + useBackend: false + - refId: latest_external + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: latest_block_number{chain="filecoin"} + hide: false + instant: true + legendFormat: __auto + range: false + refId: latest_external + - refId: latest_indexed + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: sync_status_block_number{job="sushi", instance="merkl_sushiswap", kind="latest_indexed"} + hide: false + instant: true + legendFormat: __auto + range: false + refId: latest_indexed + - refId: condition + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + - 0 + type: gt + operator: + type: and + query: + params: [] + reducer: + params: [] + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: ${diff} >= 16 + intervalMs: 1000 + maxDataPoints: 43200 + refId: condition + type: math + dashboardUid: cb9b746a-9abc-482e-9214-5231e0dd75ca + panelId: 24 + noDataState: Alerting + execErrState: Alerting + for: 15m + annotations: + __dashboardUid__: cb9b746a-9abc-482e-9214-5231e0dd75ca + __panelId__: "24" + summary: Watcher {{ index $labels "instance" }} of group {{ index $labels "job" }} is falling behind external head by {{ index $values "diff" }} + isPaused: false diff --git a/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md b/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md index 23335efb..3673b530 100644 --- a/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md +++ b/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md @@ -35,6 +35,8 @@ laconic-so --stack monitoring deploy create --spec-file monitoring-watchers-spec ## Configure +### Prometheus scrape config + Add the following scrape configs to prometheus config file (`monitoring-watchers-deployment/config/monitoring/prometheus/prometheus.yml`) in the deployment folder: ```yml @@ -100,6 +102,27 @@ Add the following scrape configs to prometheus config file (`monitoring-watchers Add scrape config as done above for any additional watcher to add it to the Watchers dashboard. +### Grafana alerts config + +Place the pre-configured watcher alerts rules in Grafana provisioning directory: + + ```bash + cp monitoring-watchers-deployment/config/monitoring/watcher-alert-rules.yml monitoring-watchers-deployment/config/monitoring/grafana/provisioning/alerting/ + ``` + +Update the alerting contact points config (`monitoring-watchers-deployment/config/monitoring/grafana/provisioning/alerting/contactpoints.yml`) with desired contact points + +Add corresponding routes to the notification policies config (`monitoring-watchers-deployment/monitoring/grafana/provisioning/alerting/policies.yaml`) with appropriate object-matchers: + + ```yml + ... + routes: + - receiver: SlackNotifier + object_matchers: + # Add matchers below + - ['grafana_folder', '=', 'WatcherAlerts'] + ``` + ### Env Set the following env variables in the deployment env config file (`monitoring-watchers-deployment/config.env`):