From 27412519b4917e00aa8f2ccdb44f0f232aa7b1e0 Mon Sep 17 00:00:00 2001 From: Nabarun Date: Wed, 2 Apr 2025 18:54:06 +0530 Subject: [PATCH 1/4] Add readme for monitoring testnet services --- .../data/config/monitoring/blackbox.yml | 1 + .../stacks/monitoring/monitoring-testnet.md | 128 ++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md diff --git a/stack_orchestrator/data/config/monitoring/blackbox.yml b/stack_orchestrator/data/config/monitoring/blackbox.yml index 4a9d72eb..d24e7574 100644 --- a/stack_orchestrator/data/config/monitoring/blackbox.yml +++ b/stack_orchestrator/data/config/monitoring/blackbox.yml @@ -5,3 +5,4 @@ modules: http: valid_status_codes: [] #default to 2xx method: GET + preferred_ip_protocol: ip4 diff --git a/stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md b/stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md new file mode 100644 index 00000000..e20fab13 --- /dev/null +++ b/stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md @@ -0,0 +1,128 @@ +# Monitoring Testnet + +Instructions to setup and run monitoring stack for testnet services + +## Create a deployment + +After completing [setup](./README.md#setup), create a spec file for the deployment, which will map the stack's ports and volumes to the host: + +```bash +laconic-so --stack monitoring deploy init --output monitoring-testnet-spec.yml +``` + +### Ports + +Edit `network` in spec file to map container ports to same ports in host: + +``` +... +network: + ports: + prometheus: + - '9090:9090' + grafana: + - '3000:3000' +... +``` + +--- + +Once you've made any needed changes to the spec file, create a deployment from it: + +```bash +laconic-so --stack monitoring deploy create --spec-file monitoring-testnet-spec.yml --deployment-dir monitoring-testnet-deployment +``` + +## Configure + +### Prometheus scrape config + +Add the following scrape configs to prometheus config file (`monitoring-testnet-deployment/config/monitoring/prometheus/prometheus.yml`) in the deployment folder: + + ```yml + ... + - job_name: 'blackbox' + ... + static_configs: + - targets: + - https://wallet.laconic.com + - https://laconicd-sapo.laconic.com + - https://console-sapo.laconic.com + - https://fixturenet-eth.laconic.com + - https://deploy.laconic.com + - https://deploy-backend.laconic.com/staging/version + - https://container-registry.apps.vaasl.io + - https://webapp-deployer-api.apps.vaasl.io + - https://webapp-deployer-ui.apps.vaasl.io + ... + - job_name: laconicd + ... + static_configs: + - targets: ['LACONICD_REST_HOST:LACONICD_REST_PORT'] + # Example: 'host.docker.internal:3317' + ``` + +### Grafana alerts config + +Place the pre-configured alerts rules in Grafana provisioning directory: + + ```bash + # watcher alert rules + cp monitoring-testnet-deployment/config/monitoring/testnet-alert-rules.yml monitoring-testnet-deployment/config/monitoring/grafana/provisioning/alerting/ + ``` + +Update the alerting contact points config (`monitoring-testnet-deployment/config/monitoring/grafana/provisioning/alerting/contactpoints.yml`) with desired contact points + +Add corresponding routes to the notification policies config (`monitoring-testnet-deployment/config/monitoring/grafana/provisioning/alerting/policies.yml`) with appropriate object-matchers: + + ```yml + ... + routes: + - receiver: SlackNotifier + object_matchers: + # Add matchers below + - ['grafana_folder', '=~', 'TestnetAlerts'] + ``` + +### Env + +Set the following env variables in the deployment env config file (`monitoring-testnet-deployment/config.env`): + + ```bash + # Grafana server host URL to be used + # (Optional, default: http://localhost:3000) + GF_SERVER_ROOT_URL= + ``` + +## Start the stack + +Start the deployment: + +```bash +laconic-so deployment --dir monitoring-testnet-deployment start +``` + +* List and check the health status of all the containers using `docker ps` and wait for them to be `healthy` + +* Grafana should now be visible at http://localhost:3000 with configured dashboards + +## Clean up + +To stop monitoring services running in the background, while preserving data: + +```bash +# Only stop the docker containers +laconic-so deployment --dir monitoring-watchers-deployment stop + +# Run 'start' to restart the deployment +``` + +To stop monitoring services and also delete data: + +```bash +# Stop the docker containers +laconic-so deployment --dir monitoring-watchers-deployment stop --delete-volumes + +# Remove deployment directory (deployment will have to be recreated for a re-run) +rm -rf monitoring-watchers-deployment +``` -- 2.45.2 From fb0138e9754df942992e30e13239cf487be98fb5 Mon Sep 17 00:00:00 2001 From: Nabarun Date: Fri, 4 Apr 2025 18:06:53 +0530 Subject: [PATCH 2/4] Add alerts for testnet services --- .../config/monitoring/testnet-alert-rules.yml | 64 +++++++++++++++++++ .../stacks/monitoring/monitoring-testnet.md | 20 +++++- 2 files changed, 82 insertions(+), 2 deletions(-) create mode 100644 stack_orchestrator/data/config/monitoring/testnet-alert-rules.yml diff --git a/stack_orchestrator/data/config/monitoring/testnet-alert-rules.yml b/stack_orchestrator/data/config/monitoring/testnet-alert-rules.yml new file mode 100644 index 00000000..60d77bd1 --- /dev/null +++ b/stack_orchestrator/data/config/monitoring/testnet-alert-rules.yml @@ -0,0 +1,64 @@ +apiVersion: 1 + +groups: + - orgId: 1 + name: testnet + folder: TestnetAlerts + interval: 30s + rules: + - uid: endpoint_down + title: endpoint_down + condition: condition + data: + - refId: probe_success + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: probe_success{job="blackbox"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: probe_success + - refId: condition + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + - 0 + type: eq + operator: + type: and + query: + params: [] + reducer: + params: [] + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: ${probe_success} == 0 + intervalMs: 1000 + maxDataPoints: 43200 + refId: condition + type: math + noDataState: Alerting + execErrState: Alerting + for: 5m + annotations: + summary: Endpoint {{ $labels.instance }} is down + isPaused: false diff --git a/stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md b/stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md index e20fab13..64399995 100644 --- a/stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md +++ b/stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md @@ -4,7 +4,7 @@ Instructions to setup and run monitoring stack for testnet services ## Create a deployment -After completing [setup](./README.md#setup), create a spec file for the deployment, which will map the stack's ports and volumes to the host: +Create a spec file for the deployment, which will map the stack's ports and volumes to the host: ```bash laconic-so --stack monitoring deploy init --output monitoring-testnet-spec.yml @@ -37,7 +37,7 @@ laconic-so --stack monitoring deploy create --spec-file monitoring-testnet-spec. ### Prometheus scrape config -Add the following scrape configs to prometheus config file (`monitoring-testnet-deployment/config/monitoring/prometheus/prometheus.yml`) in the deployment folder: +- Setup the following scrape configs in prometheus config file (`monitoring-testnet-deployment/config/monitoring/prometheus/prometheus.yml`) in the deployment folder: ```yml ... @@ -62,6 +62,22 @@ Add the following scrape configs to prometheus config file (`monitoring-testnet- # Example: 'host.docker.internal:3317' ``` +- Remove docker compose services which are not required in `monitoring-testnet-deployment/compose/docker-compose-prom-server.yml` + - `ethereum-chain-head-exporter` + - `filecoin-chain-head-exporter` + - `graph-node-upstream-head-exporter` + - `postgres-exporter` + +### Grafana dashboards + +Remove some of the existing dashboards which are not required in monitoring testnet +``` +cd monitoring-testnet-deployment/config/monitoring/grafana/dashboards +rm postgres-dashboard.json subgraphs-dashboard.json watcher-dashboard.json +cd - +``` + + ### Grafana alerts config Place the pre-configured alerts rules in Grafana provisioning directory: -- 2.45.2 From 159402921dc3c1ec3e519fefaf8d08b3b04cbe86 Mon Sep 17 00:00:00 2001 From: Nabarun Date: Mon, 7 Apr 2025 14:23:14 +0530 Subject: [PATCH 3/4] Use alias for blackbox targets in dashboard --- .../data/config/monitoring/blackbox.yml | 2 +- .../prometheus-blackbox-exporter.json | 2 +- .../provisioning/alerting/policies.yml | 1 + .../monitoring/prometheus/prometheus.yml | 20 +++++++--- .../stacks/monitoring/monitoring-testnet.md | 37 ++++++++++++++----- .../stacks/monitoring/monitoring-watchers.md | 9 +++-- 6 files changed, 50 insertions(+), 21 deletions(-) diff --git a/stack_orchestrator/data/config/monitoring/blackbox.yml b/stack_orchestrator/data/config/monitoring/blackbox.yml index d24e7574..30cb9ffa 100644 --- a/stack_orchestrator/data/config/monitoring/blackbox.yml +++ b/stack_orchestrator/data/config/monitoring/blackbox.yml @@ -1,7 +1,7 @@ modules: http_2xx: prober: http - timeout: 5s + timeout: 15s http: valid_status_codes: [] #default to 2xx method: GET diff --git a/stack_orchestrator/data/config/monitoring/grafana/dashboards/prometheus-blackbox-exporter.json b/stack_orchestrator/data/config/monitoring/grafana/dashboards/prometheus-blackbox-exporter.json index 638c4d5b..6c018154 100644 --- a/stack_orchestrator/data/config/monitoring/grafana/dashboards/prometheus-blackbox-exporter.json +++ b/stack_orchestrator/data/config/monitoring/grafana/dashboards/prometheus-blackbox-exporter.json @@ -136,7 +136,7 @@ "refId": "A" } ], - "title": "$target status", + "title": "$target", "type": "row" }, { diff --git a/stack_orchestrator/data/config/monitoring/grafana/provisioning/alerting/policies.yml b/stack_orchestrator/data/config/monitoring/grafana/provisioning/alerting/policies.yml index b1beb31d..498b9cf5 100644 --- a/stack_orchestrator/data/config/monitoring/grafana/provisioning/alerting/policies.yml +++ b/stack_orchestrator/data/config/monitoring/grafana/provisioning/alerting/policies.yml @@ -8,6 +8,7 @@ policies: group_by: - grafana_folder - alertname + - instance routes: - receiver: SlackNotifier object_matchers: diff --git a/stack_orchestrator/data/config/monitoring/prometheus/prometheus.yml b/stack_orchestrator/data/config/monitoring/prometheus/prometheus.yml index dea7052d..42e56795 100644 --- a/stack_orchestrator/data/config/monitoring/prometheus/prometheus.yml +++ b/stack_orchestrator/data/config/monitoring/prometheus/prometheus.yml @@ -28,17 +28,25 @@ scrape_configs: - targets: # - https://github.com relabel_configs: + # Forward the original target URL as the 'target' parameter. - source_labels: [__address__] regex: (.*)(:80)? target_label: __param_target - - source_labels: [__param_target] - regex: (.*) + # Use the custom alias if defined for the 'instance' label. + - source_labels: [alias] target_label: instance - replacement: ${1} - - source_labels: [] - regex: .* - target_label: __address__ + action: replace + # If no alias is set, fall back to the target URL. + - source_labels: [instance] + regex: ^$ + target_label: instance + replacement: ${__param_target} + # Finally, tell Prometheus to scrape the blackbox_exporter. + - target_label: __address__ replacement: blackbox:9115 + # Drop the original alias label as it's now redundant with instance + - action: labeldrop + regex: ^alias$ - job_name: chain_heads scrape_interval: 10s diff --git a/stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md b/stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md index 64399995..7f109307 100644 --- a/stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md +++ b/stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md @@ -44,16 +44,33 @@ laconic-so --stack monitoring deploy create --spec-file monitoring-testnet-spec. - job_name: 'blackbox' ... static_configs: - - targets: - - https://wallet.laconic.com - - https://laconicd-sapo.laconic.com - - https://console-sapo.laconic.com - - https://fixturenet-eth.laconic.com - - https://deploy.laconic.com - - https://deploy-backend.laconic.com/staging/version - - https://container-registry.apps.vaasl.io - - https://webapp-deployer-api.apps.vaasl.io - - https://webapp-deployer-ui.apps.vaasl.io + - targets: ["https://wallet.laconic.com"] + labels: + alias: "Wallet App" + - targets: ["https://laconicd-sapo.laconic.com"] + labels: + alias: "Node (laconicd)" + - targets: ["https://console-sapo.laconic.com"] + labels: + alias: "Console App" + - targets: ["https://fixturenet-eth.laconic.com"] + labels: + alias: "Fixturenet ETH" + - targets: ["https://deploy.laconic.com"] + labels: + alias: "Deploy App" + - targets: ["https://deploy-backend.laconic.com/staging/version"] + labels: + alias: "Deploy Backend" + - targets: ["https://container-registry.apps.vaasl.io"] + labels: + alias: "Container Registry" + - targets: ["https://webapp-deployer-api.apps.vaasl.io"] + labels: + alias: "Webapp Deployer API" + - targets: ["https://webapp-deployer-ui.apps.vaasl.io"] + labels: + alias: "Webapp Deployer UI" ... - job_name: laconicd ... diff --git a/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md b/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md index 158da503..bd325a96 100644 --- a/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md +++ b/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md @@ -44,9 +44,12 @@ Add the following scrape configs to prometheus config file (`monitoring-watchers - job_name: 'blackbox' ... static_configs: - - targets: - - - - + - targets: [""] + labels: + alias: "Azimuth Watcher" + - targets: [""] + labels: + alias: "Node (laconicd)" ... - job_name: laconicd static_configs: -- 2.45.2 From 320517b6b0c6f402715de5165970f918da5b2df6 Mon Sep 17 00:00:00 2001 From: Nabarun Date: Tue, 8 Apr 2025 12:21:50 +0530 Subject: [PATCH 4/4] Add URL to dashboard panel label --- .../prometheus-blackbox-exporter.json | 28 ++++++++++++++++++- .../monitoring/prometheus/prometheus.yml | 10 +++++-- .../stacks/monitoring/monitoring-testnet.md | 11 +++++++- 3 files changed, 45 insertions(+), 4 deletions(-) diff --git a/stack_orchestrator/data/config/monitoring/grafana/dashboards/prometheus-blackbox-exporter.json b/stack_orchestrator/data/config/monitoring/grafana/dashboards/prometheus-blackbox-exporter.json index 6c018154..0a534bec 100644 --- a/stack_orchestrator/data/config/monitoring/grafana/dashboards/prometheus-blackbox-exporter.json +++ b/stack_orchestrator/data/config/monitoring/grafana/dashboards/prometheus-blackbox-exporter.json @@ -133,10 +133,13 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "expr": "probe_success{instance=~\"$target\"}", + "format": "time_series", + "instant": true, "refId": "A" } ], - "title": "$target", + "title": "$target ($url)", "type": "row" }, { @@ -1057,6 +1060,29 @@ "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "definition": "label_values(probe_success{instance=~\"$target\"}, url)", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "url", + "options": [], + "query": "label_values(probe_success{instance=~\"$target\"}, url)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" } ] }, diff --git a/stack_orchestrator/data/config/monitoring/prometheus/prometheus.yml b/stack_orchestrator/data/config/monitoring/prometheus/prometheus.yml index 42e56795..9dd83904 100644 --- a/stack_orchestrator/data/config/monitoring/prometheus/prometheus.yml +++ b/stack_orchestrator/data/config/monitoring/prometheus/prometheus.yml @@ -25,8 +25,10 @@ scrape_configs: module: [http_2xx] static_configs: # Add URLs to be monitored below - - targets: - # - https://github.com + # - targets: ["https://github.com"] + # labels: + # alias: "GitHub" + # url: "https://github.com" relabel_configs: # Forward the original target URL as the 'target' parameter. - source_labels: [__address__] @@ -36,6 +38,10 @@ scrape_configs: - source_labels: [alias] target_label: instance action: replace + # Preserve the URL label + - source_labels: [url] + target_label: url + action: replace # If no alias is set, fall back to the target URL. - source_labels: [instance] regex: ^$ diff --git a/stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md b/stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md index 7f109307..12d0383e 100644 --- a/stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md +++ b/stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md @@ -47,30 +47,39 @@ laconic-so --stack monitoring deploy create --spec-file monitoring-testnet-spec. - targets: ["https://wallet.laconic.com"] labels: alias: "Wallet App" + url: "https://wallet.laconic.com" - targets: ["https://laconicd-sapo.laconic.com"] labels: - alias: "Node (laconicd)" + alias: "Node laconicd" + url: "https://laconicd-sapo.laconic.com" - targets: ["https://console-sapo.laconic.com"] labels: alias: "Console App" + url: "https://console-sapo.laconic.com" - targets: ["https://fixturenet-eth.laconic.com"] labels: alias: "Fixturenet ETH" + url: "https://fixturenet-eth.laconic.com" - targets: ["https://deploy.laconic.com"] labels: alias: "Deploy App" + url: "https://deploy.laconic.com" - targets: ["https://deploy-backend.laconic.com/staging/version"] labels: alias: "Deploy Backend" + url: "https://deploy-backend.laconic.com/staging/version" - targets: ["https://container-registry.apps.vaasl.io"] labels: alias: "Container Registry" + url: "https://container-registry.apps.vaasl.io" - targets: ["https://webapp-deployer-api.apps.vaasl.io"] labels: alias: "Webapp Deployer API" + url: "https://webapp-deployer-api.apps.vaasl.io" - targets: ["https://webapp-deployer-ui.apps.vaasl.io"] labels: alias: "Webapp Deployer UI" + url: "https://webapp-deployer-ui.apps.vaasl.io" ... - job_name: laconicd ... -- 2.45.2