diff --git a/stack_orchestrator/data/config/monitoring/blackbox.yml b/stack_orchestrator/data/config/monitoring/blackbox.yml index 4a9d72eb..30cb9ffa 100644 --- a/stack_orchestrator/data/config/monitoring/blackbox.yml +++ b/stack_orchestrator/data/config/monitoring/blackbox.yml @@ -1,7 +1,8 @@ modules: http_2xx: prober: http - timeout: 5s + timeout: 15s http: valid_status_codes: [] #default to 2xx method: GET + preferred_ip_protocol: ip4 diff --git a/stack_orchestrator/data/config/monitoring/grafana/dashboards/prometheus-blackbox-exporter.json b/stack_orchestrator/data/config/monitoring/grafana/dashboards/prometheus-blackbox-exporter.json index 638c4d5b..0a534bec 100644 --- a/stack_orchestrator/data/config/monitoring/grafana/dashboards/prometheus-blackbox-exporter.json +++ b/stack_orchestrator/data/config/monitoring/grafana/dashboards/prometheus-blackbox-exporter.json @@ -133,10 +133,13 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "expr": "probe_success{instance=~\"$target\"}", + "format": "time_series", + "instant": true, "refId": "A" } ], - "title": "$target status", + "title": "$target ($url)", "type": "row" }, { @@ -1057,6 +1060,29 @@ "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "definition": "label_values(probe_success{instance=~\"$target\"}, url)", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "url", + "options": [], + "query": "label_values(probe_success{instance=~\"$target\"}, url)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" } ] }, diff --git a/stack_orchestrator/data/config/monitoring/grafana/provisioning/alerting/policies.yml b/stack_orchestrator/data/config/monitoring/grafana/provisioning/alerting/policies.yml index b1beb31d..498b9cf5 100644 --- a/stack_orchestrator/data/config/monitoring/grafana/provisioning/alerting/policies.yml +++ b/stack_orchestrator/data/config/monitoring/grafana/provisioning/alerting/policies.yml @@ -8,6 +8,7 @@ policies: group_by: - grafana_folder - alertname + - instance routes: - receiver: SlackNotifier object_matchers: diff --git a/stack_orchestrator/data/config/monitoring/prometheus/prometheus.yml b/stack_orchestrator/data/config/monitoring/prometheus/prometheus.yml index dea7052d..9dd83904 100644 --- a/stack_orchestrator/data/config/monitoring/prometheus/prometheus.yml +++ b/stack_orchestrator/data/config/monitoring/prometheus/prometheus.yml @@ -25,20 +25,34 @@ scrape_configs: module: [http_2xx] static_configs: # Add URLs to be monitored below - - targets: - # - https://github.com + # - targets: ["https://github.com"] + # labels: + # alias: "GitHub" + # url: "https://github.com" relabel_configs: + # Forward the original target URL as the 'target' parameter. - source_labels: [__address__] regex: (.*)(:80)? target_label: __param_target - - source_labels: [__param_target] - regex: (.*) + # Use the custom alias if defined for the 'instance' label. + - source_labels: [alias] target_label: instance - replacement: ${1} - - source_labels: [] - regex: .* - target_label: __address__ + action: replace + # Preserve the URL label + - source_labels: [url] + target_label: url + action: replace + # If no alias is set, fall back to the target URL. + - source_labels: [instance] + regex: ^$ + target_label: instance + replacement: ${__param_target} + # Finally, tell Prometheus to scrape the blackbox_exporter. + - target_label: __address__ replacement: blackbox:9115 + # Drop the original alias label as it's now redundant with instance + - action: labeldrop + regex: ^alias$ - job_name: chain_heads scrape_interval: 10s diff --git a/stack_orchestrator/data/config/monitoring/testnet-alert-rules.yml b/stack_orchestrator/data/config/monitoring/testnet-alert-rules.yml new file mode 100644 index 00000000..60d77bd1 --- /dev/null +++ b/stack_orchestrator/data/config/monitoring/testnet-alert-rules.yml @@ -0,0 +1,64 @@ +apiVersion: 1 + +groups: + - orgId: 1 + name: testnet + folder: TestnetAlerts + interval: 30s + rules: + - uid: endpoint_down + title: endpoint_down + condition: condition + data: + - refId: probe_success + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: probe_success{job="blackbox"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: probe_success + - refId: condition + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + - 0 + type: eq + operator: + type: and + query: + params: [] + reducer: + params: [] + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: ${probe_success} == 0 + intervalMs: 1000 + maxDataPoints: 43200 + refId: condition + type: math + noDataState: Alerting + execErrState: Alerting + for: 5m + annotations: + summary: Endpoint {{ $labels.instance }} is down + isPaused: false diff --git a/stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md b/stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md new file mode 100644 index 00000000..12d0383e --- /dev/null +++ b/stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md @@ -0,0 +1,170 @@ +# Monitoring Testnet + +Instructions to setup and run monitoring stack for testnet services + +## Create a deployment + +Create a spec file for the deployment, which will map the stack's ports and volumes to the host: + +```bash +laconic-so --stack monitoring deploy init --output monitoring-testnet-spec.yml +``` + +### Ports + +Edit `network` in spec file to map container ports to same ports in host: + +``` +... +network: + ports: + prometheus: + - '9090:9090' + grafana: + - '3000:3000' +... +``` + +--- + +Once you've made any needed changes to the spec file, create a deployment from it: + +```bash +laconic-so --stack monitoring deploy create --spec-file monitoring-testnet-spec.yml --deployment-dir monitoring-testnet-deployment +``` + +## Configure + +### Prometheus scrape config + +- Setup the following scrape configs in prometheus config file (`monitoring-testnet-deployment/config/monitoring/prometheus/prometheus.yml`) in the deployment folder: + + ```yml + ... + - job_name: 'blackbox' + ... + static_configs: + - targets: ["https://wallet.laconic.com"] + labels: + alias: "Wallet App" + url: "https://wallet.laconic.com" + - targets: ["https://laconicd-sapo.laconic.com"] + labels: + alias: "Node laconicd" + url: "https://laconicd-sapo.laconic.com" + - targets: ["https://console-sapo.laconic.com"] + labels: + alias: "Console App" + url: "https://console-sapo.laconic.com" + - targets: ["https://fixturenet-eth.laconic.com"] + labels: + alias: "Fixturenet ETH" + url: "https://fixturenet-eth.laconic.com" + - targets: ["https://deploy.laconic.com"] + labels: + alias: "Deploy App" + url: "https://deploy.laconic.com" + - targets: ["https://deploy-backend.laconic.com/staging/version"] + labels: + alias: "Deploy Backend" + url: "https://deploy-backend.laconic.com/staging/version" + - targets: ["https://container-registry.apps.vaasl.io"] + labels: + alias: "Container Registry" + url: "https://container-registry.apps.vaasl.io" + - targets: ["https://webapp-deployer-api.apps.vaasl.io"] + labels: + alias: "Webapp Deployer API" + url: "https://webapp-deployer-api.apps.vaasl.io" + - targets: ["https://webapp-deployer-ui.apps.vaasl.io"] + labels: + alias: "Webapp Deployer UI" + url: "https://webapp-deployer-ui.apps.vaasl.io" + ... + - job_name: laconicd + ... + static_configs: + - targets: ['LACONICD_REST_HOST:LACONICD_REST_PORT'] + # Example: 'host.docker.internal:3317' + ``` + +- Remove docker compose services which are not required in `monitoring-testnet-deployment/compose/docker-compose-prom-server.yml` + - `ethereum-chain-head-exporter` + - `filecoin-chain-head-exporter` + - `graph-node-upstream-head-exporter` + - `postgres-exporter` + +### Grafana dashboards + +Remove some of the existing dashboards which are not required in monitoring testnet +``` +cd monitoring-testnet-deployment/config/monitoring/grafana/dashboards +rm postgres-dashboard.json subgraphs-dashboard.json watcher-dashboard.json +cd - +``` + + +### Grafana alerts config + +Place the pre-configured alerts rules in Grafana provisioning directory: + + ```bash + # watcher alert rules + cp monitoring-testnet-deployment/config/monitoring/testnet-alert-rules.yml monitoring-testnet-deployment/config/monitoring/grafana/provisioning/alerting/ + ``` + +Update the alerting contact points config (`monitoring-testnet-deployment/config/monitoring/grafana/provisioning/alerting/contactpoints.yml`) with desired contact points + +Add corresponding routes to the notification policies config (`monitoring-testnet-deployment/config/monitoring/grafana/provisioning/alerting/policies.yml`) with appropriate object-matchers: + + ```yml + ... + routes: + - receiver: SlackNotifier + object_matchers: + # Add matchers below + - ['grafana_folder', '=~', 'TestnetAlerts'] + ``` + +### Env + +Set the following env variables in the deployment env config file (`monitoring-testnet-deployment/config.env`): + + ```bash + # Grafana server host URL to be used + # (Optional, default: http://localhost:3000) + GF_SERVER_ROOT_URL= + ``` + +## Start the stack + +Start the deployment: + +```bash +laconic-so deployment --dir monitoring-testnet-deployment start +``` + +* List and check the health status of all the containers using `docker ps` and wait for them to be `healthy` + +* Grafana should now be visible at http://localhost:3000 with configured dashboards + +## Clean up + +To stop monitoring services running in the background, while preserving data: + +```bash +# Only stop the docker containers +laconic-so deployment --dir monitoring-watchers-deployment stop + +# Run 'start' to restart the deployment +``` + +To stop monitoring services and also delete data: + +```bash +# Stop the docker containers +laconic-so deployment --dir monitoring-watchers-deployment stop --delete-volumes + +# Remove deployment directory (deployment will have to be recreated for a re-run) +rm -rf monitoring-watchers-deployment +``` diff --git a/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md b/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md index 158da503..bd325a96 100644 --- a/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md +++ b/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md @@ -44,9 +44,12 @@ Add the following scrape configs to prometheus config file (`monitoring-watchers - job_name: 'blackbox' ... static_configs: - - targets: - - - - + - targets: [""] + labels: + alias: "Azimuth Watcher" + - targets: [""] + labels: + alias: "Node (laconicd)" ... - job_name: laconicd static_configs: