Add readme for monitoring testnet services #967
@ -1,7 +1,8 @@
|
|||||||
modules:
|
modules:
|
||||||
http_2xx:
|
http_2xx:
|
||||||
prober: http
|
prober: http
|
||||||
timeout: 5s
|
timeout: 15s
|
||||||
http:
|
http:
|
||||||
valid_status_codes: [] #default to 2xx
|
valid_status_codes: [] #default to 2xx
|
||||||
method: GET
|
method: GET
|
||||||
|
preferred_ip_protocol: ip4
|
||||||
|
@ -133,10 +133,13 @@
|
|||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "PBFA97CFB590B2093"
|
"uid": "PBFA97CFB590B2093"
|
||||||
},
|
},
|
||||||
|
"expr": "probe_success{instance=~\"$target\"}",
|
||||||
|
"format": "time_series",
|
||||||
|
"instant": true,
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"title": "$target status",
|
"title": "$target ($url)",
|
||||||
"type": "row"
|
"type": "row"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1057,6 +1060,29 @@
|
|||||||
"tagsQuery": "",
|
"tagsQuery": "",
|
||||||
"type": "query",
|
"type": "query",
|
||||||
"useTags": false
|
"useTags": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"current": {
|
||||||
|
"selected": false,
|
||||||
|
"text": "",
|
||||||
|
"value": ""
|
||||||
|
},
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "PBFA97CFB590B2093"
|
||||||
|
},
|
||||||
|
"definition": "label_values(probe_success{instance=~\"$target\"}, url)",
|
||||||
|
"hide": 2,
|
||||||
|
"includeAll": false,
|
||||||
|
"multi": false,
|
||||||
|
"name": "url",
|
||||||
|
"options": [],
|
||||||
|
"query": "label_values(probe_success{instance=~\"$target\"}, url)",
|
||||||
|
"refresh": 2,
|
||||||
|
"regex": "",
|
||||||
|
"skipUrlSync": false,
|
||||||
|
"sort": 0,
|
||||||
|
"type": "query"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -8,6 +8,7 @@ policies:
|
|||||||
group_by:
|
group_by:
|
||||||
- grafana_folder
|
- grafana_folder
|
||||||
- alertname
|
- alertname
|
||||||
|
- instance
|
||||||
routes:
|
routes:
|
||||||
- receiver: SlackNotifier
|
- receiver: SlackNotifier
|
||||||
object_matchers:
|
object_matchers:
|
||||||
|
@ -25,20 +25,34 @@ scrape_configs:
|
|||||||
module: [http_2xx]
|
module: [http_2xx]
|
||||||
static_configs:
|
static_configs:
|
||||||
# Add URLs to be monitored below
|
# Add URLs to be monitored below
|
||||||
- targets:
|
# - targets: ["https://github.com"]
|
||||||
# - https://github.com
|
# labels:
|
||||||
|
# alias: "GitHub"
|
||||||
|
# url: "https://github.com"
|
||||||
relabel_configs:
|
relabel_configs:
|
||||||
|
# Forward the original target URL as the 'target' parameter.
|
||||||
- source_labels: [__address__]
|
- source_labels: [__address__]
|
||||||
regex: (.*)(:80)?
|
regex: (.*)(:80)?
|
||||||
target_label: __param_target
|
target_label: __param_target
|
||||||
- source_labels: [__param_target]
|
# Use the custom alias if defined for the 'instance' label.
|
||||||
regex: (.*)
|
- source_labels: [alias]
|
||||||
target_label: instance
|
target_label: instance
|
||||||
replacement: ${1}
|
action: replace
|
||||||
- source_labels: []
|
# Preserve the URL label
|
||||||
regex: .*
|
- source_labels: [url]
|
||||||
target_label: __address__
|
target_label: url
|
||||||
|
action: replace
|
||||||
|
# If no alias is set, fall back to the target URL.
|
||||||
|
- source_labels: [instance]
|
||||||
|
regex: ^$
|
||||||
|
target_label: instance
|
||||||
|
replacement: ${__param_target}
|
||||||
|
# Finally, tell Prometheus to scrape the blackbox_exporter.
|
||||||
|
- target_label: __address__
|
||||||
replacement: blackbox:9115
|
replacement: blackbox:9115
|
||||||
|
# Drop the original alias label as it's now redundant with instance
|
||||||
|
- action: labeldrop
|
||||||
|
regex: ^alias$
|
||||||
|
|
||||||
- job_name: chain_heads
|
- job_name: chain_heads
|
||||||
scrape_interval: 10s
|
scrape_interval: 10s
|
||||||
|
@ -0,0 +1,64 @@
|
|||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
groups:
|
||||||
|
- orgId: 1
|
||||||
|
name: testnet
|
||||||
|
folder: TestnetAlerts
|
||||||
|
interval: 30s
|
||||||
|
rules:
|
||||||
|
- uid: endpoint_down
|
||||||
|
title: endpoint_down
|
||||||
|
condition: condition
|
||||||
|
data:
|
||||||
|
- refId: probe_success
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 600
|
||||||
|
to: 0
|
||||||
|
datasourceUid: PBFA97CFB590B2093
|
||||||
|
model:
|
||||||
|
datasource:
|
||||||
|
type: prometheus
|
||||||
|
uid: PBFA97CFB590B2093
|
||||||
|
editorMode: code
|
||||||
|
expr: probe_success{job="blackbox"}
|
||||||
|
instant: true
|
||||||
|
intervalMs: 1000
|
||||||
|
legendFormat: __auto
|
||||||
|
maxDataPoints: 43200
|
||||||
|
range: false
|
||||||
|
refId: probe_success
|
||||||
|
- refId: condition
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 600
|
||||||
|
to: 0
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params:
|
||||||
|
- 0
|
||||||
|
- 0
|
||||||
|
type: eq
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
query:
|
||||||
|
params: []
|
||||||
|
reducer:
|
||||||
|
params: []
|
||||||
|
type: avg
|
||||||
|
type: query
|
||||||
|
datasource:
|
||||||
|
name: Expression
|
||||||
|
type: __expr__
|
||||||
|
uid: __expr__
|
||||||
|
expression: ${probe_success} == 0
|
||||||
|
intervalMs: 1000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
refId: condition
|
||||||
|
type: math
|
||||||
|
noDataState: Alerting
|
||||||
|
execErrState: Alerting
|
||||||
|
for: 5m
|
||||||
|
annotations:
|
||||||
|
summary: Endpoint {{ $labels.instance }} is down
|
||||||
|
isPaused: false
|
170
stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md
Normal file
170
stack_orchestrator/data/stacks/monitoring/monitoring-testnet.md
Normal file
@ -0,0 +1,170 @@
|
|||||||
|
# Monitoring Testnet
|
||||||
|
|
||||||
|
Instructions to setup and run monitoring stack for testnet services
|
||||||
|
|
||||||
|
## Create a deployment
|
||||||
|
|
||||||
|
Create a spec file for the deployment, which will map the stack's ports and volumes to the host:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
laconic-so --stack monitoring deploy init --output monitoring-testnet-spec.yml
|
||||||
|
```
|
||||||
|
|
||||||
|
### Ports
|
||||||
|
|
||||||
|
Edit `network` in spec file to map container ports to same ports in host:
|
||||||
|
|
||||||
|
```
|
||||||
|
...
|
||||||
|
network:
|
||||||
|
ports:
|
||||||
|
prometheus:
|
||||||
|
- '9090:9090'
|
||||||
|
grafana:
|
||||||
|
- '3000:3000'
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Once you've made any needed changes to the spec file, create a deployment from it:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
laconic-so --stack monitoring deploy create --spec-file monitoring-testnet-spec.yml --deployment-dir monitoring-testnet-deployment
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configure
|
||||||
|
|
||||||
|
### Prometheus scrape config
|
||||||
|
|
||||||
|
- Setup the following scrape configs in prometheus config file (`monitoring-testnet-deployment/config/monitoring/prometheus/prometheus.yml`) in the deployment folder:
|
||||||
|
|
||||||
|
```yml
|
||||||
|
...
|
||||||
|
- job_name: 'blackbox'
|
||||||
|
...
|
||||||
|
static_configs:
|
||||||
|
- targets: ["https://wallet.laconic.com"]
|
||||||
|
labels:
|
||||||
|
alias: "Wallet App"
|
||||||
|
url: "https://wallet.laconic.com"
|
||||||
|
- targets: ["https://laconicd-sapo.laconic.com"]
|
||||||
|
labels:
|
||||||
|
alias: "Node laconicd"
|
||||||
|
url: "https://laconicd-sapo.laconic.com"
|
||||||
|
- targets: ["https://console-sapo.laconic.com"]
|
||||||
|
labels:
|
||||||
|
alias: "Console App"
|
||||||
|
url: "https://console-sapo.laconic.com"
|
||||||
|
- targets: ["https://fixturenet-eth.laconic.com"]
|
||||||
|
labels:
|
||||||
|
alias: "Fixturenet ETH"
|
||||||
|
url: "https://fixturenet-eth.laconic.com"
|
||||||
|
- targets: ["https://deploy.laconic.com"]
|
||||||
|
labels:
|
||||||
|
alias: "Deploy App"
|
||||||
|
url: "https://deploy.laconic.com"
|
||||||
|
- targets: ["https://deploy-backend.laconic.com/staging/version"]
|
||||||
|
labels:
|
||||||
|
alias: "Deploy Backend"
|
||||||
|
url: "https://deploy-backend.laconic.com/staging/version"
|
||||||
|
- targets: ["https://container-registry.apps.vaasl.io"]
|
||||||
|
labels:
|
||||||
|
alias: "Container Registry"
|
||||||
|
url: "https://container-registry.apps.vaasl.io"
|
||||||
|
- targets: ["https://webapp-deployer-api.apps.vaasl.io"]
|
||||||
|
labels:
|
||||||
|
alias: "Webapp Deployer API"
|
||||||
|
url: "https://webapp-deployer-api.apps.vaasl.io"
|
||||||
|
- targets: ["https://webapp-deployer-ui.apps.vaasl.io"]
|
||||||
|
labels:
|
||||||
|
alias: "Webapp Deployer UI"
|
||||||
|
url: "https://webapp-deployer-ui.apps.vaasl.io"
|
||||||
|
|||||||
|
...
|
||||||
|
- job_name: laconicd
|
||||||
|
...
|
||||||
|
static_configs:
|
||||||
|
- targets: ['LACONICD_REST_HOST:LACONICD_REST_PORT']
|
||||||
|
# Example: 'host.docker.internal:3317'
|
||||||
|
```
|
||||||
|
|
||||||
|
- Remove docker compose services which are not required in `monitoring-testnet-deployment/compose/docker-compose-prom-server.yml`
|
||||||
|
- `ethereum-chain-head-exporter`
|
||||||
|
- `filecoin-chain-head-exporter`
|
||||||
|
- `graph-node-upstream-head-exporter`
|
||||||
|
- `postgres-exporter`
|
||||||
|
|
||||||
|
### Grafana dashboards
|
||||||
|
|
||||||
|
Remove some of the existing dashboards which are not required in monitoring testnet
|
||||||
|
```
|
||||||
|
cd monitoring-testnet-deployment/config/monitoring/grafana/dashboards
|
||||||
|
rm postgres-dashboard.json subgraphs-dashboard.json watcher-dashboard.json
|
||||||
|
cd -
|
||||||
|
```
|
||||||
|
<!-- TODO: Check node-exporter-full.json, nodejs-app-dashboard.json -->
|
||||||
|
|
||||||
|
### Grafana alerts config
|
||||||
|
|
||||||
|
Place the pre-configured alerts rules in Grafana provisioning directory:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# watcher alert rules
|
||||||
|
cp monitoring-testnet-deployment/config/monitoring/testnet-alert-rules.yml monitoring-testnet-deployment/config/monitoring/grafana/provisioning/alerting/
|
||||||
|
```
|
||||||
|
|
||||||
|
Update the alerting contact points config (`monitoring-testnet-deployment/config/monitoring/grafana/provisioning/alerting/contactpoints.yml`) with desired contact points
|
||||||
|
|
||||||
|
Add corresponding routes to the notification policies config (`monitoring-testnet-deployment/config/monitoring/grafana/provisioning/alerting/policies.yml`) with appropriate object-matchers:
|
||||||
|
|
||||||
|
```yml
|
||||||
|
...
|
||||||
|
routes:
|
||||||
|
- receiver: SlackNotifier
|
||||||
|
object_matchers:
|
||||||
|
# Add matchers below
|
||||||
|
- ['grafana_folder', '=~', 'TestnetAlerts']
|
||||||
|
```
|
||||||
|
|
||||||
|
### Env
|
||||||
|
|
||||||
|
Set the following env variables in the deployment env config file (`monitoring-testnet-deployment/config.env`):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Grafana server host URL to be used
|
||||||
|
# (Optional, default: http://localhost:3000)
|
||||||
|
GF_SERVER_ROOT_URL=
|
||||||
|
```
|
||||||
|
|
||||||
|
## Start the stack
|
||||||
|
|
||||||
|
Start the deployment:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
laconic-so deployment --dir monitoring-testnet-deployment start
|
||||||
|
```
|
||||||
|
|
||||||
|
* List and check the health status of all the containers using `docker ps` and wait for them to be `healthy`
|
||||||
|
|
||||||
|
* Grafana should now be visible at http://localhost:3000 with configured dashboards
|
||||||
|
|
||||||
|
## Clean up
|
||||||
|
|
||||||
|
To stop monitoring services running in the background, while preserving data:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Only stop the docker containers
|
||||||
|
laconic-so deployment --dir monitoring-watchers-deployment stop
|
||||||
|
|
||||||
|
# Run 'start' to restart the deployment
|
||||||
|
```
|
||||||
|
|
||||||
|
To stop monitoring services and also delete data:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Stop the docker containers
|
||||||
|
laconic-so deployment --dir monitoring-watchers-deployment stop --delete-volumes
|
||||||
|
|
||||||
|
# Remove deployment directory (deployment will have to be recreated for a re-run)
|
||||||
|
rm -rf monitoring-watchers-deployment
|
||||||
|
```
|
@ -44,9 +44,12 @@ Add the following scrape configs to prometheus config file (`monitoring-watchers
|
|||||||
- job_name: 'blackbox'
|
- job_name: 'blackbox'
|
||||||
...
|
...
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets:
|
- targets: ["<AZIMUTH_GATEWAY_GQL_ENDPOINT>"]
|
||||||
- <AZIMUTH_GATEWAY_GQL_ENDPOINT>
|
labels:
|
||||||
- <LACONICD_GQL_ENDPOINT>
|
alias: "Azimuth Watcher"
|
||||||
|
- targets: ["<LACONICD_GQL_ENDPOINT>"]
|
||||||
|
labels:
|
||||||
|
alias: "Node (laconicd)"
|
||||||
...
|
...
|
||||||
- job_name: laconicd
|
- job_name: laconicd
|
||||||
static_configs:
|
static_configs:
|
||||||
|
Loading…
Reference in New Issue
Block a user
we should add explorer.laconic.com, in addition to its backend RPC