[WIP] Add stack for monitoring testnet services #46
31
stack-orchestrator/compose/docker-compose-grafana.yml
Normal file
31
stack-orchestrator/compose/docker-compose-grafana.yml
Normal file
@ -0,0 +1,31 @@
|
||||
version: "3.7"
|
||||
|
||||
services:
|
||||
grafana:
|
||||
image: grafana/grafana:10.2.3
|
||||
restart: always
|
||||
environment:
|
||||
GF_SERVER_ROOT_URL: ${GF_SERVER_ROOT_URL}
|
||||
CERC_GRAFANA_ALERTS_SUBGRAPH_IDS: ${CERC_GRAFANA_ALERTS_SUBGRAPH_IDS}
|
||||
volumes:
|
||||
- ../config/monitoring/grafana/provisioning:/etc/grafana/provisioning
|
||||
- ../config/monitoring/grafana/dashboards:/etc/grafana/dashboards
|
||||
- ../config/monitoring/update-grafana-alerts-config.sh:/update-grafana-alerts-config.sh
|
||||
- grafana_storage:/var/lib/grafana
|
||||
user: root
|
||||
entrypoint: ["bash", "-c"]
|
||||
command: |
|
||||
"/run.sh"
|
||||
ports:
|
||||
- "3000"
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
healthcheck:
|
||||
test: ["CMD", "nc", "-vz", "localhost", "3000"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 10
|
||||
start_period: 3s
|
||||
|
||||
volumes:
|
||||
grafana_storage:
|
||||
42
stack-orchestrator/compose/docker-compose-prom-server.yml
Normal file
42
stack-orchestrator/compose/docker-compose-prom-server.yml
Normal file
@ -0,0 +1,42 @@
|
||||
version: "3.7"
|
||||
|
||||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.49.1
|
||||
restart: always
|
||||
volumes:
|
||||
- ../config/monitoring/prometheus:/etc/prometheus
|
||||
- prometheus_data:/prometheus
|
||||
ports:
|
||||
- "9090"
|
||||
healthcheck:
|
||||
test: ["CMD", "nc", "-vz", "localhost", "9090"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 10
|
||||
start_period: 3s
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
|
||||
blackbox:
|
||||
image: prom/blackbox-exporter:latest
|
||||
restart: always
|
||||
volumes:
|
||||
- ../config/monitoring/blackbox.yml:/etc/blackbox_exporter/config.yml
|
||||
ports:
|
||||
- '9115'
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
|
||||
postgres-exporter:
|
||||
image: quay.io/prometheuscommunity/postgres-exporter
|
||||
restart: always
|
||||
volumes:
|
||||
- ../config/monitoring/postgres-exporter.yml:/postgres_exporter.yml
|
||||
ports:
|
||||
- '9187'
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
|
||||
volumes:
|
||||
prometheus_data:
|
||||
7
stack-orchestrator/config/monitoring/blackbox.yml
Normal file
7
stack-orchestrator/config/monitoring/blackbox.yml
Normal file
@ -0,0 +1,7 @@
|
||||
modules:
|
||||
http_2xx:
|
||||
prober: http
|
||||
timeout: 5s
|
||||
http:
|
||||
valid_status_codes: [] #default to 2xx
|
||||
method: GET
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,943 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "grafana"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "node.js prometheus client basic metrics",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"gnetId": 11159,
|
||||
"graphTooltip": 0,
|
||||
"id": 15,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 10,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 6,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [],
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"alertThreshold": true
|
||||
},
|
||||
"paceLength": 10,
|
||||
"percentage": false,
|
||||
"pluginVersion": "10.2.2",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"expr": "irate(process_cpu_user_seconds_total{instance=~\"$instance\"}[2m]) * 100",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "User CPU - {{instance}}",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"expr": "irate(process_cpu_system_seconds_total{instance=~\"$instance\"}[2m]) * 100",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Sys CPU - {{instance}}",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeRegions": [],
|
||||
"title": "Process CPU Usage",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"mode": "time",
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percent",
|
||||
"logBase": 1,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"logBase": 1,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 9,
|
||||
"x": 10,
|
||||
"y": 0
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 8,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [],
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"alertThreshold": true
|
||||
},
|
||||
"paceLength": 10,
|
||||
"percentage": false,
|
||||
"pluginVersion": "10.2.2",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"expr": "nodejs_eventloop_lag_seconds{instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "{{instance}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeRegions": [],
|
||||
"title": "Event Loop Lag",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"mode": "time",
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "s",
|
||||
"logBase": 1,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"logBase": 1,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"fixedColor": "text",
|
||||
"mode": "fixed"
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"options": {
|
||||
"match": "null",
|
||||
"result": {
|
||||
"text": "N/A"
|
||||
}
|
||||
},
|
||||
"type": "special"
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 5,
|
||||
"x": 19,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"interval": "",
|
||||
"links": [],
|
||||
"maxDataPoints": 100,
|
||||
"options": {
|
||||
"colorMode": "none",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"mean"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "name",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "10.2.2",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"expr": "sum(nodejs_version_info{instance=~\"$instance\"}) by (version)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"interval": "",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "{{version}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Node.js Version",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"fixedColor": "#F2495C",
|
||||
"mode": "fixed"
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"options": {
|
||||
"match": "null",
|
||||
"result": {
|
||||
"text": "N/A"
|
||||
}
|
||||
},
|
||||
"type": "special"
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 5,
|
||||
"x": 19,
|
||||
"y": 3
|
||||
},
|
||||
"id": 4,
|
||||
"links": [],
|
||||
"maxDataPoints": 100,
|
||||
"options": {
|
||||
"colorMode": "none",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "10.2.2",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"expr": "sum(changes(process_start_time_seconds{instance=~\"$instance\"}[1m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "{{instance}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Process Restart Times",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 16,
|
||||
"x": 0,
|
||||
"y": 7
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 7,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [],
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"alertThreshold": true
|
||||
},
|
||||
"paceLength": 10,
|
||||
"percentage": false,
|
||||
"pluginVersion": "10.2.2",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"expr": "process_resident_memory_bytes{instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Process Memory - {{instance}}",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"expr": "nodejs_heap_size_total_bytes{instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Heap Total - {{instance}}",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"expr": "nodejs_heap_size_used_bytes{instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Heap Used - {{instance}}",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"expr": "nodejs_external_memory_bytes{instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "External Memory - {{instance}}",
|
||||
"refId": "D"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeRegions": [],
|
||||
"title": "Process Memory Usage",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"mode": "time",
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "bytes",
|
||||
"logBase": 1,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"logBase": 1,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 7
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 9,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [],
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"alertThreshold": true
|
||||
},
|
||||
"paceLength": 10,
|
||||
"percentage": false,
|
||||
"pluginVersion": "10.2.2",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"expr": "nodejs_active_handles_total{instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Active Handler - {{instance}}",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"expr": "nodejs_active_requests_total{instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Active Request - {{instance}}",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeRegions": [],
|
||||
"title": "Active Handlers/Requests Total",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"mode": "time",
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"logBase": 1,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"logBase": 1,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 14
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 10,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [],
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"alertThreshold": true
|
||||
},
|
||||
"paceLength": 10,
|
||||
"percentage": false,
|
||||
"pluginVersion": "10.2.2",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"expr": "nodejs_heap_space_size_total_bytes{instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Heap Total - {{instance}} - {{space}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeRegions": [],
|
||||
"title": "Heap Total Detail",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"mode": "time",
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "bytes",
|
||||
"logBase": 1,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"logBase": 1,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 14
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 11,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [],
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"alertThreshold": true
|
||||
},
|
||||
"paceLength": 10,
|
||||
"percentage": false,
|
||||
"pluginVersion": "10.2.2",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"expr": "nodejs_heap_space_size_used_bytes{instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Heap Used - {{instance}} - {{space}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeRegions": [],
|
||||
"title": "Heap Used Detail",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"mode": "time",
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "bytes",
|
||||
"logBase": 1,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"logBase": 1,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 14
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 12,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [],
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"alertThreshold": true
|
||||
},
|
||||
"paceLength": 10,
|
||||
"percentage": false,
|
||||
"pluginVersion": "10.2.2",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"expr": "nodejs_heap_space_size_available_bytes{instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Heap Used - {{instance}} - {{space}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeRegions": [],
|
||||
"title": "Heap Available Detail",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"mode": "time",
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "bytes",
|
||||
"logBase": 1,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"logBase": 1,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false
|
||||
}
|
||||
}
|
||||
],
|
||||
"refresh": "10s",
|
||||
"schemaVersion": 38,
|
||||
"tags": [
|
||||
"nodejs"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"selected": true,
|
||||
"text": [
|
||||
"All"
|
||||
],
|
||||
"value": [
|
||||
"$__all"
|
||||
]
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"definition": "label_values(nodejs_version_info, instance)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "instance",
|
||||
"multi": true,
|
||||
"name": "instance",
|
||||
"options": [],
|
||||
"query": "label_values(nodejs_version_info, instance)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-15m",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "",
|
||||
"title": "NodeJS Application Dashboard",
|
||||
"uid": "PTSqcpJWk",
|
||||
"version": 3,
|
||||
"weekStart": ""
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,14 @@
|
||||
# https://www.clever-cloud.com/blog/features/2021/12/03/slack-alerts-for-grafana/
|
||||
|
||||
apiVersion: 1
|
||||
|
||||
contactPoints:
|
||||
- orgId: 1
|
||||
name: SlackNotifier
|
||||
receivers:
|
||||
- uid: a71b06e3-58b6-41fe-af65-fbbb29653951
|
||||
type: slack
|
||||
settings:
|
||||
# Slack hook URL (see https://api.slack.com/messaging/webhooks)
|
||||
url: <YOUR_SLACK_HOOK_URL>
|
||||
disableResolveMessage: false
|
||||
@ -0,0 +1,15 @@
|
||||
# https://grafana.com/docs/grafana/latest/alerting/alerting-rules/create-notification-policy/
|
||||
|
||||
apiVersion: 1
|
||||
|
||||
policies:
|
||||
- orgId: 1
|
||||
receiver: grafana-default-email
|
||||
group_by:
|
||||
- grafana_folder
|
||||
- alertname
|
||||
routes:
|
||||
- receiver: SlackNotifier
|
||||
object_matchers:
|
||||
# Add matchers below
|
||||
# - ['grafana_folder', '=', 'MyAlerts']
|
||||
@ -0,0 +1,10 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: dashboards
|
||||
type: file
|
||||
updateIntervalSeconds: 10
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /etc/grafana/dashboards
|
||||
foldersFromFilesStructure: true
|
||||
@ -0,0 +1,16 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- id: 1
|
||||
orgId: 1
|
||||
name: Prometheus
|
||||
type: prometheus
|
||||
typeName: Prometheus
|
||||
typeLogoUrl: public/app/plugins/datasource/prometheus/img/prometheus_logo.svg
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
jsonData:
|
||||
httpMethod: POST
|
||||
version: 1
|
||||
editable: true
|
||||
@ -0,0 +1,69 @@
|
||||
global:
|
||||
scrape_interval: 10s
|
||||
evaluation_interval: 15s
|
||||
|
||||
rule_files:
|
||||
# - "first.rules"
|
||||
# - "second.rules"
|
||||
|
||||
scrape_configs:
|
||||
- job_name: prometheus
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
- job_name: node
|
||||
static_configs:
|
||||
# Add node-exporter targets to be monitored below
|
||||
# - targets: ['example-host:9100']
|
||||
# labels:
|
||||
# instance: 'my-host'
|
||||
|
||||
- job_name: 'blackbox'
|
||||
scrape_interval: 10s
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
static_configs:
|
||||
# Add URLs to be monitored below
|
||||
- targets:
|
||||
# - https://github.com
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
regex: (.*)(:80)?
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
regex: (.*)
|
||||
target_label: instance
|
||||
replacement: ${1}
|
||||
- source_labels: []
|
||||
regex: .*
|
||||
target_label: __address__
|
||||
replacement: blackbox:9115
|
||||
|
||||
- job_name: 'postgres'
|
||||
scrape_interval: 30s
|
||||
scrape_timeout: 30s
|
||||
static_configs:
|
||||
# Add DB targets below
|
||||
# - targets: [example-server:5432]
|
||||
# labels:
|
||||
# instance: 'example-label'
|
||||
metrics_path: /probe
|
||||
params:
|
||||
auth_module: [foo]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: postgres-exporter:9187
|
||||
|
||||
- job_name: laconicd
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
static_configs:
|
||||
# Add laconicd REST endpoint target with host and port (1317)
|
||||
# - targets: ['example-host:1317']
|
||||
params:
|
||||
format: ['prometheus']
|
||||
122
stack-orchestrator/stacks/monitoring/monitoring-testnet.md
Normal file
122
stack-orchestrator/stacks/monitoring/monitoring-testnet.md
Normal file
@ -0,0 +1,122 @@
|
||||
# Monitoring Testnet services
|
||||
|
||||
Instructions to setup and run monitoring stack
|
||||
|
||||
## Create a deployment
|
||||
|
||||
After completing [setup](./README.md#setup), create a spec file for the deployment, which will map the stack's ports and volumes to the host:
|
||||
|
||||
```bash
|
||||
laconic-so --stack monitoring deploy init --output monitoring-testnet-spec.yml
|
||||
```
|
||||
|
||||
### Ports
|
||||
|
||||
Edit `network` in spec file to map container ports to same ports in host:
|
||||
|
||||
```
|
||||
...
|
||||
network:
|
||||
ports:
|
||||
prometheus:
|
||||
- '9090:9090'
|
||||
grafana:
|
||||
- '3000:3000'
|
||||
...
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
Once you've made any needed changes to the spec file, create a deployment from it:
|
||||
|
||||
```bash
|
||||
laconic-so --stack monitoring deploy create --spec-file monitoring-testnet-spec.yml --deployment-dir monitoring-testnet-deployment
|
||||
```
|
||||
|
||||
## Configure
|
||||
|
||||
### Prometheus scrape config
|
||||
|
||||
Add the following scrape configs to prometheus config file (`monitoring-testnet-deployment/config/monitoring/prometheus/prometheus.yml`) in the deployment folder:
|
||||
|
||||
```yml
|
||||
...
|
||||
- job_name: 'blackbox'
|
||||
...
|
||||
static_configs:
|
||||
- targets:
|
||||
- <LACONICD_GQL_ENDPOINT>
|
||||
...
|
||||
- job_name: laconicd
|
||||
static_configs:
|
||||
- targets: ['LACONICD_REST_HOST:LACONICD_REST_PORT']
|
||||
...
|
||||
|
||||
```
|
||||
|
||||
### Grafana alerts config
|
||||
|
||||
Place the pre-configured alerts rules in Grafana provisioning directory:
|
||||
|
||||
```bash
|
||||
cp monitoring-testnet-deployment/config/monitoring/testnet-alert-rules.yml monitoring-testnet-deployment/config/monitoring/grafana/provisioning/alerting/
|
||||
```
|
||||
|
||||
Update the alerting contact points config (`monitoring-testnet-deployment/config/monitoring/grafana/provisioning/alerting/contactpoints.yml`) with desired contact points
|
||||
|
||||
Add corresponding routes to the notification policies config (`monitoring-testnet-deployment/config/monitoring/grafana/provisioning/alerting/policies.yml`) with appropriate object-matchers:
|
||||
|
||||
```yml
|
||||
...
|
||||
routes:
|
||||
- receiver: SlackNotifier
|
||||
object_matchers:
|
||||
# Add matchers below
|
||||
- ['grafana_folder', '=~', 'TestnetAlerts']
|
||||
```
|
||||
|
||||
### Env
|
||||
|
||||
Set the following env variables in the deployment env config file (`monitoring-testnet-deployment/config.env`):
|
||||
|
||||
```bash
|
||||
# Infura key to be used
|
||||
CERC_INFURA_KEY=
|
||||
|
||||
# Grafana server host URL to be used
|
||||
# (Optional, default: http://localhost:3000)
|
||||
GF_SERVER_ROOT_URL=
|
||||
```
|
||||
|
||||
## Start the stack
|
||||
|
||||
Start the deployment:
|
||||
|
||||
```bash
|
||||
laconic-so deployment --dir monitoring-testnet-deployment start
|
||||
```
|
||||
|
||||
* List and check the health status of all the containers using `docker ps` and wait for them to be `healthy`
|
||||
|
||||
* Grafana should now be visible at http://localhost:3000 with configured dashboards
|
||||
|
||||
## Clean up
|
||||
|
||||
To stop monitoring services running in the background, while preserving data:
|
||||
|
||||
```bash
|
||||
# Only stop the docker containers
|
||||
laconic-so deployment --dir monitoring-testnet-deployment stop
|
||||
|
||||
# Run 'start' to restart the deployment
|
||||
```
|
||||
|
||||
To stop monitoring services and also delete data:
|
||||
|
||||
```bash
|
||||
# Stop the docker containers
|
||||
laconic-so deployment --dir monitoring-testnet-deployment stop --delete-volumes
|
||||
|
||||
# Remove deployment directory (deployment will have to be recreated for a re-run)
|
||||
rm -rf monitoring-testnet-deployment
|
||||
```
|
||||
5
stack-orchestrator/stacks/monitoring/stack.yml
Normal file
5
stack-orchestrator/stacks/monitoring/stack.yml
Normal file
@ -0,0 +1,5 @@
|
||||
version: "0.1"
|
||||
name: monitoring testnet
|
||||
pods:
|
||||
- prom-server
|
||||
- grafana
|
||||
Loading…
Reference in New Issue
Block a user