feat(monitoring): add Prometheus metrics endpoint and Grafana dashboards
Add observability stack with @willsoto/nestjs-prometheus for /metrics endpoint, Prometheus scraping config, and 4 auto-provisioned Grafana dashboards (API overview, database, search, business metrics). Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
149
monitoring/grafana/dashboards/api-overview.json
Normal file
149
monitoring/grafana/dashboards/api-overview.json
Normal file
@@ -0,0 +1,149 @@
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"title": "Request Rate (req/s)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total[5m])) by (method)",
|
||||
"legendFormat": "{{method}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps",
|
||||
"custom": { "drawStyle": "line", "fillOpacity": 10 }
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Error Rate (5xx)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m]))",
|
||||
"legendFormat": "5xx errors/s",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps",
|
||||
"custom": { "drawStyle": "line", "fillOpacity": 10 },
|
||||
"color": { "mode": "fixed", "fixedColor": "red" }
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Request Latency (p50 / p95 / p99)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p95",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p99",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"custom": { "drawStyle": "line", "fillOpacity": 5 }
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Requests by Route",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total[5m])) by (route)",
|
||||
"legendFormat": "{{route}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps",
|
||||
"custom": { "drawStyle": "bars", "fillOpacity": 50 }
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Requests by Status Code",
|
||||
"type": "piechart",
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 16 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(http_requests_total[1h])) by (status_code)",
|
||||
"legendFormat": "{{status_code}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Process Memory (RSS)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 16 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "process_resident_memory_bytes{job=\"goodgo-api\"}",
|
||||
"legendFormat": "RSS",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"custom": { "drawStyle": "line", "fillOpacity": 10 }
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Node.js Event Loop Lag",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 16 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "nodejs_eventloop_lag_seconds{job=\"goodgo-api\"}",
|
||||
"legendFormat": "Event Loop Lag",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"custom": { "drawStyle": "line", "fillOpacity": 10 }
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["goodgo", "api"],
|
||||
"templating": { "list": [] },
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "GoodGo API Overview",
|
||||
"uid": "goodgo-api-overview",
|
||||
"version": 1
|
||||
}
|
||||
117
monitoring/grafana/dashboards/business-metrics.json
Normal file
117
monitoring/grafana/dashboards/business-metrics.json
Normal file
@@ -0,0 +1,117 @@
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"title": "Listings Created (rate)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(listings_created_total[5m])) by (category)",
|
||||
"legendFormat": "{{category}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": { "drawStyle": "bars", "fillOpacity": 50 }
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Listings Created (total)",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(listings_created_total[24h]))",
|
||||
"legendFormat": "Last 24h",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "short" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Payments Processed (rate)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(payments_processed_total[5m])) by (status)",
|
||||
"legendFormat": "{{status}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": { "drawStyle": "line", "fillOpacity": 10 }
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Payment Success Rate",
|
||||
"type": "gauge",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(payments_processed_total{status=\"success\"}[1h])) / sum(rate(payments_processed_total[1h]))",
|
||||
"legendFormat": "Success Rate",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percentunit",
|
||||
"min": 0,
|
||||
"max": 1,
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 0.9 },
|
||||
{ "color": "green", "value": 0.95 }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Active Subscriptions by Plan",
|
||||
"type": "bargauge",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "active_subscriptions",
|
||||
"legendFormat": "{{plan}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "color": "blue", "value": null }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["goodgo", "business"],
|
||||
"templating": { "list": [] },
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "GoodGo Business Metrics",
|
||||
"uid": "goodgo-business",
|
||||
"version": 1
|
||||
}
|
||||
108
monitoring/grafana/dashboards/database.json
Normal file
108
monitoring/grafana/dashboards/database.json
Normal file
@@ -0,0 +1,108 @@
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"title": "Query Latency (p50 / p95 / p99)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(db_query_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(db_query_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p95",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(db_query_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p99",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"custom": { "drawStyle": "line", "fillOpacity": 10 }
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Query Rate by Operation",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(db_query_duration_seconds_count[5m])) by (operation)",
|
||||
"legendFormat": "{{operation}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": { "drawStyle": "bars", "fillOpacity": 50 }
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Active DB Connections",
|
||||
"type": "gauge",
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 8 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "db_pool_active_connections",
|
||||
"legendFormat": "Active",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 15 },
|
||||
{ "color": "red", "value": 25 }
|
||||
]
|
||||
},
|
||||
"max": 30
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Slow Queries (> 100ms)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 16, "x": 8, "y": 8 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(db_query_duration_seconds_bucket{le=\"0.1\"}[5m])) / sum(rate(db_query_duration_seconds_count[5m]))",
|
||||
"legendFormat": "% queries < 100ms",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percentunit",
|
||||
"min": 0,
|
||||
"max": 1,
|
||||
"custom": { "drawStyle": "line", "fillOpacity": 10 }
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["goodgo", "database"],
|
||||
"templating": { "list": [] },
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "GoodGo Database",
|
||||
"uid": "goodgo-database",
|
||||
"version": 1
|
||||
}
|
||||
82
monitoring/grafana/dashboards/search.json
Normal file
82
monitoring/grafana/dashboards/search.json
Normal file
@@ -0,0 +1,82 @@
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"title": "Search Latency (p50 / p95 / p99)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(search_query_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(search_query_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p95",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(search_query_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p99",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"custom": { "drawStyle": "line", "fillOpacity": 10 }
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Search Query Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(search_query_duration_seconds_count[5m])) by (collection)",
|
||||
"legendFormat": "{{collection}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": { "drawStyle": "line", "fillOpacity": 10 }
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Search Query Rate by Type",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 8 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(search_query_duration_seconds_count[5m])) by (type)",
|
||||
"legendFormat": "{{type}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": { "drawStyle": "bars", "fillOpacity": 50 }
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["goodgo", "search"],
|
||||
"templating": { "list": [] },
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "GoodGo Search (Typesense)",
|
||||
"uid": "goodgo-search",
|
||||
"version": 1
|
||||
}
|
||||
12
monitoring/grafana/provisioning/dashboards/dashboard.yml
Normal file
12
monitoring/grafana/provisioning/dashboards/dashboard.yml
Normal file
@@ -0,0 +1,12 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'GoodGo Dashboards'
|
||||
orgId: 1
|
||||
folder: 'GoodGo'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
foldersFromFilesStructure: false
|
||||
@@ -0,0 +1,9 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: true
|
||||
16
monitoring/prometheus/prometheus.yml
Normal file
16
monitoring/prometheus/prometheus.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'goodgo-api'
|
||||
metrics_path: '/metrics'
|
||||
static_configs:
|
||||
- targets: ['host.docker.internal:3001']
|
||||
labels:
|
||||
service: 'goodgo-api'
|
||||
environment: 'development'
|
||||
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
Reference in New Issue
Block a user