Files
goodgo-platform/monitoring/grafana/dashboards/api-overview.json
Ho Ngoc Hai 90839cf542 feat(monitoring): add API latency Grafana dashboard and alerting rules
Create comprehensive Grafana dashboard for API latency monitoring with:
- p50/p95/p99 stat panels and time series for all endpoints
- Per-endpoint latency breakdown with route/method template variables
- Top 10 slowest endpoints table and bar chart (by p99)
- Request rate (by method) and error rate (4xx/5xx) panels
- Error rate percentage (5xx/total) with SLO threshold
- Latency heatmap and histogram distribution panels

Add Prometheus alerting rules:
- ApiLatencyP99High: p99 > 1s for 5m (warning)
- ApiEndpointLatencyP99High: per-endpoint p99 > 2s (warning)
- ApiLatencyP99Critical: p99 > 3s for 3m (critical/SLO breach)
- ApiErrorRate5xxHigh: 5xx rate > 1% for 5m (warning)

Fix api-overview.json using wrong metric name
(http_request_duration_seconds → goodgo_api_request_duration_seconds).

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-10 23:18:09 +07:00

150 lines
3.9 KiB
JSON

{
"annotations": { "list": [] },
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"links": [],
"panels": [
{
"title": "Request Rate (req/s)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
"targets": [
{
"expr": "sum(rate(http_requests_total[5m])) by (method)",
"legendFormat": "{{method}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": { "drawStyle": "line", "fillOpacity": 10 }
}
}
},
{
"title": "Error Rate (5xx)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
"targets": [
{
"expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m]))",
"legendFormat": "5xx errors/s",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": { "drawStyle": "line", "fillOpacity": 10 },
"color": { "mode": "fixed", "fixedColor": "red" }
}
}
},
{
"title": "Request Latency (p50 / p95 / p99)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(goodgo_api_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "p50",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum(rate(goodgo_api_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "p95",
"refId": "B"
},
{
"expr": "histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "p99",
"refId": "C"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"custom": { "drawStyle": "line", "fillOpacity": 5 }
}
}
},
{
"title": "Requests by Route",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
"targets": [
{
"expr": "sum(rate(http_requests_total[5m])) by (route)",
"legendFormat": "{{route}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": { "drawStyle": "bars", "fillOpacity": 50 }
}
}
},
{
"title": "Requests by Status Code",
"type": "piechart",
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 16 },
"targets": [
{
"expr": "sum(increase(http_requests_total[1h])) by (status_code)",
"legendFormat": "{{status_code}}",
"refId": "A"
}
]
},
{
"title": "Process Memory (RSS)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 16 },
"targets": [
{
"expr": "process_resident_memory_bytes{job=\"goodgo-api\"}",
"legendFormat": "RSS",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "bytes",
"custom": { "drawStyle": "line", "fillOpacity": 10 }
}
}
},
{
"title": "Node.js Event Loop Lag",
"type": "timeseries",
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 16 },
"targets": [
{
"expr": "nodejs_eventloop_lag_seconds{job=\"goodgo-api\"}",
"legendFormat": "Event Loop Lag",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"custom": { "drawStyle": "line", "fillOpacity": 10 }
}
}
}
],
"schemaVersion": 39,
"tags": ["goodgo", "api"],
"templating": { "list": [] },
"time": { "from": "now-1h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "GoodGo API Overview",
"uid": "goodgo-api-overview",
"version": 1
}