Create comprehensive Grafana dashboard for API latency monitoring with: - p50/p95/p99 stat panels and time series for all endpoints - Per-endpoint latency breakdown with route/method template variables - Top 10 slowest endpoints table and bar chart (by p99) - Request rate (by method) and error rate (4xx/5xx) panels - Error rate percentage (5xx/total) with SLO threshold - Latency heatmap and histogram distribution panels Add Prometheus alerting rules: - ApiLatencyP99High: p99 > 1s for 5m (warning) - ApiEndpointLatencyP99High: per-endpoint p99 > 2s (warning) - ApiLatencyP99Critical: p99 > 3s for 3m (critical/SLO breach) - ApiErrorRate5xxHigh: 5xx rate > 1% for 5m (warning) Fix api-overview.json using wrong metric name (http_request_duration_seconds → goodgo_api_request_duration_seconds). Co-Authored-By: Paperclip <noreply@paperclip.ing>
150 lines
3.9 KiB
JSON
150 lines
3.9 KiB
JSON
{
|
|
"annotations": { "list": [] },
|
|
"editable": true,
|
|
"fiscalYearStartMonth": 0,
|
|
"graphTooltip": 1,
|
|
"links": [],
|
|
"panels": [
|
|
{
|
|
"title": "Request Rate (req/s)",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(http_requests_total[5m])) by (method)",
|
|
"legendFormat": "{{method}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "reqps",
|
|
"custom": { "drawStyle": "line", "fillOpacity": 10 }
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"title": "Error Rate (5xx)",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m]))",
|
|
"legendFormat": "5xx errors/s",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "reqps",
|
|
"custom": { "drawStyle": "line", "fillOpacity": 10 },
|
|
"color": { "mode": "fixed", "fixedColor": "red" }
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"title": "Request Latency (p50 / p95 / p99)",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.50, sum(rate(goodgo_api_request_duration_seconds_bucket[5m])) by (le))",
|
|
"legendFormat": "p50",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.95, sum(rate(goodgo_api_request_duration_seconds_bucket[5m])) by (le))",
|
|
"legendFormat": "p95",
|
|
"refId": "B"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket[5m])) by (le))",
|
|
"legendFormat": "p99",
|
|
"refId": "C"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "s",
|
|
"custom": { "drawStyle": "line", "fillOpacity": 5 }
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"title": "Requests by Route",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(http_requests_total[5m])) by (route)",
|
|
"legendFormat": "{{route}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "reqps",
|
|
"custom": { "drawStyle": "bars", "fillOpacity": 50 }
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"title": "Requests by Status Code",
|
|
"type": "piechart",
|
|
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 16 },
|
|
"targets": [
|
|
{
|
|
"expr": "sum(increase(http_requests_total[1h])) by (status_code)",
|
|
"legendFormat": "{{status_code}}",
|
|
"refId": "A"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Process Memory (RSS)",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 16 },
|
|
"targets": [
|
|
{
|
|
"expr": "process_resident_memory_bytes{job=\"goodgo-api\"}",
|
|
"legendFormat": "RSS",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "bytes",
|
|
"custom": { "drawStyle": "line", "fillOpacity": 10 }
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"title": "Node.js Event Loop Lag",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 16 },
|
|
"targets": [
|
|
{
|
|
"expr": "nodejs_eventloop_lag_seconds{job=\"goodgo-api\"}",
|
|
"legendFormat": "Event Loop Lag",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "s",
|
|
"custom": { "drawStyle": "line", "fillOpacity": 10 }
|
|
}
|
|
}
|
|
}
|
|
],
|
|
"schemaVersion": 39,
|
|
"tags": ["goodgo", "api"],
|
|
"templating": { "list": [] },
|
|
"time": { "from": "now-1h", "to": "now" },
|
|
"timepicker": {},
|
|
"timezone": "browser",
|
|
"title": "GoodGo API Overview",
|
|
"uid": "goodgo-api-overview",
|
|
"version": 1
|
|
}
|