Files
goodgo-platform/monitoring/grafana/dashboards/api-latency.json
Ho Ngoc Hai 90839cf542 feat(monitoring): add API latency Grafana dashboard and alerting rules
Create comprehensive Grafana dashboard for API latency monitoring with:
- p50/p95/p99 stat panels and time series for all endpoints
- Per-endpoint latency breakdown with route/method template variables
- Top 10 slowest endpoints table and bar chart (by p99)
- Request rate (by method) and error rate (4xx/5xx) panels
- Error rate percentage (5xx/total) with SLO threshold
- Latency heatmap and histogram distribution panels

Add Prometheus alerting rules:
- ApiLatencyP99High: p99 > 1s for 5m (warning)
- ApiEndpointLatencyP99High: per-endpoint p99 > 2s (warning)
- ApiLatencyP99Critical: p99 > 3s for 3m (critical/SLO breach)
- ApiErrorRate5xxHigh: 5xx rate > 1% for 5m (warning)

Fix api-overview.json using wrong metric name
(http_request_duration_seconds → goodgo_api_request_duration_seconds).

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-10 23:18:09 +07:00

689 lines
21 KiB
JSON

{
"__inputs": [],
"annotations": { "list": [] },
"description": "API latency monitoring with p50/p95/p99 percentiles, top slowest endpoints, request & error rates, and alerting thresholds.",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"links": [
{
"asDropdown": false,
"icon": "dashboard",
"includeVars": true,
"keepTime": true,
"tags": ["goodgo", "api"],
"targetBlank": false,
"title": "GoodGo API Dashboards",
"type": "dashboards"
}
],
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 100,
"title": "Overview",
"type": "row"
},
{
"id": 1,
"title": "Current p50 Latency",
"type": "stat",
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))",
"legendFormat": "p50",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.25 },
{ "color": "orange", "value": 0.5 },
{ "color": "red", "value": 1 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "value",
"graphMode": "area",
"textMode": "auto"
}
},
{
"id": 2,
"title": "Current p95 Latency",
"type": "stat",
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))",
"legendFormat": "p95",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.5 },
{ "color": "orange", "value": 1 },
{ "color": "red", "value": 2 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "value",
"graphMode": "area",
"textMode": "auto"
}
},
{
"id": 3,
"title": "Current p99 Latency",
"type": "stat",
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))",
"legendFormat": "p99",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.5 },
{ "color": "orange", "value": 1 },
{ "color": "red", "value": 2 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "value",
"graphMode": "area",
"textMode": "auto"
}
},
{
"id": 4,
"title": "Request Rate",
"type": "stat",
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "sum(rate(http_requests_total{job=\"goodgo-api\"}[$__rate_interval]))",
"legendFormat": "req/s",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "blue", "value": null }
]
},
"color": { "mode": "thresholds" }
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "value",
"graphMode": "area",
"textMode": "auto"
}
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
"id": 101,
"title": "Latency Percentiles",
"type": "row"
},
{
"id": 10,
"title": "API Latency — p50 / p95 / p99 (All Endpoints)",
"description": "Aggregated request latency percentiles across all API endpoints.",
"type": "timeseries",
"gridPos": { "h": 9, "w": 24, "x": 0, "y": 6 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))",
"legendFormat": "p50",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))",
"legendFormat": "p95",
"refId": "B"
},
{
"expr": "histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))",
"legendFormat": "p99",
"refId": "C"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"custom": {
"drawStyle": "line",
"lineInterpolation": "smooth",
"fillOpacity": 10,
"pointSize": 5,
"showPoints": "auto",
"spanNulls": false,
"stacking": { "mode": "none" },
"thresholdsStyle": { "mode": "dashed" }
},
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "red", "value": 1 }
]
}
},
"overrides": [
{
"matcher": { "id": "byName", "options": "p50" },
"properties": [
{ "id": "color", "value": { "mode": "fixed", "fixedColor": "green" } }
]
},
{
"matcher": { "id": "byName", "options": "p95" },
"properties": [
{ "id": "color", "value": { "mode": "fixed", "fixedColor": "yellow" } }
]
},
{
"matcher": { "id": "byName", "options": "p99" },
"properties": [
{ "id": "color", "value": { "mode": "fixed", "fixedColor": "orange" } }
]
}
]
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
}
},
{
"id": 11,
"title": "Latency p50 / p95 / p99 per Endpoint",
"description": "Per-endpoint latency breakdown. Use the route variable to filter specific endpoints.",
"type": "timeseries",
"gridPos": { "h": 9, "w": 24, "x": 0, "y": 15 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\", route=~\"$route\"}[$__rate_interval])) by (le, route))",
"legendFormat": "{{route}} p50",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\", route=~\"$route\"}[$__rate_interval])) by (le, route))",
"legendFormat": "{{route}} p95",
"refId": "B"
},
{
"expr": "histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\", route=~\"$route\"}[$__rate_interval])) by (le, route))",
"legendFormat": "{{route}} p99",
"refId": "C"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"custom": {
"drawStyle": "line",
"lineInterpolation": "smooth",
"fillOpacity": 5,
"pointSize": 5,
"showPoints": "auto",
"spanNulls": false,
"stacking": { "mode": "none" }
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "table", "placement": "right", "calcs": ["mean", "max"] }
}
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 },
"id": 102,
"title": "Top Slowest Endpoints",
"type": "row"
},
{
"id": 20,
"title": "Top 10 Slowest Endpoints (by p99 Latency)",
"description": "Endpoints ranked by their p99 latency. Helps identify which routes need optimization.",
"type": "table",
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 25 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "topk(10, histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le, route, method)))",
"legendFormat": "{{method}} {{route}}",
"refId": "A",
"format": "table",
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": { "Time": true, "le": true },
"renameByName": {
"method": "Method",
"route": "Route",
"Value": "p99 Latency (s)"
},
"indexByName": { "method": 0, "route": 1, "Value": 2 }
}
},
{
"id": "sortBy",
"options": {
"sort": [{ "field": "p99 Latency (s)", "desc": true }]
}
}
],
"fieldConfig": {
"defaults": {
"custom": { "align": "auto" }
},
"overrides": [
{
"matcher": { "id": "byName", "options": "p99 Latency (s)" },
"properties": [
{ "id": "unit", "value": "s" },
{ "id": "decimals", "value": 3 },
{
"id": "thresholds",
"value": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.5 },
{ "color": "orange", "value": 1 },
{ "color": "red", "value": 2 }
]
}
},
{ "id": "custom.displayMode", "value": "color-background-solid" }
]
}
]
},
"options": {
"showHeader": true,
"sortBy": [{ "displayName": "p99 Latency (s)", "desc": true }],
"footer": { "show": false }
}
},
{
"id": 21,
"title": "Top 10 Slowest Endpoints (Bar Chart)",
"description": "Visual ranking of the slowest endpoints by p99 latency.",
"type": "barchart",
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 25 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "topk(10, histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le, route)))",
"legendFormat": "{{route}}",
"refId": "A",
"format": "table",
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": { "Time": true, "le": true },
"renameByName": {
"route": "Route",
"Value": "p99 Latency (s)"
}
}
},
{
"id": "sortBy",
"options": {
"sort": [{ "field": "p99 Latency (s)", "desc": true }]
}
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"color": { "mode": "continuous-YlRd" },
"custom": {
"fillOpacity": 80,
"gradientMode": "scheme"
}
}
},
"options": {
"orientation": "horizontal",
"xTickLabelRotation": 0,
"showValue": "always",
"barWidth": 0.7
}
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 },
"id": 103,
"title": "Request & Error Rates",
"type": "row"
},
{
"id": 30,
"title": "Request Rate (req/s)",
"description": "Total request throughput per HTTP method.",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "sum(rate(http_requests_total{job=\"goodgo-api\"}[$__rate_interval])) by (method)",
"legendFormat": "{{method}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"drawStyle": "line",
"lineInterpolation": "smooth",
"fillOpacity": 15,
"stacking": { "mode": "normal" },
"showPoints": "never"
},
"color": { "mode": "palette-classic" }
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
}
},
{
"id": 31,
"title": "Error Rate (4xx & 5xx)",
"description": "Client and server error rates. Spikes here may indicate API issues or bad client requests.",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "sum(rate(http_requests_total{job=\"goodgo-api\", status_code=~\"5..\"}[$__rate_interval]))",
"legendFormat": "5xx errors/s",
"refId": "A"
},
{
"expr": "sum(rate(http_requests_total{job=\"goodgo-api\", status_code=~\"4..\"}[$__rate_interval]))",
"legendFormat": "4xx errors/s",
"refId": "B"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"drawStyle": "line",
"lineInterpolation": "smooth",
"fillOpacity": 15,
"showPoints": "never"
}
},
"overrides": [
{
"matcher": { "id": "byName", "options": "5xx errors/s" },
"properties": [
{ "id": "color", "value": { "mode": "fixed", "fixedColor": "red" } }
]
},
{
"matcher": { "id": "byName", "options": "4xx errors/s" },
"properties": [
{ "id": "color", "value": { "mode": "fixed", "fixedColor": "orange" } }
]
}
]
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom" }
}
},
{
"id": 32,
"title": "Error Rate % (5xx / Total)",
"description": "Percentage of requests resulting in server errors. SLO target: < 0.1%.",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "100 * sum(rate(http_requests_total{job=\"goodgo-api\", status_code=~\"5..\"}[$__rate_interval])) / sum(rate(http_requests_total{job=\"goodgo-api\"}[$__rate_interval]))",
"legendFormat": "5xx error %",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"custom": {
"drawStyle": "line",
"lineInterpolation": "smooth",
"fillOpacity": 10,
"showPoints": "never",
"thresholdsStyle": { "mode": "dashed" }
},
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "red", "value": 1 }
]
},
"color": { "mode": "fixed", "fixedColor": "red" }
}
},
"options": {
"tooltip": { "mode": "single" },
"legend": { "displayMode": "list", "placement": "bottom" }
}
},
{
"id": 33,
"title": "Request Rate by Route (Top 10)",
"description": "Top 10 most active routes by request rate.",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "topk(10, sum(rate(http_requests_total{job=\"goodgo-api\"}[$__rate_interval])) by (route))",
"legendFormat": "{{route}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"drawStyle": "line",
"lineInterpolation": "smooth",
"fillOpacity": 10,
"showPoints": "never"
},
"color": { "mode": "palette-classic" }
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "table", "placement": "right", "calcs": ["mean", "max"] }
}
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 52 },
"id": 104,
"title": "Latency Distribution",
"type": "row"
},
{
"id": 40,
"title": "Latency Heatmap",
"description": "Distribution of request latencies over time. Dense clusters indicate common latency ranges.",
"type": "heatmap",
"gridPos": { "h": 9, "w": 24, "x": 0, "y": 53 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "sum(increase(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le)",
"legendFormat": "{{le}}",
"refId": "A",
"format": "heatmap"
}
],
"options": {
"calculate": false,
"yAxis": { "unit": "s" },
"color": {
"scheme": "Spectral",
"mode": "scheme",
"reverse": true
},
"tooltip": { "show": true, "yHistogram": true },
"cellGap": 1
}
},
{
"id": 41,
"title": "Latency Distribution (Histogram)",
"description": "Current request latency distribution across histogram buckets.",
"type": "histogram",
"gridPos": { "h": 9, "w": 24, "x": 0, "y": 62 },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le)",
"legendFormat": "{{le}}",
"refId": "A",
"format": "heatmap"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"color": { "mode": "palette-classic" },
"custom": { "fillOpacity": 80 }
}
},
"options": {
"bucketSize": 0,
"combine": false,
"fillOpacity": 80
}
}
],
"refresh": "30s",
"schemaVersion": 39,
"tags": ["goodgo", "api", "latency", "sre"],
"templating": {
"list": [
{
"current": { "selected": false, "text": "All", "value": "$__all" },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"definition": "label_values(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}, route)",
"hide": 0,
"includeAll": true,
"label": "Route",
"multi": true,
"name": "route",
"options": [],
"query": {
"query": "label_values(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}, route)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"sort": 1,
"type": "query"
},
{
"current": { "selected": false, "text": "All", "value": "$__all" },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"definition": "label_values(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}, method)",
"hide": 0,
"includeAll": true,
"label": "Method",
"multi": true,
"name": "method",
"options": [],
"query": {
"query": "label_values(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}, method)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"sort": 1,
"type": "query"
}
]
},
"time": { "from": "now-1h", "to": "now" },
"timepicker": {
"refresh_intervals": ["5s", "10s", "30s", "1m", "5m"]
},
"timezone": "browser",
"title": "GoodGo API Latency",
"uid": "goodgo-api-latency",
"version": 1
}