feat(monitoring): add API latency Grafana dashboard and alerting rules
Create comprehensive Grafana dashboard for API latency monitoring with: - p50/p95/p99 stat panels and time series for all endpoints - Per-endpoint latency breakdown with route/method template variables - Top 10 slowest endpoints table and bar chart (by p99) - Request rate (by method) and error rate (4xx/5xx) panels - Error rate percentage (5xx/total) with SLO threshold - Latency heatmap and histogram distribution panels Add Prometheus alerting rules: - ApiLatencyP99High: p99 > 1s for 5m (warning) - ApiEndpointLatencyP99High: per-endpoint p99 > 2s (warning) - ApiLatencyP99Critical: p99 > 3s for 3m (critical/SLO breach) - ApiErrorRate5xxHigh: 5xx rate > 1% for 5m (warning) Fix api-overview.json using wrong metric name (http_request_duration_seconds → goodgo_api_request_duration_seconds). Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
688
monitoring/grafana/dashboards/api-latency.json
Normal file
688
monitoring/grafana/dashboards/api-latency.json
Normal file
@@ -0,0 +1,688 @@
|
||||
{
|
||||
"__inputs": [],
|
||||
"annotations": { "list": [] },
|
||||
"description": "API latency monitoring with p50/p95/p99 percentiles, top slowest endpoints, request & error rates, and alerting thresholds.",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"links": [
|
||||
{
|
||||
"asDropdown": false,
|
||||
"icon": "dashboard",
|
||||
"includeVars": true,
|
||||
"keepTime": true,
|
||||
"tags": ["goodgo", "api"],
|
||||
"targetBlank": false,
|
||||
"title": "GoodGo API Dashboards",
|
||||
"type": "dashboards"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
||||
"id": 100,
|
||||
"title": "Overview",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Current p50 Latency",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))",
|
||||
"legendFormat": "p50",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.25 },
|
||||
{ "color": "orange", "value": 0.5 },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"color": { "mode": "thresholds" }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"textMode": "auto"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Current p95 Latency",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))",
|
||||
"legendFormat": "p95",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "orange", "value": 1 },
|
||||
{ "color": "red", "value": 2 }
|
||||
]
|
||||
},
|
||||
"color": { "mode": "thresholds" }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"textMode": "auto"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Current p99 Latency",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))",
|
||||
"legendFormat": "p99",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "orange", "value": 1 },
|
||||
{ "color": "red", "value": 2 }
|
||||
]
|
||||
},
|
||||
"color": { "mode": "thresholds" }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"textMode": "auto"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Request Rate",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{job=\"goodgo-api\"}[$__rate_interval]))",
|
||||
"legendFormat": "req/s",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "blue", "value": null }
|
||||
]
|
||||
},
|
||||
"color": { "mode": "thresholds" }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"textMode": "auto"
|
||||
}
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
|
||||
"id": 101,
|
||||
"title": "Latency Percentiles",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "API Latency — p50 / p95 / p99 (All Endpoints)",
|
||||
"description": "Aggregated request latency percentiles across all API endpoints.",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 9, "w": 24, "x": 0, "y": 6 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))",
|
||||
"legendFormat": "p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))",
|
||||
"legendFormat": "p95",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))",
|
||||
"legendFormat": "p99",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 10,
|
||||
"pointSize": 5,
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": { "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "dashed" }
|
||||
},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "p50" },
|
||||
"properties": [
|
||||
{ "id": "color", "value": { "mode": "fixed", "fixedColor": "green" } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "p95" },
|
||||
"properties": [
|
||||
{ "id": "color", "value": { "mode": "fixed", "fixedColor": "yellow" } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "p99" },
|
||||
"properties": [
|
||||
{ "id": "color", "value": { "mode": "fixed", "fixedColor": "orange" } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Latency p50 / p95 / p99 per Endpoint",
|
||||
"description": "Per-endpoint latency breakdown. Use the route variable to filter specific endpoints.",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 9, "w": 24, "x": 0, "y": 15 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\", route=~\"$route\"}[$__rate_interval])) by (le, route))",
|
||||
"legendFormat": "{{route}} p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\", route=~\"$route\"}[$__rate_interval])) by (le, route))",
|
||||
"legendFormat": "{{route}} p95",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\", route=~\"$route\"}[$__rate_interval])) by (le, route))",
|
||||
"legendFormat": "{{route}} p99",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 5,
|
||||
"pointSize": 5,
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": { "mode": "none" }
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "table", "placement": "right", "calcs": ["mean", "max"] }
|
||||
}
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 },
|
||||
"id": 102,
|
||||
"title": "Top Slowest Endpoints",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"id": 20,
|
||||
"title": "Top 10 Slowest Endpoints (by p99 Latency)",
|
||||
"description": "Endpoints ranked by their p99 latency. Helps identify which routes need optimization.",
|
||||
"type": "table",
|
||||
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 25 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(10, histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le, route, method)))",
|
||||
"legendFormat": "{{method}} {{route}}",
|
||||
"refId": "A",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": { "Time": true, "le": true },
|
||||
"renameByName": {
|
||||
"method": "Method",
|
||||
"route": "Route",
|
||||
"Value": "p99 Latency (s)"
|
||||
},
|
||||
"indexByName": { "method": 0, "route": 1, "Value": 2 }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": {
|
||||
"sort": [{ "field": "p99 Latency (s)", "desc": true }]
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "auto" }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "p99 Latency (s)" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "s" },
|
||||
{ "id": "decimals", "value": 3 },
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "orange", "value": 1 },
|
||||
{ "color": "red", "value": 2 }
|
||||
]
|
||||
}
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background-solid" }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true,
|
||||
"sortBy": [{ "displayName": "p99 Latency (s)", "desc": true }],
|
||||
"footer": { "show": false }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 21,
|
||||
"title": "Top 10 Slowest Endpoints (Bar Chart)",
|
||||
"description": "Visual ranking of the slowest endpoints by p99 latency.",
|
||||
"type": "barchart",
|
||||
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 25 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(10, histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le, route)))",
|
||||
"legendFormat": "{{route}}",
|
||||
"refId": "A",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": { "Time": true, "le": true },
|
||||
"renameByName": {
|
||||
"route": "Route",
|
||||
"Value": "p99 Latency (s)"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": {
|
||||
"sort": [{ "field": "p99 Latency (s)", "desc": true }]
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"color": { "mode": "continuous-YlRd" },
|
||||
"custom": {
|
||||
"fillOpacity": 80,
|
||||
"gradientMode": "scheme"
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"xTickLabelRotation": 0,
|
||||
"showValue": "always",
|
||||
"barWidth": 0.7
|
||||
}
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 },
|
||||
"id": 103,
|
||||
"title": "Request & Error Rates",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"title": "Request Rate (req/s)",
|
||||
"description": "Total request throughput per HTTP method.",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{job=\"goodgo-api\"}[$__rate_interval])) by (method)",
|
||||
"legendFormat": "{{method}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 15,
|
||||
"stacking": { "mode": "normal" },
|
||||
"showPoints": "never"
|
||||
},
|
||||
"color": { "mode": "palette-classic" }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 31,
|
||||
"title": "Error Rate (4xx & 5xx)",
|
||||
"description": "Client and server error rates. Spikes here may indicate API issues or bad client requests.",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{job=\"goodgo-api\", status_code=~\"5..\"}[$__rate_interval]))",
|
||||
"legendFormat": "5xx errors/s",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{job=\"goodgo-api\", status_code=~\"4..\"}[$__rate_interval]))",
|
||||
"legendFormat": "4xx errors/s",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 15,
|
||||
"showPoints": "never"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "5xx errors/s" },
|
||||
"properties": [
|
||||
{ "id": "color", "value": { "mode": "fixed", "fixedColor": "red" } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "4xx errors/s" },
|
||||
"properties": [
|
||||
{ "id": "color", "value": { "mode": "fixed", "fixedColor": "orange" } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 32,
|
||||
"title": "Error Rate % (5xx / Total)",
|
||||
"description": "Percentage of requests resulting in server errors. SLO target: < 0.1%.",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * sum(rate(http_requests_total{job=\"goodgo-api\", status_code=~\"5..\"}[$__rate_interval])) / sum(rate(http_requests_total{job=\"goodgo-api\"}[$__rate_interval]))",
|
||||
"legendFormat": "5xx error %",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 10,
|
||||
"showPoints": "never",
|
||||
"thresholdsStyle": { "mode": "dashed" }
|
||||
},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"color": { "mode": "fixed", "fixedColor": "red" }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "single" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 33,
|
||||
"title": "Request Rate by Route (Top 10)",
|
||||
"description": "Top 10 most active routes by request rate.",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(10, sum(rate(http_requests_total{job=\"goodgo-api\"}[$__rate_interval])) by (route))",
|
||||
"legendFormat": "{{route}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 10,
|
||||
"showPoints": "never"
|
||||
},
|
||||
"color": { "mode": "palette-classic" }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "table", "placement": "right", "calcs": ["mean", "max"] }
|
||||
}
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 52 },
|
||||
"id": 104,
|
||||
"title": "Latency Distribution",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"id": 40,
|
||||
"title": "Latency Heatmap",
|
||||
"description": "Distribution of request latencies over time. Dense clusters indicate common latency ranges.",
|
||||
"type": "heatmap",
|
||||
"gridPos": { "h": 9, "w": 24, "x": 0, "y": 53 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le)",
|
||||
"legendFormat": "{{le}}",
|
||||
"refId": "A",
|
||||
"format": "heatmap"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"calculate": false,
|
||||
"yAxis": { "unit": "s" },
|
||||
"color": {
|
||||
"scheme": "Spectral",
|
||||
"mode": "scheme",
|
||||
"reverse": true
|
||||
},
|
||||
"tooltip": { "show": true, "yHistogram": true },
|
||||
"cellGap": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 41,
|
||||
"title": "Latency Distribution (Histogram)",
|
||||
"description": "Current request latency distribution across histogram buckets.",
|
||||
"type": "histogram",
|
||||
"gridPos": { "h": 9, "w": 24, "x": 0, "y": 62 },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le)",
|
||||
"legendFormat": "{{le}}",
|
||||
"refId": "A",
|
||||
"format": "heatmap"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "fillOpacity": 80 }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"bucketSize": 0,
|
||||
"combine": false,
|
||||
"fillOpacity": 80
|
||||
}
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 39,
|
||||
"tags": ["goodgo", "api", "latency", "sre"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": { "selected": false, "text": "All", "value": "$__all" },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"definition": "label_values(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}, route)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "Route",
|
||||
"multi": true,
|
||||
"name": "route",
|
||||
"options": [],
|
||||
"query": {
|
||||
"query": "label_values(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}, route)",
|
||||
"refId": "StandardVariableQuery"
|
||||
},
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"current": { "selected": false, "text": "All", "value": "$__all" },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"definition": "label_values(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}, method)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "Method",
|
||||
"multi": true,
|
||||
"name": "method",
|
||||
"options": [],
|
||||
"query": {
|
||||
"query": "label_values(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}, method)",
|
||||
"refId": "StandardVariableQuery"
|
||||
},
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"timepicker": {
|
||||
"refresh_intervals": ["5s", "10s", "30s", "1m", "5m"]
|
||||
},
|
||||
"timezone": "browser",
|
||||
"title": "GoodGo API Latency",
|
||||
"uid": "goodgo-api-latency",
|
||||
"version": 1
|
||||
}
|
||||
@@ -48,17 +48,17 @@
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"expr": "histogram_quantile(0.50, sum(rate(goodgo_api_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"expr": "histogram_quantile(0.95, sum(rate(goodgo_api_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p95",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p99",
|
||||
"refId": "C"
|
||||
}
|
||||
|
||||
81
monitoring/prometheus/alert-rules.yml
Normal file
81
monitoring/prometheus/alert-rules.yml
Normal file
@@ -0,0 +1,81 @@
|
||||
groups:
|
||||
- name: goodgo_api_latency
|
||||
rules:
|
||||
# ── p99 latency alert ──
|
||||
- alert: ApiLatencyP99High
|
||||
expr: >
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le)
|
||||
) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: sre
|
||||
service: goodgo-api
|
||||
annotations:
|
||||
summary: "API p99 latency exceeds 1s"
|
||||
description: >
|
||||
The overall API p99 latency has been above 1 second for the last 5 minutes.
|
||||
Current value: {{ $value | printf "%.3f" }}s.
|
||||
Investigate slow endpoints using the GoodGo API Latency dashboard.
|
||||
dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
|
||||
runbook_url: "https://docs.goodgo.vn/runbooks/api-latency-high"
|
||||
|
||||
# ── Per-route p99 latency alert ──
|
||||
- alert: ApiEndpointLatencyP99High
|
||||
expr: >
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le, route, method)
|
||||
) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: sre
|
||||
service: goodgo-api
|
||||
annotations:
|
||||
summary: "Endpoint {{ $labels.method }} {{ $labels.route }} p99 > 2s"
|
||||
description: >
|
||||
The {{ $labels.method }} {{ $labels.route }} endpoint has a p99 latency
|
||||
above 2 seconds for the last 5 minutes.
|
||||
Current value: {{ $value | printf "%.3f" }}s.
|
||||
dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
|
||||
|
||||
# ── p99 critical (SLO breach) ──
|
||||
- alert: ApiLatencyP99Critical
|
||||
expr: >
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le)
|
||||
) > 3
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
team: sre
|
||||
service: goodgo-api
|
||||
annotations:
|
||||
summary: "CRITICAL: API p99 latency exceeds 3s (SLO breach)"
|
||||
description: >
|
||||
The overall API p99 latency has been above 3 seconds for the last 3 minutes.
|
||||
This is a potential SLO breach. Immediate investigation required.
|
||||
Current value: {{ $value | printf "%.3f" }}s.
|
||||
dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
|
||||
runbook_url: "https://docs.goodgo.vn/runbooks/api-latency-critical"
|
||||
|
||||
# ── 5xx error rate spike ──
|
||||
- alert: ApiErrorRate5xxHigh
|
||||
expr: >
|
||||
(
|
||||
sum(rate(http_requests_total{job="goodgo-api", status_code=~"5.."}[5m]))
|
||||
/
|
||||
sum(rate(http_requests_total{job="goodgo-api"}[5m]))
|
||||
) * 100 > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: sre
|
||||
service: goodgo-api
|
||||
annotations:
|
||||
summary: "API 5xx error rate above 1%"
|
||||
description: >
|
||||
The 5xx error rate for the GoodGo API has been above 1% for the last 5 minutes.
|
||||
Current value: {{ $value | printf "%.2f" }}%.
|
||||
dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
|
||||
@@ -2,6 +2,9 @@ global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
rule_files:
|
||||
- 'alert-rules.yml'
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'goodgo-api'
|
||||
metrics_path: '/metrics'
|
||||
|
||||
Reference in New Issue
Block a user