feat(monitoring): add API latency Grafana dashboard and alerting rules

Create comprehensive Grafana dashboard for API latency monitoring with: - p50/p95/p99 stat panels and time series for all endpoints - Per-endpoint latency breakdown with route/method template variables - Top 10 slowest endpoints table and bar chart (by p99) - Request rate (by method) and error rate (4xx/5xx) panels - Error rate percentage (5xx/total) with SLO threshold - Latency heatmap and histogram distribution panels Add Prometheus alerting rules: - ApiLatencyP99High: p99 > 1s for 5m (warning) - ApiEndpointLatencyP99High: per-endpoint p99 > 2s (warning) - ApiLatencyP99Critical: p99 > 3s for 3m (critical/SLO breach) - ApiErrorRate5xxHigh: 5xx rate > 1% for 5m (warning) Fix api-overview.json using wrong metric name (http_request_duration_seconds → goodgo_api_request_duration_seconds). Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-10 23:18:09 +07:00
parent 59272e9321
commit 90839cf542
4 changed files with 775 additions and 3 deletions
--- a/monitoring/grafana/dashboards/api-latency.json
+++ b/monitoring/grafana/dashboards/api-latency.json
@@ -0,0 +1,688 @@
+{
+  "__inputs": [],
+  "annotations": { "list": [] },
+  "description": "API latency monitoring with p50/p95/p99 percentiles, top slowest endpoints, request & error rates, and alerting thresholds.",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "links": [
+    {
+      "asDropdown": false,
+      "icon": "dashboard",
+      "includeVars": true,
+      "keepTime": true,
+      "tags": ["goodgo", "api"],
+      "targetBlank": false,
+      "title": "GoodGo API Dashboards",
+      "type": "dashboards"
+    }
+  ],
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
+      "id": 100,
+      "title": "Overview",
+      "type": "row"
+    },
+    {
+      "id": 1,
+      "title": "Current p50 Latency",
+      "type": "stat",
+      "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))",
+          "legendFormat": "p50",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "yellow", "value": 0.25 },
+              { "color": "orange", "value": 0.5 },
+              { "color": "red", "value": 1 }
+            ]
+          },
+          "color": { "mode": "thresholds" }
+        }
+      },
+      "options": {
+        "reduceOptions": { "calcs": ["lastNotNull"] },
+        "colorMode": "value",
+        "graphMode": "area",
+        "textMode": "auto"
+      }
+    },
+    {
+      "id": 2,
+      "title": "Current p95 Latency",
+      "type": "stat",
+      "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))",
+          "legendFormat": "p95",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "yellow", "value": 0.5 },
+              { "color": "orange", "value": 1 },
+              { "color": "red", "value": 2 }
+            ]
+          },
+          "color": { "mode": "thresholds" }
+        }
+      },
+      "options": {
+        "reduceOptions": { "calcs": ["lastNotNull"] },
+        "colorMode": "value",
+        "graphMode": "area",
+        "textMode": "auto"
+      }
+    },
+    {
+      "id": 3,
+      "title": "Current p99 Latency",
+      "type": "stat",
+      "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))",
+          "legendFormat": "p99",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "yellow", "value": 0.5 },
+              { "color": "orange", "value": 1 },
+              { "color": "red", "value": 2 }
+            ]
+          },
+          "color": { "mode": "thresholds" }
+        }
+      },
+      "options": {
+        "reduceOptions": { "calcs": ["lastNotNull"] },
+        "colorMode": "value",
+        "graphMode": "area",
+        "textMode": "auto"
+      }
+    },
+    {
+      "id": 4,
+      "title": "Request Rate",
+      "type": "stat",
+      "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "sum(rate(http_requests_total{job=\"goodgo-api\"}[$__rate_interval]))",
+          "legendFormat": "req/s",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "blue", "value": null }
+            ]
+          },
+          "color": { "mode": "thresholds" }
+        }
+      },
+      "options": {
+        "reduceOptions": { "calcs": ["lastNotNull"] },
+        "colorMode": "value",
+        "graphMode": "area",
+        "textMode": "auto"
+      }
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
+      "id": 101,
+      "title": "Latency Percentiles",
+      "type": "row"
+    },
+    {
+      "id": 10,
+      "title": "API Latency — p50 / p95 / p99 (All Endpoints)",
+      "description": "Aggregated request latency percentiles across all API endpoints.",
+      "type": "timeseries",
+      "gridPos": { "h": 9, "w": 24, "x": 0, "y": 6 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))",
+          "legendFormat": "p50",
+          "refId": "A"
+        },
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))",
+          "legendFormat": "p95",
+          "refId": "B"
+        },
+        {
+          "expr": "histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))",
+          "legendFormat": "p99",
+          "refId": "C"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "smooth",
+            "fillOpacity": 10,
+            "pointSize": 5,
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": { "mode": "none" },
+            "thresholdsStyle": { "mode": "dashed" }
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "red", "value": 1 }
+            ]
+          }
+        },
+        "overrides": [
+          {
+            "matcher": { "id": "byName", "options": "p50" },
+            "properties": [
+              { "id": "color", "value": { "mode": "fixed", "fixedColor": "green" } }
+            ]
+          },
+          {
+            "matcher": { "id": "byName", "options": "p95" },
+            "properties": [
+              { "id": "color", "value": { "mode": "fixed", "fixedColor": "yellow" } }
+            ]
+          },
+          {
+            "matcher": { "id": "byName", "options": "p99" },
+            "properties": [
+              { "id": "color", "value": { "mode": "fixed", "fixedColor": "orange" } }
+            ]
+          }
+        ]
+      },
+      "options": {
+        "tooltip": { "mode": "multi", "sort": "desc" },
+        "legend": { "displayMode": "list", "placement": "bottom" }
+      }
+    },
+    {
+      "id": 11,
+      "title": "Latency p50 / p95 / p99 per Endpoint",
+      "description": "Per-endpoint latency breakdown. Use the route variable to filter specific endpoints.",
+      "type": "timeseries",
+      "gridPos": { "h": 9, "w": 24, "x": 0, "y": 15 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\", route=~\"$route\"}[$__rate_interval])) by (le, route))",
+          "legendFormat": "{{route}} p50",
+          "refId": "A"
+        },
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\", route=~\"$route\"}[$__rate_interval])) by (le, route))",
+          "legendFormat": "{{route}} p95",
+          "refId": "B"
+        },
+        {
+          "expr": "histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\", route=~\"$route\"}[$__rate_interval])) by (le, route))",
+          "legendFormat": "{{route}} p99",
+          "refId": "C"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "smooth",
+            "fillOpacity": 5,
+            "pointSize": 5,
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": { "mode": "none" }
+          }
+        }
+      },
+      "options": {
+        "tooltip": { "mode": "multi", "sort": "desc" },
+        "legend": { "displayMode": "table", "placement": "right", "calcs": ["mean", "max"] }
+      }
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 },
+      "id": 102,
+      "title": "Top Slowest Endpoints",
+      "type": "row"
+    },
+    {
+      "id": 20,
+      "title": "Top 10 Slowest Endpoints (by p99 Latency)",
+      "description": "Endpoints ranked by their p99 latency. Helps identify which routes need optimization.",
+      "type": "table",
+      "gridPos": { "h": 10, "w": 12, "x": 0, "y": 25 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "topk(10, histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le, route, method)))",
+          "legendFormat": "{{method}} {{route}}",
+          "refId": "A",
+          "format": "table",
+          "instant": true
+        }
+      ],
+      "transformations": [
+        {
+          "id": "organize",
+          "options": {
+            "excludeByName": { "Time": true, "le": true },
+            "renameByName": {
+              "method": "Method",
+              "route": "Route",
+              "Value": "p99 Latency (s)"
+            },
+            "indexByName": { "method": 0, "route": 1, "Value": 2 }
+          }
+        },
+        {
+          "id": "sortBy",
+          "options": {
+            "sort": [{ "field": "p99 Latency (s)", "desc": true }]
+          }
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "custom": { "align": "auto" }
+        },
+        "overrides": [
+          {
+            "matcher": { "id": "byName", "options": "p99 Latency (s)" },
+            "properties": [
+              { "id": "unit", "value": "s" },
+              { "id": "decimals", "value": 3 },
+              {
+                "id": "thresholds",
+                "value": {
+                  "mode": "absolute",
+                  "steps": [
+                    { "color": "green", "value": null },
+                    { "color": "yellow", "value": 0.5 },
+                    { "color": "orange", "value": 1 },
+                    { "color": "red", "value": 2 }
+                  ]
+                }
+              },
+              { "id": "custom.displayMode", "value": "color-background-solid" }
+            ]
+          }
+        ]
+      },
+      "options": {
+        "showHeader": true,
+        "sortBy": [{ "displayName": "p99 Latency (s)", "desc": true }],
+        "footer": { "show": false }
+      }
+    },
+    {
+      "id": 21,
+      "title": "Top 10 Slowest Endpoints (Bar Chart)",
+      "description": "Visual ranking of the slowest endpoints by p99 latency.",
+      "type": "barchart",
+      "gridPos": { "h": 10, "w": 12, "x": 12, "y": 25 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "topk(10, histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le, route)))",
+          "legendFormat": "{{route}}",
+          "refId": "A",
+          "format": "table",
+          "instant": true
+        }
+      ],
+      "transformations": [
+        {
+          "id": "organize",
+          "options": {
+            "excludeByName": { "Time": true, "le": true },
+            "renameByName": {
+              "route": "Route",
+              "Value": "p99 Latency (s)"
+            }
+          }
+        },
+        {
+          "id": "sortBy",
+          "options": {
+            "sort": [{ "field": "p99 Latency (s)", "desc": true }]
+          }
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "color": { "mode": "continuous-YlRd" },
+          "custom": {
+            "fillOpacity": 80,
+            "gradientMode": "scheme"
+          }
+        }
+      },
+      "options": {
+        "orientation": "horizontal",
+        "xTickLabelRotation": 0,
+        "showValue": "always",
+        "barWidth": 0.7
+      }
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 },
+      "id": 103,
+      "title": "Request & Error Rates",
+      "type": "row"
+    },
+    {
+      "id": 30,
+      "title": "Request Rate (req/s)",
+      "description": "Total request throughput per HTTP method.",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "sum(rate(http_requests_total{job=\"goodgo-api\"}[$__rate_interval])) by (method)",
+          "legendFormat": "{{method}}",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "smooth",
+            "fillOpacity": 15,
+            "stacking": { "mode": "normal" },
+            "showPoints": "never"
+          },
+          "color": { "mode": "palette-classic" }
+        }
+      },
+      "options": {
+        "tooltip": { "mode": "multi", "sort": "desc" },
+        "legend": { "displayMode": "list", "placement": "bottom" }
+      }
+    },
+    {
+      "id": 31,
+      "title": "Error Rate (4xx & 5xx)",
+      "description": "Client and server error rates. Spikes here may indicate API issues or bad client requests.",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "sum(rate(http_requests_total{job=\"goodgo-api\", status_code=~\"5..\"}[$__rate_interval]))",
+          "legendFormat": "5xx errors/s",
+          "refId": "A"
+        },
+        {
+          "expr": "sum(rate(http_requests_total{job=\"goodgo-api\", status_code=~\"4..\"}[$__rate_interval]))",
+          "legendFormat": "4xx errors/s",
+          "refId": "B"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "smooth",
+            "fillOpacity": 15,
+            "showPoints": "never"
+          }
+        },
+        "overrides": [
+          {
+            "matcher": { "id": "byName", "options": "5xx errors/s" },
+            "properties": [
+              { "id": "color", "value": { "mode": "fixed", "fixedColor": "red" } }
+            ]
+          },
+          {
+            "matcher": { "id": "byName", "options": "4xx errors/s" },
+            "properties": [
+              { "id": "color", "value": { "mode": "fixed", "fixedColor": "orange" } }
+            ]
+          }
+        ]
+      },
+      "options": {
+        "tooltip": { "mode": "multi", "sort": "desc" },
+        "legend": { "displayMode": "list", "placement": "bottom" }
+      }
+    },
+    {
+      "id": 32,
+      "title": "Error Rate % (5xx / Total)",
+      "description": "Percentage of requests resulting in server errors. SLO target: < 0.1%.",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "100 * sum(rate(http_requests_total{job=\"goodgo-api\", status_code=~\"5..\"}[$__rate_interval])) / sum(rate(http_requests_total{job=\"goodgo-api\"}[$__rate_interval]))",
+          "legendFormat": "5xx error %",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "min": 0,
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "smooth",
+            "fillOpacity": 10,
+            "showPoints": "never",
+            "thresholdsStyle": { "mode": "dashed" }
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "red", "value": 1 }
+            ]
+          },
+          "color": { "mode": "fixed", "fixedColor": "red" }
+        }
+      },
+      "options": {
+        "tooltip": { "mode": "single" },
+        "legend": { "displayMode": "list", "placement": "bottom" }
+      }
+    },
+    {
+      "id": 33,
+      "title": "Request Rate by Route (Top 10)",
+      "description": "Top 10 most active routes by request rate.",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "topk(10, sum(rate(http_requests_total{job=\"goodgo-api\"}[$__rate_interval])) by (route))",
+          "legendFormat": "{{route}}",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "smooth",
+            "fillOpacity": 10,
+            "showPoints": "never"
+          },
+          "color": { "mode": "palette-classic" }
+        }
+      },
+      "options": {
+        "tooltip": { "mode": "multi", "sort": "desc" },
+        "legend": { "displayMode": "table", "placement": "right", "calcs": ["mean", "max"] }
+      }
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 52 },
+      "id": 104,
+      "title": "Latency Distribution",
+      "type": "row"
+    },
+    {
+      "id": 40,
+      "title": "Latency Heatmap",
+      "description": "Distribution of request latencies over time. Dense clusters indicate common latency ranges.",
+      "type": "heatmap",
+      "gridPos": { "h": 9, "w": 24, "x": 0, "y": 53 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "sum(increase(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le)",
+          "legendFormat": "{{le}}",
+          "refId": "A",
+          "format": "heatmap"
+        }
+      ],
+      "options": {
+        "calculate": false,
+        "yAxis": { "unit": "s" },
+        "color": {
+          "scheme": "Spectral",
+          "mode": "scheme",
+          "reverse": true
+        },
+        "tooltip": { "show": true, "yHistogram": true },
+        "cellGap": 1
+      }
+    },
+    {
+      "id": 41,
+      "title": "Latency Distribution (Histogram)",
+      "description": "Current request latency distribution across histogram buckets.",
+      "type": "histogram",
+      "gridPos": { "h": 9, "w": 24, "x": 0, "y": 62 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le)",
+          "legendFormat": "{{le}}",
+          "refId": "A",
+          "format": "heatmap"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "color": { "mode": "palette-classic" },
+          "custom": { "fillOpacity": 80 }
+        }
+      },
+      "options": {
+        "bucketSize": 0,
+        "combine": false,
+        "fillOpacity": 80
+      }
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 39,
+  "tags": ["goodgo", "api", "latency", "sre"],
+  "templating": {
+    "list": [
+      {
+        "current": { "selected": false, "text": "All", "value": "$__all" },
+        "datasource": { "type": "prometheus", "uid": "prometheus" },
+        "definition": "label_values(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}, route)",
+        "hide": 0,
+        "includeAll": true,
+        "label": "Route",
+        "multi": true,
+        "name": "route",
+        "options": [],
+        "query": {
+          "query": "label_values(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}, route)",
+          "refId": "StandardVariableQuery"
+        },
+        "refresh": 2,
+        "regex": "",
+        "sort": 1,
+        "type": "query"
+      },
+      {
+        "current": { "selected": false, "text": "All", "value": "$__all" },
+        "datasource": { "type": "prometheus", "uid": "prometheus" },
+        "definition": "label_values(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}, method)",
+        "hide": 0,
+        "includeAll": true,
+        "label": "Method",
+        "multi": true,
+        "name": "method",
+        "options": [],
+        "query": {
+          "query": "label_values(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}, method)",
+          "refId": "StandardVariableQuery"
+        },
+        "refresh": 2,
+        "regex": "",
+        "sort": 1,
+        "type": "query"
+      }
+    ]
+  },
+  "time": { "from": "now-1h", "to": "now" },
+  "timepicker": {
+    "refresh_intervals": ["5s", "10s", "30s", "1m", "5m"]
+  },
+  "timezone": "browser",
+  "title": "GoodGo API Latency",
+  "uid": "goodgo-api-latency",
+  "version": 1
+}
--- a/monitoring/grafana/dashboards/api-overview.json
+++ b/monitoring/grafana/dashboards/api-overview.json
@@ -48,17 +48,17 @@
      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
      "targets": [
        {
-          "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
+          "expr": "histogram_quantile(0.50, sum(rate(goodgo_api_request_duration_seconds_bucket[5m])) by (le))",
          "legendFormat": "p50",
          "refId": "A"
        },
        {
-          "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
+          "expr": "histogram_quantile(0.95, sum(rate(goodgo_api_request_duration_seconds_bucket[5m])) by (le))",
          "legendFormat": "p95",
          "refId": "B"
        },
        {
-          "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
+          "expr": "histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket[5m])) by (le))",
          "legendFormat": "p99",
          "refId": "C"
        }
--- a/monitoring/prometheus/alert-rules.yml
+++ b/monitoring/prometheus/alert-rules.yml
@@ -0,0 +1,81 @@
+groups:
+  - name: goodgo_api_latency
+    rules:
+      # ── p99 latency alert ──
+      - alert: ApiLatencyP99High
+        expr: >
+          histogram_quantile(0.99,
+            sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le)
+          ) > 1
+        for: 5m
+        labels:
+          severity: warning
+          team: sre
+          service: goodgo-api
+        annotations:
+          summary: "API p99 latency exceeds 1s"
+          description: >
+            The overall API p99 latency has been above 1 second for the last 5 minutes.
+            Current value: {{ $value | printf "%.3f" }}s.
+            Investigate slow endpoints using the GoodGo API Latency dashboard.
+          dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
+          runbook_url: "https://docs.goodgo.vn/runbooks/api-latency-high"
+
+      # ── Per-route p99 latency alert ──
+      - alert: ApiEndpointLatencyP99High
+        expr: >
+          histogram_quantile(0.99,
+            sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le, route, method)
+          ) > 2
+        for: 5m
+        labels:
+          severity: warning
+          team: sre
+          service: goodgo-api
+        annotations:
+          summary: "Endpoint {{ $labels.method }} {{ $labels.route }} p99 > 2s"
+          description: >
+            The {{ $labels.method }} {{ $labels.route }} endpoint has a p99 latency
+            above 2 seconds for the last 5 minutes.
+            Current value: {{ $value | printf "%.3f" }}s.
+          dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
+
+      # ── p99 critical (SLO breach) ──
+      - alert: ApiLatencyP99Critical
+        expr: >
+          histogram_quantile(0.99,
+            sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le)
+          ) > 3
+        for: 3m
+        labels:
+          severity: critical
+          team: sre
+          service: goodgo-api
+        annotations:
+          summary: "CRITICAL: API p99 latency exceeds 3s (SLO breach)"
+          description: >
+            The overall API p99 latency has been above 3 seconds for the last 3 minutes.
+            This is a potential SLO breach. Immediate investigation required.
+            Current value: {{ $value | printf "%.3f" }}s.
+          dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
+          runbook_url: "https://docs.goodgo.vn/runbooks/api-latency-critical"
+
+      # ── 5xx error rate spike ──
+      - alert: ApiErrorRate5xxHigh
+        expr: >
+          (
+            sum(rate(http_requests_total{job="goodgo-api", status_code=~"5.."}[5m]))
+            /
+            sum(rate(http_requests_total{job="goodgo-api"}[5m]))
+          ) * 100 > 1
+        for: 5m
+        labels:
+          severity: warning
+          team: sre
+          service: goodgo-api
+        annotations:
+          summary: "API 5xx error rate above 1%"
+          description: >
+            The 5xx error rate for the GoodGo API has been above 1% for the last 5 minutes.
+            Current value: {{ $value | printf "%.2f" }}%.
+          dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
--- a/monitoring/prometheus/prometheus.yml
+++ b/monitoring/prometheus/prometheus.yml
@@ -2,6 +2,9 @@ global:
  scrape_interval: 15s
  evaluation_interval: 15s

+rule_files:
+  - 'alert-rules.yml'
+
 scrape_configs:
  - job_name: 'goodgo-api'
    metrics_path: '/metrics'