diff --git a/monitoring/grafana/dashboards/api-latency.json b/monitoring/grafana/dashboards/api-latency.json new file mode 100644 index 0000000..21b590d --- /dev/null +++ b/monitoring/grafana/dashboards/api-latency.json @@ -0,0 +1,688 @@ +{ + "__inputs": [], + "annotations": { "list": [] }, + "description": "API latency monitoring with p50/p95/p99 percentiles, top slowest endpoints, request & error rates, and alerting thresholds.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [ + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": ["goodgo", "api"], + "targetBlank": false, + "title": "GoodGo API Dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 100, + "title": "Overview", + "type": "row" + }, + { + "id": 1, + "title": "Current p50 Latency", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))", + "legendFormat": "p50", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.25 }, + { "color": "orange", "value": 0.5 }, + { "color": "red", "value": 1 } + ] + }, + "color": { "mode": "thresholds" } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "value", + "graphMode": "area", + "textMode": "auto" + } + }, + { + "id": 2, + "title": "Current p95 Latency", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))", + "legendFormat": "p95", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.5 }, + { "color": "orange", "value": 1 }, + { "color": "red", "value": 2 } + ] + }, + "color": { "mode": "thresholds" } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "value", + "graphMode": "area", + "textMode": "auto" + } + }, + { + "id": 3, + "title": "Current p99 Latency", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))", + "legendFormat": "p99", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.5 }, + { "color": "orange", "value": 1 }, + { "color": "red", "value": 2 } + ] + }, + "color": { "mode": "thresholds" } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "value", + "graphMode": "area", + "textMode": "auto" + } + }, + { + "id": 4, + "title": "Request Rate", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_requests_total{job=\"goodgo-api\"}[$__rate_interval]))", + "legendFormat": "req/s", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "blue", "value": null } + ] + }, + "color": { "mode": "thresholds" } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "value", + "graphMode": "area", + "textMode": "auto" + } + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 101, + "title": "Latency Percentiles", + "type": "row" + }, + { + "id": 10, + "title": "API Latency — p50 / p95 / p99 (All Endpoints)", + "description": "Aggregated request latency percentiles across all API endpoints.", + "type": "timeseries", + "gridPos": { "h": 9, "w": 24, "x": 0, "y": 6 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le))", + "legendFormat": "p99", + "refId": "C" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 10, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "mode": "none" }, + "thresholdsStyle": { "mode": "dashed" } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "p50" }, + "properties": [ + { "id": "color", "value": { "mode": "fixed", "fixedColor": "green" } } + ] + }, + { + "matcher": { "id": "byName", "options": "p95" }, + "properties": [ + { "id": "color", "value": { "mode": "fixed", "fixedColor": "yellow" } } + ] + }, + { + "matcher": { "id": "byName", "options": "p99" }, + "properties": [ + { "id": "color", "value": { "mode": "fixed", "fixedColor": "orange" } } + ] + } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom" } + } + }, + { + "id": 11, + "title": "Latency p50 / p95 / p99 per Endpoint", + "description": "Per-endpoint latency breakdown. Use the route variable to filter specific endpoints.", + "type": "timeseries", + "gridPos": { "h": 9, "w": 24, "x": 0, "y": 15 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\", route=~\"$route\"}[$__rate_interval])) by (le, route))", + "legendFormat": "{{route}} p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\", route=~\"$route\"}[$__rate_interval])) by (le, route))", + "legendFormat": "{{route}} p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\", route=~\"$route\"}[$__rate_interval])) by (le, route))", + "legendFormat": "{{route}} p99", + "refId": "C" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 5, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "mode": "none" } + } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "table", "placement": "right", "calcs": ["mean", "max"] } + } + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 }, + "id": 102, + "title": "Top Slowest Endpoints", + "type": "row" + }, + { + "id": 20, + "title": "Top 10 Slowest Endpoints (by p99 Latency)", + "description": "Endpoints ranked by their p99 latency. Helps identify which routes need optimization.", + "type": "table", + "gridPos": { "h": 10, "w": 12, "x": 0, "y": 25 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "topk(10, histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le, route, method)))", + "legendFormat": "{{method}} {{route}}", + "refId": "A", + "format": "table", + "instant": true + } + ], + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "le": true }, + "renameByName": { + "method": "Method", + "route": "Route", + "Value": "p99 Latency (s)" + }, + "indexByName": { "method": 0, "route": 1, "Value": 2 } + } + }, + { + "id": "sortBy", + "options": { + "sort": [{ "field": "p99 Latency (s)", "desc": true }] + } + } + ], + "fieldConfig": { + "defaults": { + "custom": { "align": "auto" } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "p99 Latency (s)" }, + "properties": [ + { "id": "unit", "value": "s" }, + { "id": "decimals", "value": 3 }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.5 }, + { "color": "orange", "value": 1 }, + { "color": "red", "value": 2 } + ] + } + }, + { "id": "custom.displayMode", "value": "color-background-solid" } + ] + } + ] + }, + "options": { + "showHeader": true, + "sortBy": [{ "displayName": "p99 Latency (s)", "desc": true }], + "footer": { "show": false } + } + }, + { + "id": 21, + "title": "Top 10 Slowest Endpoints (Bar Chart)", + "description": "Visual ranking of the slowest endpoints by p99 latency.", + "type": "barchart", + "gridPos": { "h": 10, "w": 12, "x": 12, "y": 25 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "topk(10, histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le, route)))", + "legendFormat": "{{route}}", + "refId": "A", + "format": "table", + "instant": true + } + ], + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "le": true }, + "renameByName": { + "route": "Route", + "Value": "p99 Latency (s)" + } + } + }, + { + "id": "sortBy", + "options": { + "sort": [{ "field": "p99 Latency (s)", "desc": true }] + } + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "color": { "mode": "continuous-YlRd" }, + "custom": { + "fillOpacity": 80, + "gradientMode": "scheme" + } + } + }, + "options": { + "orientation": "horizontal", + "xTickLabelRotation": 0, + "showValue": "always", + "barWidth": 0.7 + } + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 }, + "id": 103, + "title": "Request & Error Rates", + "type": "row" + }, + { + "id": 30, + "title": "Request Rate (req/s)", + "description": "Total request throughput per HTTP method.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_requests_total{job=\"goodgo-api\"}[$__rate_interval])) by (method)", + "legendFormat": "{{method}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 15, + "stacking": { "mode": "normal" }, + "showPoints": "never" + }, + "color": { "mode": "palette-classic" } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom" } + } + }, + { + "id": 31, + "title": "Error Rate (4xx & 5xx)", + "description": "Client and server error rates. Spikes here may indicate API issues or bad client requests.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_requests_total{job=\"goodgo-api\", status_code=~\"5..\"}[$__rate_interval]))", + "legendFormat": "5xx errors/s", + "refId": "A" + }, + { + "expr": "sum(rate(http_requests_total{job=\"goodgo-api\", status_code=~\"4..\"}[$__rate_interval]))", + "legendFormat": "4xx errors/s", + "refId": "B" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 15, + "showPoints": "never" + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "5xx errors/s" }, + "properties": [ + { "id": "color", "value": { "mode": "fixed", "fixedColor": "red" } } + ] + }, + { + "matcher": { "id": "byName", "options": "4xx errors/s" }, + "properties": [ + { "id": "color", "value": { "mode": "fixed", "fixedColor": "orange" } } + ] + } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom" } + } + }, + { + "id": 32, + "title": "Error Rate % (5xx / Total)", + "description": "Percentage of requests resulting in server errors. SLO target: < 0.1%.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "100 * sum(rate(http_requests_total{job=\"goodgo-api\", status_code=~\"5..\"}[$__rate_interval])) / sum(rate(http_requests_total{job=\"goodgo-api\"}[$__rate_interval]))", + "legendFormat": "5xx error %", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 10, + "showPoints": "never", + "thresholdsStyle": { "mode": "dashed" } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "color": { "mode": "fixed", "fixedColor": "red" } + } + }, + "options": { + "tooltip": { "mode": "single" }, + "legend": { "displayMode": "list", "placement": "bottom" } + } + }, + { + "id": 33, + "title": "Request Rate by Route (Top 10)", + "description": "Top 10 most active routes by request rate.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "topk(10, sum(rate(http_requests_total{job=\"goodgo-api\"}[$__rate_interval])) by (route))", + "legendFormat": "{{route}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 10, + "showPoints": "never" + }, + "color": { "mode": "palette-classic" } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "table", "placement": "right", "calcs": ["mean", "max"] } + } + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 52 }, + "id": 104, + "title": "Latency Distribution", + "type": "row" + }, + { + "id": 40, + "title": "Latency Heatmap", + "description": "Distribution of request latencies over time. Dense clusters indicate common latency ranges.", + "type": "heatmap", + "gridPos": { "h": 9, "w": 24, "x": 0, "y": 53 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(increase(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le)", + "legendFormat": "{{le}}", + "refId": "A", + "format": "heatmap" + } + ], + "options": { + "calculate": false, + "yAxis": { "unit": "s" }, + "color": { + "scheme": "Spectral", + "mode": "scheme", + "reverse": true + }, + "tooltip": { "show": true, "yHistogram": true }, + "cellGap": 1 + } + }, + { + "id": 41, + "title": "Latency Distribution (Histogram)", + "description": "Current request latency distribution across histogram buckets.", + "type": "histogram", + "gridPos": { "h": 9, "w": 24, "x": 0, "y": 62 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}[$__rate_interval])) by (le)", + "legendFormat": "{{le}}", + "refId": "A", + "format": "heatmap" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "color": { "mode": "palette-classic" }, + "custom": { "fillOpacity": 80 } + } + }, + "options": { + "bucketSize": 0, + "combine": false, + "fillOpacity": 80 + } + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": ["goodgo", "api", "latency", "sre"], + "templating": { + "list": [ + { + "current": { "selected": false, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "definition": "label_values(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}, route)", + "hide": 0, + "includeAll": true, + "label": "Route", + "multi": true, + "name": "route", + "options": [], + "query": { + "query": "label_values(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}, route)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { "selected": false, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "definition": "label_values(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}, method)", + "hide": 0, + "includeAll": true, + "label": "Method", + "multi": true, + "name": "method", + "options": [], + "query": { + "query": "label_values(goodgo_api_request_duration_seconds_bucket{job=\"goodgo-api\"}, method)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": { + "refresh_intervals": ["5s", "10s", "30s", "1m", "5m"] + }, + "timezone": "browser", + "title": "GoodGo API Latency", + "uid": "goodgo-api-latency", + "version": 1 +} diff --git a/monitoring/grafana/dashboards/api-overview.json b/monitoring/grafana/dashboards/api-overview.json index b919a82..6e83e04 100644 --- a/monitoring/grafana/dashboards/api-overview.json +++ b/monitoring/grafana/dashboards/api-overview.json @@ -48,17 +48,17 @@ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, "targets": [ { - "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))", + "expr": "histogram_quantile(0.50, sum(rate(goodgo_api_request_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p50", "refId": "A" }, { - "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))", + "expr": "histogram_quantile(0.95, sum(rate(goodgo_api_request_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p95", "refId": "B" }, { - "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p99", "refId": "C" } diff --git a/monitoring/prometheus/alert-rules.yml b/monitoring/prometheus/alert-rules.yml new file mode 100644 index 0000000..d2995a3 --- /dev/null +++ b/monitoring/prometheus/alert-rules.yml @@ -0,0 +1,81 @@ +groups: + - name: goodgo_api_latency + rules: + # ── p99 latency alert ── + - alert: ApiLatencyP99High + expr: > + histogram_quantile(0.99, + sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le) + ) > 1 + for: 5m + labels: + severity: warning + team: sre + service: goodgo-api + annotations: + summary: "API p99 latency exceeds 1s" + description: > + The overall API p99 latency has been above 1 second for the last 5 minutes. + Current value: {{ $value | printf "%.3f" }}s. + Investigate slow endpoints using the GoodGo API Latency dashboard. + dashboard: "/d/goodgo-api-latency/goodgo-api-latency" + runbook_url: "https://docs.goodgo.vn/runbooks/api-latency-high" + + # ── Per-route p99 latency alert ── + - alert: ApiEndpointLatencyP99High + expr: > + histogram_quantile(0.99, + sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le, route, method) + ) > 2 + for: 5m + labels: + severity: warning + team: sre + service: goodgo-api + annotations: + summary: "Endpoint {{ $labels.method }} {{ $labels.route }} p99 > 2s" + description: > + The {{ $labels.method }} {{ $labels.route }} endpoint has a p99 latency + above 2 seconds for the last 5 minutes. + Current value: {{ $value | printf "%.3f" }}s. + dashboard: "/d/goodgo-api-latency/goodgo-api-latency" + + # ── p99 critical (SLO breach) ── + - alert: ApiLatencyP99Critical + expr: > + histogram_quantile(0.99, + sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le) + ) > 3 + for: 3m + labels: + severity: critical + team: sre + service: goodgo-api + annotations: + summary: "CRITICAL: API p99 latency exceeds 3s (SLO breach)" + description: > + The overall API p99 latency has been above 3 seconds for the last 3 minutes. + This is a potential SLO breach. Immediate investigation required. + Current value: {{ $value | printf "%.3f" }}s. + dashboard: "/d/goodgo-api-latency/goodgo-api-latency" + runbook_url: "https://docs.goodgo.vn/runbooks/api-latency-critical" + + # ── 5xx error rate spike ── + - alert: ApiErrorRate5xxHigh + expr: > + ( + sum(rate(http_requests_total{job="goodgo-api", status_code=~"5.."}[5m])) + / + sum(rate(http_requests_total{job="goodgo-api"}[5m])) + ) * 100 > 1 + for: 5m + labels: + severity: warning + team: sre + service: goodgo-api + annotations: + summary: "API 5xx error rate above 1%" + description: > + The 5xx error rate for the GoodGo API has been above 1% for the last 5 minutes. + Current value: {{ $value | printf "%.2f" }}%. + dashboard: "/d/goodgo-api-latency/goodgo-api-latency" diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml index 4f255ed..3c55b5a 100644 --- a/monitoring/prometheus/prometheus.yml +++ b/monitoring/prometheus/prometheus.yml @@ -2,6 +2,9 @@ global: scrape_interval: 15s evaluation_interval: 15s +rule_files: + - 'alert-rules.yml' + scrape_configs: - job_name: 'goodgo-api' metrics_path: '/metrics'