feat(monitoring): add API latency Grafana dashboard and alerting rules
Create comprehensive Grafana dashboard for API latency monitoring with: - p50/p95/p99 stat panels and time series for all endpoints - Per-endpoint latency breakdown with route/method template variables - Top 10 slowest endpoints table and bar chart (by p99) - Request rate (by method) and error rate (4xx/5xx) panels - Error rate percentage (5xx/total) with SLO threshold - Latency heatmap and histogram distribution panels Add Prometheus alerting rules: - ApiLatencyP99High: p99 > 1s for 5m (warning) - ApiEndpointLatencyP99High: per-endpoint p99 > 2s (warning) - ApiLatencyP99Critical: p99 > 3s for 3m (critical/SLO breach) - ApiErrorRate5xxHigh: 5xx rate > 1% for 5m (warning) Fix api-overview.json using wrong metric name (http_request_duration_seconds → goodgo_api_request_duration_seconds). Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
81
monitoring/prometheus/alert-rules.yml
Normal file
81
monitoring/prometheus/alert-rules.yml
Normal file
@@ -0,0 +1,81 @@
|
||||
groups:
|
||||
- name: goodgo_api_latency
|
||||
rules:
|
||||
# ── p99 latency alert ──
|
||||
- alert: ApiLatencyP99High
|
||||
expr: >
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le)
|
||||
) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: sre
|
||||
service: goodgo-api
|
||||
annotations:
|
||||
summary: "API p99 latency exceeds 1s"
|
||||
description: >
|
||||
The overall API p99 latency has been above 1 second for the last 5 minutes.
|
||||
Current value: {{ $value | printf "%.3f" }}s.
|
||||
Investigate slow endpoints using the GoodGo API Latency dashboard.
|
||||
dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
|
||||
runbook_url: "https://docs.goodgo.vn/runbooks/api-latency-high"
|
||||
|
||||
# ── Per-route p99 latency alert ──
|
||||
- alert: ApiEndpointLatencyP99High
|
||||
expr: >
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le, route, method)
|
||||
) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: sre
|
||||
service: goodgo-api
|
||||
annotations:
|
||||
summary: "Endpoint {{ $labels.method }} {{ $labels.route }} p99 > 2s"
|
||||
description: >
|
||||
The {{ $labels.method }} {{ $labels.route }} endpoint has a p99 latency
|
||||
above 2 seconds for the last 5 minutes.
|
||||
Current value: {{ $value | printf "%.3f" }}s.
|
||||
dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
|
||||
|
||||
# ── p99 critical (SLO breach) ──
|
||||
- alert: ApiLatencyP99Critical
|
||||
expr: >
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le)
|
||||
) > 3
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
team: sre
|
||||
service: goodgo-api
|
||||
annotations:
|
||||
summary: "CRITICAL: API p99 latency exceeds 3s (SLO breach)"
|
||||
description: >
|
||||
The overall API p99 latency has been above 3 seconds for the last 3 minutes.
|
||||
This is a potential SLO breach. Immediate investigation required.
|
||||
Current value: {{ $value | printf "%.3f" }}s.
|
||||
dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
|
||||
runbook_url: "https://docs.goodgo.vn/runbooks/api-latency-critical"
|
||||
|
||||
# ── 5xx error rate spike ──
|
||||
- alert: ApiErrorRate5xxHigh
|
||||
expr: >
|
||||
(
|
||||
sum(rate(http_requests_total{job="goodgo-api", status_code=~"5.."}[5m]))
|
||||
/
|
||||
sum(rate(http_requests_total{job="goodgo-api"}[5m]))
|
||||
) * 100 > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: sre
|
||||
service: goodgo-api
|
||||
annotations:
|
||||
summary: "API 5xx error rate above 1%"
|
||||
description: >
|
||||
The 5xx error rate for the GoodGo API has been above 1% for the last 5 minutes.
|
||||
Current value: {{ $value | printf "%.2f" }}%.
|
||||
dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
|
||||
@@ -2,6 +2,9 @@ global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
rule_files:
|
||||
- 'alert-rules.yml'
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'goodgo-api'
|
||||
metrics_path: '/metrics'
|
||||
|
||||
Reference in New Issue
Block a user