groups: - name: goodgo_api_latency rules: # ── p99 latency alert ── - alert: ApiLatencyP99High expr: > histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le) ) > 1 for: 5m labels: severity: warning team: sre service: goodgo-api annotations: summary: "API p99 latency exceeds 1s" description: > The overall API p99 latency has been above 1 second for the last 5 minutes. Current value: {{ $value | printf "%.3f" }}s. Investigate slow endpoints using the GoodGo API Latency dashboard. dashboard: "/d/goodgo-api-latency/goodgo-api-latency" runbook_url: "https://docs.goodgo.vn/runbooks/api-latency-high" # ── Per-route p99 latency alert ── - alert: ApiEndpointLatencyP99High expr: > histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le, route, method) ) > 2 for: 5m labels: severity: warning team: sre service: goodgo-api annotations: summary: "Endpoint {{ $labels.method }} {{ $labels.route }} p99 > 2s" description: > The {{ $labels.method }} {{ $labels.route }} endpoint has a p99 latency above 2 seconds for the last 5 minutes. Current value: {{ $value | printf "%.3f" }}s. dashboard: "/d/goodgo-api-latency/goodgo-api-latency" # ── p99 critical (SLO breach) ── - alert: ApiLatencyP99Critical expr: > histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le) ) > 3 for: 3m labels: severity: critical team: sre service: goodgo-api annotations: summary: "CRITICAL: API p99 latency exceeds 3s (SLO breach)" description: > The overall API p99 latency has been above 3 seconds for the last 3 minutes. This is a potential SLO breach. Immediate investigation required. Current value: {{ $value | printf "%.3f" }}s. dashboard: "/d/goodgo-api-latency/goodgo-api-latency" runbook_url: "https://docs.goodgo.vn/runbooks/api-latency-critical" # ── 5xx error rate spike ── - alert: ApiErrorRate5xxHigh expr: > ( sum(rate(http_requests_total{job="goodgo-api", status_code=~"5.."}[5m])) / sum(rate(http_requests_total{job="goodgo-api"}[5m])) ) * 100 > 1 for: 5m labels: severity: warning team: sre service: goodgo-api annotations: summary: "API 5xx error rate above 1%" description: > The 5xx error rate for the GoodGo API has been above 1% for the last 5 minutes. Current value: {{ $value | printf "%.2f" }}%. dashboard: "/d/goodgo-api-latency/goodgo-api-latency"