goodgo-platform/monitoring/prometheus/alert-rules.yml

groups:
  - name: goodgo_api_latency
    rules:
      # ── p99 latency alert ──
      - alert: ApiLatencyP99High
        expr: >
          histogram_quantile(0.99,
            sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le)
          ) > 1
        for: 5m
        labels:
          severity: warning
          team: sre
          service: goodgo-api
        annotations:
          summary: "API p99 latency exceeds 1s"
          description: >
            The overall API p99 latency has been above 1 second for the last 5 minutes.
            Current value: {{ $value | printf "%.3f" }}s.
            Investigate slow endpoints using the GoodGo API Latency dashboard.
          dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
          runbook_url: "https://docs.goodgo.vn/runbooks/api-latency-high"

      # ── Per-route p99 latency alert ──
      - alert: ApiEndpointLatencyP99High
        expr: >
          histogram_quantile(0.99,
            sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le, route, method)
          ) > 2
        for: 5m
        labels:
          severity: warning
          team: sre
          service: goodgo-api
        annotations:
          summary: "Endpoint {{ $labels.method }} {{ $labels.route }} p99 > 2s"
          description: >
            The {{ $labels.method }} {{ $labels.route }} endpoint has a p99 latency
            above 2 seconds for the last 5 minutes.
            Current value: {{ $value | printf "%.3f" }}s.
          dashboard: "/d/goodgo-api-latency/goodgo-api-latency"

      # ── p99 critical (SLO breach) ──
      - alert: ApiLatencyP99Critical
        expr: >
          histogram_quantile(0.99,
            sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le)
          ) > 3
        for: 3m
        labels:
          severity: critical
          team: sre
          service: goodgo-api
        annotations:
          summary: "CRITICAL: API p99 latency exceeds 3s (SLO breach)"
          description: >
            The overall API p99 latency has been above 3 seconds for the last 3 minutes.
            This is a potential SLO breach. Immediate investigation required.
            Current value: {{ $value | printf "%.3f" }}s.
          dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
          runbook_url: "https://docs.goodgo.vn/runbooks/api-latency-critical"

      # ── 5xx error rate spike ──
      - alert: ApiErrorRate5xxHigh
        expr: >
          (
            sum(rate(http_requests_total{job="goodgo-api", status_code=~"5.."}[5m]))
            /
            sum(rate(http_requests_total{job="goodgo-api"}[5m]))
          ) * 100 > 1
        for: 5m
        labels:
          severity: warning
          team: sre
          service: goodgo-api
        annotations:
          summary: "API 5xx error rate above 1%"
          description: >
            The 5xx error rate for the GoodGo API has been above 1% for the last 5 minutes.
            Current value: {{ $value | printf "%.2f" }}%.
          dashboard: "/d/goodgo-api-latency/goodgo-api-latency"