goodgo-platform/monitoring/prometheus/alert-rules.yml

groups:
  # ── API Latency & Error Alerts ───────────────────────────────────────────────
  - name: goodgo_api_latency
    rules:
      # ── p99 latency alert ──
      - alert: ApiLatencyP99High
        expr: >
          histogram_quantile(0.99,
            sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le)
          ) > 1
        for: 5m
        labels:
          severity: warning
          team: sre
          service: goodgo-api
        annotations:
          summary: "API p99 latency exceeds 1s"
          description: >
            The overall API p99 latency has been above 1 second for the last 5 minutes.
            Current value: {{ $value | printf "%.3f" }}s.
            Investigate slow endpoints using the GoodGo API Latency dashboard.
          dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
          runbook_url: "https://docs.goodgo.vn/runbooks/api-latency-high"

      # ── Per-route p99 latency alert ──
      - alert: ApiEndpointLatencyP99High
        expr: >
          histogram_quantile(0.99,
            sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le, route, method)
          ) > 2
        for: 5m
        labels:
          severity: warning
          team: sre
          service: goodgo-api
        annotations:
          summary: "Endpoint {{ $labels.method }} {{ $labels.route }} p99 > 2s"
          description: >
            The {{ $labels.method }} {{ $labels.route }} endpoint has a p99 latency
            above 2 seconds for the last 5 minutes.
            Current value: {{ $value | printf "%.3f" }}s.
          dashboard: "/d/goodgo-api-latency/goodgo-api-latency"

      # ── p99 critical (SLO breach) ──
      - alert: ApiLatencyP99Critical
        expr: >
          histogram_quantile(0.99,
            sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le)
          ) > 3
        for: 3m
        labels:
          severity: critical
          team: sre
          service: goodgo-api
        annotations:
          summary: "CRITICAL: API p99 latency exceeds 3s (SLO breach)"
          description: >
            The overall API p99 latency has been above 3 seconds for the last 3 minutes.
            This is a potential SLO breach. Immediate investigation required.
            Current value: {{ $value | printf "%.3f" }}s.
          dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
          runbook_url: "https://docs.goodgo.vn/runbooks/api-latency-critical"

      # ── 5xx error rate spike ──
      - alert: ApiErrorRate5xxHigh
        expr: >
          (
            sum(rate(http_requests_total{job="goodgo-api", status_code=~"5.."}[5m]))
            /
            sum(rate(http_requests_total{job="goodgo-api"}[5m]))
          ) * 100 > 1
        for: 5m
        labels:
          severity: warning
          team: sre
          service: goodgo-api
        annotations:
          summary: "API 5xx error rate above 1%"
          description: >
            The 5xx error rate for the GoodGo API has been above 1% for the last 5 minutes.
            Current value: {{ $value | printf "%.2f" }}%.
          dashboard: "/d/goodgo-api-latency/goodgo-api-latency"

      # ── 5xx error rate critical ──
      - alert: ApiErrorRate5xxCritical
        expr: >
          (
            sum(rate(http_requests_total{job="goodgo-api", status_code=~"5.."}[5m]))
            /
            sum(rate(http_requests_total{job="goodgo-api"}[5m]))
          ) * 100 > 5
        for: 3m
        labels:
          severity: critical
          team: sre
          service: goodgo-api
        annotations:
          summary: "CRITICAL: API 5xx error rate above 5%"
          description: >
            The 5xx error rate for the GoodGo API has been above 5% for the last 3 minutes.
            This indicates a major incident. Immediate investigation required.
            Current value: {{ $value | printf "%.2f" }}%.
          dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
          runbook_url: "https://docs.goodgo.vn/runbooks/5xx-critical"

      # ── No traffic (possible downtime) ──
      - alert: ApiNoTraffic
        expr: >
          sum(rate(http_requests_total{job="goodgo-api"}[5m])) == 0
        for: 10m
        labels:
          severity: warning
          team: sre
          service: goodgo-api
        annotations:
          summary: "API receiving zero traffic for 10 minutes"
          description: >
            The GoodGo API has received no requests in the last 10 minutes.
            This may indicate the service is down or unreachable.

  # ── Database Alerts ──────────────────────────────────────────────────────────
  - name: goodgo_database
    rules:
      # ── PostgreSQL active connections high ──
      - alert: PostgresActiveConnectionsHigh
        expr: >
          pg_stat_activity_count{datname="goodgo", state="active"} > 15
        for: 5m
        labels:
          severity: warning
          team: sre
          service: postgres
        annotations:
          summary: "PostgreSQL active connections above 15"
          description: >
            The number of active PostgreSQL connections has been above 15 for 5 minutes.
            Pool size is 20. Current value: {{ $value }}.
            Check for long-running queries or connection leaks.
          runbook_url: "https://docs.goodgo.vn/runbooks/db-connections"

      # ── PostgreSQL connection pool near exhaustion ──
      - alert: PostgresConnectionPoolCritical
        expr: >
          pg_stat_activity_count{datname="goodgo"} > 180
        for: 2m
        labels:
          severity: critical
          team: sre
          service: postgres
        annotations:
          summary: "CRITICAL: PostgreSQL connections near limit (>180/200)"
          description: >
            Total PostgreSQL connections have exceeded 180 (max client connections: 200).
            PgBouncer pool may be exhausted. Immediate action required.
            Current value: {{ $value }}.
          runbook_url: "https://docs.goodgo.vn/runbooks/db-pool-exhaustion"

      # ── PostgreSQL slow queries ──
      - alert: PostgresSlowQueries
        expr: >
          pg_stat_activity_count{datname="goodgo", state="active", wait_event_type="Lock"} > 5
        for: 5m
        labels:
          severity: warning
          team: sre
          service: postgres
        annotations:
          summary: "Multiple PostgreSQL queries waiting on locks"
          description: >
            More than 5 queries are blocked waiting on locks for 5+ minutes.
            This may indicate lock contention or deadlocks.
            Current value: {{ $value }}.

      # ── PostgreSQL down ──
      - alert: PostgresDown
        expr: >
          up{job="goodgo-api"} == 0
        for: 1m
        labels:
          severity: critical
          team: sre
          service: postgres
        annotations:
          summary: "CRITICAL: Cannot scrape GoodGo API (possible service down)"
          description: >
            Prometheus cannot scrape the GoodGo API metrics endpoint.
            The API or its dependencies (PostgreSQL, Redis) may be down.

  # ── Redis Alerts ─────────────────────────────────────────────────────────────
  - name: goodgo_redis
    rules:
      # ── Redis memory usage high ──
      - alert: RedisMemoryHigh
        expr: >
          redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80
        for: 5m
        labels:
          severity: warning
          team: sre
          service: redis
        annotations:
          summary: "Redis memory usage above 80%"
          description: >
            Redis memory usage has exceeded 80% of the configured maximum (512 MB).
            Eviction policy (allkeys-lru) is active but high usage may indicate a problem.
            Current usage: {{ $value | printf "%.1f" }}%.

      # ── Redis memory critical ──
      - alert: RedisMemoryCritical
        expr: >
          redis_memory_used_bytes / redis_memory_max_bytes * 100 > 95
        for: 2m
        labels:
          severity: critical
          team: sre
          service: redis
        annotations:
          summary: "CRITICAL: Redis memory usage above 95%"
          description: >
            Redis memory usage has exceeded 95% of the configured maximum.
            Heavy eviction is occurring. Consider increasing maxmemory or investigating cache patterns.
            Current usage: {{ $value | printf "%.1f" }}%.

      # ── Redis connected clients high ──
      - alert: RedisConnectedClientsHigh
        expr: >
          redis_connected_clients > 150
        for: 5m
        labels:
          severity: warning
          team: sre
          service: redis
        annotations:
          summary: "Redis connected clients above 150"
          description: >
            The number of connected Redis clients has exceeded 150 for 5+ minutes.
            Current value: {{ $value }}.

      # ── Redis rejected connections ──
      - alert: RedisRejectedConnections
        expr: >
          increase(redis_rejected_connections_total[5m]) > 0
        for: 1m
        labels:
          severity: critical
          team: sre
          service: redis
        annotations:
          summary: "Redis is rejecting connections"
          description: >
            Redis has rejected {{ $value }} connection(s) in the last 5 minutes.
            This indicates maxclients has been reached.

  # ── Container Resource Alerts ────────────────────────────────────────────────
  - name: goodgo_containers
    rules:
      # ── Container restart loop ──
      - alert: ContainerRestartLoop
        expr: >
          increase(container_restart_count{name=~"goodgo-.*"}[15m]) > 3
        for: 5m
        labels:
          severity: critical
          team: sre
          service: "{{ $labels.name }}"
        annotations:
          summary: "Container {{ $labels.name }} restart loop"
          description: >
            Container {{ $labels.name }} has restarted more than 3 times in the last 15 minutes.
            This indicates a crash loop. Check container logs immediately.

      # ── Container memory near limit ──
      - alert: ContainerMemoryHigh
        expr: >
          (container_memory_usage_bytes{name=~"goodgo-.*"}
           / container_spec_memory_limit_bytes{name=~"goodgo-.*"}) * 100 > 85
        for: 5m
        labels:
          severity: warning
          team: sre
          service: "{{ $labels.name }}"
        annotations:
          summary: "Container {{ $labels.name }} memory usage above 85%"
          description: >
            Container {{ $labels.name }} is using more than 85% of its memory limit.
            Current usage: {{ $value | printf "%.1f" }}%.
            Risk of OOM kill if usage continues to grow.

      # ── Container CPU throttled ──
      - alert: ContainerCPUThrottled
        expr: >
          rate(container_cpu_cfs_throttled_seconds_total{name=~"goodgo-.*"}[5m]) > 0.5
        for: 10m
        labels:
          severity: warning
          team: sre
          service: "{{ $labels.name }}"
        annotations:
          summary: "Container {{ $labels.name }} CPU is being throttled"
          description: >
            Container {{ $labels.name }} has been CPU-throttled for 10+ minutes.
            Current throttle rate: {{ $value | printf "%.2f" }}s/s.
            Consider increasing CPU limits.

  # ── Disk & Volume Alerts ─────────────────────────────────────────────────────
  - name: goodgo_disk
    rules:
      # ── Host disk usage high ──
      - alert: HostDiskUsageHigh
        expr: >
          (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80
        for: 10m
        labels:
          severity: warning
          team: sre
          service: host
        annotations:
          summary: "Host root disk usage above 80%"
          description: >
            The root filesystem is {{ $value | printf "%.1f" }}% full.
            Clean up Docker artifacts, old backups, or logs to free space.
          runbook_url: "https://docs.goodgo.vn/runbooks/disk-space"

      # ── Host disk usage critical ──
      - alert: HostDiskUsageCritical
        expr: >
          (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 90
        for: 5m
        labels:
          severity: critical
          team: sre
          service: host
        annotations:
          summary: "CRITICAL: Host root disk usage above 90%"
          description: >
            The root filesystem is {{ $value | printf "%.1f" }}% full.
            Services may fail. Immediate cleanup required.
          runbook_url: "https://docs.goodgo.vn/runbooks/disk-space-critical"

  # ── Service Health Alerts ────────────────────────────────────────────────────
  - name: goodgo_services
    rules:
      # ── API health check failing ──
      - alert: ApiHealthCheckFailing
        expr: >
          probe_success{job="goodgo-api-health"} == 0
        for: 2m
        labels:
          severity: critical
          team: sre
          service: goodgo-api
        annotations:
          summary: "CRITICAL: API health check is failing"
          description: >
            The GoodGo API health endpoint has been unreachable for 2+ minutes.
            The service may be down or unresponsive.

      # ── Prometheus target down ──
      - alert: PrometheusTargetDown
        expr: >
          up == 0
        for: 5m
        labels:
          severity: warning
          team: sre
        annotations:
          summary: "Prometheus target {{ $labels.job }} is down"
          description: >
            Prometheus cannot scrape {{ $labels.instance }} (job: {{ $labels.job }}) for 5+ minutes.

  # ── Backup Monitoring Alerts ─────────────────────────────────────────────────
  - name: goodgo_backups
    rules:
      # ── Backup age too old (no recent backup) ──
      - alert: BackupTooOld
        expr: >
          (time() - goodgo_backup_last_success_timestamp_seconds) > 90000
        for: 5m
        labels:
          severity: warning
          team: sre
          service: pg-backup
        annotations:
          summary: "PostgreSQL backup is more than 25 hours old"
          description: >
            The last successful PostgreSQL backup was {{ $value | humanizeDuration }} ago.
            Daily backups run at 02:00 UTC. The backup job may have failed.
            Check: docker logs goodgo-pg-backup

      # ── Backup verification failed ──
      - alert: BackupVerificationFailed
        expr: >
          goodgo_backup_verify_result == 0
        for: 1m
        labels:
          severity: warning
          team: sre
          service: pg-backup
        annotations:
          summary: "PostgreSQL backup verification failed"
          description: >
            The automated backup verification check has failed.
            Check: docker exec goodgo-pg-backup cat /backups/verify-latest.json