groups: # ── API Latency & Error Alerts ─────────────────────────────────────────────── - name: goodgo_api_latency rules: # ── p99 latency alert ── - alert: ApiLatencyP99High expr: > histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le) ) > 1 for: 5m labels: severity: warning team: sre service: goodgo-api annotations: summary: "API p99 latency exceeds 1s" description: > The overall API p99 latency has been above 1 second for the last 5 minutes. Current value: {{ $value | printf "%.3f" }}s. Investigate slow endpoints using the GoodGo API Latency dashboard. dashboard: "/d/goodgo-api-latency/goodgo-api-latency" runbook_url: "https://docs.goodgo.vn/runbooks/api-latency-high" # ── Per-route p99 latency alert ── - alert: ApiEndpointLatencyP99High expr: > histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le, route, method) ) > 2 for: 5m labels: severity: warning team: sre service: goodgo-api annotations: summary: "Endpoint {{ $labels.method }} {{ $labels.route }} p99 > 2s" description: > The {{ $labels.method }} {{ $labels.route }} endpoint has a p99 latency above 2 seconds for the last 5 minutes. Current value: {{ $value | printf "%.3f" }}s. dashboard: "/d/goodgo-api-latency/goodgo-api-latency" # ── p99 critical (SLO breach) ── - alert: ApiLatencyP99Critical expr: > histogram_quantile(0.99, sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api"}[5m])) by (le) ) > 3 for: 3m labels: severity: critical team: sre service: goodgo-api annotations: summary: "CRITICAL: API p99 latency exceeds 3s (SLO breach)" description: > The overall API p99 latency has been above 3 seconds for the last 3 minutes. This is a potential SLO breach. Immediate investigation required. Current value: {{ $value | printf "%.3f" }}s. dashboard: "/d/goodgo-api-latency/goodgo-api-latency" runbook_url: "https://docs.goodgo.vn/runbooks/api-latency-critical" # ── 5xx error rate spike ── - alert: ApiErrorRate5xxHigh expr: > ( sum(rate(http_requests_total{job="goodgo-api", status_code=~"5.."}[5m])) / sum(rate(http_requests_total{job="goodgo-api"}[5m])) ) * 100 > 1 for: 5m labels: severity: warning team: sre service: goodgo-api annotations: summary: "API 5xx error rate above 1%" description: > The 5xx error rate for the GoodGo API has been above 1% for the last 5 minutes. Current value: {{ $value | printf "%.2f" }}%. dashboard: "/d/goodgo-api-latency/goodgo-api-latency" # ── 5xx error rate critical ── - alert: ApiErrorRate5xxCritical expr: > ( sum(rate(http_requests_total{job="goodgo-api", status_code=~"5.."}[5m])) / sum(rate(http_requests_total{job="goodgo-api"}[5m])) ) * 100 > 5 for: 3m labels: severity: critical team: sre service: goodgo-api annotations: summary: "CRITICAL: API 5xx error rate above 5%" description: > The 5xx error rate for the GoodGo API has been above 5% for the last 3 minutes. This indicates a major incident. Immediate investigation required. Current value: {{ $value | printf "%.2f" }}%. dashboard: "/d/goodgo-api-latency/goodgo-api-latency" runbook_url: "https://docs.goodgo.vn/runbooks/5xx-critical" # ── No traffic (possible downtime) ── - alert: ApiNoTraffic expr: > sum(rate(http_requests_total{job="goodgo-api"}[5m])) == 0 for: 10m labels: severity: warning team: sre service: goodgo-api annotations: summary: "API receiving zero traffic for 10 minutes" description: > The GoodGo API has received no requests in the last 10 minutes. This may indicate the service is down or unreachable. # ── Database Alerts ────────────────────────────────────────────────────────── - name: goodgo_database rules: # ── PostgreSQL active connections high ── - alert: PostgresActiveConnectionsHigh expr: > pg_stat_activity_count{datname="goodgo", state="active"} > 15 for: 5m labels: severity: warning team: sre service: postgres annotations: summary: "PostgreSQL active connections above 15" description: > The number of active PostgreSQL connections has been above 15 for 5 minutes. Pool size is 20. Current value: {{ $value }}. Check for long-running queries or connection leaks. runbook_url: "https://docs.goodgo.vn/runbooks/db-connections" # ── PostgreSQL connection pool near exhaustion ── - alert: PostgresConnectionPoolCritical expr: > pg_stat_activity_count{datname="goodgo"} > 180 for: 2m labels: severity: critical team: sre service: postgres annotations: summary: "CRITICAL: PostgreSQL connections near limit (>180/200)" description: > Total PostgreSQL connections have exceeded 180 (max client connections: 200). PgBouncer pool may be exhausted. Immediate action required. Current value: {{ $value }}. runbook_url: "https://docs.goodgo.vn/runbooks/db-pool-exhaustion" # ── PostgreSQL slow queries ── - alert: PostgresSlowQueries expr: > pg_stat_activity_count{datname="goodgo", state="active", wait_event_type="Lock"} > 5 for: 5m labels: severity: warning team: sre service: postgres annotations: summary: "Multiple PostgreSQL queries waiting on locks" description: > More than 5 queries are blocked waiting on locks for 5+ minutes. This may indicate lock contention or deadlocks. Current value: {{ $value }}. # ── PostgreSQL down ── - alert: PostgresDown expr: > up{job="goodgo-api"} == 0 for: 1m labels: severity: critical team: sre service: postgres annotations: summary: "CRITICAL: Cannot scrape GoodGo API (possible service down)" description: > Prometheus cannot scrape the GoodGo API metrics endpoint. The API or its dependencies (PostgreSQL, Redis) may be down. # ── Redis Alerts ───────────────────────────────────────────────────────────── - name: goodgo_redis rules: # ── Redis memory usage high ── - alert: RedisMemoryHigh expr: > redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80 for: 5m labels: severity: warning team: sre service: redis annotations: summary: "Redis memory usage above 80%" description: > Redis memory usage has exceeded 80% of the configured maximum (512 MB). Eviction policy (allkeys-lru) is active but high usage may indicate a problem. Current usage: {{ $value | printf "%.1f" }}%. # ── Redis memory critical ── - alert: RedisMemoryCritical expr: > redis_memory_used_bytes / redis_memory_max_bytes * 100 > 95 for: 2m labels: severity: critical team: sre service: redis annotations: summary: "CRITICAL: Redis memory usage above 95%" description: > Redis memory usage has exceeded 95% of the configured maximum. Heavy eviction is occurring. Consider increasing maxmemory or investigating cache patterns. Current usage: {{ $value | printf "%.1f" }}%. # ── Redis connected clients high ── - alert: RedisConnectedClientsHigh expr: > redis_connected_clients > 150 for: 5m labels: severity: warning team: sre service: redis annotations: summary: "Redis connected clients above 150" description: > The number of connected Redis clients has exceeded 150 for 5+ minutes. Current value: {{ $value }}. # ── Redis rejected connections ── - alert: RedisRejectedConnections expr: > increase(redis_rejected_connections_total[5m]) > 0 for: 1m labels: severity: critical team: sre service: redis annotations: summary: "Redis is rejecting connections" description: > Redis has rejected {{ $value }} connection(s) in the last 5 minutes. This indicates maxclients has been reached. # ── Container Resource Alerts ──────────────────────────────────────────────── - name: goodgo_containers rules: # ── Container restart loop ── - alert: ContainerRestartLoop expr: > increase(container_restart_count{name=~"goodgo-.*"}[15m]) > 3 for: 5m labels: severity: critical team: sre service: "{{ $labels.name }}" annotations: summary: "Container {{ $labels.name }} restart loop" description: > Container {{ $labels.name }} has restarted more than 3 times in the last 15 minutes. This indicates a crash loop. Check container logs immediately. # ── Container memory near limit ── - alert: ContainerMemoryHigh expr: > (container_memory_usage_bytes{name=~"goodgo-.*"} / container_spec_memory_limit_bytes{name=~"goodgo-.*"}) * 100 > 85 for: 5m labels: severity: warning team: sre service: "{{ $labels.name }}" annotations: summary: "Container {{ $labels.name }} memory usage above 85%" description: > Container {{ $labels.name }} is using more than 85% of its memory limit. Current usage: {{ $value | printf "%.1f" }}%. Risk of OOM kill if usage continues to grow. # ── Container CPU throttled ── - alert: ContainerCPUThrottled expr: > rate(container_cpu_cfs_throttled_seconds_total{name=~"goodgo-.*"}[5m]) > 0.5 for: 10m labels: severity: warning team: sre service: "{{ $labels.name }}" annotations: summary: "Container {{ $labels.name }} CPU is being throttled" description: > Container {{ $labels.name }} has been CPU-throttled for 10+ minutes. Current throttle rate: {{ $value | printf "%.2f" }}s/s. Consider increasing CPU limits. # ── Disk & Volume Alerts ───────────────────────────────────────────────────── - name: goodgo_disk rules: # ── Host disk usage high ── - alert: HostDiskUsageHigh expr: > (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80 for: 10m labels: severity: warning team: sre service: host annotations: summary: "Host root disk usage above 80%" description: > The root filesystem is {{ $value | printf "%.1f" }}% full. Clean up Docker artifacts, old backups, or logs to free space. runbook_url: "https://docs.goodgo.vn/runbooks/disk-space" # ── Host disk usage critical ── - alert: HostDiskUsageCritical expr: > (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 90 for: 5m labels: severity: critical team: sre service: host annotations: summary: "CRITICAL: Host root disk usage above 90%" description: > The root filesystem is {{ $value | printf "%.1f" }}% full. Services may fail. Immediate cleanup required. runbook_url: "https://docs.goodgo.vn/runbooks/disk-space-critical" # ── Service Health Alerts ──────────────────────────────────────────────────── - name: goodgo_services rules: # ── API health check failing ── - alert: ApiHealthCheckFailing expr: > probe_success{job="goodgo-api-health"} == 0 for: 2m labels: severity: critical team: sre service: goodgo-api annotations: summary: "CRITICAL: API health check is failing" description: > The GoodGo API health endpoint has been unreachable for 2+ minutes. The service may be down or unresponsive. # ── Prometheus target down ── - alert: PrometheusTargetDown expr: > up == 0 for: 5m labels: severity: warning team: sre annotations: summary: "Prometheus target {{ $labels.job }} is down" description: > Prometheus cannot scrape {{ $labels.instance }} (job: {{ $labels.job }}) for 5+ minutes. # ── Backup Monitoring Alerts ───────────────────────────────────────────────── - name: goodgo_backups rules: # ── Backup age too old (no recent backup) ── - alert: BackupTooOld expr: > (time() - goodgo_backup_last_success_timestamp_seconds) > 90000 for: 5m labels: severity: warning team: sre service: pg-backup annotations: summary: "PostgreSQL backup is more than 25 hours old" description: > The last successful PostgreSQL backup was {{ $value | humanizeDuration }} ago. Daily backups run at 02:00 UTC. The backup job may have failed. Check: docker logs goodgo-pg-backup # ── Backup verification failed ── - alert: BackupVerificationFailed expr: > goodgo_backup_verify_result == 0 for: 1m labels: severity: warning team: sre service: pg-backup annotations: summary: "PostgreSQL backup verification failed" description: > The automated backup verification check has failed. Check: docker exec goodgo-pg-backup cat /backups/verify-latest.json