feat(monitoring): add comprehensive alerting rules, Alertmanager, and DR validation

Expand production monitoring with full alert coverage for database connections, Redis memory/connections, container resources, disk usage, service health, and backup integrity. Add Alertmanager service with Slack routing for critical and warning alerts, and add automated backup verification to the pg-backup cron schedule. Update runbook with DR validation procedures and quarterly checklist. - Expand Prometheus alert rules from 4 to 24 alerts across 7 groups - Add Alertmanager container (prom/alertmanager:v0.27.0) with Slack routing - Configure inhibition rules (critical suppresses warning for same service) - Schedule automated backup verification at 04:00 UTC daily - Add Alertmanager datasource to Grafana provisioning - Update runbook with Section 9: DR Validation (automated + manual procedures) - Add SLACK_WEBHOOK_URL and Grafana vars to .env.example Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-11 20:15:36 +07:00
parent 33c2e5ac1d
commit 9409706c58
8 changed files with 1108 additions and 2 deletions
--- a/monitoring/alertmanager/alertmanager.yml
+++ b/monitoring/alertmanager/alertmanager.yml
@@ -0,0 +1,90 @@
+# GoodGo Platform — Alertmanager Configuration
+# Routes alerts from Prometheus to notification channels.
+#
+# Environment variables (set in .env):
+#   SLACK_WEBHOOK_URL      — Slack incoming webhook for alert notifications
+#   ALERTMANAGER_SMTP_*    — SMTP settings for email alerts (optional)
+
+global:
+  resolve_timeout: 5m
+  slack_api_url: '${SLACK_WEBHOOK_URL}'
+
+# ── Notification Templates ─────────────────────────────────────────────────────
+templates:
+  - '/etc/alertmanager/templates/*.tmpl'
+
+# ── Inhibition Rules ──────────────────────────────────────────────────────────
+# Suppress warning alerts when a critical alert is already firing for the same service
+inhibit_rules:
+  - source_matchers:
+      - severity = critical
+    target_matchers:
+      - severity = warning
+    equal: ['service']
+
+# ── Routing Tree ──────────────────────────────────────────────────────────────
+route:
+  receiver: 'slack-sre'
+  group_by: ['alertname', 'service']
+  group_wait: 30s
+  group_interval: 5m
+  repeat_interval: 4h
+
+  routes:
+    # Critical alerts — immediate notification, shorter repeat
+    - matchers:
+        - severity = critical
+      receiver: 'slack-critical'
+      group_wait: 10s
+      repeat_interval: 1h
+      continue: false
+
+    # Backup alerts — route to infrastructure channel
+    - matchers:
+        - alertname =~ "Backup.*"
+      receiver: 'slack-infrastructure'
+      group_wait: 1m
+      repeat_interval: 6h
+
+# ── Receivers ─────────────────────────────────────────────────────────────────
+receivers:
+  - name: 'slack-sre'
+    slack_configs:
+      - channel: '#sre-oncall'
+        send_resolved: true
+        title: '{{ if eq .Status "firing" }}🔥{{ else }}✅{{ end }} [{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}'
+        text: >-
+          *Service:* {{ .CommonLabels.service }}
+          *Severity:* {{ .CommonLabels.severity }}
+          {{ range .Alerts }}
+          *Summary:* {{ .Annotations.summary }}
+          *Description:* {{ .Annotations.description }}
+          {{ if .Annotations.runbook_url }}*Runbook:* {{ .Annotations.runbook_url }}{{ end }}
+          {{ if .Annotations.dashboard }}*Dashboard:* {{ .Annotations.dashboard }}{{ end }}
+          {{ end }}
+
+  - name: 'slack-critical'
+    slack_configs:
+      - channel: '#sre-oncall'
+        send_resolved: true
+        title: '{{ if eq .Status "firing" }}🚨 CRITICAL{{ else }}✅ RESOLVED{{ end }} {{ .CommonLabels.alertname }}'
+        text: >-
+          *Service:* {{ .CommonLabels.service }}
+          *Severity:* CRITICAL — Immediate action required
+          {{ range .Alerts }}
+          *Summary:* {{ .Annotations.summary }}
+          *Description:* {{ .Annotations.description }}
+          {{ if .Annotations.runbook_url }}*Runbook:* {{ .Annotations.runbook_url }}{{ end }}
+          {{ end }}
+
+  - name: 'slack-infrastructure'
+    slack_configs:
+      - channel: '#infrastructure'
+        send_resolved: true
+        title: '{{ if eq .Status "firing" }}⚠️{{ else }}✅{{ end }} [{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}'
+        text: >-
+          *Service:* {{ .CommonLabels.service }}
+          {{ range .Alerts }}
+          *Summary:* {{ .Annotations.summary }}
+          *Description:* {{ .Annotations.description }}
+          {{ end }}
--- a/monitoring/grafana/provisioning/datasources/datasource.yml
+++ b/monitoring/grafana/provisioning/datasources/datasource.yml
@@ -21,3 +21,12 @@ datasources:
          matcherRegex: 'correlationId":"([^"]+)'
          name: correlationId
          url: '$${__value.raw}'
+
+  - name: Alertmanager
+    uid: alertmanager
+    type: alertmanager
+    access: proxy
+    url: http://alertmanager:9093
+    editable: true
+    jsonData:
+      implementation: prometheus
--- a/monitoring/prometheus/alert-rules.yml
+++ b/monitoring/prometheus/alert-rules.yml
@@ -1,4 +1,5 @@
 groups:
+  # ── API Latency & Error Alerts ───────────────────────────────────────────────
  - name: goodgo_api_latency
    rules:
      # ── p99 latency alert ──
@@ -79,3 +80,324 @@ groups:
            The 5xx error rate for the GoodGo API has been above 1% for the last 5 minutes.
            Current value: {{ $value | printf "%.2f" }}%.
          dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
+
+      # ── 5xx error rate critical ──
+      - alert: ApiErrorRate5xxCritical
+        expr: >
+          (
+            sum(rate(http_requests_total{job="goodgo-api", status_code=~"5.."}[5m]))
+            /
+            sum(rate(http_requests_total{job="goodgo-api"}[5m]))
+          ) * 100 > 5
+        for: 3m
+        labels:
+          severity: critical
+          team: sre
+          service: goodgo-api
+        annotations:
+          summary: "CRITICAL: API 5xx error rate above 5%"
+          description: >
+            The 5xx error rate for the GoodGo API has been above 5% for the last 3 minutes.
+            This indicates a major incident. Immediate investigation required.
+            Current value: {{ $value | printf "%.2f" }}%.
+          dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
+          runbook_url: "https://docs.goodgo.vn/runbooks/5xx-critical"
+
+      # ── No traffic (possible downtime) ──
+      - alert: ApiNoTraffic
+        expr: >
+          sum(rate(http_requests_total{job="goodgo-api"}[5m])) == 0
+        for: 10m
+        labels:
+          severity: warning
+          team: sre
+          service: goodgo-api
+        annotations:
+          summary: "API receiving zero traffic for 10 minutes"
+          description: >
+            The GoodGo API has received no requests in the last 10 minutes.
+            This may indicate the service is down or unreachable.
+
+  # ── Database Alerts ──────────────────────────────────────────────────────────
+  - name: goodgo_database
+    rules:
+      # ── PostgreSQL active connections high ──
+      - alert: PostgresActiveConnectionsHigh
+        expr: >
+          pg_stat_activity_count{datname="goodgo", state="active"} > 15
+        for: 5m
+        labels:
+          severity: warning
+          team: sre
+          service: postgres
+        annotations:
+          summary: "PostgreSQL active connections above 15"
+          description: >
+            The number of active PostgreSQL connections has been above 15 for 5 minutes.
+            Pool size is 20. Current value: {{ $value }}.
+            Check for long-running queries or connection leaks.
+          runbook_url: "https://docs.goodgo.vn/runbooks/db-connections"
+
+      # ── PostgreSQL connection pool near exhaustion ──
+      - alert: PostgresConnectionPoolCritical
+        expr: >
+          pg_stat_activity_count{datname="goodgo"} > 180
+        for: 2m
+        labels:
+          severity: critical
+          team: sre
+          service: postgres
+        annotations:
+          summary: "CRITICAL: PostgreSQL connections near limit (>180/200)"
+          description: >
+            Total PostgreSQL connections have exceeded 180 (max client connections: 200).
+            PgBouncer pool may be exhausted. Immediate action required.
+            Current value: {{ $value }}.
+          runbook_url: "https://docs.goodgo.vn/runbooks/db-pool-exhaustion"
+
+      # ── PostgreSQL slow queries ──
+      - alert: PostgresSlowQueries
+        expr: >
+          pg_stat_activity_count{datname="goodgo", state="active", wait_event_type="Lock"} > 5
+        for: 5m
+        labels:
+          severity: warning
+          team: sre
+          service: postgres
+        annotations:
+          summary: "Multiple PostgreSQL queries waiting on locks"
+          description: >
+            More than 5 queries are blocked waiting on locks for 5+ minutes.
+            This may indicate lock contention or deadlocks.
+            Current value: {{ $value }}.
+
+      # ── PostgreSQL down ──
+      - alert: PostgresDown
+        expr: >
+          up{job="goodgo-api"} == 0
+        for: 1m
+        labels:
+          severity: critical
+          team: sre
+          service: postgres
+        annotations:
+          summary: "CRITICAL: Cannot scrape GoodGo API (possible service down)"
+          description: >
+            Prometheus cannot scrape the GoodGo API metrics endpoint.
+            The API or its dependencies (PostgreSQL, Redis) may be down.
+
+  # ── Redis Alerts ─────────────────────────────────────────────────────────────
+  - name: goodgo_redis
+    rules:
+      # ── Redis memory usage high ──
+      - alert: RedisMemoryHigh
+        expr: >
+          redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80
+        for: 5m
+        labels:
+          severity: warning
+          team: sre
+          service: redis
+        annotations:
+          summary: "Redis memory usage above 80%"
+          description: >
+            Redis memory usage has exceeded 80% of the configured maximum (512 MB).
+            Eviction policy (allkeys-lru) is active but high usage may indicate a problem.
+            Current usage: {{ $value | printf "%.1f" }}%.
+
+      # ── Redis memory critical ──
+      - alert: RedisMemoryCritical
+        expr: >
+          redis_memory_used_bytes / redis_memory_max_bytes * 100 > 95
+        for: 2m
+        labels:
+          severity: critical
+          team: sre
+          service: redis
+        annotations:
+          summary: "CRITICAL: Redis memory usage above 95%"
+          description: >
+            Redis memory usage has exceeded 95% of the configured maximum.
+            Heavy eviction is occurring. Consider increasing maxmemory or investigating cache patterns.
+            Current usage: {{ $value | printf "%.1f" }}%.
+
+      # ── Redis connected clients high ──
+      - alert: RedisConnectedClientsHigh
+        expr: >
+          redis_connected_clients > 150
+        for: 5m
+        labels:
+          severity: warning
+          team: sre
+          service: redis
+        annotations:
+          summary: "Redis connected clients above 150"
+          description: >
+            The number of connected Redis clients has exceeded 150 for 5+ minutes.
+            Current value: {{ $value }}.
+
+      # ── Redis rejected connections ──
+      - alert: RedisRejectedConnections
+        expr: >
+          increase(redis_rejected_connections_total[5m]) > 0
+        for: 1m
+        labels:
+          severity: critical
+          team: sre
+          service: redis
+        annotations:
+          summary: "Redis is rejecting connections"
+          description: >
+            Redis has rejected {{ $value }} connection(s) in the last 5 minutes.
+            This indicates maxclients has been reached.
+
+  # ── Container Resource Alerts ────────────────────────────────────────────────
+  - name: goodgo_containers
+    rules:
+      # ── Container restart loop ──
+      - alert: ContainerRestartLoop
+        expr: >
+          increase(container_restart_count{name=~"goodgo-.*"}[15m]) > 3
+        for: 5m
+        labels:
+          severity: critical
+          team: sre
+          service: "{{ $labels.name }}"
+        annotations:
+          summary: "Container {{ $labels.name }} restart loop"
+          description: >
+            Container {{ $labels.name }} has restarted more than 3 times in the last 15 minutes.
+            This indicates a crash loop. Check container logs immediately.
+
+      # ── Container memory near limit ──
+      - alert: ContainerMemoryHigh
+        expr: >
+          (container_memory_usage_bytes{name=~"goodgo-.*"}
+           / container_spec_memory_limit_bytes{name=~"goodgo-.*"}) * 100 > 85
+        for: 5m
+        labels:
+          severity: warning
+          team: sre
+          service: "{{ $labels.name }}"
+        annotations:
+          summary: "Container {{ $labels.name }} memory usage above 85%"
+          description: >
+            Container {{ $labels.name }} is using more than 85% of its memory limit.
+            Current usage: {{ $value | printf "%.1f" }}%.
+            Risk of OOM kill if usage continues to grow.
+
+      # ── Container CPU throttled ──
+      - alert: ContainerCPUThrottled
+        expr: >
+          rate(container_cpu_cfs_throttled_seconds_total{name=~"goodgo-.*"}[5m]) > 0.5
+        for: 10m
+        labels:
+          severity: warning
+          team: sre
+          service: "{{ $labels.name }}"
+        annotations:
+          summary: "Container {{ $labels.name }} CPU is being throttled"
+          description: >
+            Container {{ $labels.name }} has been CPU-throttled for 10+ minutes.
+            Current throttle rate: {{ $value | printf "%.2f" }}s/s.
+            Consider increasing CPU limits.
+
+  # ── Disk & Volume Alerts ─────────────────────────────────────────────────────
+  - name: goodgo_disk
+    rules:
+      # ── Host disk usage high ──
+      - alert: HostDiskUsageHigh
+        expr: >
+          (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80
+        for: 10m
+        labels:
+          severity: warning
+          team: sre
+          service: host
+        annotations:
+          summary: "Host root disk usage above 80%"
+          description: >
+            The root filesystem is {{ $value | printf "%.1f" }}% full.
+            Clean up Docker artifacts, old backups, or logs to free space.
+          runbook_url: "https://docs.goodgo.vn/runbooks/disk-space"
+
+      # ── Host disk usage critical ──
+      - alert: HostDiskUsageCritical
+        expr: >
+          (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 90
+        for: 5m
+        labels:
+          severity: critical
+          team: sre
+          service: host
+        annotations:
+          summary: "CRITICAL: Host root disk usage above 90%"
+          description: >
+            The root filesystem is {{ $value | printf "%.1f" }}% full.
+            Services may fail. Immediate cleanup required.
+          runbook_url: "https://docs.goodgo.vn/runbooks/disk-space-critical"
+
+  # ── Service Health Alerts ────────────────────────────────────────────────────
+  - name: goodgo_services
+    rules:
+      # ── API health check failing ──
+      - alert: ApiHealthCheckFailing
+        expr: >
+          probe_success{job="goodgo-api-health"} == 0
+        for: 2m
+        labels:
+          severity: critical
+          team: sre
+          service: goodgo-api
+        annotations:
+          summary: "CRITICAL: API health check is failing"
+          description: >
+            The GoodGo API health endpoint has been unreachable for 2+ minutes.
+            The service may be down or unresponsive.
+
+      # ── Prometheus target down ──
+      - alert: PrometheusTargetDown
+        expr: >
+          up == 0
+        for: 5m
+        labels:
+          severity: warning
+          team: sre
+        annotations:
+          summary: "Prometheus target {{ $labels.job }} is down"
+          description: >
+            Prometheus cannot scrape {{ $labels.instance }} (job: {{ $labels.job }}) for 5+ minutes.
+
+  # ── Backup Monitoring Alerts ─────────────────────────────────────────────────
+  - name: goodgo_backups
+    rules:
+      # ── Backup age too old (no recent backup) ──
+      - alert: BackupTooOld
+        expr: >
+          (time() - goodgo_backup_last_success_timestamp_seconds) > 90000
+        for: 5m
+        labels:
+          severity: warning
+          team: sre
+          service: pg-backup
+        annotations:
+          summary: "PostgreSQL backup is more than 25 hours old"
+          description: >
+            The last successful PostgreSQL backup was {{ $value | humanizeDuration }} ago.
+            Daily backups run at 02:00 UTC. The backup job may have failed.
+            Check: docker logs goodgo-pg-backup
+
+      # ── Backup verification failed ──
+      - alert: BackupVerificationFailed
+        expr: >
+          goodgo_backup_verify_result == 0
+        for: 1m
+        labels:
+          severity: warning
+          team: sre
+          service: pg-backup
+        annotations:
+          summary: "PostgreSQL backup verification failed"
+          description: >
+            The automated backup verification check has failed.
+            Check: docker exec goodgo-pg-backup cat /backups/verify-latest.json
--- a/monitoring/prometheus/prometheus.yml
+++ b/monitoring/prometheus/prometheus.yml
@@ -5,6 +5,11 @@ global:
 rule_files:
  - 'alert-rules.yml'

+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets: ['alertmanager:9093']
+
 scrape_configs:
  - job_name: 'goodgo-api'
    metrics_path: '/metrics'