feat(monitoring): add comprehensive alerting rules, Alertmanager, and DR validation
Expand production monitoring with full alert coverage for database connections, Redis memory/connections, container resources, disk usage, service health, and backup integrity. Add Alertmanager service with Slack routing for critical and warning alerts, and add automated backup verification to the pg-backup cron schedule. Update runbook with DR validation procedures and quarterly checklist. - Expand Prometheus alert rules from 4 to 24 alerts across 7 groups - Add Alertmanager container (prom/alertmanager:v0.27.0) with Slack routing - Configure inhibition rules (critical suppresses warning for same service) - Schedule automated backup verification at 04:00 UTC daily - Add Alertmanager datasource to Grafana provisioning - Update runbook with Section 9: DR Validation (automated + manual procedures) - Add SLACK_WEBHOOK_URL and Grafana vars to .env.example Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
90
monitoring/alertmanager/alertmanager.yml
Normal file
90
monitoring/alertmanager/alertmanager.yml
Normal file
@@ -0,0 +1,90 @@
|
||||
# GoodGo Platform — Alertmanager Configuration
|
||||
# Routes alerts from Prometheus to notification channels.
|
||||
#
|
||||
# Environment variables (set in .env):
|
||||
# SLACK_WEBHOOK_URL — Slack incoming webhook for alert notifications
|
||||
# ALERTMANAGER_SMTP_* — SMTP settings for email alerts (optional)
|
||||
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
slack_api_url: '${SLACK_WEBHOOK_URL}'
|
||||
|
||||
# ── Notification Templates ─────────────────────────────────────────────────────
|
||||
templates:
|
||||
- '/etc/alertmanager/templates/*.tmpl'
|
||||
|
||||
# ── Inhibition Rules ──────────────────────────────────────────────────────────
|
||||
# Suppress warning alerts when a critical alert is already firing for the same service
|
||||
inhibit_rules:
|
||||
- source_matchers:
|
||||
- severity = critical
|
||||
target_matchers:
|
||||
- severity = warning
|
||||
equal: ['service']
|
||||
|
||||
# ── Routing Tree ──────────────────────────────────────────────────────────────
|
||||
route:
|
||||
receiver: 'slack-sre'
|
||||
group_by: ['alertname', 'service']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
|
||||
routes:
|
||||
# Critical alerts — immediate notification, shorter repeat
|
||||
- matchers:
|
||||
- severity = critical
|
||||
receiver: 'slack-critical'
|
||||
group_wait: 10s
|
||||
repeat_interval: 1h
|
||||
continue: false
|
||||
|
||||
# Backup alerts — route to infrastructure channel
|
||||
- matchers:
|
||||
- alertname =~ "Backup.*"
|
||||
receiver: 'slack-infrastructure'
|
||||
group_wait: 1m
|
||||
repeat_interval: 6h
|
||||
|
||||
# ── Receivers ─────────────────────────────────────────────────────────────────
|
||||
receivers:
|
||||
- name: 'slack-sre'
|
||||
slack_configs:
|
||||
- channel: '#sre-oncall'
|
||||
send_resolved: true
|
||||
title: '{{ if eq .Status "firing" }}🔥{{ else }}✅{{ end }} [{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}'
|
||||
text: >-
|
||||
*Service:* {{ .CommonLabels.service }}
|
||||
*Severity:* {{ .CommonLabels.severity }}
|
||||
{{ range .Alerts }}
|
||||
*Summary:* {{ .Annotations.summary }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
{{ if .Annotations.runbook_url }}*Runbook:* {{ .Annotations.runbook_url }}{{ end }}
|
||||
{{ if .Annotations.dashboard }}*Dashboard:* {{ .Annotations.dashboard }}{{ end }}
|
||||
{{ end }}
|
||||
|
||||
- name: 'slack-critical'
|
||||
slack_configs:
|
||||
- channel: '#sre-oncall'
|
||||
send_resolved: true
|
||||
title: '{{ if eq .Status "firing" }}🚨 CRITICAL{{ else }}✅ RESOLVED{{ end }} {{ .CommonLabels.alertname }}'
|
||||
text: >-
|
||||
*Service:* {{ .CommonLabels.service }}
|
||||
*Severity:* CRITICAL — Immediate action required
|
||||
{{ range .Alerts }}
|
||||
*Summary:* {{ .Annotations.summary }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
{{ if .Annotations.runbook_url }}*Runbook:* {{ .Annotations.runbook_url }}{{ end }}
|
||||
{{ end }}
|
||||
|
||||
- name: 'slack-infrastructure'
|
||||
slack_configs:
|
||||
- channel: '#infrastructure'
|
||||
send_resolved: true
|
||||
title: '{{ if eq .Status "firing" }}⚠️{{ else }}✅{{ end }} [{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}'
|
||||
text: >-
|
||||
*Service:* {{ .CommonLabels.service }}
|
||||
{{ range .Alerts }}
|
||||
*Summary:* {{ .Annotations.summary }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
{{ end }}
|
||||
@@ -21,3 +21,12 @@ datasources:
|
||||
matcherRegex: 'correlationId":"([^"]+)'
|
||||
name: correlationId
|
||||
url: '$${__value.raw}'
|
||||
|
||||
- name: Alertmanager
|
||||
uid: alertmanager
|
||||
type: alertmanager
|
||||
access: proxy
|
||||
url: http://alertmanager:9093
|
||||
editable: true
|
||||
jsonData:
|
||||
implementation: prometheus
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
groups:
|
||||
# ── API Latency & Error Alerts ───────────────────────────────────────────────
|
||||
- name: goodgo_api_latency
|
||||
rules:
|
||||
# ── p99 latency alert ──
|
||||
@@ -79,3 +80,324 @@ groups:
|
||||
The 5xx error rate for the GoodGo API has been above 1% for the last 5 minutes.
|
||||
Current value: {{ $value | printf "%.2f" }}%.
|
||||
dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
|
||||
|
||||
# ── 5xx error rate critical ──
|
||||
- alert: ApiErrorRate5xxCritical
|
||||
expr: >
|
||||
(
|
||||
sum(rate(http_requests_total{job="goodgo-api", status_code=~"5.."}[5m]))
|
||||
/
|
||||
sum(rate(http_requests_total{job="goodgo-api"}[5m]))
|
||||
) * 100 > 5
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
team: sre
|
||||
service: goodgo-api
|
||||
annotations:
|
||||
summary: "CRITICAL: API 5xx error rate above 5%"
|
||||
description: >
|
||||
The 5xx error rate for the GoodGo API has been above 5% for the last 3 minutes.
|
||||
This indicates a major incident. Immediate investigation required.
|
||||
Current value: {{ $value | printf "%.2f" }}%.
|
||||
dashboard: "/d/goodgo-api-latency/goodgo-api-latency"
|
||||
runbook_url: "https://docs.goodgo.vn/runbooks/5xx-critical"
|
||||
|
||||
# ── No traffic (possible downtime) ──
|
||||
- alert: ApiNoTraffic
|
||||
expr: >
|
||||
sum(rate(http_requests_total{job="goodgo-api"}[5m])) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: sre
|
||||
service: goodgo-api
|
||||
annotations:
|
||||
summary: "API receiving zero traffic for 10 minutes"
|
||||
description: >
|
||||
The GoodGo API has received no requests in the last 10 minutes.
|
||||
This may indicate the service is down or unreachable.
|
||||
|
||||
# ── Database Alerts ──────────────────────────────────────────────────────────
|
||||
- name: goodgo_database
|
||||
rules:
|
||||
# ── PostgreSQL active connections high ──
|
||||
- alert: PostgresActiveConnectionsHigh
|
||||
expr: >
|
||||
pg_stat_activity_count{datname="goodgo", state="active"} > 15
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: sre
|
||||
service: postgres
|
||||
annotations:
|
||||
summary: "PostgreSQL active connections above 15"
|
||||
description: >
|
||||
The number of active PostgreSQL connections has been above 15 for 5 minutes.
|
||||
Pool size is 20. Current value: {{ $value }}.
|
||||
Check for long-running queries or connection leaks.
|
||||
runbook_url: "https://docs.goodgo.vn/runbooks/db-connections"
|
||||
|
||||
# ── PostgreSQL connection pool near exhaustion ──
|
||||
- alert: PostgresConnectionPoolCritical
|
||||
expr: >
|
||||
pg_stat_activity_count{datname="goodgo"} > 180
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
team: sre
|
||||
service: postgres
|
||||
annotations:
|
||||
summary: "CRITICAL: PostgreSQL connections near limit (>180/200)"
|
||||
description: >
|
||||
Total PostgreSQL connections have exceeded 180 (max client connections: 200).
|
||||
PgBouncer pool may be exhausted. Immediate action required.
|
||||
Current value: {{ $value }}.
|
||||
runbook_url: "https://docs.goodgo.vn/runbooks/db-pool-exhaustion"
|
||||
|
||||
# ── PostgreSQL slow queries ──
|
||||
- alert: PostgresSlowQueries
|
||||
expr: >
|
||||
pg_stat_activity_count{datname="goodgo", state="active", wait_event_type="Lock"} > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: sre
|
||||
service: postgres
|
||||
annotations:
|
||||
summary: "Multiple PostgreSQL queries waiting on locks"
|
||||
description: >
|
||||
More than 5 queries are blocked waiting on locks for 5+ minutes.
|
||||
This may indicate lock contention or deadlocks.
|
||||
Current value: {{ $value }}.
|
||||
|
||||
# ── PostgreSQL down ──
|
||||
- alert: PostgresDown
|
||||
expr: >
|
||||
up{job="goodgo-api"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
team: sre
|
||||
service: postgres
|
||||
annotations:
|
||||
summary: "CRITICAL: Cannot scrape GoodGo API (possible service down)"
|
||||
description: >
|
||||
Prometheus cannot scrape the GoodGo API metrics endpoint.
|
||||
The API or its dependencies (PostgreSQL, Redis) may be down.
|
||||
|
||||
# ── Redis Alerts ─────────────────────────────────────────────────────────────
|
||||
- name: goodgo_redis
|
||||
rules:
|
||||
# ── Redis memory usage high ──
|
||||
- alert: RedisMemoryHigh
|
||||
expr: >
|
||||
redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: sre
|
||||
service: redis
|
||||
annotations:
|
||||
summary: "Redis memory usage above 80%"
|
||||
description: >
|
||||
Redis memory usage has exceeded 80% of the configured maximum (512 MB).
|
||||
Eviction policy (allkeys-lru) is active but high usage may indicate a problem.
|
||||
Current usage: {{ $value | printf "%.1f" }}%.
|
||||
|
||||
# ── Redis memory critical ──
|
||||
- alert: RedisMemoryCritical
|
||||
expr: >
|
||||
redis_memory_used_bytes / redis_memory_max_bytes * 100 > 95
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
team: sre
|
||||
service: redis
|
||||
annotations:
|
||||
summary: "CRITICAL: Redis memory usage above 95%"
|
||||
description: >
|
||||
Redis memory usage has exceeded 95% of the configured maximum.
|
||||
Heavy eviction is occurring. Consider increasing maxmemory or investigating cache patterns.
|
||||
Current usage: {{ $value | printf "%.1f" }}%.
|
||||
|
||||
# ── Redis connected clients high ──
|
||||
- alert: RedisConnectedClientsHigh
|
||||
expr: >
|
||||
redis_connected_clients > 150
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: sre
|
||||
service: redis
|
||||
annotations:
|
||||
summary: "Redis connected clients above 150"
|
||||
description: >
|
||||
The number of connected Redis clients has exceeded 150 for 5+ minutes.
|
||||
Current value: {{ $value }}.
|
||||
|
||||
# ── Redis rejected connections ──
|
||||
- alert: RedisRejectedConnections
|
||||
expr: >
|
||||
increase(redis_rejected_connections_total[5m]) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
team: sre
|
||||
service: redis
|
||||
annotations:
|
||||
summary: "Redis is rejecting connections"
|
||||
description: >
|
||||
Redis has rejected {{ $value }} connection(s) in the last 5 minutes.
|
||||
This indicates maxclients has been reached.
|
||||
|
||||
# ── Container Resource Alerts ────────────────────────────────────────────────
|
||||
- name: goodgo_containers
|
||||
rules:
|
||||
# ── Container restart loop ──
|
||||
- alert: ContainerRestartLoop
|
||||
expr: >
|
||||
increase(container_restart_count{name=~"goodgo-.*"}[15m]) > 3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: sre
|
||||
service: "{{ $labels.name }}"
|
||||
annotations:
|
||||
summary: "Container {{ $labels.name }} restart loop"
|
||||
description: >
|
||||
Container {{ $labels.name }} has restarted more than 3 times in the last 15 minutes.
|
||||
This indicates a crash loop. Check container logs immediately.
|
||||
|
||||
# ── Container memory near limit ──
|
||||
- alert: ContainerMemoryHigh
|
||||
expr: >
|
||||
(container_memory_usage_bytes{name=~"goodgo-.*"}
|
||||
/ container_spec_memory_limit_bytes{name=~"goodgo-.*"}) * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: sre
|
||||
service: "{{ $labels.name }}"
|
||||
annotations:
|
||||
summary: "Container {{ $labels.name }} memory usage above 85%"
|
||||
description: >
|
||||
Container {{ $labels.name }} is using more than 85% of its memory limit.
|
||||
Current usage: {{ $value | printf "%.1f" }}%.
|
||||
Risk of OOM kill if usage continues to grow.
|
||||
|
||||
# ── Container CPU throttled ──
|
||||
- alert: ContainerCPUThrottled
|
||||
expr: >
|
||||
rate(container_cpu_cfs_throttled_seconds_total{name=~"goodgo-.*"}[5m]) > 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: sre
|
||||
service: "{{ $labels.name }}"
|
||||
annotations:
|
||||
summary: "Container {{ $labels.name }} CPU is being throttled"
|
||||
description: >
|
||||
Container {{ $labels.name }} has been CPU-throttled for 10+ minutes.
|
||||
Current throttle rate: {{ $value | printf "%.2f" }}s/s.
|
||||
Consider increasing CPU limits.
|
||||
|
||||
# ── Disk & Volume Alerts ─────────────────────────────────────────────────────
|
||||
- name: goodgo_disk
|
||||
rules:
|
||||
# ── Host disk usage high ──
|
||||
- alert: HostDiskUsageHigh
|
||||
expr: >
|
||||
(1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: sre
|
||||
service: host
|
||||
annotations:
|
||||
summary: "Host root disk usage above 80%"
|
||||
description: >
|
||||
The root filesystem is {{ $value | printf "%.1f" }}% full.
|
||||
Clean up Docker artifacts, old backups, or logs to free space.
|
||||
runbook_url: "https://docs.goodgo.vn/runbooks/disk-space"
|
||||
|
||||
# ── Host disk usage critical ──
|
||||
- alert: HostDiskUsageCritical
|
||||
expr: >
|
||||
(1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: sre
|
||||
service: host
|
||||
annotations:
|
||||
summary: "CRITICAL: Host root disk usage above 90%"
|
||||
description: >
|
||||
The root filesystem is {{ $value | printf "%.1f" }}% full.
|
||||
Services may fail. Immediate cleanup required.
|
||||
runbook_url: "https://docs.goodgo.vn/runbooks/disk-space-critical"
|
||||
|
||||
# ── Service Health Alerts ────────────────────────────────────────────────────
|
||||
- name: goodgo_services
|
||||
rules:
|
||||
# ── API health check failing ──
|
||||
- alert: ApiHealthCheckFailing
|
||||
expr: >
|
||||
probe_success{job="goodgo-api-health"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
team: sre
|
||||
service: goodgo-api
|
||||
annotations:
|
||||
summary: "CRITICAL: API health check is failing"
|
||||
description: >
|
||||
The GoodGo API health endpoint has been unreachable for 2+ minutes.
|
||||
The service may be down or unresponsive.
|
||||
|
||||
# ── Prometheus target down ──
|
||||
- alert: PrometheusTargetDown
|
||||
expr: >
|
||||
up == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: sre
|
||||
annotations:
|
||||
summary: "Prometheus target {{ $labels.job }} is down"
|
||||
description: >
|
||||
Prometheus cannot scrape {{ $labels.instance }} (job: {{ $labels.job }}) for 5+ minutes.
|
||||
|
||||
# ── Backup Monitoring Alerts ─────────────────────────────────────────────────
|
||||
- name: goodgo_backups
|
||||
rules:
|
||||
# ── Backup age too old (no recent backup) ──
|
||||
- alert: BackupTooOld
|
||||
expr: >
|
||||
(time() - goodgo_backup_last_success_timestamp_seconds) > 90000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: sre
|
||||
service: pg-backup
|
||||
annotations:
|
||||
summary: "PostgreSQL backup is more than 25 hours old"
|
||||
description: >
|
||||
The last successful PostgreSQL backup was {{ $value | humanizeDuration }} ago.
|
||||
Daily backups run at 02:00 UTC. The backup job may have failed.
|
||||
Check: docker logs goodgo-pg-backup
|
||||
|
||||
# ── Backup verification failed ──
|
||||
- alert: BackupVerificationFailed
|
||||
expr: >
|
||||
goodgo_backup_verify_result == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
team: sre
|
||||
service: pg-backup
|
||||
annotations:
|
||||
summary: "PostgreSQL backup verification failed"
|
||||
description: >
|
||||
The automated backup verification check has failed.
|
||||
Check: docker exec goodgo-pg-backup cat /backups/verify-latest.json
|
||||
|
||||
@@ -5,6 +5,11 @@ global:
|
||||
rule_files:
|
||||
- 'alert-rules.yml'
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ['alertmanager:9093']
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'goodgo-api'
|
||||
metrics_path: '/metrics'
|
||||
|
||||
Reference in New Issue
Block a user