feat(monitoring): add comprehensive alerting rules, Alertmanager, and DR validation

Expand production monitoring with full alert coverage for database connections,
Redis memory/connections, container resources, disk usage, service health, and
backup integrity. Add Alertmanager service with Slack routing for critical and
warning alerts, and add automated backup verification to the pg-backup cron
schedule. Update runbook with DR validation procedures and quarterly checklist.

- Expand Prometheus alert rules from 4 to 24 alerts across 7 groups
- Add Alertmanager container (prom/alertmanager:v0.27.0) with Slack routing
- Configure inhibition rules (critical suppresses warning for same service)
- Schedule automated backup verification at 04:00 UTC daily
- Add Alertmanager datasource to Grafana provisioning
- Update runbook with Section 9: DR Validation (automated + manual procedures)
- Add SLACK_WEBHOOK_URL and Grafana vars to .env.example

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
Ho Ngoc Hai
2026-04-11 20:15:36 +07:00
parent 33c2e5ac1d
commit 9409706c58
8 changed files with 1108 additions and 2 deletions

View File

@@ -314,7 +314,8 @@ services:
- -c
- |
apt-get update -qq && apt-get install -y -qq cron > /dev/null 2>&1
echo "0 2 * * * PGHOST=postgres PGPORT=5432 PGUSER=${DB_USER} PGDATABASE=${DB_NAME} PGPASSWORD=${DB_PASSWORD} BACKUP_DIR=/backups RETENTION_DAYS=${BACKUP_RETENTION_DAYS:-7} /scripts/pg-backup.sh >> /var/log/pg-backup.log 2>&1" | crontab -
(echo "0 2 * * * PGHOST=postgres PGPORT=5432 PGUSER=${DB_USER} PGDATABASE=${DB_NAME} PGPASSWORD=${DB_PASSWORD} BACKUP_DIR=/backups RETENTION_DAYS=${BACKUP_RETENTION_DAYS:-7} /scripts/pg-backup.sh >> /var/log/pg-backup.log 2>&1"
echo "0 4 * * * PGHOST=postgres PGPORT=5432 PGUSER=${DB_USER} PGDATABASE=${DB_NAME} PGPASSWORD=${DB_PASSWORD} BACKUP_DIR=/backups REPORT_FILE=/backups/verify-latest.json /scripts/pg-verify-backup.sh >> /var/log/pg-verify.log 2>&1") | crontab -
/scripts/pg-backup.sh
cron -f
environment:
@@ -410,7 +411,11 @@ services:
- '--web.enable-lifecycle'
volumes:
- ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./monitoring/prometheus/alert-rules.yml:/etc/prometheus/alert-rules.yml:ro
- prometheus_data:/prometheus
depends_on:
alertmanager:
condition: service_healthy
healthcheck:
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:9090/-/healthy']
interval: 15s
@@ -434,6 +439,41 @@ services:
networks:
- goodgo-net
alertmanager:
image: prom/alertmanager:v0.27.0
container_name: goodgo-alertmanager
restart: unless-stopped
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--data.retention=120h'
environment:
SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL:-}
volumes:
- ./monitoring/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
healthcheck:
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:9093/-/healthy']
interval: 15s
timeout: 5s
retries: 3
start_period: 10s
deploy:
resources:
limits:
memory: 256m
cpus: '0.25'
reservations:
memory: 64m
security_opt:
- no-new-privileges:true
logging:
driver: json-file
options:
max-size: '5m'
max-file: '3'
networks:
- goodgo-net
grafana:
image: grafana/grafana:10.4.1
container_name: goodgo-grafana
@@ -457,6 +497,8 @@ services:
condition: service_healthy
loki:
condition: service_healthy
alertmanager:
condition: service_healthy
healthcheck:
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:3000/api/health']
interval: 15s