feat(monitoring): add comprehensive alerting rules, Alertmanager, and DR validation
Expand production monitoring with full alert coverage for database connections, Redis memory/connections, container resources, disk usage, service health, and backup integrity. Add Alertmanager service with Slack routing for critical and warning alerts, and add automated backup verification to the pg-backup cron schedule. Update runbook with DR validation procedures and quarterly checklist. - Expand Prometheus alert rules from 4 to 24 alerts across 7 groups - Add Alertmanager container (prom/alertmanager:v0.27.0) with Slack routing - Configure inhibition rules (critical suppresses warning for same service) - Schedule automated backup verification at 04:00 UTC daily - Add Alertmanager datasource to Grafana provisioning - Update runbook with Section 9: DR Validation (automated + manual procedures) - Add SLACK_WEBHOOK_URL and Grafana vars to .env.example Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -314,7 +314,8 @@ services:
|
||||
- -c
|
||||
- |
|
||||
apt-get update -qq && apt-get install -y -qq cron > /dev/null 2>&1
|
||||
echo "0 2 * * * PGHOST=postgres PGPORT=5432 PGUSER=${DB_USER} PGDATABASE=${DB_NAME} PGPASSWORD=${DB_PASSWORD} BACKUP_DIR=/backups RETENTION_DAYS=${BACKUP_RETENTION_DAYS:-7} /scripts/pg-backup.sh >> /var/log/pg-backup.log 2>&1" | crontab -
|
||||
(echo "0 2 * * * PGHOST=postgres PGPORT=5432 PGUSER=${DB_USER} PGDATABASE=${DB_NAME} PGPASSWORD=${DB_PASSWORD} BACKUP_DIR=/backups RETENTION_DAYS=${BACKUP_RETENTION_DAYS:-7} /scripts/pg-backup.sh >> /var/log/pg-backup.log 2>&1"
|
||||
echo "0 4 * * * PGHOST=postgres PGPORT=5432 PGUSER=${DB_USER} PGDATABASE=${DB_NAME} PGPASSWORD=${DB_PASSWORD} BACKUP_DIR=/backups REPORT_FILE=/backups/verify-latest.json /scripts/pg-verify-backup.sh >> /var/log/pg-verify.log 2>&1") | crontab -
|
||||
/scripts/pg-backup.sh
|
||||
cron -f
|
||||
environment:
|
||||
@@ -410,7 +411,11 @@ services:
|
||||
- '--web.enable-lifecycle'
|
||||
volumes:
|
||||
- ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./monitoring/prometheus/alert-rules.yml:/etc/prometheus/alert-rules.yml:ro
|
||||
- prometheus_data:/prometheus
|
||||
depends_on:
|
||||
alertmanager:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:9090/-/healthy']
|
||||
interval: 15s
|
||||
@@ -434,6 +439,41 @@ services:
|
||||
networks:
|
||||
- goodgo-net
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:v0.27.0
|
||||
container_name: goodgo-alertmanager
|
||||
restart: unless-stopped
|
||||
command:
|
||||
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
- '--data.retention=120h'
|
||||
environment:
|
||||
SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL:-}
|
||||
volumes:
|
||||
- ./monitoring/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||
healthcheck:
|
||||
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:9093/-/healthy']
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 10s
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 256m
|
||||
cpus: '0.25'
|
||||
reservations:
|
||||
memory: 64m
|
||||
security_opt:
|
||||
- no-new-privileges:true
|
||||
logging:
|
||||
driver: json-file
|
||||
options:
|
||||
max-size: '5m'
|
||||
max-file: '3'
|
||||
networks:
|
||||
- goodgo-net
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:10.4.1
|
||||
container_name: goodgo-grafana
|
||||
@@ -457,6 +497,8 @@ services:
|
||||
condition: service_healthy
|
||||
loki:
|
||||
condition: service_healthy
|
||||
alertmanager:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:3000/api/health']
|
||||
interval: 15s
|
||||
|
||||
Reference in New Issue
Block a user