feat(monitoring): add comprehensive alerting rules, Alertmanager, and DR validation
Expand production monitoring with full alert coverage for database connections, Redis memory/connections, container resources, disk usage, service health, and backup integrity. Add Alertmanager service with Slack routing for critical and warning alerts, and add automated backup verification to the pg-backup cron schedule. Update runbook with DR validation procedures and quarterly checklist. - Expand Prometheus alert rules from 4 to 24 alerts across 7 groups - Add Alertmanager container (prom/alertmanager:v0.27.0) with Slack routing - Configure inhibition rules (critical suppresses warning for same service) - Schedule automated backup verification at 04:00 UTC daily - Add Alertmanager datasource to Grafana provisioning - Update runbook with Section 9: DR Validation (automated + manual procedures) - Add SLACK_WEBHOOK_URL and Grafana vars to .env.example Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
90
monitoring/alertmanager/alertmanager.yml
Normal file
90
monitoring/alertmanager/alertmanager.yml
Normal file
@@ -0,0 +1,90 @@
|
||||
# GoodGo Platform — Alertmanager Configuration
|
||||
# Routes alerts from Prometheus to notification channels.
|
||||
#
|
||||
# Environment variables (set in .env):
|
||||
# SLACK_WEBHOOK_URL — Slack incoming webhook for alert notifications
|
||||
# ALERTMANAGER_SMTP_* — SMTP settings for email alerts (optional)
|
||||
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
slack_api_url: '${SLACK_WEBHOOK_URL}'
|
||||
|
||||
# ── Notification Templates ─────────────────────────────────────────────────────
|
||||
templates:
|
||||
- '/etc/alertmanager/templates/*.tmpl'
|
||||
|
||||
# ── Inhibition Rules ──────────────────────────────────────────────────────────
|
||||
# Suppress warning alerts when a critical alert is already firing for the same service
|
||||
inhibit_rules:
|
||||
- source_matchers:
|
||||
- severity = critical
|
||||
target_matchers:
|
||||
- severity = warning
|
||||
equal: ['service']
|
||||
|
||||
# ── Routing Tree ──────────────────────────────────────────────────────────────
|
||||
route:
|
||||
receiver: 'slack-sre'
|
||||
group_by: ['alertname', 'service']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
|
||||
routes:
|
||||
# Critical alerts — immediate notification, shorter repeat
|
||||
- matchers:
|
||||
- severity = critical
|
||||
receiver: 'slack-critical'
|
||||
group_wait: 10s
|
||||
repeat_interval: 1h
|
||||
continue: false
|
||||
|
||||
# Backup alerts — route to infrastructure channel
|
||||
- matchers:
|
||||
- alertname =~ "Backup.*"
|
||||
receiver: 'slack-infrastructure'
|
||||
group_wait: 1m
|
||||
repeat_interval: 6h
|
||||
|
||||
# ── Receivers ─────────────────────────────────────────────────────────────────
|
||||
receivers:
|
||||
- name: 'slack-sre'
|
||||
slack_configs:
|
||||
- channel: '#sre-oncall'
|
||||
send_resolved: true
|
||||
title: '{{ if eq .Status "firing" }}🔥{{ else }}✅{{ end }} [{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}'
|
||||
text: >-
|
||||
*Service:* {{ .CommonLabels.service }}
|
||||
*Severity:* {{ .CommonLabels.severity }}
|
||||
{{ range .Alerts }}
|
||||
*Summary:* {{ .Annotations.summary }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
{{ if .Annotations.runbook_url }}*Runbook:* {{ .Annotations.runbook_url }}{{ end }}
|
||||
{{ if .Annotations.dashboard }}*Dashboard:* {{ .Annotations.dashboard }}{{ end }}
|
||||
{{ end }}
|
||||
|
||||
- name: 'slack-critical'
|
||||
slack_configs:
|
||||
- channel: '#sre-oncall'
|
||||
send_resolved: true
|
||||
title: '{{ if eq .Status "firing" }}🚨 CRITICAL{{ else }}✅ RESOLVED{{ end }} {{ .CommonLabels.alertname }}'
|
||||
text: >-
|
||||
*Service:* {{ .CommonLabels.service }}
|
||||
*Severity:* CRITICAL — Immediate action required
|
||||
{{ range .Alerts }}
|
||||
*Summary:* {{ .Annotations.summary }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
{{ if .Annotations.runbook_url }}*Runbook:* {{ .Annotations.runbook_url }}{{ end }}
|
||||
{{ end }}
|
||||
|
||||
- name: 'slack-infrastructure'
|
||||
slack_configs:
|
||||
- channel: '#infrastructure'
|
||||
send_resolved: true
|
||||
title: '{{ if eq .Status "firing" }}⚠️{{ else }}✅{{ end }} [{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}'
|
||||
text: >-
|
||||
*Service:* {{ .CommonLabels.service }}
|
||||
{{ range .Alerts }}
|
||||
*Summary:* {{ .Annotations.summary }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
{{ end }}
|
||||
Reference in New Issue
Block a user