Expand production monitoring with full alert coverage for database connections, Redis memory/connections, container resources, disk usage, service health, and backup integrity. Add Alertmanager service with Slack routing for critical and warning alerts, and add automated backup verification to the pg-backup cron schedule. Update runbook with DR validation procedures and quarterly checklist. - Expand Prometheus alert rules from 4 to 24 alerts across 7 groups - Add Alertmanager container (prom/alertmanager:v0.27.0) with Slack routing - Configure inhibition rules (critical suppresses warning for same service) - Schedule automated backup verification at 04:00 UTC daily - Add Alertmanager datasource to Grafana provisioning - Update runbook with Section 9: DR Validation (automated + manual procedures) - Add SLACK_WEBHOOK_URL and Grafana vars to .env.example Co-Authored-By: Paperclip <noreply@paperclip.ing>
91 lines
3.7 KiB
YAML
91 lines
3.7 KiB
YAML
# GoodGo Platform — Alertmanager Configuration
|
|
# Routes alerts from Prometheus to notification channels.
|
|
#
|
|
# Environment variables (set in .env):
|
|
# SLACK_WEBHOOK_URL — Slack incoming webhook for alert notifications
|
|
# ALERTMANAGER_SMTP_* — SMTP settings for email alerts (optional)
|
|
|
|
global:
|
|
resolve_timeout: 5m
|
|
slack_api_url: '${SLACK_WEBHOOK_URL}'
|
|
|
|
# ── Notification Templates ─────────────────────────────────────────────────────
|
|
templates:
|
|
- '/etc/alertmanager/templates/*.tmpl'
|
|
|
|
# ── Inhibition Rules ──────────────────────────────────────────────────────────
|
|
# Suppress warning alerts when a critical alert is already firing for the same service
|
|
inhibit_rules:
|
|
- source_matchers:
|
|
- severity = critical
|
|
target_matchers:
|
|
- severity = warning
|
|
equal: ['service']
|
|
|
|
# ── Routing Tree ──────────────────────────────────────────────────────────────
|
|
route:
|
|
receiver: 'slack-sre'
|
|
group_by: ['alertname', 'service']
|
|
group_wait: 30s
|
|
group_interval: 5m
|
|
repeat_interval: 4h
|
|
|
|
routes:
|
|
# Critical alerts — immediate notification, shorter repeat
|
|
- matchers:
|
|
- severity = critical
|
|
receiver: 'slack-critical'
|
|
group_wait: 10s
|
|
repeat_interval: 1h
|
|
continue: false
|
|
|
|
# Backup alerts — route to infrastructure channel
|
|
- matchers:
|
|
- alertname =~ "Backup.*"
|
|
receiver: 'slack-infrastructure'
|
|
group_wait: 1m
|
|
repeat_interval: 6h
|
|
|
|
# ── Receivers ─────────────────────────────────────────────────────────────────
|
|
receivers:
|
|
- name: 'slack-sre'
|
|
slack_configs:
|
|
- channel: '#sre-oncall'
|
|
send_resolved: true
|
|
title: '{{ if eq .Status "firing" }}🔥{{ else }}✅{{ end }} [{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}'
|
|
text: >-
|
|
*Service:* {{ .CommonLabels.service }}
|
|
*Severity:* {{ .CommonLabels.severity }}
|
|
{{ range .Alerts }}
|
|
*Summary:* {{ .Annotations.summary }}
|
|
*Description:* {{ .Annotations.description }}
|
|
{{ if .Annotations.runbook_url }}*Runbook:* {{ .Annotations.runbook_url }}{{ end }}
|
|
{{ if .Annotations.dashboard }}*Dashboard:* {{ .Annotations.dashboard }}{{ end }}
|
|
{{ end }}
|
|
|
|
- name: 'slack-critical'
|
|
slack_configs:
|
|
- channel: '#sre-oncall'
|
|
send_resolved: true
|
|
title: '{{ if eq .Status "firing" }}🚨 CRITICAL{{ else }}✅ RESOLVED{{ end }} {{ .CommonLabels.alertname }}'
|
|
text: >-
|
|
*Service:* {{ .CommonLabels.service }}
|
|
*Severity:* CRITICAL — Immediate action required
|
|
{{ range .Alerts }}
|
|
*Summary:* {{ .Annotations.summary }}
|
|
*Description:* {{ .Annotations.description }}
|
|
{{ if .Annotations.runbook_url }}*Runbook:* {{ .Annotations.runbook_url }}{{ end }}
|
|
{{ end }}
|
|
|
|
- name: 'slack-infrastructure'
|
|
slack_configs:
|
|
- channel: '#infrastructure'
|
|
send_resolved: true
|
|
title: '{{ if eq .Status "firing" }}⚠️{{ else }}✅{{ end }} [{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}'
|
|
text: >-
|
|
*Service:* {{ .CommonLabels.service }}
|
|
{{ range .Alerts }}
|
|
*Summary:* {{ .Annotations.summary }}
|
|
*Description:* {{ .Annotations.description }}
|
|
{{ end }}
|