Files
goodgo-platform/monitoring/alertmanager/alertmanager.yml
Ho Ngoc Hai 9409706c58 feat(monitoring): add comprehensive alerting rules, Alertmanager, and DR validation
Expand production monitoring with full alert coverage for database connections,
Redis memory/connections, container resources, disk usage, service health, and
backup integrity. Add Alertmanager service with Slack routing for critical and
warning alerts, and add automated backup verification to the pg-backup cron
schedule. Update runbook with DR validation procedures and quarterly checklist.

- Expand Prometheus alert rules from 4 to 24 alerts across 7 groups
- Add Alertmanager container (prom/alertmanager:v0.27.0) with Slack routing
- Configure inhibition rules (critical suppresses warning for same service)
- Schedule automated backup verification at 04:00 UTC daily
- Add Alertmanager datasource to Grafana provisioning
- Update runbook with Section 9: DR Validation (automated + manual procedures)
- Add SLACK_WEBHOOK_URL and Grafana vars to .env.example

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-11 20:15:36 +07:00

91 lines
3.7 KiB
YAML

# GoodGo Platform — Alertmanager Configuration
# Routes alerts from Prometheus to notification channels.
#
# Environment variables (set in .env):
# SLACK_WEBHOOK_URL — Slack incoming webhook for alert notifications
# ALERTMANAGER_SMTP_* — SMTP settings for email alerts (optional)
global:
resolve_timeout: 5m
slack_api_url: '${SLACK_WEBHOOK_URL}'
# ── Notification Templates ─────────────────────────────────────────────────────
templates:
- '/etc/alertmanager/templates/*.tmpl'
# ── Inhibition Rules ──────────────────────────────────────────────────────────
# Suppress warning alerts when a critical alert is already firing for the same service
inhibit_rules:
- source_matchers:
- severity = critical
target_matchers:
- severity = warning
equal: ['service']
# ── Routing Tree ──────────────────────────────────────────────────────────────
route:
receiver: 'slack-sre'
group_by: ['alertname', 'service']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
routes:
# Critical alerts — immediate notification, shorter repeat
- matchers:
- severity = critical
receiver: 'slack-critical'
group_wait: 10s
repeat_interval: 1h
continue: false
# Backup alerts — route to infrastructure channel
- matchers:
- alertname =~ "Backup.*"
receiver: 'slack-infrastructure'
group_wait: 1m
repeat_interval: 6h
# ── Receivers ─────────────────────────────────────────────────────────────────
receivers:
- name: 'slack-sre'
slack_configs:
- channel: '#sre-oncall'
send_resolved: true
title: '{{ if eq .Status "firing" }}🔥{{ else }}✅{{ end }} [{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}'
text: >-
*Service:* {{ .CommonLabels.service }}
*Severity:* {{ .CommonLabels.severity }}
{{ range .Alerts }}
*Summary:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
{{ if .Annotations.runbook_url }}*Runbook:* {{ .Annotations.runbook_url }}{{ end }}
{{ if .Annotations.dashboard }}*Dashboard:* {{ .Annotations.dashboard }}{{ end }}
{{ end }}
- name: 'slack-critical'
slack_configs:
- channel: '#sre-oncall'
send_resolved: true
title: '{{ if eq .Status "firing" }}🚨 CRITICAL{{ else }}✅ RESOLVED{{ end }} {{ .CommonLabels.alertname }}'
text: >-
*Service:* {{ .CommonLabels.service }}
*Severity:* CRITICAL — Immediate action required
{{ range .Alerts }}
*Summary:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
{{ if .Annotations.runbook_url }}*Runbook:* {{ .Annotations.runbook_url }}{{ end }}
{{ end }}
- name: 'slack-infrastructure'
slack_configs:
- channel: '#infrastructure'
send_resolved: true
title: '{{ if eq .Status "firing" }}⚠️{{ else }}✅{{ end }} [{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}'
text: >-
*Service:* {{ .CommonLabels.service }}
{{ range .Alerts }}
*Summary:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
{{ end }}