Finishes the half-implemented MFA enforcement work and ships the SLO
monitoring rules at the same time.
MFA grace period (auth):
- New `mfa-policy.ts` central source of truth: `MFA_REQUIRED_ROLES = [ADMIN]`,
`MFA_GRACE_PERIOD_DAYS = 14`, `MFA_REAUTH_WINDOW_MINUTES = 15`.
- New columns `User.mfaGraceStartedAt` + `User.mfaLastVerifiedAt`
(migration `20260429000000_add_mfa_grace_columns`).
- `JwtPayload.mfa: 'none' | 'grace' | 'enrollment_required'` claim now
carried in every access token so the FE + admin guards can react.
- `LoginUserHandler.resolveMfaGraceClaim()`:
* If role requires MFA and user has not enrolled, lazy-stamp
`mfaGraceStartedAt` on first login (returns `mfa: 'grace'`,
`remainingDays: 14`).
* After window expires → `mfa: 'enrollment_required'`, `remainingDays: 0`
(callers must force enrolment on sensitive routes).
* Otherwise → `mfa: 'none'`.
- `LocalStrategy` now passes `totpEnabled` + `mfaGraceStartedAt` through
to the command so the handler can branch without an extra query.
- `IUserRepository` + `PrismaUserRepository` get
`updateMfaGraceStartedAt` / `updateMfaLastVerifiedAt`.
- `UserEntity` carries the two new fields end-to-end (props, getters,
`createNew` + `createPasswordless` factories). Fixed an orphan-property
syntax bug in `createPasswordless` that was breaking typecheck.
- `oauth.service.ts` `UserEntity` construction now includes `deletedAt`
+ the two MFA fields (was missing required props).
- Add missing `jsonwebtoken` + `@types/jsonwebtoken` to `apps/api`
(transitively pulled in via `jwt-rotation.ts` from commit 3705193 but
never declared, so `tsc --noEmit` was failing).
- Update `login-user.handler.spec.ts` + `local.strategy.spec.ts` to cover
grace-window + enrolment-required branches. 338/338 auth tests pass.
Ops monitoring:
- New `monitoring/prometheus/slo-rules.yml` with recording + alerting
rules for the agreed SLOs.
- Wire it into `prometheus.yml` + alertmanager routing.
- Capture the SLO soak-test results in
`docs/audits/slo-soak-test-log.md`.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
113 lines
4.5 KiB
YAML
113 lines
4.5 KiB
YAML
# GoodGo Platform — Alertmanager Configuration
|
|
# Routes alerts from Prometheus to notification channels.
|
|
#
|
|
# Environment variables (set in .env):
|
|
# SLACK_WEBHOOK_URL — Slack incoming webhook for alert notifications
|
|
# ALERTMANAGER_SMTP_* — SMTP settings for email alerts (optional)
|
|
|
|
global:
|
|
resolve_timeout: 5m
|
|
slack_api_url: '${SLACK_WEBHOOK_URL}'
|
|
|
|
# ── Notification Templates ─────────────────────────────────────────────────────
|
|
templates:
|
|
- '/etc/alertmanager/templates/*.tmpl'
|
|
|
|
# ── Inhibition Rules ──────────────────────────────────────────────────────────
|
|
# Suppress warning alerts when a critical alert is already firing for the same service
|
|
inhibit_rules:
|
|
- source_matchers:
|
|
- severity = critical
|
|
target_matchers:
|
|
- severity = warning
|
|
equal: ['service']
|
|
|
|
# ── Routing Tree ──────────────────────────────────────────────────────────────
|
|
route:
|
|
receiver: 'slack-sre'
|
|
group_by: ['alertname', 'service']
|
|
group_wait: 30s
|
|
group_interval: 5m
|
|
repeat_interval: 4h
|
|
|
|
routes:
|
|
# Staging SLO soak — burn-rate alerts to Slack only, no pager
|
|
- matchers:
|
|
- environment = staging
|
|
- slo_type =~ "availability|latency"
|
|
receiver: 'slack-sre-staging-soak'
|
|
group_by: ['alertname', 'route', 'burn_window']
|
|
group_wait: 15s
|
|
group_interval: 5m
|
|
repeat_interval: 30m
|
|
continue: false
|
|
|
|
# Critical alerts — immediate notification, shorter repeat
|
|
- matchers:
|
|
- severity = critical
|
|
receiver: 'slack-critical'
|
|
group_wait: 10s
|
|
repeat_interval: 1h
|
|
continue: false
|
|
|
|
# Backup alerts — route to infrastructure channel
|
|
- matchers:
|
|
- alertname =~ "Backup.*"
|
|
receiver: 'slack-infrastructure'
|
|
group_wait: 1m
|
|
repeat_interval: 6h
|
|
|
|
# ── Receivers ─────────────────────────────────────────────────────────────────
|
|
receivers:
|
|
- name: 'slack-sre'
|
|
slack_configs:
|
|
- channel: '#sre-oncall'
|
|
send_resolved: true
|
|
title: '{{ if eq .Status "firing" }}🔥{{ else }}✅{{ end }} [{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}'
|
|
text: >-
|
|
*Service:* {{ .CommonLabels.service }}
|
|
*Severity:* {{ .CommonLabels.severity }}
|
|
{{ range .Alerts }}
|
|
*Summary:* {{ .Annotations.summary }}
|
|
*Description:* {{ .Annotations.description }}
|
|
{{ if .Annotations.runbook_url }}*Runbook:* {{ .Annotations.runbook_url }}{{ end }}
|
|
{{ if .Annotations.dashboard }}*Dashboard:* {{ .Annotations.dashboard }}{{ end }}
|
|
{{ end }}
|
|
|
|
- name: 'slack-critical'
|
|
slack_configs:
|
|
- channel: '#sre-oncall'
|
|
send_resolved: true
|
|
title: '{{ if eq .Status "firing" }}🚨 CRITICAL{{ else }}✅ RESOLVED{{ end }} {{ .CommonLabels.alertname }}'
|
|
text: >-
|
|
*Service:* {{ .CommonLabels.service }}
|
|
*Severity:* CRITICAL — Immediate action required
|
|
{{ range .Alerts }}
|
|
*Summary:* {{ .Annotations.summary }}
|
|
*Description:* {{ .Annotations.description }}
|
|
{{ if .Annotations.runbook_url }}*Runbook:* {{ .Annotations.runbook_url }}{{ end }}
|
|
{{ end }}
|
|
|
|
- name: 'slack-sre-staging-soak'
|
|
slack_configs:
|
|
- channel: '#sre-staging-soak'
|
|
send_resolved: true
|
|
title: 'SOAK {{ .CommonLabels.alertname }}'
|
|
text: >-
|
|
Route: {{ .CommonLabels.method }} {{ .CommonLabels.route }}
|
|
Burn: {{ .CommonLabels.burn_window }} | {{ .CommonLabels.severity }}
|
|
{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}
|
|
Staging soak — NOT paging.
|
|
|
|
- name: 'slack-infrastructure'
|
|
slack_configs:
|
|
- channel: '#infrastructure'
|
|
send_resolved: true
|
|
title: '{{ if eq .Status "firing" }}⚠️{{ else }}✅{{ end }} [{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}'
|
|
text: >-
|
|
*Service:* {{ .CommonLabels.service }}
|
|
{{ range .Alerts }}
|
|
*Summary:* {{ .Annotations.summary }}
|
|
*Description:* {{ .Annotations.description }}
|
|
{{ end }}
|