Finishes the half-implemented MFA enforcement work and ships the SLO
monitoring rules at the same time.
MFA grace period (auth):
- New `mfa-policy.ts` central source of truth: `MFA_REQUIRED_ROLES = [ADMIN]`,
`MFA_GRACE_PERIOD_DAYS = 14`, `MFA_REAUTH_WINDOW_MINUTES = 15`.
- New columns `User.mfaGraceStartedAt` + `User.mfaLastVerifiedAt`
(migration `20260429000000_add_mfa_grace_columns`).
- `JwtPayload.mfa: 'none' | 'grace' | 'enrollment_required'` claim now
carried in every access token so the FE + admin guards can react.
- `LoginUserHandler.resolveMfaGraceClaim()`:
* If role requires MFA and user has not enrolled, lazy-stamp
`mfaGraceStartedAt` on first login (returns `mfa: 'grace'`,
`remainingDays: 14`).
* After window expires → `mfa: 'enrollment_required'`, `remainingDays: 0`
(callers must force enrolment on sensitive routes).
* Otherwise → `mfa: 'none'`.
- `LocalStrategy` now passes `totpEnabled` + `mfaGraceStartedAt` through
to the command so the handler can branch without an extra query.
- `IUserRepository` + `PrismaUserRepository` get
`updateMfaGraceStartedAt` / `updateMfaLastVerifiedAt`.
- `UserEntity` carries the two new fields end-to-end (props, getters,
`createNew` + `createPasswordless` factories). Fixed an orphan-property
syntax bug in `createPasswordless` that was breaking typecheck.
- `oauth.service.ts` `UserEntity` construction now includes `deletedAt`
+ the two MFA fields (was missing required props).
- Add missing `jsonwebtoken` + `@types/jsonwebtoken` to `apps/api`
(transitively pulled in via `jwt-rotation.ts` from commit 3705193 but
never declared, so `tsc --noEmit` was failing).
- Update `login-user.handler.spec.ts` + `local.strategy.spec.ts` to cover
grace-window + enrolment-required branches. 338/338 auth tests pass.
Ops monitoring:
- New `monitoring/prometheus/slo-rules.yml` with recording + alerting
rules for the agreed SLOs.
- Wire it into `prometheus.yml` + alertmanager routing.
- Capture the SLO soak-test results in
`docs/audits/slo-soak-test-log.md`.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
151 lines
10 KiB
YAML
151 lines
10 KiB
YAML
groups:
|
|
- name: slo:availability:recording
|
|
interval: 30s
|
|
rules:
|
|
- record: slo:http_requests:rate5m
|
|
expr: sum(rate(http_requests_total{job="goodgo-api"}[5m])) by (route, method)
|
|
- record: slo:http_errors:rate5m
|
|
expr: sum(rate(http_requests_total{job="goodgo-api", status_code=~"5.."}[5m])) by (route, method)
|
|
- record: slo:error_ratio:rate5m
|
|
expr: slo:http_errors:rate5m / slo:http_requests:rate5m
|
|
- record: slo:http_requests:rate30m
|
|
expr: sum(rate(http_requests_total{job="goodgo-api"}[30m])) by (route, method)
|
|
- record: slo:http_errors:rate30m
|
|
expr: sum(rate(http_requests_total{job="goodgo-api", status_code=~"5.."}[30m])) by (route, method)
|
|
- record: slo:error_ratio:rate30m
|
|
expr: slo:http_errors:rate30m / slo:http_requests:rate30m
|
|
- record: slo:http_requests:rate1h
|
|
expr: sum(rate(http_requests_total{job="goodgo-api"}[1h])) by (route, method)
|
|
- record: slo:http_errors:rate1h
|
|
expr: sum(rate(http_requests_total{job="goodgo-api", status_code=~"5.."}[1h])) by (route, method)
|
|
- record: slo:error_ratio:rate1h
|
|
expr: slo:http_errors:rate1h / slo:http_requests:rate1h
|
|
- record: slo:http_requests:rate6h
|
|
expr: sum(rate(http_requests_total{job="goodgo-api"}[6h])) by (route, method)
|
|
- record: slo:http_errors:rate6h
|
|
expr: sum(rate(http_requests_total{job="goodgo-api", status_code=~"5.."}[6h])) by (route, method)
|
|
- record: slo:error_ratio:rate6h
|
|
expr: slo:http_errors:rate6h / slo:http_requests:rate6h
|
|
- record: slo:http_requests:rate1d
|
|
expr: sum(rate(http_requests_total{job="goodgo-api"}[1d])) by (route, method)
|
|
- record: slo:http_errors:rate1d
|
|
expr: sum(rate(http_requests_total{job="goodgo-api", status_code=~"5.."}[1d])) by (route, method)
|
|
- record: slo:error_ratio:rate1d
|
|
expr: slo:http_errors:rate1d / slo:http_requests:rate1d
|
|
- record: slo:http_requests:rate3d
|
|
expr: sum(rate(http_requests_total{job="goodgo-api"}[3d])) by (route, method)
|
|
- record: slo:http_errors:rate3d
|
|
expr: sum(rate(http_requests_total{job="goodgo-api", status_code=~"5.."}[3d])) by (route, method)
|
|
- record: slo:error_ratio:rate3d
|
|
expr: slo:http_errors:rate3d / slo:http_requests:rate3d
|
|
- name: slo:latency:recording
|
|
interval: 30s
|
|
rules:
|
|
- record: slo:latency_good:rate5m
|
|
expr: >
|
|
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api", route="/api/listings", le="0.5"}[5m])) by (route, method) or
|
|
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api", route="/api/listings/:id", le="0.25"}[5m])) by (route, method) or
|
|
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api", route="/api/payments/create", le="1"}[5m])) by (route, method) or
|
|
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api", route="/api/auth/login", le="0.5"}[5m])) by (route, method) or
|
|
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api", route="/api/search", le="0.5"}[5m])) by (route, method)
|
|
- record: slo:latency_total:rate5m
|
|
expr: sum(rate(goodgo_api_request_duration_seconds_count{job="goodgo-api", route=~"/api/listings|/api/listings/:id|/api/payments/create|/api/auth/login|/api/search"}[5m])) by (route, method)
|
|
- record: slo:latency_good_ratio:rate5m
|
|
expr: slo:latency_good:rate5m / slo:latency_total:rate5m
|
|
- record: slo:latency_good:rate1h
|
|
expr: >
|
|
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api", route="/api/listings", le="0.5"}[1h])) by (route, method) or
|
|
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api", route="/api/listings/:id", le="0.25"}[1h])) by (route, method) or
|
|
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api", route="/api/payments/create", le="1"}[1h])) by (route, method) or
|
|
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api", route="/api/auth/login", le="0.5"}[1h])) by (route, method) or
|
|
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api", route="/api/search", le="0.5"}[1h])) by (route, method)
|
|
- record: slo:latency_total:rate1h
|
|
expr: sum(rate(goodgo_api_request_duration_seconds_count{job="goodgo-api", route=~"/api/listings|/api/listings/:id|/api/payments/create|/api/auth/login|/api/search"}[1h])) by (route, method)
|
|
- record: slo:latency_good_ratio:rate1h
|
|
expr: slo:latency_good:rate1h / slo:latency_total:rate1h
|
|
- record: slo:latency_good:rate6h
|
|
expr: >
|
|
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api", route="/api/listings", le="0.5"}[6h])) by (route, method) or
|
|
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api", route="/api/listings/:id", le="0.25"}[6h])) by (route, method) or
|
|
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api", route="/api/payments/create", le="1"}[6h])) by (route, method) or
|
|
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api", route="/api/auth/login", le="0.5"}[6h])) by (route, method) or
|
|
sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api", route="/api/search", le="0.5"}[6h])) by (route, method)
|
|
- record: slo:latency_total:rate6h
|
|
expr: sum(rate(goodgo_api_request_duration_seconds_count{job="goodgo-api", route=~"/api/listings|/api/listings/:id|/api/payments/create|/api/auth/login|/api/search"}[6h])) by (route, method)
|
|
- record: slo:latency_good_ratio:rate6h
|
|
expr: slo:latency_good:rate6h / slo:latency_total:rate6h
|
|
- name: slo:availability:burn_rate_alerts
|
|
rules:
|
|
- alert: SloAvailFastBurn
|
|
expr: >
|
|
(slo:error_ratio:rate1h{route=~"/api/listings|/api/listings/:id|/api/auth/login|/api/search"} > 0.0144
|
|
and slo:error_ratio:rate5m{route=~"/api/listings|/api/listings/:id|/api/auth/login|/api/search"} > 0.0144)
|
|
and slo:http_requests:rate1h{route=~"/api/listings|/api/listings/:id|/api/auth/login|/api/search"} > 1
|
|
for: 2m
|
|
labels: {severity: critical, team: sre, slo_type: availability, burn_window: fast, slo_target: "99.9", environment: staging}
|
|
annotations:
|
|
summary: "SLO FAST BURN: {{ $labels.method }} {{ $labels.route }} availability (14.4x)"
|
|
description: "Error ratio {{ $value | printf \"%.4f\" }} exceeds 14.4x burn threshold 0.0144."
|
|
- alert: SloAvailFastBurnPayments
|
|
expr: >
|
|
(slo:error_ratio:rate1h{route="/api/payments/create"} > 0.0072
|
|
and slo:error_ratio:rate5m{route="/api/payments/create"} > 0.0072)
|
|
and slo:http_requests:rate1h{route="/api/payments/create"} > 1
|
|
for: 2m
|
|
labels: {severity: critical, team: sre, slo_type: availability, burn_window: fast, slo_target: "99.95", environment: staging}
|
|
annotations:
|
|
summary: "SLO FAST BURN: payments availability (14.4x)"
|
|
description: "Payments error ratio {{ $value | printf \"%.4f\" }} exceeds threshold 0.0072."
|
|
- alert: SloAvailSlowBurn
|
|
expr: >
|
|
(slo:error_ratio:rate6h{route=~"/api/listings|/api/listings/:id|/api/auth/login|/api/search"} > 0.006
|
|
and slo:error_ratio:rate30m{route=~"/api/listings|/api/listings/:id|/api/auth/login|/api/search"} > 0.006)
|
|
and slo:http_requests:rate6h{route=~"/api/listings|/api/listings/:id|/api/auth/login|/api/search"} > 1
|
|
for: 5m
|
|
labels: {severity: warning, team: sre, slo_type: availability, burn_window: slow, slo_target: "99.9", environment: staging}
|
|
annotations:
|
|
summary: "SLO SLOW BURN: {{ $labels.method }} {{ $labels.route }} availability (6x)"
|
|
description: "6h error ratio {{ $value | printf \"%.4f\" }} exceeds 6x threshold 0.006."
|
|
- alert: SloAvailSlowBurnPayments
|
|
expr: >
|
|
(slo:error_ratio:rate6h{route="/api/payments/create"} > 0.003
|
|
and slo:error_ratio:rate30m{route="/api/payments/create"} > 0.003)
|
|
and slo:http_requests:rate6h{route="/api/payments/create"} > 1
|
|
for: 5m
|
|
labels: {severity: warning, team: sre, slo_type: availability, burn_window: slow, slo_target: "99.95", environment: staging}
|
|
annotations:
|
|
summary: "SLO SLOW BURN: payments availability (6x)"
|
|
description: "Payments 6h error ratio {{ $value | printf \"%.4f\" }} exceeds threshold 0.003."
|
|
- name: slo:latency:burn_rate_alerts
|
|
rules:
|
|
- alert: SloLatencyFastBurn
|
|
expr: >
|
|
(slo:latency_good_ratio:rate1h{route=~"/api/listings|/api/listings/:id|/api/auth/login|/api/search"} < 0.9856
|
|
and slo:latency_good_ratio:rate5m{route=~"/api/listings|/api/listings/:id|/api/auth/login|/api/search"} < 0.9856)
|
|
and slo:latency_total:rate1h{route=~"/api/listings|/api/listings/:id|/api/auth/login|/api/search"} > 1
|
|
for: 2m
|
|
labels: {severity: critical, team: sre, slo_type: latency, burn_window: fast, environment: staging}
|
|
annotations:
|
|
summary: "SLO LATENCY FAST BURN: {{ $labels.method }} {{ $labels.route }} (14.4x)"
|
|
description: "Good ratio {{ $value | printf \"%.4f\" }} below 0.9856 threshold."
|
|
- alert: SloLatencyFastBurnPayments
|
|
expr: >
|
|
(slo:latency_good_ratio:rate1h{route="/api/payments/create"} < 0.9928
|
|
and slo:latency_good_ratio:rate5m{route="/api/payments/create"} < 0.9928)
|
|
and slo:latency_total:rate1h{route="/api/payments/create"} > 1
|
|
for: 2m
|
|
labels: {severity: critical, team: sre, slo_type: latency, burn_window: fast, environment: staging}
|
|
annotations:
|
|
summary: "SLO LATENCY FAST BURN: payments (14.4x)"
|
|
description: "Payments good ratio {{ $value | printf \"%.4f\" }} below 0.9928."
|
|
- alert: SloLatencySlowBurn
|
|
expr: >
|
|
(slo:latency_good_ratio:rate6h{route=~"/api/listings|/api/listings/:id|/api/auth/login|/api/search"} < 0.994
|
|
and slo:latency_good_ratio:rate5m{route=~"/api/listings|/api/listings/:id|/api/auth/login|/api/search"} < 0.994)
|
|
and slo:latency_total:rate6h{route=~"/api/listings|/api/listings/:id|/api/auth/login|/api/search"} > 1
|
|
for: 5m
|
|
labels: {severity: warning, team: sre, slo_type: latency, burn_window: slow, environment: staging}
|
|
annotations:
|
|
summary: "SLO latency slow burn: {{ $labels.method }} {{ $labels.route }} (6x)"
|
|
description: "6h good ratio {{ $value | printf \"%.4f\" }} below 0.994."
|