# ────────────────────────────────────────────────────────────────────────────── # SLO recording + alerting rules for the top 5 GoodGo API endpoints. # Source of truth for SLI/SLO definitions: docs/observability/slo.md # Issue: GOO-119 # # Endpoint label values (set by HttpMetricsInterceptor, NestJS route paths # without the /api/v1 prefix): # - /auth/login # - /search # - /listings/:id # - /payments/callback/:provider # - /inquiries # # Multi-window, multi-burn-rate alert pattern (Google SRE Workbook ch. 5): # fast page : burn 14.4 over 1 h & 5 m # slow ticket: burn 6 over 6 h & 30 m # slow ticket: burn 3 over 24 h & 2 h # slow ticket: burn 1 over 3 d & 6 h # ────────────────────────────────────────────────────────────────────────────── groups: # ─── Recording rules: success and latency ratios per endpoint, per window ─── - name: goodgo_slo_recording interval: 30s rules: # ── /auth/login ────────────────────────────────────────────────────── - record: slo:request_errors:ratio_rate5m labels: { route: "/auth/login", slo: "auth_login_availability" } expr: | ( sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/auth/login",status_code=~"5.."}[5m])) ) / ( sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/auth/login"}[5m])) > 0 ) - record: slo:request_errors:ratio_rate30m labels: { route: "/auth/login", slo: "auth_login_availability" } expr: | sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/auth/login",status_code=~"5.."}[30m])) / (sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/auth/login"}[30m])) > 0) - record: slo:request_errors:ratio_rate1h labels: { route: "/auth/login", slo: "auth_login_availability" } expr: | sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/auth/login",status_code=~"5.."}[1h])) / (sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/auth/login"}[1h])) > 0) - record: slo:request_errors:ratio_rate2h labels: { route: "/auth/login", slo: "auth_login_availability" } expr: | sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/auth/login",status_code=~"5.."}[2h])) / (sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/auth/login"}[2h])) > 0) - record: slo:request_errors:ratio_rate6h labels: { route: "/auth/login", slo: "auth_login_availability" } expr: | sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/auth/login",status_code=~"5.."}[6h])) / (sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/auth/login"}[6h])) > 0) - record: slo:request_errors:ratio_rate1d labels: { route: "/auth/login", slo: "auth_login_availability" } expr: | sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/auth/login",status_code=~"5.."}[1d])) / (sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/auth/login"}[1d])) > 0) - record: slo:request_errors:ratio_rate3d labels: { route: "/auth/login", slo: "auth_login_availability" } expr: | sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/auth/login",status_code=~"5.."}[3d])) / (sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/auth/login"}[3d])) > 0) - record: slo:latency_slow:ratio_rate5m labels: { route: "/auth/login", slo: "auth_login_latency", threshold_seconds: "0.4" } expr: | 1 - ( sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api",route="/api/v1/auth/login",le="0.4"}[5m])) / (sum(rate(goodgo_api_request_duration_seconds_count{job="goodgo-api",route="/api/v1/auth/login"}[5m])) > 0) ) - record: slo:latency_slow:ratio_rate1h labels: { route: "/auth/login", slo: "auth_login_latency", threshold_seconds: "0.4" } expr: | 1 - ( sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api",route="/api/v1/auth/login",le="0.4"}[1h])) / (sum(rate(goodgo_api_request_duration_seconds_count{job="goodgo-api",route="/api/v1/auth/login"}[1h])) > 0) ) - record: slo:latency_slow:ratio_rate6h labels: { route: "/auth/login", slo: "auth_login_latency", threshold_seconds: "0.4" } expr: | 1 - ( sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api",route="/api/v1/auth/login",le="0.4"}[6h])) / (sum(rate(goodgo_api_request_duration_seconds_count{job="goodgo-api",route="/api/v1/auth/login"}[6h])) > 0) ) # ── /search (listings discovery) ───────────────────────────────────── - record: slo:request_errors:ratio_rate5m labels: { route: "/search", slo: "search_availability" } expr: | sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/search",status_code=~"5.."}[5m])) / (sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/search"}[5m])) > 0) - record: slo:request_errors:ratio_rate1h labels: { route: "/search", slo: "search_availability" } expr: | sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/search",status_code=~"5.."}[1h])) / (sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/search"}[1h])) > 0) - record: slo:request_errors:ratio_rate6h labels: { route: "/search", slo: "search_availability" } expr: | sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/search",status_code=~"5.."}[6h])) / (sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/search"}[6h])) > 0) - record: slo:latency_slow:ratio_rate5m labels: { route: "/search", slo: "search_latency", threshold_seconds: "0.8" } expr: | 1 - ( sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api",route="/api/v1/search",le="0.8"}[5m])) / (sum(rate(goodgo_api_request_duration_seconds_count{job="goodgo-api",route="/api/v1/search"}[5m])) > 0) ) - record: slo:latency_slow:ratio_rate1h labels: { route: "/search", slo: "search_latency", threshold_seconds: "0.8" } expr: | 1 - ( sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api",route="/api/v1/search",le="0.8"}[1h])) / (sum(rate(goodgo_api_request_duration_seconds_count{job="goodgo-api",route="/api/v1/search"}[1h])) > 0) ) - record: slo:latency_slow:ratio_rate6h labels: { route: "/search", slo: "search_latency", threshold_seconds: "0.8" } expr: | 1 - ( sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api",route="/api/v1/search",le="0.8"}[6h])) / (sum(rate(goodgo_api_request_duration_seconds_count{job="goodgo-api",route="/api/v1/search"}[6h])) > 0) ) # ── /listings/:id (detail page) ────────────────────────────────────── - record: slo:request_errors:ratio_rate5m labels: { route: "/listings/:id", slo: "listing_detail_availability" } expr: | sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/listings/:id",status_code=~"5.."}[5m])) / (sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/listings/:id"}[5m])) > 0) - record: slo:request_errors:ratio_rate1h labels: { route: "/listings/:id", slo: "listing_detail_availability" } expr: | sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/listings/:id",status_code=~"5.."}[1h])) / (sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/listings/:id"}[1h])) > 0) - record: slo:request_errors:ratio_rate6h labels: { route: "/listings/:id", slo: "listing_detail_availability" } expr: | sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/listings/:id",status_code=~"5.."}[6h])) / (sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/listings/:id"}[6h])) > 0) - record: slo:latency_slow:ratio_rate5m labels: { route: "/listings/:id", slo: "listing_detail_latency", threshold_seconds: "0.5" } expr: | 1 - ( sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api",route="/api/v1/listings/:id",le="0.5"}[5m])) / (sum(rate(goodgo_api_request_duration_seconds_count{job="goodgo-api",route="/api/v1/listings/:id"}[5m])) > 0) ) - record: slo:latency_slow:ratio_rate1h labels: { route: "/listings/:id", slo: "listing_detail_latency", threshold_seconds: "0.5" } expr: | 1 - ( sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api",route="/api/v1/listings/:id",le="0.5"}[1h])) / (sum(rate(goodgo_api_request_duration_seconds_count{job="goodgo-api",route="/api/v1/listings/:id"}[1h])) > 0) ) - record: slo:latency_slow:ratio_rate6h labels: { route: "/listings/:id", slo: "listing_detail_latency", threshold_seconds: "0.5" } expr: | 1 - ( sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api",route="/api/v1/listings/:id",le="0.5"}[6h])) / (sum(rate(goodgo_api_request_duration_seconds_count{job="goodgo-api",route="/api/v1/listings/:id"}[6h])) > 0) ) # ── /payments/callback/:provider ───────────────────────────────────── # Payment callbacks: 4xx >=422 also counts as failure (provider validation). - record: slo:request_errors:ratio_rate5m labels: { route: "/payments/callback/:provider", slo: "payment_callback_availability" } expr: | sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/payments/callback/:provider",status_code=~"5..|4(2[2-9]|[3-9].)"}[5m])) / (sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/payments/callback/:provider"}[5m])) > 0) - record: slo:request_errors:ratio_rate1h labels: { route: "/payments/callback/:provider", slo: "payment_callback_availability" } expr: | sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/payments/callback/:provider",status_code=~"5..|4(2[2-9]|[3-9].)"}[1h])) / (sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/payments/callback/:provider"}[1h])) > 0) - record: slo:request_errors:ratio_rate6h labels: { route: "/payments/callback/:provider", slo: "payment_callback_availability" } expr: | sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/payments/callback/:provider",status_code=~"5..|4(2[2-9]|[3-9].)"}[6h])) / (sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/payments/callback/:provider"}[6h])) > 0) - record: slo:latency_slow:ratio_rate5m labels: { route: "/payments/callback/:provider", slo: "payment_callback_latency", threshold_seconds: "2.0" } expr: | 1 - ( sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api",route="/api/v1/payments/callback/:provider",le="2"}[5m])) / (sum(rate(goodgo_api_request_duration_seconds_count{job="goodgo-api",route="/api/v1/payments/callback/:provider"}[5m])) > 0) ) - record: slo:latency_slow:ratio_rate1h labels: { route: "/payments/callback/:provider", slo: "payment_callback_latency", threshold_seconds: "2.0" } expr: | 1 - ( sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api",route="/api/v1/payments/callback/:provider",le="2"}[1h])) / (sum(rate(goodgo_api_request_duration_seconds_count{job="goodgo-api",route="/api/v1/payments/callback/:provider"}[1h])) > 0) ) - record: slo:latency_slow:ratio_rate6h labels: { route: "/payments/callback/:provider", slo: "payment_callback_latency", threshold_seconds: "2.0" } expr: | 1 - ( sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api",route="/api/v1/payments/callback/:provider",le="2"}[6h])) / (sum(rate(goodgo_api_request_duration_seconds_count{job="goodgo-api",route="/api/v1/payments/callback/:provider"}[6h])) > 0) ) # ── /inquiries (lead capture) ──────────────────────────────────────── - record: slo:request_errors:ratio_rate5m labels: { route: "/inquiries", slo: "inquiries_availability" } expr: | sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/inquiries",status_code=~"5.."}[5m])) / (sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/inquiries"}[5m])) > 0) - record: slo:request_errors:ratio_rate1h labels: { route: "/inquiries", slo: "inquiries_availability" } expr: | sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/inquiries",status_code=~"5.."}[1h])) / (sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/inquiries"}[1h])) > 0) - record: slo:request_errors:ratio_rate6h labels: { route: "/inquiries", slo: "inquiries_availability" } expr: | sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/inquiries",status_code=~"5.."}[6h])) / (sum(rate(http_requests_total{job="goodgo-api",route="/api/v1/inquiries"}[6h])) > 0) - record: slo:latency_slow:ratio_rate5m labels: { route: "/inquiries", slo: "inquiries_latency", threshold_seconds: "0.6" } expr: | 1 - ( sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api",route="/api/v1/inquiries",le="0.6"}[5m])) / (sum(rate(goodgo_api_request_duration_seconds_count{job="goodgo-api",route="/api/v1/inquiries"}[5m])) > 0) ) - record: slo:latency_slow:ratio_rate1h labels: { route: "/inquiries", slo: "inquiries_latency", threshold_seconds: "0.6" } expr: | 1 - ( sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api",route="/api/v1/inquiries",le="0.6"}[1h])) / (sum(rate(goodgo_api_request_duration_seconds_count{job="goodgo-api",route="/api/v1/inquiries"}[1h])) > 0) ) - record: slo:latency_slow:ratio_rate6h labels: { route: "/inquiries", slo: "inquiries_latency", threshold_seconds: "0.6" } expr: | 1 - ( sum(rate(goodgo_api_request_duration_seconds_bucket{job="goodgo-api",route="/api/v1/inquiries",le="0.6"}[6h])) / (sum(rate(goodgo_api_request_duration_seconds_count{job="goodgo-api",route="/api/v1/inquiries"}[6h])) > 0) ) # ─── Burn-rate alerts ────────────────────────────────────────────────────── # Each pair fires only when BOTH the long and short window are simultaneously # above the burn-rate threshold; this kills false positives from short blips. - name: goodgo_slo_burn_rate rules: # ────────────── /auth/login (availability target 99.9 %) ──────────── - alert: SLOBurnFastAuthLoginAvailability expr: | slo:request_errors:ratio_rate1h{slo="auth_login_availability"} > (14.4 * 0.001) and slo:request_errors:ratio_rate5m{slo="auth_login_availability"} > (14.4 * 0.001) for: 2m labels: severity: critical team: sre service: goodgo-api slo: auth_login_availability burn_rate: "14.4" annotations: summary: "FAST burn: /auth/login availability eating 2% budget per hour" description: > POST /auth/login is burning the availability error budget at 14.4× the sustainable rate. At this rate the 30-day budget is consumed in under 2 days. Investigate auth service, JWT signing, and dependency health. runbook_url: "https://docs.goodgo.vn/runbooks/slo-auth-login" - alert: SLOBurnSlowAuthLoginAvailability expr: | slo:request_errors:ratio_rate6h{slo="auth_login_availability"} > (6 * 0.001) and slo:request_errors:ratio_rate30m{slo="auth_login_availability"} > (6 * 0.001) for: 15m labels: severity: warning team: sre service: goodgo-api slo: auth_login_availability burn_rate: "6" annotations: summary: "SLOW burn: /auth/login availability" description: > POST /auth/login has been burning availability budget at 6× the sustainable rate over the last 6 h. Open a reliability ticket. - alert: SLOBurnFastAuthLoginLatency expr: | slo:latency_slow:ratio_rate1h{slo="auth_login_latency"} > (14.4 * 0.01) and slo:latency_slow:ratio_rate5m{slo="auth_login_latency"} > (14.4 * 0.01) for: 2m labels: severity: critical team: sre service: goodgo-api slo: auth_login_latency annotations: summary: "FAST burn: /auth/login p95 latency budget" description: > POST /auth/login is serving more than expected slow requests (>400 ms) at 14.4× the sustainable burn. Check DB latency, JWT signing CPU, and bcrypt cost factor. # ────────────── /search (availability 99.5%, latency 95%) ─────────── - alert: SLOBurnFastSearchAvailability expr: | slo:request_errors:ratio_rate1h{slo="search_availability"} > (14.4 * 0.005) and slo:request_errors:ratio_rate5m{slo="search_availability"} > (14.4 * 0.005) for: 2m labels: { severity: critical, team: sre, service: goodgo-api, slo: search_availability } annotations: summary: "FAST burn: /search availability" description: > GET /search 5xx rate is burning the 99.5% availability budget at 14.4×. Likely Typesense, Postgres, or PostGIS regression. - alert: SLOBurnSlowSearchAvailability expr: | slo:request_errors:ratio_rate6h{slo="search_availability"} > (6 * 0.005) for: 15m labels: { severity: warning, team: sre, service: goodgo-api, slo: search_availability } annotations: summary: "SLOW burn: /search availability over 6 h" description: GET /search has been burning availability at >=6× for 6 h. - alert: SLOBurnFastSearchLatency expr: | slo:latency_slow:ratio_rate1h{slo="search_latency"} > (14.4 * 0.05) and slo:latency_slow:ratio_rate5m{slo="search_latency"} > (14.4 * 0.05) for: 2m labels: { severity: critical, team: sre, service: goodgo-api, slo: search_latency } annotations: summary: "FAST burn: /search p95 latency" description: > GET /search latency budget burning at 14.4×. Check Typesense and PostGIS query plans. # ────────────── /listings/:id (99.9% / 99% under 500 ms) ──────────── - alert: SLOBurnFastListingDetailAvailability expr: | slo:request_errors:ratio_rate1h{slo="listing_detail_availability"} > (14.4 * 0.001) and slo:request_errors:ratio_rate5m{slo="listing_detail_availability"} > (14.4 * 0.001) for: 2m labels: { severity: critical, team: sre, service: goodgo-api, slo: listing_detail_availability } annotations: summary: "FAST burn: /listings/:id availability" description: GET /listings/:id 5xx rate is burning availability budget at 14.4×. - alert: SLOBurnSlowListingDetailAvailability expr: | slo:request_errors:ratio_rate6h{slo="listing_detail_availability"} > (6 * 0.001) for: 15m labels: { severity: warning, team: sre, service: goodgo-api, slo: listing_detail_availability } annotations: summary: "SLOW burn: /listings/:id availability" description: GET /listings/:id availability burn at >=6× for 6 h. - alert: SLOBurnFastListingDetailLatency expr: | slo:latency_slow:ratio_rate1h{slo="listing_detail_latency"} > (14.4 * 0.01) and slo:latency_slow:ratio_rate5m{slo="listing_detail_latency"} > (14.4 * 0.01) for: 2m labels: { severity: critical, team: sre, service: goodgo-api, slo: listing_detail_latency } annotations: summary: "FAST burn: /listings/:id latency" description: GET /listings/:id slow-request rate burning at 14.4×. # ────────────── /payments/callback/:provider (99.95% / 99% under 2s) ─ - alert: SLOBurnFastPaymentCallbackAvailability expr: | slo:request_errors:ratio_rate1h{slo="payment_callback_availability"} > (14.4 * 0.0005) and slo:request_errors:ratio_rate5m{slo="payment_callback_availability"} > (14.4 * 0.0005) for: 2m labels: { severity: critical, team: sre, service: goodgo-api, slo: payment_callback_availability } annotations: summary: "FAST burn: payment callback availability" description: > POST /payments/callback/:provider is failing (5xx or signature rejection) at 14.4× the sustainable burn. Revenue at risk — page payments on-call immediately. runbook_url: "https://docs.goodgo.vn/runbooks/slo-payment-callback" - alert: SLOBurnSlowPaymentCallbackAvailability expr: | slo:request_errors:ratio_rate6h{slo="payment_callback_availability"} > (6 * 0.0005) for: 15m labels: { severity: warning, team: sre, service: goodgo-api, slo: payment_callback_availability } annotations: summary: "SLOW burn: payment callback availability" - alert: SLOBurnFastPaymentCallbackLatency expr: | slo:latency_slow:ratio_rate1h{slo="payment_callback_latency"} > (14.4 * 0.01) and slo:latency_slow:ratio_rate5m{slo="payment_callback_latency"} > (14.4 * 0.01) for: 2m labels: { severity: critical, team: sre, service: goodgo-api, slo: payment_callback_latency } annotations: summary: "FAST burn: payment callback p99 latency" # ────────────── /inquiries (99.9% / 99% under 600 ms) ─────────────── - alert: SLOBurnFastInquiriesAvailability expr: | slo:request_errors:ratio_rate1h{slo="inquiries_availability"} > (14.4 * 0.001) and slo:request_errors:ratio_rate5m{slo="inquiries_availability"} > (14.4 * 0.001) for: 2m labels: { severity: critical, team: sre, service: goodgo-api, slo: inquiries_availability } annotations: summary: "FAST burn: /inquiries availability" description: POST /inquiries 5xx rate burning at 14.4×. - alert: SLOBurnSlowInquiriesAvailability expr: | slo:request_errors:ratio_rate6h{slo="inquiries_availability"} > (6 * 0.001) for: 15m labels: { severity: warning, team: sre, service: goodgo-api, slo: inquiries_availability } annotations: summary: "SLOW burn: /inquiries availability" - alert: SLOBurnFastInquiriesLatency expr: | slo:latency_slow:ratio_rate1h{slo="inquiries_latency"} > (14.4 * 0.01) and slo:latency_slow:ratio_rate5m{slo="inquiries_latency"} > (14.4 * 0.01) for: 2m labels: { severity: critical, team: sre, service: goodgo-api, slo: inquiries_latency } annotations: summary: "FAST burn: /inquiries latency"