Files
pos-system/infra/observability/prometheus/alert-rules.yml
Ho Ngoc Hai dc1ea7c0d2 feat: Phase 2 W7-8 production readiness — QR menu, analytics, E2E tests, observability
- Public QR menu: BFF proxy endpoints (no auth), PosDataService public methods
- Revenue analytics + staff performance: Dapper queries, validators, BFF proxy
- Playwright E2E tests: 8 spec files covering auth, admin, 5 POS verticals, reports
- Observability: Grafana dashboard (HTTP metrics, infra, business), Prometheus alert rules
- Fixes: validator frozen-date bug (Must vs LessThanOrEqualTo), PublicMenuController logging + CancellationToken

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-06 19:51:37 +07:00

166 lines
6.5 KiB
YAML

groups:
# =========================================================================
# GoodGo Platform - Prometheus Alert Rules
# =========================================================================
# EN: Critical alerts for service health, performance, and infrastructure.
# VI: Canh bao nghiem trong cho suc khoe dich vu, hieu nang, va ha tang.
# =========================================================================
- name: service_health
interval: 30s
rules:
# -------------------------------------------------------------------
# Service Down — healthcheck fails for > 1 minute
# -------------------------------------------------------------------
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
team: platform
annotations:
summary: "Service {{ $labels.job }} is DOWN"
description: |
Service {{ $labels.job }} (instance {{ $labels.instance }}) has been
unreachable for more than 1 minute. Check container health and logs.
runbook_url: "https://docs.goodgo.vn/runbooks/service-down"
# -------------------------------------------------------------------
# High 5xx Error Rate — > 5% of requests return 5xx for 5 minutes
# -------------------------------------------------------------------
- alert: HighErrorRate
expr: |
(
sum(rate(http_requests_received_total{code=~"5.."}[5m])) by (job)
/
sum(rate(http_requests_received_total[5m])) by (job)
) > 0.05
for: 5m
labels:
severity: critical
team: backend
annotations:
summary: "High 5xx error rate on {{ $labels.job }}"
description: |
Service {{ $labels.job }} has a 5xx error rate of {{ $value | humanizePercentage }}
over the last 5 minutes. Investigate application logs for exceptions.
runbook_url: "https://docs.goodgo.vn/runbooks/high-error-rate"
# -------------------------------------------------------------------
# High Latency — p95 response time > 2s for 5 minutes
# -------------------------------------------------------------------
- alert: HighLatencyP95
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job)
) > 2
for: 5m
labels:
severity: warning
team: backend
annotations:
summary: "High p95 latency on {{ $labels.job }}"
description: |
Service {{ $labels.job }} p95 response time is {{ $value | humanizeDuration }}
(threshold: 2s). Check for slow database queries or external calls.
runbook_url: "https://docs.goodgo.vn/runbooks/high-latency"
- name: infrastructure_health
interval: 60s
rules:
# -------------------------------------------------------------------
# Database Connection Pool Exhausted — > 90% utilization
# -------------------------------------------------------------------
- alert: DatabaseConnectionPoolExhausted
expr: |
(
dotnet_npgsql_busy_connections
/
dotnet_npgsql_max_pool_size
) > 0.9
for: 2m
labels:
severity: critical
team: platform
annotations:
summary: "PostgreSQL connection pool near exhaustion on {{ $labels.job }}"
description: |
Service {{ $labels.job }} is using {{ $value | humanizePercentage }} of its
connection pool. Consider increasing MaxPoolSize or investigating connection leaks.
runbook_url: "https://docs.goodgo.vn/runbooks/db-pool-exhausted"
# -------------------------------------------------------------------
# Disk Usage > 85%
# -------------------------------------------------------------------
- alert: DiskUsageHigh
expr: |
(
node_filesystem_avail_bytes{mountpoint="/"}
/
node_filesystem_size_bytes{mountpoint="/"}
) < 0.15
for: 5m
labels:
severity: warning
team: devops
annotations:
summary: "Disk usage above 85% on {{ $labels.instance }}"
description: |
Node {{ $labels.instance }} has only {{ $value | humanizePercentage }}
disk space remaining. Clean up old data or expand storage.
runbook_url: "https://docs.goodgo.vn/runbooks/disk-usage"
# -------------------------------------------------------------------
# Memory Usage > 80%
# -------------------------------------------------------------------
- alert: MemoryUsageHigh
expr: |
(
1 - (
node_memory_MemAvailable_bytes
/
node_memory_MemTotal_bytes
)
) > 0.8
for: 5m
labels:
severity: warning
team: devops
annotations:
summary: "Memory usage above 80% on {{ $labels.instance }}"
description: |
Node {{ $labels.instance }} memory usage is at {{ $value | humanizePercentage }}.
Check for memory leaks or scale horizontally.
runbook_url: "https://docs.goodgo.vn/runbooks/memory-usage"
# -------------------------------------------------------------------
# Redis Memory Usage > 80%
# -------------------------------------------------------------------
- alert: RedisMemoryHigh
expr: |
redis_memory_used_bytes / redis_memory_max_bytes > 0.8
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "Redis memory usage above 80%"
description: |
Redis is using {{ $value | humanizePercentage }} of max memory.
Review cache eviction policies and key TTLs.
# -------------------------------------------------------------------
# RabbitMQ Queue Backlog > 1000 messages
# -------------------------------------------------------------------
- alert: RabbitMQQueueBacklog
expr: rabbitmq_queue_messages > 1000
for: 5m
labels:
severity: warning
team: backend
annotations:
summary: "RabbitMQ queue {{ $labels.queue }} has backlog"
description: |
Queue {{ $labels.queue }} has {{ $value }} messages pending.
Check consumer health and throughput.