- Public QR menu: BFF proxy endpoints (no auth), PosDataService public methods - Revenue analytics + staff performance: Dapper queries, validators, BFF proxy - Playwright E2E tests: 8 spec files covering auth, admin, 5 POS verticals, reports - Observability: Grafana dashboard (HTTP metrics, infra, business), Prometheus alert rules - Fixes: validator frozen-date bug (Must vs LessThanOrEqualTo), PublicMenuController logging + CancellationToken Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
166 lines
6.5 KiB
YAML
166 lines
6.5 KiB
YAML
groups:
|
|
# =========================================================================
|
|
# GoodGo Platform - Prometheus Alert Rules
|
|
# =========================================================================
|
|
# EN: Critical alerts for service health, performance, and infrastructure.
|
|
# VI: Canh bao nghiem trong cho suc khoe dich vu, hieu nang, va ha tang.
|
|
# =========================================================================
|
|
|
|
- name: service_health
|
|
interval: 30s
|
|
rules:
|
|
# -------------------------------------------------------------------
|
|
# Service Down — healthcheck fails for > 1 minute
|
|
# -------------------------------------------------------------------
|
|
- alert: ServiceDown
|
|
expr: up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
team: platform
|
|
annotations:
|
|
summary: "Service {{ $labels.job }} is DOWN"
|
|
description: |
|
|
Service {{ $labels.job }} (instance {{ $labels.instance }}) has been
|
|
unreachable for more than 1 minute. Check container health and logs.
|
|
runbook_url: "https://docs.goodgo.vn/runbooks/service-down"
|
|
|
|
# -------------------------------------------------------------------
|
|
# High 5xx Error Rate — > 5% of requests return 5xx for 5 minutes
|
|
# -------------------------------------------------------------------
|
|
- alert: HighErrorRate
|
|
expr: |
|
|
(
|
|
sum(rate(http_requests_received_total{code=~"5.."}[5m])) by (job)
|
|
/
|
|
sum(rate(http_requests_received_total[5m])) by (job)
|
|
) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
team: backend
|
|
annotations:
|
|
summary: "High 5xx error rate on {{ $labels.job }}"
|
|
description: |
|
|
Service {{ $labels.job }} has a 5xx error rate of {{ $value | humanizePercentage }}
|
|
over the last 5 minutes. Investigate application logs for exceptions.
|
|
runbook_url: "https://docs.goodgo.vn/runbooks/high-error-rate"
|
|
|
|
# -------------------------------------------------------------------
|
|
# High Latency — p95 response time > 2s for 5 minutes
|
|
# -------------------------------------------------------------------
|
|
- alert: HighLatencyP95
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job)
|
|
) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: backend
|
|
annotations:
|
|
summary: "High p95 latency on {{ $labels.job }}"
|
|
description: |
|
|
Service {{ $labels.job }} p95 response time is {{ $value | humanizeDuration }}
|
|
(threshold: 2s). Check for slow database queries or external calls.
|
|
runbook_url: "https://docs.goodgo.vn/runbooks/high-latency"
|
|
|
|
- name: infrastructure_health
|
|
interval: 60s
|
|
rules:
|
|
# -------------------------------------------------------------------
|
|
# Database Connection Pool Exhausted — > 90% utilization
|
|
# -------------------------------------------------------------------
|
|
- alert: DatabaseConnectionPoolExhausted
|
|
expr: |
|
|
(
|
|
dotnet_npgsql_busy_connections
|
|
/
|
|
dotnet_npgsql_max_pool_size
|
|
) > 0.9
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
team: platform
|
|
annotations:
|
|
summary: "PostgreSQL connection pool near exhaustion on {{ $labels.job }}"
|
|
description: |
|
|
Service {{ $labels.job }} is using {{ $value | humanizePercentage }} of its
|
|
connection pool. Consider increasing MaxPoolSize or investigating connection leaks.
|
|
runbook_url: "https://docs.goodgo.vn/runbooks/db-pool-exhausted"
|
|
|
|
# -------------------------------------------------------------------
|
|
# Disk Usage > 85%
|
|
# -------------------------------------------------------------------
|
|
- alert: DiskUsageHigh
|
|
expr: |
|
|
(
|
|
node_filesystem_avail_bytes{mountpoint="/"}
|
|
/
|
|
node_filesystem_size_bytes{mountpoint="/"}
|
|
) < 0.15
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: devops
|
|
annotations:
|
|
summary: "Disk usage above 85% on {{ $labels.instance }}"
|
|
description: |
|
|
Node {{ $labels.instance }} has only {{ $value | humanizePercentage }}
|
|
disk space remaining. Clean up old data or expand storage.
|
|
runbook_url: "https://docs.goodgo.vn/runbooks/disk-usage"
|
|
|
|
# -------------------------------------------------------------------
|
|
# Memory Usage > 80%
|
|
# -------------------------------------------------------------------
|
|
- alert: MemoryUsageHigh
|
|
expr: |
|
|
(
|
|
1 - (
|
|
node_memory_MemAvailable_bytes
|
|
/
|
|
node_memory_MemTotal_bytes
|
|
)
|
|
) > 0.8
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: devops
|
|
annotations:
|
|
summary: "Memory usage above 80% on {{ $labels.instance }}"
|
|
description: |
|
|
Node {{ $labels.instance }} memory usage is at {{ $value | humanizePercentage }}.
|
|
Check for memory leaks or scale horizontally.
|
|
runbook_url: "https://docs.goodgo.vn/runbooks/memory-usage"
|
|
|
|
# -------------------------------------------------------------------
|
|
# Redis Memory Usage > 80%
|
|
# -------------------------------------------------------------------
|
|
- alert: RedisMemoryHigh
|
|
expr: |
|
|
redis_memory_used_bytes / redis_memory_max_bytes > 0.8
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
annotations:
|
|
summary: "Redis memory usage above 80%"
|
|
description: |
|
|
Redis is using {{ $value | humanizePercentage }} of max memory.
|
|
Review cache eviction policies and key TTLs.
|
|
|
|
# -------------------------------------------------------------------
|
|
# RabbitMQ Queue Backlog > 1000 messages
|
|
# -------------------------------------------------------------------
|
|
- alert: RabbitMQQueueBacklog
|
|
expr: rabbitmq_queue_messages > 1000
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: backend
|
|
annotations:
|
|
summary: "RabbitMQ queue {{ $labels.queue }} has backlog"
|
|
description: |
|
|
Queue {{ $labels.queue }} has {{ $value }} messages pending.
|
|
Check consumer health and throughput.
|