pos-system/infra/observability/prometheus/alert-rules.yml

groups:
  # =========================================================================
  # GoodGo Platform - Prometheus Alert Rules
  # =========================================================================
  # EN: Critical alerts for service health, performance, and infrastructure.
  # VI: Canh bao nghiem trong cho suc khoe dich vu, hieu nang, va ha tang.
  # =========================================================================

  - name: service_health
    interval: 30s
    rules:
      # -------------------------------------------------------------------
      # Service Down — healthcheck fails for > 1 minute
      # -------------------------------------------------------------------
      - alert: ServiceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
          team: platform
        annotations:
          summary: "Service {{ $labels.job }} is DOWN"
          description: |
            Service {{ $labels.job }} (instance {{ $labels.instance }}) has been
            unreachable for more than 1 minute. Check container health and logs.
          runbook_url: "https://docs.goodgo.vn/runbooks/service-down"

      # -------------------------------------------------------------------
      # High 5xx Error Rate — > 5% of requests return 5xx for 5 minutes
      # -------------------------------------------------------------------
      - alert: HighErrorRate
        expr: |
          (
            sum(rate(http_requests_received_total{code=~"5.."}[5m])) by (job)
            /
            sum(rate(http_requests_received_total[5m])) by (job)
          ) > 0.05
        for: 5m
        labels:
          severity: critical
          team: backend
        annotations:
          summary: "High 5xx error rate on {{ $labels.job }}"
          description: |
            Service {{ $labels.job }} has a 5xx error rate of {{ $value | humanizePercentage }}
            over the last 5 minutes. Investigate application logs for exceptions.
          runbook_url: "https://docs.goodgo.vn/runbooks/high-error-rate"

      # -------------------------------------------------------------------
      # High Latency — p95 response time > 2s for 5 minutes
      # -------------------------------------------------------------------
      - alert: HighLatencyP95
        expr: |
          histogram_quantile(0.95,
            sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job)
          ) > 2
        for: 5m
        labels:
          severity: warning
          team: backend
        annotations:
          summary: "High p95 latency on {{ $labels.job }}"
          description: |
            Service {{ $labels.job }} p95 response time is {{ $value | humanizeDuration }}
            (threshold: 2s). Check for slow database queries or external calls.
          runbook_url: "https://docs.goodgo.vn/runbooks/high-latency"

  - name: infrastructure_health
    interval: 60s
    rules:
      # -------------------------------------------------------------------
      # Database Connection Pool Exhausted — > 90% utilization
      # -------------------------------------------------------------------
      - alert: DatabaseConnectionPoolExhausted
        expr: |
          (
            dotnet_npgsql_busy_connections
            /
            dotnet_npgsql_max_pool_size
          ) > 0.9
        for: 2m
        labels:
          severity: critical
          team: platform
        annotations:
          summary: "PostgreSQL connection pool near exhaustion on {{ $labels.job }}"
          description: |
            Service {{ $labels.job }} is using {{ $value | humanizePercentage }} of its
            connection pool. Consider increasing MaxPoolSize or investigating connection leaks.
          runbook_url: "https://docs.goodgo.vn/runbooks/db-pool-exhausted"

      # -------------------------------------------------------------------
      # Disk Usage > 85%
      # -------------------------------------------------------------------
      - alert: DiskUsageHigh
        expr: |
          (
            node_filesystem_avail_bytes{mountpoint="/"}
            /
            node_filesystem_size_bytes{mountpoint="/"}
          ) < 0.15
        for: 5m
        labels:
          severity: warning
          team: devops
        annotations:
          summary: "Disk usage above 85% on {{ $labels.instance }}"
          description: |
            Node {{ $labels.instance }} has only {{ $value | humanizePercentage }}
            disk space remaining. Clean up old data or expand storage.
          runbook_url: "https://docs.goodgo.vn/runbooks/disk-usage"

      # -------------------------------------------------------------------
      # Memory Usage > 80%
      # -------------------------------------------------------------------
      - alert: MemoryUsageHigh
        expr: |
          (
            1 - (
              node_memory_MemAvailable_bytes
              /
              node_memory_MemTotal_bytes
            )
          ) > 0.8
        for: 5m
        labels:
          severity: warning
          team: devops
        annotations:
          summary: "Memory usage above 80% on {{ $labels.instance }}"
          description: |
            Node {{ $labels.instance }} memory usage is at {{ $value | humanizePercentage }}.
            Check for memory leaks or scale horizontally.
          runbook_url: "https://docs.goodgo.vn/runbooks/memory-usage"

      # -------------------------------------------------------------------
      # Redis Memory Usage > 80%
      # -------------------------------------------------------------------
      - alert: RedisMemoryHigh
        expr: |
          redis_memory_used_bytes / redis_memory_max_bytes > 0.8
        for: 5m
        labels:
          severity: warning
          team: platform
        annotations:
          summary: "Redis memory usage above 80%"
          description: |
            Redis is using {{ $value | humanizePercentage }} of max memory.
            Review cache eviction policies and key TTLs.

      # -------------------------------------------------------------------
      # RabbitMQ Queue Backlog > 1000 messages
      # -------------------------------------------------------------------
      - alert: RabbitMQQueueBacklog
        expr: rabbitmq_queue_messages > 1000
        for: 5m
        labels:
          severity: warning
          team: backend
        annotations:
          summary: "RabbitMQ queue {{ $labels.queue }} has backlog"
          description: |
            Queue {{ $labels.queue }} has {{ $value }} messages pending.
            Check consumer health and throughput.