From 9409706c5875ab5166a51d29684de033dc580767 Mon Sep 17 00:00:00 2001 From: Ho Ngoc Hai Date: Sat, 11 Apr 2026 20:15:36 +0700 Subject: [PATCH] feat(monitoring): add comprehensive alerting rules, Alertmanager, and DR validation Expand production monitoring with full alert coverage for database connections, Redis memory/connections, container resources, disk usage, service health, and backup integrity. Add Alertmanager service with Slack routing for critical and warning alerts, and add automated backup verification to the pg-backup cron schedule. Update runbook with DR validation procedures and quarterly checklist. - Expand Prometheus alert rules from 4 to 24 alerts across 7 groups - Add Alertmanager container (prom/alertmanager:v0.27.0) with Slack routing - Configure inhibition rules (critical suppresses warning for same service) - Schedule automated backup verification at 04:00 UTC daily - Add Alertmanager datasource to Grafana provisioning - Update runbook with Section 9: DR Validation (automated + manual procedures) - Add SLACK_WEBHOOK_URL and Grafana vars to .env.example Co-Authored-By: Paperclip --- .env.example | 11 + EXPLORATION_SUMMARY.md | 419 ++++++++++++++++++ docker-compose.prod.yml | 44 +- docs/RUNBOOK.md | 210 ++++++++- monitoring/alertmanager/alertmanager.yml | 90 ++++ .../provisioning/datasources/datasource.yml | 9 + monitoring/prometheus/alert-rules.yml | 322 ++++++++++++++ monitoring/prometheus/prometheus.yml | 5 + 8 files changed, 1108 insertions(+), 2 deletions(-) create mode 100644 EXPLORATION_SUMMARY.md create mode 100644 monitoring/alertmanager/alertmanager.yml diff --git a/.env.example b/.env.example index dd12629..7985748 100644 --- a/.env.example +++ b/.env.example @@ -164,3 +164,14 @@ KYC_ENCRYPTION_KEY_VERSION=1 # Logging # ----------------------------------------------------------------------------- LOG_LEVEL=info + +# ----------------------------------------------------------------------------- +# Monitoring & Alerting +# ----------------------------------------------------------------------------- +GRAFANA_ADMIN_USER=admin +GRAFANA_ADMIN_PASSWORD=CHANGE_ME +GRAFANA_PORT=3002 +GRAFANA_ROOT_URL=http://localhost:3002 + +# Slack webhook for alert notifications (Alertmanager + CI/CD) +SLACK_WEBHOOK_URL=https://hooks.slack.com/services/CHANGE_ME diff --git a/EXPLORATION_SUMMARY.md b/EXPLORATION_SUMMARY.md new file mode 100644 index 0000000..530cda4 --- /dev/null +++ b/EXPLORATION_SUMMARY.md @@ -0,0 +1,419 @@ +# GoodGo Platform - Codebase Exploration Summary + +## πŸ“‹ Overview + +This exploration provides a comprehensive analysis of the GoodGo Platform codebase to establish architectural patterns and best practices for building new Inquiry & Lead Management UI pages. + +**Two detailed documents have been created:** +1. **`codebase_exploration.md`** - Full technical deep-dive with code samples +2. **`CODEBASE_QUICK_REFERENCE.md`** - Quick reference templates and checklists + +--- + +## 🎯 Key Findings + +### Architecture Overview +- **Frontend**: Next.js 15+ with App Router, TypeScript, Tailwind CSS +- **Backend**: NestJS with CQRS pattern, modular architecture +- **Communication**: REST API with JWT + CSRF protection +- **State Management**: Zustand + React Query +- **UI Components**: Radix UI-inspired compound components with Tailwind styling +- **i18n**: next-intl with Vietnamese (vi) and English (en) +- **Database**: Prisma ORM + +### Authentication Flow +- **Cookies**: httpOnly JWT cookies (user management via `useAuthStore`) +- **CSRF**: Token-based via `XSRF-TOKEN` cookie +- **Authorization**: Role-based access (AGENT, ADMIN, USER roles) +- **Protected Routes**: `/dashboard` routes protected by JwtAuthGuard + +--- + +## πŸ“ Directory Structure (Key Paths) + +``` +apps/web/ +β”œβ”€β”€ app/[locale]/ +β”‚ └── (dashboard)/ ← Place new pages here +β”‚ β”œβ”€β”€ inquiries/ ← New: /inquiries, /inquiries/[id] +β”‚ └── leads/ ← New: /leads, /leads/[id] +β”œβ”€β”€ components/ +β”‚ β”œβ”€β”€ ui/ ← Reusable base components +β”‚ β”œβ”€β”€ inquiries/ ← New: domain components +β”‚ └── leads/ ← New: domain components +β”œβ”€β”€ lib/ +β”‚ β”œβ”€β”€ api-client.ts ← Base fetch wrapper +β”‚ β”œβ”€β”€ inquiries-api.ts ← New: API service +β”‚ β”œβ”€β”€ leads-api.ts ← New: API service +β”‚ β”œβ”€β”€ hooks/ +β”‚ β”‚ β”œβ”€β”€ use-inquiries.ts ← New: React Query hooks +β”‚ β”‚ └── use-leads.ts ← New: React Query hooks +β”‚ └── validations/ ← Zod schemas +└── messages/ + β”œβ”€β”€ vi.json ← Add inquiries/leads translations + └── en.json ← Add inquiries/leads translations + +apps/api/src/modules/ +β”œβ”€β”€ inquiries/ +β”‚ β”œβ”€β”€ presentation/controllers/inquiries.controller.ts βœ… EXISTS +β”‚ β”œβ”€β”€ presentation/dto/ βœ… EXISTS +β”‚ └── domain/repositories/ βœ… EXISTS +└── leads/ + β”œβ”€β”€ presentation/controllers/leads.controller.ts βœ… EXISTS + β”œβ”€β”€ presentation/dto/ βœ… EXISTS + └── domain/repositories/ βœ… EXISTS +``` + +--- + +## πŸ”Œ Backend API Endpoints (Ready to Use) + +### Inquiries Module +``` +POST /api/v1/inquiries +GET /api/v1/inquiries/listing/{listingId} +GET /api/v1/inquiries/agent/me +PATCH /api/v1/inquiries/{id}/read +``` + +**Response Types:** +- `InquiryReadDto` - Single inquiry data +- `PaginatedResult` - List with pagination + +### Leads Module +``` +POST /api/v1/leads +GET /api/v1/leads +GET /api/v1/leads/stats +PATCH /api/v1/leads/{id}/status +DELETE /api/v1/leads/{id} +``` + +**Response Types:** +- `LeadReadDto` - Single lead data +- `PaginatedResult` - List with pagination +- `LeadStatsData` - Statistics + +--- + +## πŸ—οΈ Patterns to Follow + +### 1. Page Structure (Follow listings page pattern) +```typescript +'use client'; + +// Components + Hooks + Store +import { useTranslations } from 'next-intl'; +import { useQuery } from '@tanstack/react-query'; +import { useState } from 'react'; + +// Layout: Header > Stats > Filters > Content +// Features: Stats cards, filter dropdowns, table/grid view, pagination +``` + +### 2. API Service (Use apiClient) +```typescript +// apps/web/lib/inquiries-api.ts +import { apiClient } from './api-client'; + +export const inquiriesApi = { + list: (params) => apiClient.get('/inquiries', params), + getById: (id) => apiClient.get(`/inquiries/${id}`), + markAsRead: (id) => apiClient.patch(`/inquiries/${id}/read`, {}), +}; +``` + +### 3. React Query Hooks (Use key factory) +```typescript +// apps/web/lib/hooks/use-inquiries.ts +export const inquiriesKeys = { + all: ['inquiries'] as const, + list: (params) => ['inquiries', 'list', params] as const, +}; + +export function useInquiries(params = {}) { + return useQuery({ + queryKey: inquiriesKeys.list(params), + queryFn: () => inquiriesApi.list(params), + }); +} +``` + +### 4. Status Badge Component +```typescript +// apps/web/components/inquiries/inquiry-status-badge.tsx +// Map status enum to badge variant (success, warning, info, etc.) +``` + +### 5. Translations (Hierarchical JSON) +```json +{ + "inquiries": { + "title": "QuαΊ£n lΓ½ LiΓͺn hệ", + "status": { "new": "Mα»›i", "read": "Đã xem" } + } +} +``` + +--- + +## 🎨 Component Library + +### Base UI Components (Ready to Use) +- `Button` - Variants: default, outline, ghost, destructive +- `Card` - Compound: CardHeader, CardTitle, CardDescription, CardContent +- `Badge` - Variants: default, secondary, destructive, outline, success, warning, info +- `Table` - Compound: TableHeader, TableBody, TableRow, TableHead, TableCell +- `Select` - Native HTML with Tailwind styling +- `Input` - Text input with consistent styling +- `Textarea` - Text area with consistent styling +- `Dialog` - Modal dialog component +- `Tabs` - Tab navigation component +- `Label` - Form label component + +### Styling Conventions +```typescript +// Grid layout (responsive) +className="grid gap-4 sm:grid-cols-2 lg:grid-cols-3" + +// Flex layout +className="flex items-center justify-between gap-3" + +// Typography +className="text-2xl font-bold" // Heading +className="text-sm text-muted-foreground" // Secondary text + +// Status indicators +className="text-green-600 bg-green-50" // Success +className="text-yellow-600 bg-yellow-50" // Warning +className="text-blue-600 bg-blue-50" // Info +``` + +### Theme Colors (CSS Variables) +- Primary: Green (#36A653) +- Secondary: Light gray-blue +- Accent: Light gray-blue +- Muted: Gray +- Destructive: Red +- Dark mode: Automatically inverted + +--- + +## πŸ”„ Data Flow Example + +``` +User clicks filter +↓ +setFilters(newFilters) +↓ +queryKey changes +↓ +React Query automatically fetches +↓ +useQuery({ queryKey, queryFn: () => inquiriesApi.list(filters) }) +↓ +API call to /api/v1/inquiries?status=new&page=1 +↓ +useAuthStore provides JWT cookie + CSRF token +↓ +Response: { items: [], total: 10, page: 1, limit: 20 } +↓ +Component re-renders with new data +``` + +--- + +## βœ… Implementation Checklist + +### Phase 1: Setup +- [ ] Create `inquiries-api.ts` in `apps/web/lib/` +- [ ] Create `leads-api.ts` in `apps/web/lib/` +- [ ] Define DTOs matching backend responses +- [ ] Test API endpoints with Postman/cURL + +### Phase 2: Hooks & Queries +- [ ] Create `use-inquiries.ts` hook with React Query +- [ ] Create `use-leads.ts` hook with React Query +- [ ] Test data fetching with loading/error states + +### Phase 3: Components +- [ ] Create `inquiry-status-badge.tsx` component +- [ ] Create `lead-status-badge.tsx` component +- [ ] Create filter bar / filter component +- [ ] Test components in isolation + +### Phase 4: Pages +- [ ] Create `/inquiries/page.tsx` (list view) +- [ ] Create `/inquiries/[id]/page.tsx` (detail view - if needed) +- [ ] Create `/leads/page.tsx` (list view) +- [ ] Create `/leads/[id]/page.tsx` (detail view - if needed) + +### Phase 5: i18n & Polish +- [ ] Add translations to `messages/vi.json` +- [ ] Add translations to `messages/en.json` +- [ ] Test all languages +- [ ] Test dark mode +- [ ] Test responsive design (mobile/tablet/desktop) +- [ ] Add loading skeletons +- [ ] Add error boundaries +- [ ] Add empty state messages + +### Phase 6: Testing & QA +- [ ] Unit tests for components +- [ ] Integration tests for API calls +- [ ] E2E tests for user flows +- [ ] Performance testing (React Query caching) +- [ ] Accessibility testing (ARIA labels, keyboard nav) + +--- + +## πŸ“š Reference Files + +### Essential Reading +1. **Dashboard Layout** - `apps/web/app/[locale]/(dashboard)/layout.tsx` + - Responsive navigation patterns + - User info display + - Theme toggle + +2. **Listings Page** - `apps/web/app/[locale]/(dashboard)/listings/page.tsx` + - Complete list view example + - Filter state management + - Grid/table view toggle + - Stats cards + - Pagination pattern + +3. **Dashboard Page** - `apps/web/app/[locale]/(dashboard)/dashboard/page.tsx` + - Stats card component + - Chart integration + - Market data fetching + +4. **API Client** - `apps/web/lib/api-client.ts` + - Request wrapper + - CSRF token handling + - Error handling + +5. **Listings API** - `apps/web/lib/listings-api.ts` + - API service pattern + - Type definitions + - Search params handling + +6. **Use Listings Hook** - `apps/web/lib/hooks/use-listings.ts` + - React Query pattern + - Key factory pattern + +7. **Auth Store** - `apps/web/lib/auth-store.ts` + - Zustand pattern + - Async actions + - Error handling + +8. **Comparison Store** - `apps/web/lib/comparison-store.ts` + - Zustand with persistence + - Complex state management + +### Backend API Examples +- `apps/api/src/modules/inquiries/presentation/controllers/inquiries.controller.ts` +- `apps/api/src/modules/leads/presentation/controllers/leads.controller.ts` +- `apps/api/src/modules/listings/presentation/controllers/listings.controller.ts` + +--- + +## πŸ› οΈ Development Tips + +### Local Testing +```bash +# Start frontend dev server +cd apps/web && npm run dev + +# Start backend dev server (in another terminal) +cd apps/api && npm run dev + +# API will be at http://localhost:3001/api/v1 +# Frontend will be at http://localhost:3000 +``` + +### API Testing +```bash +# Test inquiry list endpoint +curl -H "Authorization: Bearer {token}" \ + http://localhost:3001/api/v1/inquiries/agent/me + +# Test lead creation +curl -X POST \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer {token}" \ + -d '{ + "name": "John Doe", + "phone": "0912345678", + "source": "website", + "score": 80 + }' \ + http://localhost:3001/api/v1/leads +``` + +### React Query Debugging +```typescript +// Add this to see React Query state +import { ReactQueryDevtools } from '@tanstack/react-query-devtools'; + +// In provider: + +``` + +### i18n Testing +- Switch language in UI +- Verify all strings translate +- Test RTL (if adding Arabic) + +--- + +## 🚨 Common Pitfalls to Avoid + +1. **Forgetting `'use client'`** - Required for hooks (useQuery, useTranslations) +2. **Not using query key factory** - Makes cache invalidation hard +3. **Hardcoding API URLs** - Use environment variables (`NEXT_PUBLIC_API_URL`) +4. **Missing error states** - Always handle loading/error/empty states +5. **Not testing pagination** - Verify page params work correctly +6. **Forgetting translations** - Add to both vi.json and en.json +7. **Not handling 401/403 errors** - Redirect to login on auth errors +8. **Ignoring mobile responsive** - Test on all breakpoints (sm, md, lg) +9. **Not using semantic HTML** - Use proper heading hierarchy, ARIA labels +10. **Direct DOM manipulation** - Use React state/hooks instead of getElementById + +--- + +## πŸ“ž Contact & Questions + +For implementation questions: +1. Check `codebase_exploration.md` for detailed explanations +2. Check `CODEBASE_QUICK_REFERENCE.md` for code templates +3. Reference existing pages (listings, dashboard) +4. Inspect backend DTOs for API response shapes + +--- + +## πŸ“„ Document Files + +- **`codebase_exploration.md`** (29.8 KB) + - Complete technical deep-dive + - 10 major sections covering all aspects + - Code snippets and examples + - Architecture diagrams in text form + +- **`CODEBASE_QUICK_REFERENCE.md`** (12 KB) + - Quick reference guide + - Template code snippets + - Checklists + - Key file references + - Development tips + +- **`EXPLORATION_SUMMARY.md`** (This file) + - High-level overview + - Key findings summary + - Directory structure + - Implementation checklist + +--- + +**Total Exploration:** 10 sections, 50+ code examples, 100+ file references + +**Ready to start building!** πŸš€ diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index ce54dc2..ee95b44 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -314,7 +314,8 @@ services: - -c - | apt-get update -qq && apt-get install -y -qq cron > /dev/null 2>&1 - echo "0 2 * * * PGHOST=postgres PGPORT=5432 PGUSER=${DB_USER} PGDATABASE=${DB_NAME} PGPASSWORD=${DB_PASSWORD} BACKUP_DIR=/backups RETENTION_DAYS=${BACKUP_RETENTION_DAYS:-7} /scripts/pg-backup.sh >> /var/log/pg-backup.log 2>&1" | crontab - + (echo "0 2 * * * PGHOST=postgres PGPORT=5432 PGUSER=${DB_USER} PGDATABASE=${DB_NAME} PGPASSWORD=${DB_PASSWORD} BACKUP_DIR=/backups RETENTION_DAYS=${BACKUP_RETENTION_DAYS:-7} /scripts/pg-backup.sh >> /var/log/pg-backup.log 2>&1" + echo "0 4 * * * PGHOST=postgres PGPORT=5432 PGUSER=${DB_USER} PGDATABASE=${DB_NAME} PGPASSWORD=${DB_PASSWORD} BACKUP_DIR=/backups REPORT_FILE=/backups/verify-latest.json /scripts/pg-verify-backup.sh >> /var/log/pg-verify.log 2>&1") | crontab - /scripts/pg-backup.sh cron -f environment: @@ -410,7 +411,11 @@ services: - '--web.enable-lifecycle' volumes: - ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./monitoring/prometheus/alert-rules.yml:/etc/prometheus/alert-rules.yml:ro - prometheus_data:/prometheus + depends_on: + alertmanager: + condition: service_healthy healthcheck: test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:9090/-/healthy'] interval: 15s @@ -434,6 +439,41 @@ services: networks: - goodgo-net + alertmanager: + image: prom/alertmanager:v0.27.0 + container_name: goodgo-alertmanager + restart: unless-stopped + command: + - '--config.file=/etc/alertmanager/alertmanager.yml' + - '--storage.path=/alertmanager' + - '--data.retention=120h' + environment: + SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL:-} + volumes: + - ./monitoring/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro + healthcheck: + test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:9093/-/healthy'] + interval: 15s + timeout: 5s + retries: 3 + start_period: 10s + deploy: + resources: + limits: + memory: 256m + cpus: '0.25' + reservations: + memory: 64m + security_opt: + - no-new-privileges:true + logging: + driver: json-file + options: + max-size: '5m' + max-file: '3' + networks: + - goodgo-net + grafana: image: grafana/grafana:10.4.1 container_name: goodgo-grafana @@ -457,6 +497,8 @@ services: condition: service_healthy loki: condition: service_healthy + alertmanager: + condition: service_healthy healthcheck: test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:3000/api/health'] interval: 15s diff --git a/docs/RUNBOOK.md b/docs/RUNBOOK.md index a1e75c9..4ef0aa0 100644 --- a/docs/RUNBOOK.md +++ b/docs/RUNBOOK.md @@ -53,6 +53,7 @@ | **promtail** | `grafana/promtail:3.0.0` | β€” | 0.25 CPU / 256 MB | β€” | | **prometheus** | `prom/prometheus:v2.51.0` | 9090 (internal) | 0.5 CPU / 1 GB | `wget /-/healthy` | | **grafana** | `grafana/grafana:10.4.1` | 3002 (external) | 0.5 CPU / 512 MB | `wget /api/health` | +| **alertmanager** | `prom/alertmanager:v0.27.0` | 9093 (internal) | 0.25 CPU / 256 MB | `wget /-/healthy` | ### Development-Only Services (`docker-compose.yml`) @@ -67,7 +68,7 @@ web --> api --> pgbouncer --> postgres |-> minio |-> ai-services -grafana --> prometheus +grafana --> prometheus --> alertmanager |-> loki --> promtail (Docker socket) pg-backup --> postgres @@ -128,6 +129,9 @@ curl -sf http://localhost:3100/ready && echo "Loki OK" # Grafana curl -sf http://localhost:3002/api/health | jq . + +# Alertmanager +curl -sf http://localhost:9093/-/healthy && echo "Alertmanager OK" ``` ### Container Resource Usage @@ -864,6 +868,7 @@ All dashboards are provisioned automatically via `monitoring/grafana/provisionin **Data Sources:** - **Prometheus** (`http://prometheus:9090`) β€” Metrics (default) - **Loki** (`http://loki:3100`) β€” Logs, with correlation ID linking to Prometheus +- **Alertmanager** (`http://alertmanager:9093`) β€” Alert state and silences --- @@ -963,13 +968,216 @@ rate(container_cpu_usage_seconds_total{name=~"goodgo-.*"}[5m]) --- +## 9. Disaster Recovery Validation + +### Automated Verification + +Backup verification runs **daily at 04:00 UTC** inside the `pg-backup` container. It restores the latest backup to an isolated test database and checks: + +- Table existence (all 22 Prisma models) +- Row count comparison against live database +- Data checksums on critical tables (User, Property, Listing, Payment, Subscription, Transaction, Plan) +- PostGIS extension availability +- Index count match +- Enum type count match + +**Check latest verification report:** + +```bash +docker exec goodgo-pg-backup cat /backups/verify-latest.json | jq . +``` + +**Check verification logs:** + +```bash +docker exec goodgo-pg-backup cat /var/log/pg-verify.log +``` + +### Manual DR Validation Procedure + +Run this quarterly (or after major schema changes) to validate the full DR process end-to-end. + +#### Step 1: Verify Backups Exist and Are Recent + +```bash +# List backups with timestamps and sizes +docker exec goodgo-pg-backup ls -lht /backups/goodgo_*.sql.gz + +# Verify latest backup is < 25 hours old +LATEST=$(docker exec goodgo-pg-backup ls -t /backups/goodgo_*.sql.gz | head -1) +echo "Latest backup: $LATEST" +``` + +#### Step 2: Run Verification Against Latest Backup + +```bash +# Automated verification (creates temp DB, validates, drops) +docker exec -e REPORT_FILE=/backups/verify-latest.json goodgo-pg-backup \ + /scripts/pg-verify-backup.sh + +# Review results +docker exec goodgo-pg-backup cat /backups/verify-latest.json | jq . +``` + +**Expected output:** All checks pass, restore completes in < 60 seconds for typical dataset. + +#### Step 3: Test Full Restore (Staging Only) + +> ⚠️ **WARNING:** Only perform this on a staging or isolated environment. Never on production. + +```bash +# 1. Create a separate test environment +docker compose -f docker-compose.yml -p goodgo-dr-test up -d postgres + +# 2. Wait for PostgreSQL to be ready +docker exec goodgo-dr-test-postgres-1 pg_isready + +# 3. Run restore against the test environment +PGHOST=localhost PGPORT= PGUSER=goodgo PGPASSWORD= \ + /scripts/pg-restore.sh /backups/.sql.gz + +# 4. Verify key tables +docker exec goodgo-dr-test-postgres-1 psql -U goodgo -d goodgo -c \ + "SELECT count(*) FROM \"User\"; SELECT count(*) FROM \"Property\"; SELECT count(*) FROM \"Listing\";" + +# 5. Clean up test environment +docker compose -f docker-compose.yml -p goodgo-dr-test down -v +``` + +#### Step 4: Validate Service Recovery Chain + +Test that all services can start from a clean state with restored data: + +```bash +# 1. Note current service status +docker compose -f docker-compose.prod.yml ps --format "table {{.Name}}\t{{.Status}}\t{{.Health}}" + +# 2. Restart all services in dependency order +docker compose -f docker-compose.prod.yml restart postgres +sleep 10 # Wait for PostgreSQL + +docker compose -f docker-compose.prod.yml restart pgbouncer redis typesense +sleep 10 # Wait for data services + +docker compose -f docker-compose.prod.yml restart api web ai-services +sleep 15 # Wait for application services + +# 3. Verify all health checks +curl -sf http://localhost:3001/health/ready | jq . +curl -sf http://localhost:3000 > /dev/null && echo "Web OK" +curl -sf http://localhost:9090/-/healthy && echo "Prometheus OK" +curl -sf http://localhost:9093/-/healthy && echo "Alertmanager OK" +curl -sf http://localhost:3002/api/health | jq . +``` + +#### Step 5: Validate Alerting Pipeline + +```bash +# 1. Check Prometheus is loading alert rules +curl -sf http://localhost:9090/api/v1/rules | jq '.data.groups | length' +# Expected: 7 groups + +# 2. Check current alerts (should be empty if healthy) +curl -sf http://localhost:9090/api/v1/alerts | jq '.data.alerts | length' + +# 3. Check Alertmanager is receiving from Prometheus +curl -sf http://localhost:9093/api/v2/status | jq '.cluster' + +# 4. Verify Alertmanager config is loaded +curl -sf http://localhost:9093/api/v2/status | jq '.config' +``` + +### DR Validation Checklist + +Use this checklist during quarterly DR reviews: + +- [ ] Latest backup is < 25 hours old +- [ ] Automated verification report shows all checks passed +- [ ] Manual restore to test DB succeeds with correct row counts +- [ ] Full service restart completes within RTO target (< 30 min) +- [ ] All health endpoints respond after restart +- [ ] Prometheus alert rules are loaded (7 groups) +- [ ] Alertmanager is reachable and configured +- [ ] Slack notification channel is receiving test alerts +- [ ] Grafana dashboards show data after restart +- [ ] Typesense search returns results after restart + +### RPO/RTO Summary + +| Metric | Target | Actual (Measured) | Notes | +|--------|--------|-------------------|-------| +| **RPO** | ≀ 24 hours | ~24h (daily at 02:00 UTC) | Reduce with WAL archiving | +| **RTO β€” Local backup** | ≀ 15 minutes | Measure during DR test | Restore + service restart | +| **RTO β€” Off-site backup** | ≀ 30 minutes | Measure during DR test | Add transfer time | +| **RTO β€” Full host recovery** | ≀ 60 minutes | Measure during DR test | New host + restore + deploy | + +--- + ## Appendix: Alert Rules Reference +### API & Error Alerts + | Alert | Expression | Severity | Duration | |-------|-----------|----------|----------| | `ApiLatencyP99High` | p99 > 1s | Warning | 5 min | | `ApiEndpointLatencyP99High` | Per-route p99 > 2s | Warning | 5 min | | `ApiLatencyP99Critical` | p99 > 3s (SLO breach) | Critical | 3 min | | `ApiErrorRate5xxHigh` | 5xx rate > 1% | Warning | 5 min | +| `ApiErrorRate5xxCritical` | 5xx rate > 5% | Critical | 3 min | +| `ApiNoTraffic` | Request rate = 0 | Warning | 10 min | + +### Database Alerts + +| Alert | Expression | Severity | Duration | +|-------|-----------|----------|----------| +| `PostgresActiveConnectionsHigh` | Active connections > 15 | Warning | 5 min | +| `PostgresConnectionPoolCritical` | Total connections > 180 | Critical | 2 min | +| `PostgresSlowQueries` | Lock-waiting queries > 5 | Warning | 5 min | +| `PostgresDown` | API scrape target down | Critical | 1 min | + +### Redis Alerts + +| Alert | Expression | Severity | Duration | +|-------|-----------|----------|----------| +| `RedisMemoryHigh` | Memory usage > 80% | Warning | 5 min | +| `RedisMemoryCritical` | Memory usage > 95% | Critical | 2 min | +| `RedisConnectedClientsHigh` | Clients > 150 | Warning | 5 min | +| `RedisRejectedConnections` | Rejected connections > 0 | Critical | 1 min | + +### Container Resource Alerts + +| Alert | Expression | Severity | Duration | +|-------|-----------|----------|----------| +| `ContainerRestartLoop` | > 3 restarts in 15 min | Critical | 5 min | +| `ContainerMemoryHigh` | Memory > 85% of limit | Warning | 5 min | +| `ContainerCPUThrottled` | CPU throttle rate > 0.5s/s | Warning | 10 min | + +### Disk & Infrastructure Alerts + +| Alert | Expression | Severity | Duration | +|-------|-----------|----------|----------| +| `HostDiskUsageHigh` | Root disk > 80% | Warning | 10 min | +| `HostDiskUsageCritical` | Root disk > 90% | Critical | 5 min | +| `ApiHealthCheckFailing` | Health probe fails | Critical | 2 min | +| `PrometheusTargetDown` | Scrape target down | Warning | 5 min | + +### Backup Alerts + +| Alert | Expression | Severity | Duration | +|-------|-----------|----------|----------| +| `BackupTooOld` | Last backup > 25 hours ago | Warning | 5 min | +| `BackupVerificationFailed` | Verify result = fail | Warning | 1 min | + +### Alert Routing + +Alerts are routed via Alertmanager (`monitoring/alertmanager/alertmanager.yml`): + +| Channel | Routes | Repeat Interval | +|---------|--------|-----------------| +| `#sre-oncall` (Slack) | All warning alerts | 4 hours | +| `#sre-oncall` (Slack) | All critical alerts (priority) | 1 hour | +| `#infrastructure` (Slack) | Backup-related alerts | 6 hours | + +**Inhibition:** Warning alerts are suppressed when a critical alert for the same service is already firing. Alert rules are defined in `monitoring/prometheus/alert-rules.yml` and evaluated every 15 seconds. diff --git a/monitoring/alertmanager/alertmanager.yml b/monitoring/alertmanager/alertmanager.yml new file mode 100644 index 0000000..2c14de7 --- /dev/null +++ b/monitoring/alertmanager/alertmanager.yml @@ -0,0 +1,90 @@ +# GoodGo Platform β€” Alertmanager Configuration +# Routes alerts from Prometheus to notification channels. +# +# Environment variables (set in .env): +# SLACK_WEBHOOK_URL β€” Slack incoming webhook for alert notifications +# ALERTMANAGER_SMTP_* β€” SMTP settings for email alerts (optional) + +global: + resolve_timeout: 5m + slack_api_url: '${SLACK_WEBHOOK_URL}' + +# ── Notification Templates ───────────────────────────────────────────────────── +templates: + - '/etc/alertmanager/templates/*.tmpl' + +# ── Inhibition Rules ────────────────────────────────────────────────────────── +# Suppress warning alerts when a critical alert is already firing for the same service +inhibit_rules: + - source_matchers: + - severity = critical + target_matchers: + - severity = warning + equal: ['service'] + +# ── Routing Tree ────────────────────────────────────────────────────────────── +route: + receiver: 'slack-sre' + group_by: ['alertname', 'service'] + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + + routes: + # Critical alerts β€” immediate notification, shorter repeat + - matchers: + - severity = critical + receiver: 'slack-critical' + group_wait: 10s + repeat_interval: 1h + continue: false + + # Backup alerts β€” route to infrastructure channel + - matchers: + - alertname =~ "Backup.*" + receiver: 'slack-infrastructure' + group_wait: 1m + repeat_interval: 6h + +# ── Receivers ───────────────────────────────────────────────────────────────── +receivers: + - name: 'slack-sre' + slack_configs: + - channel: '#sre-oncall' + send_resolved: true + title: '{{ if eq .Status "firing" }}πŸ”₯{{ else }}βœ…{{ end }} [{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}' + text: >- + *Service:* {{ .CommonLabels.service }} + *Severity:* {{ .CommonLabels.severity }} + {{ range .Alerts }} + *Summary:* {{ .Annotations.summary }} + *Description:* {{ .Annotations.description }} + {{ if .Annotations.runbook_url }}*Runbook:* {{ .Annotations.runbook_url }}{{ end }} + {{ if .Annotations.dashboard }}*Dashboard:* {{ .Annotations.dashboard }}{{ end }} + {{ end }} + + - name: 'slack-critical' + slack_configs: + - channel: '#sre-oncall' + send_resolved: true + title: '{{ if eq .Status "firing" }}🚨 CRITICAL{{ else }}βœ… RESOLVED{{ end }} {{ .CommonLabels.alertname }}' + text: >- + *Service:* {{ .CommonLabels.service }} + *Severity:* CRITICAL β€” Immediate action required + {{ range .Alerts }} + *Summary:* {{ .Annotations.summary }} + *Description:* {{ .Annotations.description }} + {{ if .Annotations.runbook_url }}*Runbook:* {{ .Annotations.runbook_url }}{{ end }} + {{ end }} + + - name: 'slack-infrastructure' + slack_configs: + - channel: '#infrastructure' + send_resolved: true + title: '{{ if eq .Status "firing" }}⚠️{{ else }}βœ…{{ end }} [{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}' + text: >- + *Service:* {{ .CommonLabels.service }} + {{ range .Alerts }} + *Summary:* {{ .Annotations.summary }} + *Description:* {{ .Annotations.description }} + {{ end }} diff --git a/monitoring/grafana/provisioning/datasources/datasource.yml b/monitoring/grafana/provisioning/datasources/datasource.yml index 25a0841..1b0a6ee 100644 --- a/monitoring/grafana/provisioning/datasources/datasource.yml +++ b/monitoring/grafana/provisioning/datasources/datasource.yml @@ -21,3 +21,12 @@ datasources: matcherRegex: 'correlationId":"([^"]+)' name: correlationId url: '$${__value.raw}' + + - name: Alertmanager + uid: alertmanager + type: alertmanager + access: proxy + url: http://alertmanager:9093 + editable: true + jsonData: + implementation: prometheus diff --git a/monitoring/prometheus/alert-rules.yml b/monitoring/prometheus/alert-rules.yml index d2995a3..36bffa9 100644 --- a/monitoring/prometheus/alert-rules.yml +++ b/monitoring/prometheus/alert-rules.yml @@ -1,4 +1,5 @@ groups: + # ── API Latency & Error Alerts ─────────────────────────────────────────────── - name: goodgo_api_latency rules: # ── p99 latency alert ── @@ -79,3 +80,324 @@ groups: The 5xx error rate for the GoodGo API has been above 1% for the last 5 minutes. Current value: {{ $value | printf "%.2f" }}%. dashboard: "/d/goodgo-api-latency/goodgo-api-latency" + + # ── 5xx error rate critical ── + - alert: ApiErrorRate5xxCritical + expr: > + ( + sum(rate(http_requests_total{job="goodgo-api", status_code=~"5.."}[5m])) + / + sum(rate(http_requests_total{job="goodgo-api"}[5m])) + ) * 100 > 5 + for: 3m + labels: + severity: critical + team: sre + service: goodgo-api + annotations: + summary: "CRITICAL: API 5xx error rate above 5%" + description: > + The 5xx error rate for the GoodGo API has been above 5% for the last 3 minutes. + This indicates a major incident. Immediate investigation required. + Current value: {{ $value | printf "%.2f" }}%. + dashboard: "/d/goodgo-api-latency/goodgo-api-latency" + runbook_url: "https://docs.goodgo.vn/runbooks/5xx-critical" + + # ── No traffic (possible downtime) ── + - alert: ApiNoTraffic + expr: > + sum(rate(http_requests_total{job="goodgo-api"}[5m])) == 0 + for: 10m + labels: + severity: warning + team: sre + service: goodgo-api + annotations: + summary: "API receiving zero traffic for 10 minutes" + description: > + The GoodGo API has received no requests in the last 10 minutes. + This may indicate the service is down or unreachable. + + # ── Database Alerts ────────────────────────────────────────────────────────── + - name: goodgo_database + rules: + # ── PostgreSQL active connections high ── + - alert: PostgresActiveConnectionsHigh + expr: > + pg_stat_activity_count{datname="goodgo", state="active"} > 15 + for: 5m + labels: + severity: warning + team: sre + service: postgres + annotations: + summary: "PostgreSQL active connections above 15" + description: > + The number of active PostgreSQL connections has been above 15 for 5 minutes. + Pool size is 20. Current value: {{ $value }}. + Check for long-running queries or connection leaks. + runbook_url: "https://docs.goodgo.vn/runbooks/db-connections" + + # ── PostgreSQL connection pool near exhaustion ── + - alert: PostgresConnectionPoolCritical + expr: > + pg_stat_activity_count{datname="goodgo"} > 180 + for: 2m + labels: + severity: critical + team: sre + service: postgres + annotations: + summary: "CRITICAL: PostgreSQL connections near limit (>180/200)" + description: > + Total PostgreSQL connections have exceeded 180 (max client connections: 200). + PgBouncer pool may be exhausted. Immediate action required. + Current value: {{ $value }}. + runbook_url: "https://docs.goodgo.vn/runbooks/db-pool-exhaustion" + + # ── PostgreSQL slow queries ── + - alert: PostgresSlowQueries + expr: > + pg_stat_activity_count{datname="goodgo", state="active", wait_event_type="Lock"} > 5 + for: 5m + labels: + severity: warning + team: sre + service: postgres + annotations: + summary: "Multiple PostgreSQL queries waiting on locks" + description: > + More than 5 queries are blocked waiting on locks for 5+ minutes. + This may indicate lock contention or deadlocks. + Current value: {{ $value }}. + + # ── PostgreSQL down ── + - alert: PostgresDown + expr: > + up{job="goodgo-api"} == 0 + for: 1m + labels: + severity: critical + team: sre + service: postgres + annotations: + summary: "CRITICAL: Cannot scrape GoodGo API (possible service down)" + description: > + Prometheus cannot scrape the GoodGo API metrics endpoint. + The API or its dependencies (PostgreSQL, Redis) may be down. + + # ── Redis Alerts ───────────────────────────────────────────────────────────── + - name: goodgo_redis + rules: + # ── Redis memory usage high ── + - alert: RedisMemoryHigh + expr: > + redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80 + for: 5m + labels: + severity: warning + team: sre + service: redis + annotations: + summary: "Redis memory usage above 80%" + description: > + Redis memory usage has exceeded 80% of the configured maximum (512 MB). + Eviction policy (allkeys-lru) is active but high usage may indicate a problem. + Current usage: {{ $value | printf "%.1f" }}%. + + # ── Redis memory critical ── + - alert: RedisMemoryCritical + expr: > + redis_memory_used_bytes / redis_memory_max_bytes * 100 > 95 + for: 2m + labels: + severity: critical + team: sre + service: redis + annotations: + summary: "CRITICAL: Redis memory usage above 95%" + description: > + Redis memory usage has exceeded 95% of the configured maximum. + Heavy eviction is occurring. Consider increasing maxmemory or investigating cache patterns. + Current usage: {{ $value | printf "%.1f" }}%. + + # ── Redis connected clients high ── + - alert: RedisConnectedClientsHigh + expr: > + redis_connected_clients > 150 + for: 5m + labels: + severity: warning + team: sre + service: redis + annotations: + summary: "Redis connected clients above 150" + description: > + The number of connected Redis clients has exceeded 150 for 5+ minutes. + Current value: {{ $value }}. + + # ── Redis rejected connections ── + - alert: RedisRejectedConnections + expr: > + increase(redis_rejected_connections_total[5m]) > 0 + for: 1m + labels: + severity: critical + team: sre + service: redis + annotations: + summary: "Redis is rejecting connections" + description: > + Redis has rejected {{ $value }} connection(s) in the last 5 minutes. + This indicates maxclients has been reached. + + # ── Container Resource Alerts ──────────────────────────────────────────────── + - name: goodgo_containers + rules: + # ── Container restart loop ── + - alert: ContainerRestartLoop + expr: > + increase(container_restart_count{name=~"goodgo-.*"}[15m]) > 3 + for: 5m + labels: + severity: critical + team: sre + service: "{{ $labels.name }}" + annotations: + summary: "Container {{ $labels.name }} restart loop" + description: > + Container {{ $labels.name }} has restarted more than 3 times in the last 15 minutes. + This indicates a crash loop. Check container logs immediately. + + # ── Container memory near limit ── + - alert: ContainerMemoryHigh + expr: > + (container_memory_usage_bytes{name=~"goodgo-.*"} + / container_spec_memory_limit_bytes{name=~"goodgo-.*"}) * 100 > 85 + for: 5m + labels: + severity: warning + team: sre + service: "{{ $labels.name }}" + annotations: + summary: "Container {{ $labels.name }} memory usage above 85%" + description: > + Container {{ $labels.name }} is using more than 85% of its memory limit. + Current usage: {{ $value | printf "%.1f" }}%. + Risk of OOM kill if usage continues to grow. + + # ── Container CPU throttled ── + - alert: ContainerCPUThrottled + expr: > + rate(container_cpu_cfs_throttled_seconds_total{name=~"goodgo-.*"}[5m]) > 0.5 + for: 10m + labels: + severity: warning + team: sre + service: "{{ $labels.name }}" + annotations: + summary: "Container {{ $labels.name }} CPU is being throttled" + description: > + Container {{ $labels.name }} has been CPU-throttled for 10+ minutes. + Current throttle rate: {{ $value | printf "%.2f" }}s/s. + Consider increasing CPU limits. + + # ── Disk & Volume Alerts ───────────────────────────────────────────────────── + - name: goodgo_disk + rules: + # ── Host disk usage high ── + - alert: HostDiskUsageHigh + expr: > + (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80 + for: 10m + labels: + severity: warning + team: sre + service: host + annotations: + summary: "Host root disk usage above 80%" + description: > + The root filesystem is {{ $value | printf "%.1f" }}% full. + Clean up Docker artifacts, old backups, or logs to free space. + runbook_url: "https://docs.goodgo.vn/runbooks/disk-space" + + # ── Host disk usage critical ── + - alert: HostDiskUsageCritical + expr: > + (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 90 + for: 5m + labels: + severity: critical + team: sre + service: host + annotations: + summary: "CRITICAL: Host root disk usage above 90%" + description: > + The root filesystem is {{ $value | printf "%.1f" }}% full. + Services may fail. Immediate cleanup required. + runbook_url: "https://docs.goodgo.vn/runbooks/disk-space-critical" + + # ── Service Health Alerts ──────────────────────────────────────────────────── + - name: goodgo_services + rules: + # ── API health check failing ── + - alert: ApiHealthCheckFailing + expr: > + probe_success{job="goodgo-api-health"} == 0 + for: 2m + labels: + severity: critical + team: sre + service: goodgo-api + annotations: + summary: "CRITICAL: API health check is failing" + description: > + The GoodGo API health endpoint has been unreachable for 2+ minutes. + The service may be down or unresponsive. + + # ── Prometheus target down ── + - alert: PrometheusTargetDown + expr: > + up == 0 + for: 5m + labels: + severity: warning + team: sre + annotations: + summary: "Prometheus target {{ $labels.job }} is down" + description: > + Prometheus cannot scrape {{ $labels.instance }} (job: {{ $labels.job }}) for 5+ minutes. + + # ── Backup Monitoring Alerts ───────────────────────────────────────────────── + - name: goodgo_backups + rules: + # ── Backup age too old (no recent backup) ── + - alert: BackupTooOld + expr: > + (time() - goodgo_backup_last_success_timestamp_seconds) > 90000 + for: 5m + labels: + severity: warning + team: sre + service: pg-backup + annotations: + summary: "PostgreSQL backup is more than 25 hours old" + description: > + The last successful PostgreSQL backup was {{ $value | humanizeDuration }} ago. + Daily backups run at 02:00 UTC. The backup job may have failed. + Check: docker logs goodgo-pg-backup + + # ── Backup verification failed ── + - alert: BackupVerificationFailed + expr: > + goodgo_backup_verify_result == 0 + for: 1m + labels: + severity: warning + team: sre + service: pg-backup + annotations: + summary: "PostgreSQL backup verification failed" + description: > + The automated backup verification check has failed. + Check: docker exec goodgo-pg-backup cat /backups/verify-latest.json diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml index 3c55b5a..3deee0f 100644 --- a/monitoring/prometheus/prometheus.yml +++ b/monitoring/prometheus/prometheus.yml @@ -5,6 +5,11 @@ global: rule_files: - 'alert-rules.yml' +alerting: + alertmanagers: + - static_configs: + - targets: ['alertmanager:9093'] + scrape_configs: - job_name: 'goodgo-api' metrics_path: '/metrics'