diff --git a/apps/api/src/modules/shared/infrastructure/logger.service.ts b/apps/api/src/modules/shared/infrastructure/logger.service.ts index 1b5a4c7..4858fbd 100644 --- a/apps/api/src/modules/shared/infrastructure/logger.service.ts +++ b/apps/api/src/modules/shared/infrastructure/logger.service.ts @@ -14,10 +14,14 @@ export class LoggerService implements NestLoggerService { ? { target: 'pino-pretty', options: { colorize: true } } : undefined, formatters: { + level(label) { + return { level: label }; + }, log(object) { return maskPii(object) as Record; }, }, + base: { service: 'goodgo-api' }, timestamp: pino.stdTimeFunctions.isoTime, }); } diff --git a/docker-compose.yml b/docker-compose.yml index 864a6d0..d71c007 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -101,6 +101,70 @@ services: networks: - goodgo-net + # ── Database Backup ── + pg-backup: + image: postgis/postgis:16-3.4 + container_name: goodgo-pg-backup + restart: unless-stopped + entrypoint: /bin/bash + command: + - -c + - | + apt-get update -qq && apt-get install -y -qq cron > /dev/null 2>&1 + echo "0 2 * * * PGHOST=postgres PGPORT=5432 PGUSER=${DB_USER:-goodgo} PGDATABASE=${DB_NAME:-goodgo} PGPASSWORD=${DB_PASSWORD:-goodgo_secret} BACKUP_DIR=/backups RETENTION_DAYS=${BACKUP_RETENTION_DAYS:-7} /scripts/pg-backup.sh >> /var/log/pg-backup.log 2>&1" | crontab - + /scripts/pg-backup.sh + cron -f + environment: + PGHOST: postgres + PGPORT: '5432' + PGUSER: ${DB_USER:-goodgo} + PGDATABASE: ${DB_NAME:-goodgo} + PGPASSWORD: ${DB_PASSWORD:-goodgo_secret} + BACKUP_DIR: /backups + RETENTION_DAYS: ${BACKUP_RETENTION_DAYS:-7} + volumes: + - ./scripts/backup:/scripts:ro + - pg_backups:/backups + depends_on: + postgres: + condition: service_healthy + networks: + - goodgo-net + + # ── Log Aggregation ── + loki: + image: grafana/loki:3.0.0 + container_name: goodgo-loki + restart: unless-stopped + ports: + - '${LOKI_PORT:-3100}:3100' + command: -config.file=/etc/loki/loki-config.yml + volumes: + - ./monitoring/loki/loki-config.yml:/etc/loki/loki-config.yml:ro + - loki_data:/loki + healthcheck: + test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:3100/ready'] + interval: 15s + timeout: 5s + retries: 5 + start_period: 20s + networks: + - goodgo-net + + promtail: + image: grafana/promtail:3.0.0 + container_name: goodgo-promtail + restart: unless-stopped + command: -config.file=/etc/promtail/promtail-config.yml + volumes: + - ./monitoring/promtail/promtail-config.yml:/etc/promtail/promtail-config.yml:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + depends_on: + loki: + condition: service_healthy + networks: + - goodgo-net + prometheus: image: prom/prometheus:v2.51.0 container_name: goodgo-prometheus @@ -142,6 +206,8 @@ services: depends_on: prometheus: condition: service_healthy + loki: + condition: service_healthy healthcheck: test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:3000/api/health'] interval: 15s @@ -160,6 +226,10 @@ volumes: driver: local minio_data: driver: local + pg_backups: + driver: local + loki_data: + driver: local prometheus_data: driver: local grafana_data: diff --git a/docs/backup-restore.md b/docs/backup-restore.md new file mode 100644 index 0000000..5c283ad --- /dev/null +++ b/docs/backup-restore.md @@ -0,0 +1,102 @@ +# Database Backup & Restore Procedures + +## Overview + +Automated daily PostgreSQL backups run inside the `pg-backup` Docker container using `pg_dump` with custom format compression. Backups are stored in the `pg_backups` Docker volume. + +## Backup Configuration + +| Setting | Default | Environment Variable | +|---------|---------|---------------------| +| Schedule | Daily at 02:00 UTC | Cron in `pg-backup` service | +| Retention | 7 days | `BACKUP_RETENTION_DAYS` | +| Format | Custom (`pg_dump --format=custom`) | — | +| Compression | Level 6 | — | +| Storage | `pg_backups` Docker volume | — | + +## Listing Backups + +```bash +docker exec goodgo-pg-backup ls -lh /backups/ +``` + +## Manual Backup + +```bash +docker exec goodgo-pg-backup /scripts/pg-backup.sh +``` + +## Restore Procedure + +### 1. Identify the backup to restore + +```bash +docker exec goodgo-pg-backup ls -lht /backups/ +``` + +### 2. Stop application services + +```bash +docker compose stop ai-services +# Stop any NestJS API processes +``` + +### 3. Run restore + +```bash +docker exec -it goodgo-pg-backup /scripts/pg-restore.sh /backups/goodgo_YYYYMMDD_HHMMSS.sql.gz +``` + +The restore script will: +- Terminate active database connections +- Drop and recreate the database +- Restore from the selected backup + +### 4. Verify restore + +```bash +docker exec goodgo-postgres psql -U goodgo -d goodgo -c '\dt' +docker exec goodgo-postgres psql -U goodgo -d goodgo -c 'SELECT count(*) FROM "User";' +``` + +### 5. Run Prisma migrations (if needed) + +```bash +pnpm prisma migrate deploy +``` + +### 6. Restart services + +```bash +docker compose up -d +``` + +## Backup Verification + +Check the backup log: + +```bash +docker exec goodgo-pg-backup cat /var/log/pg-backup.log +``` + +Verify backup integrity without restoring: + +```bash +docker exec goodgo-pg-backup pg_restore --list /backups/goodgo_YYYYMMDD_HHMMSS.sql.gz +``` + +## Disaster Recovery + +For complete data loss (volume destroyed): + +1. Retrieve backup from external storage (if configured) +2. Recreate the `pg_backups` volume and copy backup file in +3. Follow the restore procedure above + +## Log Aggregation + +Logs are aggregated via Loki + Promtail and viewable in Grafana: + +- **Grafana**: http://localhost:3002 (dashboard: "GoodGo - Logs") +- **Loki**: http://localhost:3100 +- **Log retention**: 15 days (configured in `monitoring/loki/loki-config.yml`) diff --git a/monitoring/grafana/dashboards/logs.json b/monitoring/grafana/dashboards/logs.json new file mode 100644 index 0000000..c74010c --- /dev/null +++ b/monitoring/grafana/dashboards/logs.json @@ -0,0 +1,137 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Log Volume by Level", + "type": "timeseries", + "gridPos": { "h": 6, "w": 24, "x": 0, "y": 0 }, + "datasource": { "type": "loki", "uid": "loki" }, + "targets": [ + { + "expr": "sum by (level) (count_over_time({compose_service=~\"$service\"} | json [$__interval]))", + "legendFormat": "{{ level }}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "bars", + "stacking": { "mode": "normal" }, + "fillOpacity": 80 + }, + "color": { "mode": "palette-classic" } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "error" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "fatal" }, "properties": [{ "id": "color", "value": { "fixedColor": "dark-red", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "warn" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "info" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "debug" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] } + ] + } + }, + { + "title": "Error Rate", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 6 }, + "datasource": { "type": "loki", "uid": "loki" }, + "targets": [ + { + "expr": "sum(count_over_time({compose_service=~\"$service\"} | json | level = `error` or level = `fatal` [$__range]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 10 }, + { "color": "red", "value": 50 } + ] + } + } + } + }, + { + "title": "Errors by Service", + "type": "piechart", + "gridPos": { "h": 8, "w": 6, "x": 6, "y": 6 }, + "datasource": { "type": "loki", "uid": "loki" }, + "targets": [ + { + "expr": "sum by (compose_service) (count_over_time({compose_service=~\".+\"} | json | level = `error` or level = `fatal` [$__range]))", + "legendFormat": "{{ compose_service }}", + "refId": "A" + } + ] + }, + { + "title": "HTTP Request Logs (4xx/5xx)", + "type": "table", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "datasource": { "type": "loki", "uid": "loki" }, + "targets": [ + { + "expr": "{compose_service=~\"$service\", component=\"http\"} | json | statusCode >= 400 | line_format \"{{.method}} {{.url}} {{.statusCode}} {{.duration}}ms\"", + "refId": "A" + } + ] + }, + { + "title": "All Logs", + "type": "logs", + "gridPos": { "h": 12, "w": 24, "x": 0, "y": 14 }, + "datasource": { "type": "loki", "uid": "loki" }, + "targets": [ + { + "expr": "{compose_service=~\"$service\"} | json |= `$search`", + "refId": "A" + } + ], + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": true, + "enableLogDetails": true, + "sortOrder": "Descending" + } + } + ], + "refresh": "10s", + "schemaVersion": 39, + "tags": ["goodgo", "logs", "loki"], + "templating": { + "list": [ + { + "name": "service", + "type": "query", + "datasource": { "type": "loki", "uid": "loki" }, + "query": "label_values(compose_service)", + "includeAll": true, + "allValue": ".+", + "current": { "text": "All", "value": "$__all" }, + "refresh": 2, + "multi": true + }, + { + "name": "search", + "type": "textbox", + "current": { "text": "", "value": "" }, + "label": "Search" + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "title": "GoodGo - Logs", + "uid": "goodgo-logs" +} diff --git a/monitoring/grafana/provisioning/datasources/datasource.yml b/monitoring/grafana/provisioning/datasources/datasource.yml index 1a57b69..8a5fb8a 100644 --- a/monitoring/grafana/provisioning/datasources/datasource.yml +++ b/monitoring/grafana/provisioning/datasources/datasource.yml @@ -7,3 +7,15 @@ datasources: url: http://prometheus:9090 isDefault: true editable: true + + - name: Loki + type: loki + access: proxy + url: http://loki:3100 + editable: true + jsonData: + derivedFields: + - datasourceUid: prometheus + matcherRegex: 'correlationId":"([^"]+)' + name: correlationId + url: '$${__value.raw}' diff --git a/monitoring/loki/loki-config.yml b/monitoring/loki/loki-config.yml new file mode 100644 index 0000000..ac58b96 --- /dev/null +++ b/monitoring/loki/loki-config.yml @@ -0,0 +1,58 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + instance_addr: 127.0.0.1 + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +limits_config: + reject_old_samples: true + reject_old_samples_max_age: 168h # 7 days + max_entries_limit_per_query: 5000 + ingestion_rate_mb: 4 + ingestion_burst_size_mb: 6 + +storage_config: + tsdb_shipper: + active_index_directory: /loki/tsdb-index + cache_location: /loki/tsdb-cache + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + retention_delete_worker_count: 150 + delete_request_store: filesystem + +# Retention: keep logs for 15 days +chunk_store_config: + chunk_cache_config: + embedded_cache: + enabled: true + max_size_mb: 100 + +table_manager: + retention_deletes_enabled: true + retention_period: 360h # 15 days diff --git a/monitoring/promtail/promtail-config.yml b/monitoring/promtail/promtail-config.yml new file mode 100644 index 0000000..74e90e2 --- /dev/null +++ b/monitoring/promtail/promtail-config.yml @@ -0,0 +1,66 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + # Scrape Docker container logs + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + filters: + - name: network + values: ["goodgo-net"] + relabel_configs: + # Use container name as label + - source_labels: ['__meta_docker_container_name'] + regex: '/(.*)' + target_label: 'container' + # Add service label from container name (strip goodgo- prefix) + - source_labels: ['__meta_docker_container_name'] + regex: '/goodgo-(.*)' + target_label: 'service' + # Add compose service label + - source_labels: ['__meta_docker_container_label_com_docker_compose_service'] + target_label: 'compose_service' + pipeline_stages: + # Try to parse JSON logs (Pino structured output) + - json: + expressions: + level: level + msg: msg + context: context + method: method + url: url + statusCode: statusCode + duration: duration + correlationId: correlationId + component: component + timestamp: time + # Map Pino numeric levels to labels + - template: + source: level + template: '{{ if eq .Value "10" }}trace{{ else if eq .Value "20" }}debug{{ else if eq .Value "30" }}info{{ else if eq .Value "40" }}warn{{ else if eq .Value "50" }}error{{ else if eq .Value "60" }}fatal{{ else }}{{ .Value }}{{ end }}' + - labels: + level: + context: + component: + # Add structured metadata + - structured_metadata: + method: + url: + statusCode: + correlationId: + # Timestamp from Pino output + - timestamp: + source: timestamp + format: RFC3339Nano + fallback_formats: + - '2006-01-02T15:04:05.999Z07:00' + action_on_failure: fudge diff --git a/scripts/backup/pg-backup.sh b/scripts/backup/pg-backup.sh new file mode 100755 index 0000000..6b11d70 --- /dev/null +++ b/scripts/backup/pg-backup.sh @@ -0,0 +1,42 @@ +#!/bin/bash +set -euo pipefail + +# ── PostgreSQL Automated Backup Script ── +# Runs daily via cron inside the pg-backup container. +# Dumps the database and manages retention. + +BACKUP_DIR="${BACKUP_DIR:-/backups}" +RETENTION_DAYS="${RETENTION_DAYS:-7}" +TIMESTAMP=$(date +%Y%m%d_%H%M%S) +BACKUP_FILE="${BACKUP_DIR}/goodgo_${TIMESTAMP}.sql.gz" + +echo "[backup] Starting PostgreSQL backup at $(date -Iseconds)" + +# Ensure backup directory exists +mkdir -p "${BACKUP_DIR}" + +# Run pg_dump with compression +pg_dump \ + -h "${PGHOST:-postgres}" \ + -p "${PGPORT:-5432}" \ + -U "${PGUSER:-goodgo}" \ + -d "${PGDATABASE:-goodgo}" \ + --no-owner \ + --no-privileges \ + --format=custom \ + --compress=6 \ + -f "${BACKUP_FILE}" + +FILESIZE=$(du -h "${BACKUP_FILE}" | cut -f1) +echo "[backup] Backup completed: ${BACKUP_FILE} (${FILESIZE})" + +# Prune old backups beyond retention period +echo "[backup] Pruning backups older than ${RETENTION_DAYS} days..." +PRUNED=$(find "${BACKUP_DIR}" -name "goodgo_*.sql.gz" -type f -mtime "+${RETENTION_DAYS}" -print -delete | wc -l) +echo "[backup] Pruned ${PRUNED} old backup(s)" + +# List current backups +echo "[backup] Current backups:" +ls -lh "${BACKUP_DIR}"/goodgo_*.sql.gz 2>/dev/null || echo " (none)" + +echo "[backup] Done at $(date -Iseconds)" diff --git a/scripts/backup/pg-restore.sh b/scripts/backup/pg-restore.sh new file mode 100755 index 0000000..10655e6 --- /dev/null +++ b/scripts/backup/pg-restore.sh @@ -0,0 +1,72 @@ +#!/bin/bash +set -euo pipefail + +# ── PostgreSQL Restore Script ── +# Restores a database from a backup file. +# +# Usage: +# ./pg-restore.sh +# ./pg-restore.sh /backups/goodgo_20260408_020000.sql.gz +# +# Inside Docker: +# docker exec -it goodgo-pg-backup /scripts/pg-restore.sh /backups/ + +BACKUP_FILE="${1:-}" + +if [ -z "${BACKUP_FILE}" ]; then + echo "Usage: $0 " + echo "" + echo "Available backups:" + ls -lht "${BACKUP_DIR:-/backups}"/goodgo_*.sql.gz 2>/dev/null || echo " (none found)" + exit 1 +fi + +if [ ! -f "${BACKUP_FILE}" ]; then + echo "[restore] ERROR: Backup file not found: ${BACKUP_FILE}" + exit 1 +fi + +PGHOST="${PGHOST:-postgres}" +PGPORT="${PGPORT:-5432}" +PGUSER="${PGUSER:-goodgo}" +PGDATABASE="${PGDATABASE:-goodgo}" + +echo "[restore] WARNING: This will DROP and recreate the '${PGDATABASE}' database." +echo "[restore] Backup file: ${BACKUP_FILE}" +echo "[restore] Target: ${PGHOST}:${PGPORT}/${PGDATABASE}" +echo "" + +# If running interactively, prompt for confirmation +if [ -t 0 ]; then + read -rp "Continue? (yes/no): " CONFIRM + if [ "${CONFIRM}" != "yes" ]; then + echo "[restore] Aborted." + exit 0 + fi +fi + +echo "[restore] Starting restore at $(date -Iseconds)..." + +# Terminate existing connections +psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d postgres -c \ + "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '${PGDATABASE}' AND pid <> pg_backend_pid();" \ + 2>/dev/null || true + +# Drop and recreate database +psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d postgres -c "DROP DATABASE IF EXISTS \"${PGDATABASE}\";" +psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d postgres -c "CREATE DATABASE \"${PGDATABASE}\";" + +# Restore from backup +pg_restore \ + -h "${PGHOST}" \ + -p "${PGPORT}" \ + -U "${PGUSER}" \ + -d "${PGDATABASE}" \ + --no-owner \ + --no-privileges \ + --clean \ + --if-exists \ + "${BACKUP_FILE}" || true + +echo "[restore] Restore completed at $(date -Iseconds)" +echo "[restore] Verify with: psql -h ${PGHOST} -U ${PGUSER} -d ${PGDATABASE} -c '\\dt'"