feat(ops): add database backup strategy and log aggregation stack

- Add pg-backup container with daily automated pg_dump (02:00 UTC) and 7-day retention
- Add backup/restore scripts with documented recovery procedure
- Add Loki + Promtail for centralized log aggregation from all Docker containers
- Add Loki as Grafana datasource with correlation ID derived fields
- Add Grafana logs dashboard with volume, error rate, HTTP request, and log viewer panels
- Configure Promtail to parse Pino structured JSON logs with level/context labels
- Enhance LoggerService with string-level formatter and service base field
- Configure 15-day log retention in Loki

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
Ho Ngoc Hai
2026-04-08 04:04:32 +07:00
parent 7c9f682046
commit 775eb7b374
9 changed files with 563 additions and 0 deletions

View File

@@ -101,6 +101,70 @@ services:
networks:
- goodgo-net
# ── Database Backup ──
pg-backup:
image: postgis/postgis:16-3.4
container_name: goodgo-pg-backup
restart: unless-stopped
entrypoint: /bin/bash
command:
- -c
- |
apt-get update -qq && apt-get install -y -qq cron > /dev/null 2>&1
echo "0 2 * * * PGHOST=postgres PGPORT=5432 PGUSER=${DB_USER:-goodgo} PGDATABASE=${DB_NAME:-goodgo} PGPASSWORD=${DB_PASSWORD:-goodgo_secret} BACKUP_DIR=/backups RETENTION_DAYS=${BACKUP_RETENTION_DAYS:-7} /scripts/pg-backup.sh >> /var/log/pg-backup.log 2>&1" | crontab -
/scripts/pg-backup.sh
cron -f
environment:
PGHOST: postgres
PGPORT: '5432'
PGUSER: ${DB_USER:-goodgo}
PGDATABASE: ${DB_NAME:-goodgo}
PGPASSWORD: ${DB_PASSWORD:-goodgo_secret}
BACKUP_DIR: /backups
RETENTION_DAYS: ${BACKUP_RETENTION_DAYS:-7}
volumes:
- ./scripts/backup:/scripts:ro
- pg_backups:/backups
depends_on:
postgres:
condition: service_healthy
networks:
- goodgo-net
# ── Log Aggregation ──
loki:
image: grafana/loki:3.0.0
container_name: goodgo-loki
restart: unless-stopped
ports:
- '${LOKI_PORT:-3100}:3100'
command: -config.file=/etc/loki/loki-config.yml
volumes:
- ./monitoring/loki/loki-config.yml:/etc/loki/loki-config.yml:ro
- loki_data:/loki
healthcheck:
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:3100/ready']
interval: 15s
timeout: 5s
retries: 5
start_period: 20s
networks:
- goodgo-net
promtail:
image: grafana/promtail:3.0.0
container_name: goodgo-promtail
restart: unless-stopped
command: -config.file=/etc/promtail/promtail-config.yml
volumes:
- ./monitoring/promtail/promtail-config.yml:/etc/promtail/promtail-config.yml:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
depends_on:
loki:
condition: service_healthy
networks:
- goodgo-net
prometheus:
image: prom/prometheus:v2.51.0
container_name: goodgo-prometheus
@@ -142,6 +206,8 @@ services:
depends_on:
prometheus:
condition: service_healthy
loki:
condition: service_healthy
healthcheck:
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:3000/api/health']
interval: 15s
@@ -160,6 +226,10 @@ volumes:
driver: local
minio_data:
driver: local
pg_backups:
driver: local
loki_data:
driver: local
prometheus_data:
driver: local
grafana_data: