Files
goodgo-platform/scripts/backup/pg-verify-backup.sh
Ho Ngoc Hai 8cdfe17205 feat(ops): add automated backup restore verification
Adds pg-verify-backup.sh that restores the latest backup to an isolated
test database and verifies integrity (table existence, row counts, key
checksums, PostGIS extension, indexes, enum types). Reports pass/fail
with optional JSON output.

- Cron schedule: daily at 04:00 UTC (2h after backup)
- On-demand: docker compose run --rm pg-verify-backup
- CI: weekly GitHub Actions workflow with artifact upload

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-10 23:19:43 +07:00

422 lines
14 KiB
Bash
Executable File

#!/bin/bash
set -euo pipefail
# ── PostgreSQL Backup Restore Verification Script ──
# Restores the latest backup to an isolated test database, verifies data
# integrity (table existence, row counts, key checksums), and reports pass/fail.
#
# SAFETY: Never touches the production database. Creates a temporary
# "goodgo_verify_<timestamp>" database and drops it on exit.
#
# Usage:
# ./pg-verify-backup.sh # Verify latest backup
# ./pg-verify-backup.sh <backup-file> # Verify specific backup
# SKIP_CLEANUP=1 ./pg-verify-backup.sh # Keep test DB for inspection
#
# Environment variables:
# BACKUP_DIR — Directory containing backups (default: /backups)
# PGHOST — PostgreSQL host (default: postgres)
# PGPORT — PostgreSQL port (default: 5432)
# PGUSER — PostgreSQL user (default: goodgo)
# PGPASSWORD — PostgreSQL password (from environment)
# PGDATABASE — Source/production database name (default: goodgo)
# SKIP_CLEANUP — Set to 1 to keep test database after verification
# REPORT_FILE — Path for JSON report output (optional)
#
# Exit codes:
# 0 — All checks passed
# 1 — One or more checks failed
# 2 — Setup error (no backups found, restore failed, etc.)
# ── Configuration ──
BACKUP_DIR="${BACKUP_DIR:-/backups}"
PGHOST="${PGHOST:-postgres}"
PGPORT="${PGPORT:-5432}"
PGUSER="${PGUSER:-goodgo}"
PGDATABASE="${PGDATABASE:-goodgo}"
SKIP_CLEANUP="${SKIP_CLEANUP:-0}"
REPORT_FILE="${REPORT_FILE:-}"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
VERIFY_DB="goodgo_verify_${TIMESTAMP}"
PASSED=0
FAILED=0
WARNINGS=0
RESULTS=()
# ── Color output (if terminal) ──
if [ -t 1 ]; then
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
NC='\033[0m'
else
GREEN=''
RED=''
YELLOW=''
BLUE=''
NC=''
fi
log_pass() {
PASSED=$((PASSED + 1))
RESULTS+=("{\"check\":\"$1\",\"status\":\"pass\",\"detail\":\"$2\"}")
echo -e "${GREEN}[PASS]${NC} $1: $2"
}
log_fail() {
FAILED=$((FAILED + 1))
RESULTS+=("{\"check\":\"$1\",\"status\":\"fail\",\"detail\":\"$2\"}")
echo -e "${RED}[FAIL]${NC} $1: $2"
}
log_warn() {
WARNINGS=$((WARNINGS + 1))
RESULTS+=("{\"check\":\"$1\",\"status\":\"warn\",\"detail\":\"$2\"}")
echo -e "${YELLOW}[WARN]${NC} $1: $2"
}
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
# ── Cleanup function ──
cleanup() {
local exit_code=$?
if [ "${SKIP_CLEANUP}" = "1" ]; then
log_info "SKIP_CLEANUP=1 — keeping test database '${VERIFY_DB}' for inspection"
log_info "Drop manually: psql -h ${PGHOST} -U ${PGUSER} -d postgres -c 'DROP DATABASE IF EXISTS \"${VERIFY_DB}\";'"
else
log_info "Cleaning up test database '${VERIFY_DB}'..."
psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d postgres -c \
"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '${VERIFY_DB}' AND pid <> pg_backend_pid();" \
> /dev/null 2>&1 || true
psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d postgres -c \
"DROP DATABASE IF EXISTS \"${VERIFY_DB}\";" \
> /dev/null 2>&1 || true
log_info "Test database dropped."
fi
return $exit_code
}
trap cleanup EXIT
# ── Determine backup file ──
BACKUP_FILE="${1:-}"
if [ -z "${BACKUP_FILE}" ]; then
log_info "Finding latest backup in ${BACKUP_DIR}..."
BACKUP_FILE=$(ls -t "${BACKUP_DIR}"/goodgo_*.sql.gz 2>/dev/null | head -n1 || true)
if [ -z "${BACKUP_FILE}" ]; then
echo -e "${RED}[ERROR]${NC} No backup files found in ${BACKUP_DIR}"
exit 2
fi
fi
if [ ! -f "${BACKUP_FILE}" ]; then
echo -e "${RED}[ERROR]${NC} Backup file not found: ${BACKUP_FILE}"
exit 2
fi
BACKUP_SIZE=$(du -h "${BACKUP_FILE}" | cut -f1)
BACKUP_MTIME=$(stat -c '%Y' "${BACKUP_FILE}" 2>/dev/null || stat -f '%m' "${BACKUP_FILE}" 2>/dev/null || echo "unknown")
echo ""
echo "================================================================"
echo " GoodGo Backup Restore Verification"
echo "================================================================"
echo " Backup file : ${BACKUP_FILE}"
echo " Backup size : ${BACKUP_SIZE}"
echo " Source DB : ${PGHOST}:${PGPORT}/${PGDATABASE}"
echo " Test DB : ${VERIFY_DB}"
echo " Started at : $(date -Iseconds)"
echo "================================================================"
echo ""
# ── Step 1: Create isolated test database ──
log_info "Step 1/5: Creating isolated test database '${VERIFY_DB}'..."
psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d postgres -c \
"CREATE DATABASE \"${VERIFY_DB}\";" > /dev/null 2>&1
if ! psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d postgres -c \
"SELECT 1 FROM pg_database WHERE datname = '${VERIFY_DB}';" 2>/dev/null | grep -q 1; then
echo -e "${RED}[ERROR]${NC} Failed to create test database"
exit 2
fi
log_pass "Database creation" "Test database '${VERIFY_DB}' created"
# ── Step 2: Restore backup into test database ──
log_info "Step 2/5: Restoring backup to test database..."
RESTORE_START=$(date +%s)
# Enable PostGIS extension before restore (required for geometry columns)
psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d "${VERIFY_DB}" -c \
"CREATE EXTENSION IF NOT EXISTS postgis;" > /dev/null 2>&1
RESTORE_OUTPUT=$(pg_restore \
-h "${PGHOST}" \
-p "${PGPORT}" \
-U "${PGUSER}" \
-d "${VERIFY_DB}" \
--no-owner \
--no-privileges \
--clean \
--if-exists \
"${BACKUP_FILE}" 2>&1) || true
RESTORE_END=$(date +%s)
RESTORE_DURATION=$((RESTORE_END - RESTORE_START))
# Check if any critical errors occurred (ignore warnings about objects not existing)
RESTORE_ERRORS=$(echo "${RESTORE_OUTPUT}" | grep -ci "ERROR" || true)
if [ "${RESTORE_ERRORS}" -gt 0 ]; then
# Filter out harmless "does not exist" errors from --clean --if-exists
CRITICAL_ERRORS=$(echo "${RESTORE_OUTPUT}" | grep -i "ERROR" | grep -cv "does not exist" || true)
if [ "${CRITICAL_ERRORS}" -gt 0 ]; then
log_fail "Restore" "pg_restore completed with ${CRITICAL_ERRORS} critical error(s) in ${RESTORE_DURATION}s"
echo "${RESTORE_OUTPUT}" | grep -i "ERROR" | grep -v "does not exist" | head -5
else
log_pass "Restore" "pg_restore completed in ${RESTORE_DURATION}s (non-critical warnings only)"
fi
else
log_pass "Restore" "pg_restore completed cleanly in ${RESTORE_DURATION}s"
fi
# ── Step 3: Verify table existence ──
log_info "Step 3/5: Verifying table existence..."
# Expected tables from Prisma schema (22 models + Prisma migration tracking)
EXPECTED_TABLES=(
"User"
"RefreshToken"
"OAuthAccount"
"Agent"
"Property"
"PropertyMedia"
"Listing"
"SavedSearch"
"Transaction"
"Inquiry"
"Lead"
"Payment"
"Plan"
"Subscription"
"UsageRecord"
"Valuation"
"MarketIndex"
"NotificationLog"
"NotificationPreference"
"AdminAuditLog"
"Review"
"_prisma_migrations"
)
ACTUAL_TABLES=$(psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d "${VERIFY_DB}" \
-t -A -c "SELECT tablename FROM pg_tables WHERE schemaname = 'public' ORDER BY tablename;" 2>/dev/null)
MISSING_TABLES=()
for table in "${EXPECTED_TABLES[@]}"; do
if echo "${ACTUAL_TABLES}" | grep -qx "${table}"; then
: # Table exists
else
MISSING_TABLES+=("${table}")
fi
done
ACTUAL_COUNT=$(echo "${ACTUAL_TABLES}" | grep -c . || true)
EXPECTED_COUNT=${#EXPECTED_TABLES[@]}
if [ ${#MISSING_TABLES[@]} -eq 0 ]; then
log_pass "Table existence" "All ${EXPECTED_COUNT} expected tables present (${ACTUAL_COUNT} total)"
else
log_fail "Table existence" "Missing ${#MISSING_TABLES[@]} table(s): ${MISSING_TABLES[*]}"
fi
# ── Step 4: Row count comparison ──
log_info "Step 4/5: Comparing row counts between source and restored databases..."
ROW_MISMATCHES=0
ROW_REPORT=""
for table in "${EXPECTED_TABLES[@]}"; do
if [ "${table}" = "_prisma_migrations" ]; then
continue # Skip migration tracking table for row comparison
fi
# Skip missing tables
if [[ " ${MISSING_TABLES[*]} " =~ " ${table} " ]]; then
continue
fi
SOURCE_COUNT=$(psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d "${PGDATABASE}" \
-t -A -c "SELECT count(*) FROM \"${table}\";" 2>/dev/null || echo "ERROR")
VERIFY_COUNT=$(psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d "${VERIFY_DB}" \
-t -A -c "SELECT count(*) FROM \"${table}\";" 2>/dev/null || echo "ERROR")
if [ "${SOURCE_COUNT}" = "ERROR" ] || [ "${VERIFY_COUNT}" = "ERROR" ]; then
log_warn "Row count: ${table}" "Could not query row count"
continue
fi
ROW_REPORT="${ROW_REPORT}\n ${table}: source=${SOURCE_COUNT} restored=${VERIFY_COUNT}"
if [ "${SOURCE_COUNT}" != "${VERIFY_COUNT}" ]; then
ROW_MISMATCHES=$((ROW_MISMATCHES + 1))
log_fail "Row count: ${table}" "Mismatch — source=${SOURCE_COUNT} restored=${VERIFY_COUNT}"
fi
done
if [ "${ROW_MISMATCHES}" -eq 0 ]; then
log_pass "Row counts" "All tables match between source and restored database"
else
log_fail "Row counts" "${ROW_MISMATCHES} table(s) have row count mismatches"
fi
# ── Step 5: Key data checksums ──
log_info "Step 5/5: Verifying data checksums on critical tables..."
# Checksum key tables by hashing sorted IDs + key fields
verify_checksum() {
local table="$1"
local query="$2"
local label="$3"
# Skip if table is missing
if [[ " ${MISSING_TABLES[*]} " =~ " ${table} " ]]; then
log_warn "Checksum: ${label}" "Table '${table}' missing, skipped"
return
fi
SOURCE_HASH=$(psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d "${PGDATABASE}" \
-t -A -c "${query}" 2>/dev/null || echo "ERROR")
VERIFY_HASH=$(psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d "${VERIFY_DB}" \
-t -A -c "${query}" 2>/dev/null || echo "ERROR")
if [ "${SOURCE_HASH}" = "ERROR" ] || [ "${VERIFY_HASH}" = "ERROR" ]; then
log_warn "Checksum: ${label}" "Could not compute checksum"
return
fi
if [ "${SOURCE_HASH}" = "${VERIFY_HASH}" ]; then
log_pass "Checksum: ${label}" "Hashes match (${SOURCE_HASH:0:16}...)"
else
log_fail "Checksum: ${label}" "Hash mismatch — source=${SOURCE_HASH:0:16}... restored=${VERIFY_HASH:0:16}..."
fi
}
# Checksum queries for critical tables
verify_checksum "User" \
"SELECT md5(string_agg(id || email || phone || \"fullName\" || role::text || \"kycStatus\"::text, '|' ORDER BY id)) FROM \"User\";" \
"User identities"
verify_checksum "Property" \
"SELECT md5(string_agg(id || title || district || city || \"propertyType\"::text || \"areaM2\"::text, '|' ORDER BY id)) FROM \"Property\";" \
"Property records"
verify_checksum "Listing" \
"SELECT md5(string_agg(id || \"propertyId\" || \"sellerId\" || status::text || \"priceVND\"::text || \"transactionType\"::text, '|' ORDER BY id)) FROM \"Listing\";" \
"Listing records"
verify_checksum "Payment" \
"SELECT md5(string_agg(id || \"userId\" || provider::text || type::text || \"amountVND\"::text || status::text, '|' ORDER BY id)) FROM \"Payment\";" \
"Payment records"
verify_checksum "Subscription" \
"SELECT md5(string_agg(id || \"userId\" || \"planId\" || status::text, '|' ORDER BY id)) FROM \"Subscription\";" \
"Subscription records"
verify_checksum "Transaction" \
"SELECT md5(string_agg(id || \"listingId\" || \"buyerId\" || status::text, '|' ORDER BY id)) FROM \"Transaction\";" \
"Transaction records"
verify_checksum "Plan" \
"SELECT md5(string_agg(id || tier::text || name || \"priceMonthlyVND\"::text, '|' ORDER BY id)) FROM \"Plan\";" \
"Plan records"
verify_checksum "_prisma_migrations" \
"SELECT md5(string_agg(id || migration_name || checksum, '|' ORDER BY started_at)) FROM \"_prisma_migrations\";" \
"Migration history"
# ── PostGIS extension check ──
POSTGIS_CHECK=$(psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d "${VERIFY_DB}" \
-t -A -c "SELECT extname FROM pg_extension WHERE extname = 'postgis';" 2>/dev/null || echo "")
if [ "${POSTGIS_CHECK}" = "postgis" ]; then
log_pass "PostGIS extension" "PostGIS is available in restored database"
else
log_fail "PostGIS extension" "PostGIS extension not found in restored database"
fi
# ── Index verification ──
SOURCE_INDEX_COUNT=$(psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d "${PGDATABASE}" \
-t -A -c "SELECT count(*) FROM pg_indexes WHERE schemaname = 'public';" 2>/dev/null || echo "0")
VERIFY_INDEX_COUNT=$(psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d "${VERIFY_DB}" \
-t -A -c "SELECT count(*) FROM pg_indexes WHERE schemaname = 'public';" 2>/dev/null || echo "0")
if [ "${SOURCE_INDEX_COUNT}" = "${VERIFY_INDEX_COUNT}" ]; then
log_pass "Index count" "All ${SOURCE_INDEX_COUNT} indexes restored"
else
log_warn "Index count" "Source has ${SOURCE_INDEX_COUNT} indexes, restored has ${VERIFY_INDEX_COUNT}"
fi
# ── Enum type verification ──
SOURCE_ENUM_COUNT=$(psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d "${PGDATABASE}" \
-t -A -c "SELECT count(*) FROM pg_type WHERE typtype = 'e';" 2>/dev/null || echo "0")
VERIFY_ENUM_COUNT=$(psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d "${VERIFY_DB}" \
-t -A -c "SELECT count(*) FROM pg_type WHERE typtype = 'e';" 2>/dev/null || echo "0")
if [ "${SOURCE_ENUM_COUNT}" = "${VERIFY_ENUM_COUNT}" ]; then
log_pass "Enum types" "All ${SOURCE_ENUM_COUNT} enum types restored"
else
log_fail "Enum types" "Source has ${SOURCE_ENUM_COUNT} enums, restored has ${VERIFY_ENUM_COUNT}"
fi
# ── Summary ──
echo ""
echo "================================================================"
echo " Verification Summary"
echo "================================================================"
echo " Backup file : ${BACKUP_FILE}"
echo " Restore time : ${RESTORE_DURATION}s"
TOTAL=$((PASSED + FAILED))
echo " Checks : ${TOTAL} total, ${PASSED} passed, ${FAILED} failed, ${WARNINGS} warnings"
echo " Finished at : $(date -Iseconds)"
if [ "${FAILED}" -eq 0 ]; then
echo -e " Result : ${GREEN}ALL CHECKS PASSED${NC}"
echo "================================================================"
else
echo -e " Result : ${RED}${FAILED} CHECK(S) FAILED${NC}"
echo "================================================================"
fi
echo ""
# ── Write JSON report (if requested) ──
if [ -n "${REPORT_FILE}" ]; then
RESULTS_JSON=$(printf '%s,' "${RESULTS[@]}")
RESULTS_JSON="[${RESULTS_JSON%,}]"
cat > "${REPORT_FILE}" << JSONEOF
{
"timestamp": "$(date -Iseconds)",
"backupFile": "${BACKUP_FILE}",
"backupSize": "${BACKUP_SIZE}",
"testDatabase": "${VERIFY_DB}",
"restoreDurationSeconds": ${RESTORE_DURATION},
"passed": ${PASSED},
"failed": ${FAILED},
"warnings": ${WARNINGS},
"result": "$([ ${FAILED} -eq 0 ] && echo "pass" || echo "fail")",
"checks": ${RESULTS_JSON}
}
JSONEOF
log_info "JSON report written to ${REPORT_FILE}"
fi
# Exit with appropriate code
if [ "${FAILED}" -gt 0 ]; then
exit 1
fi
exit 0