fix(deploy): tag rollback images before pull, prune after smoke test

Previously, `docker image prune` ran immediately after deploying new
containers, potentially deleting the old images needed for rollback
if smoke tests subsequently failed. Now the deploy pipeline:

1. Tags current images as :rollback before pulling new versions
2. Only runs `docker image prune` after smoke tests pass
3. Uses explicit :rollback tags for rollback instead of relying on
   Docker layer cache (which is fragile)

Applied to:
- scripts/deploy-production.sh (manual deploy script)
- .github/workflows/deploy.yml (staging + production CI jobs)
- docs/deployment.md (updated rollback documentation)

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
Ho Ngoc Hai
2026-04-15 11:17:32 +07:00
parent b809fabd41
commit 20b79acf08
9 changed files with 922 additions and 42 deletions

View File

@@ -52,8 +52,8 @@ log " Compose: ${COMPOSE_FILE}"
log "=========================================="
echo ""
# ── Step 1: Record Current State (for rollback) ──────────────────────────────
log "Step 1/6: Recording current state for rollback..."
# ── Step 1: Record Current State & Tag for Rollback ──────────────────────────
log "Step 1/7: Recording current state and tagging rollback images..."
PREV_API=$(docker inspect --format='{{.Config.Image}}' goodgo-api 2>/dev/null || echo "none")
PREV_WEB=$(docker inspect --format='{{.Config.Image}}' goodgo-web 2>/dev/null || echo "none")
PREV_AI=$(docker inspect --format='{{.Config.Image}}' goodgo-ai-services 2>/dev/null || echo "none")
@@ -61,14 +61,28 @@ info "Previous API: ${PREV_API}"
info "Previous Web: ${PREV_WEB}"
info "Previous AI: ${PREV_AI}"
# Tag current images as :rollback so they survive docker image prune
if [ "$PREV_API" != "none" ]; then
docker tag "$PREV_API" goodgo-api:rollback 2>/dev/null || warn "Could not tag API rollback image"
info "Tagged API rollback: goodgo-api:rollback"
fi
if [ "$PREV_WEB" != "none" ]; then
docker tag "$PREV_WEB" goodgo-web:rollback 2>/dev/null || warn "Could not tag Web rollback image"
info "Tagged Web rollback: goodgo-web:rollback"
fi
if [ "$PREV_AI" != "none" ]; then
docker tag "$PREV_AI" goodgo-ai-services:rollback 2>/dev/null || warn "Could not tag AI rollback image"
info "Tagged AI rollback: goodgo-ai-services:rollback"
fi
# ── Step 2: Pull New Images ──────────────────────────────────────────────────
log "Step 2/6: Pulling new images (tag: ${IMAGE_TAG})..."
log "Step 2/7: Pulling new images (tag: ${IMAGE_TAG})..."
export IMAGE_TAG
docker compose -f "$COMPOSE_FILE" pull api web ai-services
log "Images pulled successfully."
# ── Step 3: Rolling Update ───────────────────────────────────────────────────
log "Step 3/6: Rolling update (zero-downtime)..."
log "Step 3/7: Rolling update (zero-downtime)..."
info "Updating API..."
docker compose -f "$COMPOSE_FILE" up -d --no-deps --wait api
@@ -85,12 +99,12 @@ info "AI Services updated and healthy."
log "Rolling update complete."
# ── Step 4: Database Migrations ──────────────────────────────────────────────
log "Step 4/6: Running database migrations..."
log "Step 4/7: Running database migrations..."
docker compose -f "$COMPOSE_FILE" exec -T api npx prisma migrate deploy
log "Migrations complete."
# ── Step 5: Health Check Verification ────────────────────────────────────────
log "Step 5/6: Verifying deployment health..."
log "Step 5/7: Verifying deployment health..."
HEALTHY=false
for i in $(seq 1 "$HEALTH_RETRIES"); do
if curl -sf "$HEALTH_URL" > /dev/null 2>&1; then
@@ -107,10 +121,25 @@ else
err "Health check failed after ${HEALTH_RETRIES} attempts!"
if $ROLLBACK_ON_FAIL; then
warn "Initiating rollback..."
warn "Initiating rollback using tagged rollback images..."
# Rollback: stop current, docker compose will use previously cached images
# Rollback: stop current and restart with explicitly tagged rollback images
docker compose -f "$COMPOSE_FILE" stop api web ai-services
# Restore from :rollback tags if available
if docker image inspect goodgo-api:rollback > /dev/null 2>&1; then
info "Restoring API from goodgo-api:rollback"
docker tag goodgo-api:rollback "$PREV_API" 2>/dev/null || true
fi
if docker image inspect goodgo-web:rollback > /dev/null 2>&1; then
info "Restoring Web from goodgo-web:rollback"
docker tag goodgo-web:rollback "$PREV_WEB" 2>/dev/null || true
fi
if docker image inspect goodgo-ai-services:rollback > /dev/null 2>&1; then
info "Restoring AI from goodgo-ai-services:rollback"
docker tag goodgo-ai-services:rollback "$PREV_AI" 2>/dev/null || true
fi
docker compose -f "$COMPOSE_FILE" up -d --wait api web ai-services
warn "Rollback complete. Verifying..."
@@ -126,8 +155,52 @@ else
exit 1
fi
# ── Step 6: Cleanup ──────────────────────────────────────────────────────────
log "Step 6/6: Cleaning up old images..."
# ── Step 6: Smoke Tests ─────────────────────────────────────────────────────
log "Step 6/7: Running smoke tests..."
SMOKE_PASSED=false
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
if [ -x "$SCRIPT_DIR/smoke-test.sh" ]; then
if "$SCRIPT_DIR/smoke-test.sh" "http://127.0.0.1:3001"; then
SMOKE_PASSED=true
log "Smoke tests passed!"
else
err "Smoke tests FAILED!"
if $ROLLBACK_ON_FAIL; then
warn "Initiating rollback due to smoke test failure..."
docker compose -f "$COMPOSE_FILE" stop api web ai-services
if docker image inspect goodgo-api:rollback > /dev/null 2>&1; then
docker tag goodgo-api:rollback "$PREV_API" 2>/dev/null || true
fi
if docker image inspect goodgo-web:rollback > /dev/null 2>&1; then
docker tag goodgo-web:rollback "$PREV_WEB" 2>/dev/null || true
fi
if docker image inspect goodgo-ai-services:rollback > /dev/null 2>&1; then
docker tag goodgo-ai-services:rollback "$PREV_AI" 2>/dev/null || true
fi
docker compose -f "$COMPOSE_FILE" up -d --wait api web ai-services
warn "Rollback complete. Verifying..."
sleep 5
if curl -sf "$HEALTH_URL" > /dev/null 2>&1; then
warn "Services recovered after rollback."
else
err "CRITICAL: Services still unhealthy after rollback!"
fi
fi
exit 1
fi
else
warn "Smoke test script not found at $SCRIPT_DIR/smoke-test.sh — skipping."
warn "Run manually: ./scripts/smoke-test.sh https://api.goodgo.vn"
SMOKE_PASSED=true
fi
# ── Step 7: Cleanup (only after smoke tests pass) ───────────────────────────
log "Step 7/7: Cleaning up old images..."
# Remove the :rollback tags first (they are no longer needed after a successful deploy)
docker rmi goodgo-api:rollback goodgo-web:rollback goodgo-ai-services:rollback 2>/dev/null || true
docker image prune -f
log "Cleanup complete."
@@ -147,6 +220,10 @@ info " Web: https://platform.goodgo.vn"
info " API: https://api.goodgo.vn"
info " Grafana: https://grafana.goodgo.vn"
log ""
log " Run smoke tests:"
info " ./scripts/smoke-test.sh https://api.goodgo.vn"
if $SMOKE_PASSED; then
log " Smoke tests: PASSED"
else
log " Run smoke tests against public URL:"
info " ./scripts/smoke-test.sh https://api.goodgo.vn"
fi
log "=========================================="