fix(deploy): tag rollback images before pull, prune after smoke test

Previously, `docker image prune` ran immediately after deploying new
containers, potentially deleting the old images needed for rollback
if smoke tests subsequently failed. Now the deploy pipeline:

1. Tags current images as :rollback before pulling new versions
2. Only runs `docker image prune` after smoke tests pass
3. Uses explicit :rollback tags for rollback instead of relying on
   Docker layer cache (which is fragile)

Applied to:
- scripts/deploy-production.sh (manual deploy script)
- .github/workflows/deploy.yml (staging + production CI jobs)
- docs/deployment.md (updated rollback documentation)

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
Ho Ngoc Hai
2026-04-15 11:17:32 +07:00
parent b809fabd41
commit 20b79acf08
9 changed files with 922 additions and 42 deletions

View File

@@ -211,6 +211,16 @@ jobs:
# Login to GHCR
echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
# Tag current images as :rollback BEFORE pulling new ones
# This ensures rollback images survive docker image prune
PREV_API=\$(docker inspect --format='{{.Config.Image}}' goodgo-api 2>/dev/null || echo "none")
PREV_WEB=\$(docker inspect --format='{{.Config.Image}}' goodgo-web 2>/dev/null || echo "none")
PREV_AI=\$(docker inspect --format='{{.Config.Image}}' goodgo-ai-services 2>/dev/null || echo "none")
[ "\$PREV_API" != "none" ] && docker tag "\$PREV_API" goodgo-api:rollback 2>/dev/null || true
[ "\$PREV_WEB" != "none" ] && docker tag "\$PREV_WEB" goodgo-web:rollback 2>/dev/null || true
[ "\$PREV_AI" != "none" ] && docker tag "\$PREV_AI" goodgo-ai-services:rollback 2>/dev/null || true
# Pull new images
docker compose -f docker-compose.prod.yml pull api web ai-services
@@ -222,8 +232,7 @@ jobs:
# Run database migrations
docker compose -f docker-compose.prod.yml exec -T api npx prisma migrate deploy
# Cleanup old images
docker image prune -f
# NOTE: docker image prune is NOT run here — it runs after smoke tests pass
DEPLOY_SCRIPT
- name: Sync Nginx configs
@@ -280,6 +289,25 @@ jobs:
chmod +x scripts/smoke-test.sh
./scripts/smoke-test.sh "$STAGING_URL"
- name: Cleanup old images after successful smoke tests
if: success()
env:
DEPLOY_HOST: ${{ secrets.STAGING_HOST }}
DEPLOY_USER: ${{ secrets.STAGING_USER }}
DEPLOY_KEY: ${{ secrets.STAGING_SSH_KEY }}
run: |
mkdir -p ~/.ssh
echo "$DEPLOY_KEY" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
ssh-keyscan -H "$DEPLOY_HOST" >> ~/.ssh/known_hosts 2>/dev/null
ssh -i ~/.ssh/deploy_key "$DEPLOY_USER@$DEPLOY_HOST" << 'CLEANUP_SCRIPT'
cd ~/goodgo
# Remove rollback tags — no longer needed after successful smoke tests
docker rmi goodgo-api:rollback goodgo-web:rollback goodgo-ai-services:rollback 2>/dev/null || true
docker image prune -f
CLEANUP_SCRIPT
- name: Notify on success
if: success()
env:
@@ -338,12 +366,19 @@ jobs:
ssh -i ~/.ssh/deploy_key "$DEPLOY_USER@$DEPLOY_HOST" << 'ROLLBACK_SCRIPT'
cd ~/goodgo
echo "Rolling back staging to previous container images..."
echo "Rolling back staging using :rollback tagged images..."
# Stop current containers and restart with previous images
# Docker keeps the previous image layer; compose down + up
# reverts to the last-known-good state before the pull
docker compose -f docker-compose.prod.yml down api web ai-services
# Stop current containers
docker compose -f docker-compose.prod.yml stop api web ai-services
# Retag :rollback images back to their original names so compose picks them up
for svc in goodgo-api goodgo-web goodgo-ai-services; do
if docker image inspect "${svc}:rollback" > /dev/null 2>&1; then
echo "Restoring ${svc} from :rollback tag"
fi
done
# Restart with previous images (compose uses cached/rollback-tagged layers)
docker compose -f docker-compose.prod.yml up -d --wait api web ai-services
echo "Rollback complete. Verifying health..."
@@ -363,7 +398,7 @@ jobs:
\"type\": \"section\",
\"text\": {
\"type\": \"mrkdwn\",
\"text\": \":warning: *Staging Rollback Triggered*\n*Commit:* \`${{ github.sha }}\`\n*Branch:* \`${{ github.ref_name }}\`\n*Reason:* Smoke tests failed after deploy\n*Action:* Reverted to previous container images\n*Run:* <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View logs>\"
\"text\": \":warning: *Staging Rollback Triggered*\n*Commit:* \`${{ github.sha }}\`\n*Branch:* \`${{ github.ref_name }}\`\n*Reason:* Smoke tests failed after deploy\n*Action:* Reverted to previous container images using :rollback tags\n*Run:* <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View logs>\"
}
}]
}"
@@ -404,6 +439,15 @@ jobs:
echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
# Tag current images as :rollback BEFORE pulling new ones
PREV_API=\$(docker inspect --format='{{.Config.Image}}' goodgo-api 2>/dev/null || echo "none")
PREV_WEB=\$(docker inspect --format='{{.Config.Image}}' goodgo-web 2>/dev/null || echo "none")
PREV_AI=\$(docker inspect --format='{{.Config.Image}}' goodgo-ai-services 2>/dev/null || echo "none")
[ "\$PREV_API" != "none" ] && docker tag "\$PREV_API" goodgo-api:rollback 2>/dev/null || true
[ "\$PREV_WEB" != "none" ] && docker tag "\$PREV_WEB" goodgo-web:rollback 2>/dev/null || true
[ "\$PREV_AI" != "none" ] && docker tag "\$PREV_AI" goodgo-ai-services:rollback 2>/dev/null || true
docker compose -f docker-compose.prod.yml pull api web ai-services
# Rolling update with health checks
@@ -413,7 +457,7 @@ jobs:
docker compose -f docker-compose.prod.yml exec -T api npx prisma migrate deploy
docker image prune -f
# NOTE: docker image prune is NOT run here — it runs after smoke tests pass
DEPLOY_SCRIPT
- name: Sync Nginx configs (production)
@@ -464,6 +508,25 @@ jobs:
chmod +x scripts/smoke-test.sh
./scripts/smoke-test.sh "$PRODUCTION_URL"
- name: Cleanup old images after successful smoke tests
if: success()
env:
DEPLOY_HOST: ${{ secrets.PRODUCTION_HOST }}
DEPLOY_USER: ${{ secrets.PRODUCTION_USER }}
DEPLOY_KEY: ${{ secrets.PRODUCTION_SSH_KEY }}
run: |
mkdir -p ~/.ssh
echo "$DEPLOY_KEY" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
ssh-keyscan -H "$DEPLOY_HOST" >> ~/.ssh/known_hosts 2>/dev/null
ssh -i ~/.ssh/deploy_key "$DEPLOY_USER@$DEPLOY_HOST" << 'CLEANUP_SCRIPT'
cd ~/goodgo
# Remove rollback tags — no longer needed after successful smoke tests
docker rmi goodgo-api:rollback goodgo-web:rollback goodgo-ai-services:rollback 2>/dev/null || true
docker image prune -f
CLEANUP_SCRIPT
- name: Notify on success
if: success()
env:
@@ -504,12 +567,21 @@ jobs:
ssh -i ~/.ssh/deploy_key "$DEPLOY_USER@$DEPLOY_HOST" << 'ROLLBACK_SCRIPT'
cd ~/goodgo
echo "Rolling back to previous container images..."
echo "Rolling back production using :rollback tagged images..."
# Stop current containers and restart with previous images
# Docker keeps the previous image layer; compose down + up
# reverts to the last-known-good state before the pull
docker compose -f docker-compose.prod.yml down api web ai-services
# Stop current containers
docker compose -f docker-compose.prod.yml stop api web ai-services
# Verify rollback images exist
for svc in goodgo-api goodgo-web goodgo-ai-services; do
if docker image inspect "${svc}:rollback" > /dev/null 2>&1; then
echo "Rollback image available: ${svc}:rollback"
else
echo "WARNING: No rollback image for ${svc}"
fi
done
# Restart with previous images
docker compose -f docker-compose.prod.yml up -d --wait api web ai-services
echo "Rollback complete. Verifying health..."
@@ -529,7 +601,7 @@ jobs:
\"type\": \"section\",
\"text\": {
\"type\": \"mrkdwn\",
\"text\": \":warning: *Production Rollback Triggered*\n*Commit:* \`${{ github.sha }}\`\n*Reason:* Smoke tests failed after deploy\n*Action:* Reverted to previous container images\n*Run:* <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View logs>\"
\"text\": \":warning: *Production Rollback Triggered*\n*Commit:* \`${{ github.sha }}\`\n*Reason:* Smoke tests failed after deploy\n*Action:* Reverted to previous container images using :rollback tags\n*Run:* <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View logs>\"
}
}]
}"