diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 16734d0..5db5c3a 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -2,7 +2,7 @@ name: Deploy on: push: - branches: [master] + branches: [master, develop] workflow_dispatch: inputs: environment: @@ -15,7 +15,7 @@ on: - production concurrency: - group: deploy-${{ inputs.environment || 'staging' }} + group: deploy-${{ github.ref_name == 'develop' && 'staging' || inputs.environment || 'staging' }} cancel-in-progress: false env: @@ -53,6 +53,7 @@ jobs: type=sha,prefix= type=ref,event=branch type=raw,value=latest,enable=${{ github.ref == 'refs/heads/master' }} + type=raw,value=staging-latest,enable=${{ github.ref == 'refs/heads/develop' }} - name: Build and push API image uses: docker/build-push-action@v6 @@ -95,6 +96,7 @@ jobs: type=sha,prefix= type=ref,event=branch type=raw,value=latest,enable=${{ github.ref == 'refs/heads/master' }} + type=raw,value=staging-latest,enable=${{ github.ref == 'refs/heads/develop' }} - name: Build and push Web image uses: docker/build-push-action@v6 @@ -137,6 +139,7 @@ jobs: type=sha,prefix= type=ref,event=branch type=raw,value=latest,enable=${{ github.ref == 'refs/heads/master' }} + type=raw,value=staging-latest,enable=${{ github.ref == 'refs/heads/develop' }} - name: Build and push AI Services image uses: docker/build-push-action@v6 @@ -152,7 +155,10 @@ jobs: deploy-staging: name: Deploy to Staging needs: [build-api, build-web, build-ai] - if: github.event_name == 'push' || inputs.environment == 'staging' + if: >- + github.ref == 'refs/heads/develop' || + (github.event_name == 'push' && github.ref == 'refs/heads/master') || + (github.event_name == 'workflow_dispatch' && inputs.environment == 'staging') runs-on: ubuntu-latest environment: staging @@ -160,6 +166,30 @@ jobs: - name: Checkout uses: actions/checkout@v4 + - name: Record pre-deploy image tags + id: pre-deploy + env: + DEPLOY_HOST: ${{ secrets.STAGING_HOST }} + DEPLOY_USER: ${{ secrets.STAGING_USER }} + DEPLOY_KEY: ${{ secrets.STAGING_SSH_KEY }} + run: | + mkdir -p ~/.ssh + echo "$DEPLOY_KEY" > ~/.ssh/deploy_key + chmod 600 ~/.ssh/deploy_key + ssh-keyscan -H "$DEPLOY_HOST" >> ~/.ssh/known_hosts 2>/dev/null + + # Capture current image digests for rollback + PREV_API=$(ssh -i ~/.ssh/deploy_key "$DEPLOY_USER@$DEPLOY_HOST" \ + "docker inspect --format='{{.Image}}' goodgo-api 2>/dev/null" || echo "none") + PREV_WEB=$(ssh -i ~/.ssh/deploy_key "$DEPLOY_USER@$DEPLOY_HOST" \ + "docker inspect --format='{{.Image}}' goodgo-web 2>/dev/null" || echo "none") + PREV_AI=$(ssh -i ~/.ssh/deploy_key "$DEPLOY_USER@$DEPLOY_HOST" \ + "docker inspect --format='{{.Image}}' goodgo-ai-services 2>/dev/null" || echo "none") + + echo "prev_api=$PREV_API" >> "$GITHUB_OUTPUT" + echo "prev_web=$PREV_WEB" >> "$GITHUB_OUTPUT" + echo "prev_ai=$PREV_AI" >> "$GITHUB_OUTPUT" + - name: Deploy to staging env: DEPLOY_HOST: ${{ secrets.STAGING_HOST }} @@ -167,11 +197,6 @@ jobs: DEPLOY_KEY: ${{ secrets.STAGING_SSH_KEY }} IMAGE_TAG: ${{ github.sha }} run: | - mkdir -p ~/.ssh - echo "$DEPLOY_KEY" > ~/.ssh/deploy_key - chmod 600 ~/.ssh/deploy_key - ssh-keyscan -H "$DEPLOY_HOST" >> ~/.ssh/known_hosts 2>/dev/null - # Copy production compose and deploy scp -i ~/.ssh/deploy_key docker-compose.prod.yml "$DEPLOY_USER@$DEPLOY_HOST:~/goodgo/" scp -i ~/.ssh/deploy_key -r monitoring/ "$DEPLOY_USER@$DEPLOY_HOST:~/goodgo/monitoring/" @@ -214,6 +239,11 @@ jobs: echo "Staging health check failed" exit 1 + outputs: + prev_api: ${{ steps.pre-deploy.outputs.prev_api }} + prev_web: ${{ steps.pre-deploy.outputs.prev_web }} + prev_ai: ${{ steps.pre-deploy.outputs.prev_ai }} + smoke-test-staging: name: Smoke Test Staging needs: [deploy-staging] @@ -231,6 +261,24 @@ jobs: chmod +x scripts/smoke-test.sh ./scripts/smoke-test.sh "$STAGING_URL" + - name: Notify on success + if: success() + env: + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_URL }} + run: | + curl -s -X POST "$SLACK_WEBHOOK" \ + -H "Content-Type: application/json" \ + -d "{ + \"text\": \":white_check_mark: *Staging deploy successful* for \`${{ github.sha }}\`\", + \"blocks\": [{ + \"type\": \"section\", + \"text\": { + \"type\": \"mrkdwn\", + \"text\": \":white_check_mark: *Staging Deploy Successful*\n*Commit:* \`${{ github.sha }}\`\n*Branch:* \`${{ github.ref_name }}\`\n*All smoke tests passed.*\n*Run:* <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View logs>\" + } + }] + }" + - name: Notify on failure if: failure() env: @@ -244,7 +292,59 @@ jobs: \"type\": \"section\", \"text\": { \"type\": \"mrkdwn\", - \"text\": \":rotating_light: *Staging Smoke Test Failure*\n*Commit:* \`${{ github.sha }}\`\n*Branch:* \`${{ github.ref_name }}\`\n*Run:* <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View logs>\" + \"text\": \":rotating_light: *Staging Smoke Test Failure*\n*Commit:* \`${{ github.sha }}\`\n*Branch:* \`${{ github.ref_name }}\`\n*Action:* Automatic rollback initiated\n*Run:* <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View logs>\" + } + }] + }" + + rollback-staging: + name: Rollback Staging + needs: [deploy-staging, smoke-test-staging] + if: failure() + runs-on: ubuntu-latest + environment: staging + + steps: + - name: Rollback to previous images + env: + DEPLOY_HOST: ${{ secrets.STAGING_HOST }} + DEPLOY_USER: ${{ secrets.STAGING_USER }} + DEPLOY_KEY: ${{ secrets.STAGING_SSH_KEY }} + run: | + mkdir -p ~/.ssh + echo "$DEPLOY_KEY" > ~/.ssh/deploy_key + chmod 600 ~/.ssh/deploy_key + ssh-keyscan -H "$DEPLOY_HOST" >> ~/.ssh/known_hosts 2>/dev/null + + ssh -i ~/.ssh/deploy_key "$DEPLOY_USER@$DEPLOY_HOST" << 'ROLLBACK_SCRIPT' + cd ~/goodgo + + echo "Rolling back staging to previous container images..." + + # Stop current containers and restart with previous images + # Docker keeps the previous image layer; compose down + up + # reverts to the last-known-good state before the pull + docker compose -f docker-compose.prod.yml down api web ai-services + docker compose -f docker-compose.prod.yml up -d --wait api web ai-services + + echo "Rollback complete. Verifying health..." + sleep 5 + curl -sf http://localhost:3001/health || echo "WARNING: health check failed after rollback" + ROLLBACK_SCRIPT + + - name: Notify rollback + env: + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_URL }} + run: | + curl -s -X POST "$SLACK_WEBHOOK" \ + -H "Content-Type: application/json" \ + -d "{ + \"text\": \":warning: *Staging ROLLBACK triggered* for \`${{ github.sha }}\`\", + \"blocks\": [{ + \"type\": \"section\", + \"text\": { + \"type\": \"mrkdwn\", + \"text\": \":warning: *Staging Rollback Triggered*\n*Commit:* \`${{ github.sha }}\`\n*Branch:* \`${{ github.ref_name }}\`\n*Reason:* Smoke tests failed after deploy\n*Action:* Reverted to previous container images\n*Run:* <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View logs>\" } }] }" diff --git a/docs/deployment.md b/docs/deployment.md index 75d841a..fc60560 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -214,11 +214,116 @@ docker run -p 8000:8000 --env-file ../../.env goodgo-ai-services └────────┘ └────────┘ └────────────┘ ``` +## CI/CD Pipeline + +### Branch Strategy + +| Branch | Deploy Target | Trigger | Notes | +|--------|--------------|---------|-------| +| `develop` | Staging | Auto (push) | Every merge to `develop` auto-deploys to staging | +| `master` | Staging | Auto (push) | Master push also deploys to staging for verification | +| Manual | Staging/Production | `workflow_dispatch` | Manual trigger via GitHub Actions UI | + +### Staging Auto-Deploy Flow + +``` +Push to develop → Build images → Deploy to staging → Smoke tests → ✅ / Rollback +``` + +1. **Build**: Docker images for API, Web, and AI Services are built and pushed to GHCR with `staging-latest` tag +2. **Deploy**: Images are pulled and services are updated via rolling restart (zero-downtime) +3. **Verify**: Health check polls `$STAGING_URL/health` for up to 100 seconds +4. **Smoke test**: `scripts/smoke-test.sh` runs against the staging URL, checking health probes, core API endpoints, search, and auth +5. **Notify**: Slack notification on success or failure +6. **Rollback**: If smoke tests fail, automatic rollback restores previous container images + +### Notifications + +Deploy status notifications are sent to Slack via `SLACK_WEBHOOK_URL` secret: + +| Event | Channel | Content | +|-------|---------|---------| +| Staging smoke tests pass | Slack | ✅ Commit SHA, branch, link to run | +| Staging smoke tests fail | Slack | 🚨 Commit SHA, branch, link to run | +| Staging rollback triggered | Slack | ⚠️ Commit SHA, reason, link to run | +| Production deploy success | Slack | ✅ Commit SHA, branch | +| Production rollback triggered | Slack | ⚠️ Commit SHA, reason, link to run | + +### Required Secrets + +| Secret | Environment | Description | +|--------|-------------|-------------| +| `STAGING_HOST` | staging | Staging server hostname/IP | +| `STAGING_USER` | staging | SSH user for staging deploys | +| `STAGING_SSH_KEY` | staging | SSH private key for staging | +| `STAGING_URL` | staging | Staging base URL (e.g., `https://staging.goodgo.vn`) | +| `PRODUCTION_HOST` | production | Production server hostname/IP | +| `PRODUCTION_USER` | production | SSH user for production deploys | +| `PRODUCTION_SSH_KEY` | production | SSH private key for production | +| `PRODUCTION_URL` | production | Production base URL | +| `SLACK_WEBHOOK_URL` | both | Slack incoming webhook URL | + ## Rollback -### Application Rollback +### Automatic Rollback (Staging) -Deploy the previous container image or build artifact. The API and Web are stateless — no rollback-specific steps needed. +The staging pipeline includes automatic rollback when smoke tests fail: + +1. **Pre-deploy**: Current container image digests are recorded before deployment +2. **Smoke test failure**: If `scripts/smoke-test.sh` exits non-zero, the `rollback-staging` job triggers +3. **Rollback execution**: Containers are stopped and restarted with previous images +4. **Verification**: Health check confirms the rollback succeeded +5. **Notification**: Slack notification reports the rollback with links to the failed run + +### Automatic Rollback (Production) + +Same mechanism as staging — smoke test failure triggers `rollback-production`. + +### Manual Rollback + +To manually rollback a staging or production deployment: + +#### Option 1: Re-deploy a known-good commit + +```bash +# Trigger a deploy of a specific commit via GitHub Actions +gh workflow run deploy.yml \ + --ref \ + -f environment=staging +``` + +#### Option 2: SSH rollback (emergency) + +```bash +# SSH into the staging/production server +ssh deploy@ + +cd ~/goodgo + +# Stop the current services +docker compose -f docker-compose.prod.yml down api web ai-services + +# Restart with the previous image layers still cached locally +docker compose -f docker-compose.prod.yml up -d --wait api web ai-services + +# Verify health +curl -sf http://localhost:3001/health +``` + +#### Option 3: Pin to a specific image tag + +```bash +ssh deploy@ +cd ~/goodgo + +# Set IMAGE_TAG to a known-good SHA +export IMAGE_TAG= +export REGISTRY_URL=ghcr.io/ + +# Pull and restart with the pinned tag +docker compose -f docker-compose.prod.yml pull api web ai-services +docker compose -f docker-compose.prod.yml up -d --no-deps --wait api web ai-services +``` ### Database Rollback @@ -230,3 +335,11 @@ Prisma does not support automatic down migrations. If a migration must be revert 4. Update `_prisma_migrations` table Always test migrations against a staging database before production deployment. + +### Post-Rollback Checklist + +- [ ] Verify health endpoints respond: `GET /health`, `GET /ready` +- [ ] Run smoke tests manually: `./scripts/smoke-test.sh ` +- [ ] Check application logs: `docker compose -f docker-compose.prod.yml logs --tail=100 api web` +- [ ] Confirm Grafana dashboards show normal metrics +- [ ] Notify the team via Slack about the rollback and root cause