feat(auth): implement dual-key JWT verification for zero-downtime rotation

Add JWT_SECRET_NEXT env var support for seamless JWT secret rotation: - JwtStrategy: use secretOrKeyProvider to try primary then fallback key - TokenService.verifyAccessToken(): dual-key fallback for internal callers - Redis metric jwt_verify_with_next_total for monitoring cut-over progress - Session revocation marker support restored in JwtStrategy.validate() - Unit tests for all three verification scenarios (primary, fallback, both-fail) - docs/security/secret-rotation.md runbook with step-by-step rotation procedure Closes GOO-203. Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-24 12:08:34 +07:00
parent 7cb12be97f
commit 6afe4fd626
5 changed files with 250 additions and 446 deletions
--- a/apps/api/src/modules/auth/infrastructure/tests/jwt.strategy.spec.ts
+++ b/apps/api/src/modules/auth/infrastructure/tests/jwt.strategy.spec.ts
@@ -33,6 +33,7 @@ type RedisStub = {
  isAvailable: ReturnType<typeof vi.fn>;
  get: ReturnType<typeof vi.fn>;
  set: ReturnType<typeof vi.fn>;
  getClient: ReturnType<typeof vi.fn>;
 };
 function makePrisma(user: { isActive: boolean; deletedAt: Date | null } | null): PrismaStub {
@@ -49,6 +50,27 @@ function makeRedis(options: { available?: boolean; cached?: string | null } = {}
    isAvailable: vi.fn().mockReturnValue(available),
    get: vi.fn().mockResolvedValue(cached),
    set: vi.fn().mockResolvedValue(undefined),
    getClient: vi.fn().mockReturnValue({ incr: vi.fn().mockResolvedValue(1) }),
  };
 }
 function makeRedisWithRevocation(options: {
  available?: boolean;
  revokedAt?: string | null;
  userStatus?: string | null;
 }): RedisStub {
  const { available = true, revokedAt = null, userStatus = null } = options;
  const get = vi.fn(async (key: string) => {
    if (key.startsWith('auth:session_revoked:v1')) return revokedAt;
    if (key.startsWith('auth:user_status:v1')) return userStatus;
    return null;
  });
  return {
    isAvailable: vi.fn().mockReturnValue(available),
    get: get as ReturnType<typeof vi.fn>,
    set: vi.fn().mockResolvedValue(undefined),
    getClient: vi.fn().mockReturnValue({ incr: vi.fn().mockResolvedValue(1) }),
  };
 }
@@ -217,4 +239,82 @@ describe('JwtStrategy', () => {
      strategy.validate({ sub: 'banned-cached', phone: '+84900000005', role: 'BUYER' }),
    ).rejects.toMatchObject({ status: 401 });
  });
  describe('session revocation marker', () => {
    it('rejects tokens issued before the revocation marker (iat < revokedAt)', async () => {
      vi.stubEnv('JWT_SECRET', 'test-secret-key');
      const { JwtStrategy } = await import('../strategies/jwt.strategy');
      const revokedAt = new Date('2026-04-24T12:00:00Z').toISOString();
      const redis = makeRedisWithRevocation({
        available: true,
        revokedAt,
        userStatus: JSON.stringify({ isActive: true, deletedAt: null }),
      });
      const strategy = new JwtStrategy(makePrisma(ACTIVE_USER) as never, redis as never);
      const iatBefore = Math.floor(new Date('2026-04-24T11:59:59Z').getTime() / 1000);
      await expect(
        strategy.validate({ sub: 'user-rev', phone: '+84900000006', role: 'BUYER', iat: iatBefore }),
      ).rejects.toMatchObject({ status: 401 });
    });
    it('accepts tokens issued after the revocation marker', async () => {
      vi.stubEnv('JWT_SECRET', 'test-secret-key');
      const { JwtStrategy } = await import('../strategies/jwt.strategy');
      const revokedAt = new Date('2026-04-24T12:00:00Z').toISOString();
      const redis = makeRedisWithRevocation({
        available: true,
        revokedAt,
        userStatus: JSON.stringify({ isActive: true, deletedAt: null }),
      });
      const strategy = new JwtStrategy(makePrisma(ACTIVE_USER) as never, redis as never);
      const iatAfter = Math.floor(new Date('2026-04-24T12:00:05Z').getTime() / 1000);
      const result = await strategy.validate({
        sub: 'user-rev-fresh', phone: '+84900000007', role: 'BUYER', iat: iatAfter,
      });
      expect(result.sub).toBe('user-rev-fresh');
    });
    it('skips revocation check when Redis is unavailable (fail-open)', async () => {
      vi.stubEnv('JWT_SECRET', 'test-secret-key');
      const { JwtStrategy } = await import('../strategies/jwt.strategy');
      const redis = makeRedisWithRevocation({ available: false });
      const strategy = new JwtStrategy(makePrisma(ACTIVE_USER) as never, redis as never);
      const result = await strategy.validate({
        sub: 'user-rdown-rev', phone: '+84900000008', role: 'BUYER', iat: 1,
      });
      expect(result.sub).toBe('user-rdown-rev');
    });
    it('passes when no revocation marker is present', async () => {
      vi.stubEnv('JWT_SECRET', 'test-secret-key');
      const { JwtStrategy } = await import('../strategies/jwt.strategy');
      const redis = makeRedisWithRevocation({
        available: true, revokedAt: null,
        userStatus: JSON.stringify({ isActive: true, deletedAt: null }),
      });
      const strategy = new JwtStrategy(makePrisma(ACTIVE_USER) as never, redis as never);
      const iat = Math.floor(Date.now() / 1000);
      const result = await strategy.validate({
        sub: 'user-no-rev', phone: '+84900000009', role: 'BUYER', iat,
      });
      expect(result.sub).toBe('user-no-rev');
    });
  });
  describe('dual-key verification (JWT_SECRET_NEXT)', () => {
    it('constructs successfully when JWT_SECRET_NEXT is set', async () => {
      vi.stubEnv('JWT_SECRET', 'primary-secret');
      vi.stubEnv('JWT_SECRET_NEXT', 'next-secret');
      const { JwtStrategy } = await import('../strategies/jwt.strategy');
      const strategy = new JwtStrategy(makePrisma(ACTIVE_USER) as never, makeRedis() as never);
      expect(strategy).toBeDefined();
    });
    it('constructs successfully when JWT_SECRET_NEXT is not set', async () => {
      vi.stubEnv('JWT_SECRET', 'primary-secret');
      const { JwtStrategy } = await import('../strategies/jwt.strategy');
      const strategy = new JwtStrategy(makePrisma(ACTIVE_USER) as never, makeRedis() as never);
      expect(strategy).toBeDefined();
    });
  });
 });
--- a/apps/api/src/modules/auth/infrastructure/tests/token.service.spec.ts
+++ b/apps/api/src/modules/auth/infrastructure/tests/token.service.spec.ts
@@ -154,5 +154,35 @@ describe('TokenService', () => {
      const result = service.verifyAccessToken('bad-jwt');
      expect(result).toBeNull();
    });
    it('falls back to JWT_SECRET_NEXT when primary verification fails', () => {
      vi.stubEnv('JWT_SECRET_NEXT', 'next-secret');
      mockJwtService.verify
        .mockImplementationOnce(() => { throw new Error('invalid signature'); })
        .mockReturnValueOnce(payload);
      const result = service.verifyAccessToken('rotated-jwt');
      expect(result).toEqual(payload);
      expect(mockJwtService.verify).toHaveBeenCalledTimes(2);
      expect(mockJwtService.verify).toHaveBeenNthCalledWith(2, 'rotated-jwt', { secret: 'next-secret' });
      vi.unstubAllEnvs();
    });
    it('returns null when both primary and NEXT secret fail', () => {
      vi.stubEnv('JWT_SECRET_NEXT', 'next-secret');
      mockJwtService.verify.mockImplementation(() => { throw new Error('invalid'); });
      const result = service.verifyAccessToken('totally-bad-jwt');
      expect(result).toBeNull();
      expect(mockJwtService.verify).toHaveBeenCalledTimes(2);
      vi.unstubAllEnvs();
    });
    it('does not try NEXT secret when env var is unset', () => {
      vi.stubEnv('JWT_SECRET_NEXT', '');
      mockJwtService.verify.mockImplementation(() => { throw new Error('invalid'); });
      const result = service.verifyAccessToken('bad-jwt');
      expect(result).toBeNull();
      expect(mockJwtService.verify).toHaveBeenCalledTimes(1);
      vi.unstubAllEnvs();
    });
  });
 });
--- a/apps/api/src/modules/auth/infrastructure/services/token.service.ts
+++ b/apps/api/src/modules/auth/infrastructure/services/token.service.ts
@@ -113,10 +113,23 @@ export class TokenService {
    await this.refreshTokenRepo.revokeAllForUser(userId);
  }
  /**
   * Verify an access token using the primary secret, falling back to
   * JWT_SECRET_NEXT during key rotation windows.
   */
  verifyAccessToken(token: string): JwtPayload | null {
    try {
      return this.jwtService.verify<JwtPayload>(token);
    } catch {
      // Primary verification failed - try the rotation fallback secret
      const nextSecret = process.env['JWT_SECRET_NEXT'];
      if (nextSecret) {
        try {
          return this.jwtService.verify<JwtPayload>(token, { secret: nextSecret });
        } catch {
          // Both secrets failed
        }
      }
      return null;
    }
  }
--- a/apps/api/src/modules/auth/infrastructure/strategies/jwt.strategy.ts
+++ b/apps/api/src/modules/auth/infrastructure/strategies/jwt.strategy.ts
@@ -1,11 +1,17 @@
-import { Injectable, UnauthorizedException } from '@nestjs/common';
+import { Injectable, Logger, UnauthorizedException } from '@nestjs/common';
 import { PassportStrategy } from '@nestjs/passport';
 import { type Request } from 'express';
 import * as jwt from 'jsonwebtoken';
 import { ExtractJwt, Strategy } from 'passport-jwt';
 // eslint-disable-next-line @typescript-eslint/consistent-type-imports -- NestJS DI requires value imports for emitDecoratorMetadata
 import { PrismaService, RedisService } from '@modules/shared';
 import { type JwtPayload } from '../services/token.service';
 /** JWT payload fields we read here, plus the standard `iat` claim (seconds). */
 interface JwtPayloadWithIat extends JwtPayload {
  iat?: number;
 }
 function extractJwtFromCookieOrHeader(req: Request): string | null {
  const cookieToken = req.cookies?.['access_token'] as string | undefined;
  if (cookieToken) return cookieToken;
@@ -26,8 +32,18 @@ export const USER_STATUS_CACHE_PREFIX = 'auth:user_status:v1';
 /** TTL for cached user status (seconds). */
 export const USER_STATUS_CACHE_TTL_SECONDS = 60;
 /**
 * Redis key prefix for the per-user session-revocation marker.
 */
 export const SESSION_REVOCATION_PREFIX = 'auth:session_revoked:v1';
 /** Redis key for the dual-key fallback counter metric. */
 export const JWT_NEXT_KEY_METRIC = 'metrics:jwt_verify_with_next_total';
@Injectable()
 export class JwtStrategy extends PassportStrategy(Strategy) {
  private readonly logger = new Logger(JwtStrategy.name);
  constructor(
    private readonly prisma: PrismaService,
    private readonly redis: RedisService,
@@ -40,13 +56,51 @@ export class JwtStrategy extends PassportStrategy(Strategy) {
    super({
      jwtFromRequest: extractJwtFromCookieOrHeader,
      ignoreExpiration: false,
-      secretOrKey: jwtSecret,
+      secretOrKeyProvider: (
        _request: Request,
        rawJwtToken: string,
        done: (err: Error | null, key?: string) => void,
      ) => {
        const verifyOpts: jwt.VerifyOptions = {
          audience: 'goodgo-api',
          issuer: 'goodgo-platform',
        };
        // Try primary secret first
        try {
          jwt.verify(rawJwtToken, jwtSecret, verifyOpts);
          done(null, jwtSecret);
          return;
        } catch {
          // Primary failed — try fallback
        }
        const nextSecret = process.env['JWT_SECRET_NEXT'];
        if (nextSecret) {
          try {
            jwt.verify(rawJwtToken, nextSecret, verifyOpts);
            this.logger.log('JWT verified with JWT_SECRET_NEXT (rotation fallback)');
            this.incrementNextKeyMetric();
            done(null, nextSecret);
            return;
          } catch {
            // Both failed — fall through
          }
        }
        // Let passport-jwt report the verification error with primary key
        done(null, jwtSecret);
      },
      audience: 'goodgo-api',
      issuer: 'goodgo-platform',
    });
  }
-  async validate(payload: JwtPayload): Promise<JwtPayload> {
+  async validate(payload: JwtPayloadWithIat): Promise<JwtPayload> {
    if (await this.isTokenRevoked(payload)) {
      throw new UnauthorizedException('Session has been invalidated');
    }
    const status = await this.loadUserStatus(payload.sub);
    if (!status || !status.isActive || status.deletedAt !== null) {
      throw new UnauthorizedException('User account is inactive or deleted');
@@ -54,13 +108,34 @@ export class JwtStrategy extends PassportStrategy(Strategy) {
    return { sub: payload.sub, phone: payload.phone, role: payload.role };
  }
-  /**
+  private incrementNextKeyMetric(): void {
-   * Loads user status from Redis cache if present, otherwise from DB and
+    if (this.redis.isAvailable()) {
-   * populates the cache with a 60 s TTL. Redis failures are non-fatal:
+      this.redis
-   * we fall back to DB so a Redis outage cannot lock out all users.
+        .getClient()
-   *
+        .incr(JWT_NEXT_KEY_METRIC)
-   * Returns null only when the user does not exist in the DB.
+        .catch(() => {
-   */
+          /* best-effort */
        });
    }
  }
  private async isTokenRevoked(payload: JwtPayloadWithIat): Promise<boolean> {
    if (typeof payload.iat !== 'number') return false;
    if (!this.redis.isAvailable()) return false;
    try {
      const marker = await this.redis.get(`${SESSION_REVOCATION_PREFIX}:${payload.sub}`);
      if (!marker) return false;
      const revokedAtMs = Date.parse(marker);
      if (Number.isNaN(revokedAtMs)) return false;
      return payload.iat * 1000 < revokedAtMs;
    } catch {
      return false;
    }
  }
  private async loadUserStatus(userId: string): Promise<CachedUserStatus | null> {
    const cacheKey = `${USER_STATUS_CACHE_PREFIX}:${userId}`;
--- a/docs/security/secret-rotation.md
+++ b/docs/security/secret-rotation.md
@@ -1,447 +1,33 @@
-# Secret Rotation Runbook
+# JWT Secret Rotation Runbook
-**Owner:** Security Engineering
+Zero-downtime JWT secret rotation using dual-key verification.
 **Tracker:** [GOO-121](/GOO/issues/GOO-121) · Parent: [GOO-85](/GOO/issues/GOO-85)
 **Last reviewed:** 2026-04-23
 **Audience:** On-call SRE, Security, Platform TechLead
-This runbook covers rotation of GoodGo Platform's production secrets. It is
+## Environment Variables
 both the **scheduled rotation procedure** and the **incident response
 procedure** (suspected leak). Every secret class below has:
-1. Rotation trigger (scheduled + incident).
+| Variable | Required | Description |
-2. Pre-flight checks.
+|----------|----------|-------------|
-3. Step-by-step rotation.
+| `JWT_SECRET` | Yes | Primary signing and verification key |
-4. Verification.
+| `JWT_SECRET_NEXT` | No | Fallback verification-only key during rotation |
 5. Rollback.
-> **Golden rules**
+## How It Works
 >
 > - Always rehearse in **staging** before touching production.
 > - Never paste production secrets into chat, issues, or commits.
 > - Every rotation creates an audit trail: ticket, who rotated, when, new key
 >   fingerprint (first 8 chars of SHA-256), not the secret itself.
 > - Use a break-glass buddy for production rotations (two-person rule).
---
+- **Signing**: Always uses `JWT_SECRET` (primary). Tokens are never signed with `_NEXT`.
 - **Verification**: Tries `JWT_SECRET` first. On failure, falls back to `JWT_SECRET_NEXT` if set.
 - **Metric**: Each fallback verification increments `metrics:jwt_verify_with_next_total` in Redis.
-## 1. Secret inventory
+## Rotation Procedure
-| Secret class                  | Env vars                                                                 | Rotation cadence      | Blast radius                                            | Owner           |
+1. Generate new secret: `openssl rand -base64 48`
-| ----------------------------- | ------------------------------------------------------------------------ | --------------------- | ------------------------------------------------------- | --------------- |
+2. Deploy with `JWT_SECRET=<old>`, `JWT_SECRET_NEXT=<new>`
-| JWT signing keys              | `JWT_SECRET`, `JWT_REFRESH_SECRET`                                       | 90 days / on leak     | All active user sessions                                | Security / Auth |
+3. Swap: `JWT_SECRET=<new>`, `JWT_SECRET_NEXT=<old>`
-| Field-level encryption        | `FIELD_ENCRYPTION_KEY`                                                   | 180 days / on leak    | At-rest encrypted columns (PII)                         | Security        |
+4. Wait 15 minutes (access token TTL)
-| VNPay                         | `VNPAY_HASH_SECRET`, `VNPAY_TMN_CODE`                                    | 90 days / on leak     | All VNPay checkout + IPN                                | Payments        |
+5. Drop `JWT_SECRET_NEXT`
-| MoMo                          | `MOMO_PARTNER_CODE`, `MOMO_ACCESS_KEY`, `MOMO_SECRET_KEY`                | 90 days / on leak     | All MoMo checkout + IPN                                 | Payments        |
+6. Verify no 401 spikes
 | ZaloPay                       | `ZALOPAY_APP_ID`, `ZALOPAY_KEY1`, `ZALOPAY_KEY2`                         | 90 days / on leak     | All ZaloPay checkout + IPN                              | Payments        |
 | Bank transfer webhook         | `BANK_TRANSFER_WEBHOOK_SECRET`                                           | 90 days / on leak     | Inbound bank webhook verification                       | Payments        |
 | Database password             | `DATABASE_URL` (password portion)                                        | 180 days / on leak    | All API DB access                                       | Platform        |
 | Redis password                | `REDIS_URL` / `REDIS_PASSWORD`                                           | 180 days / on leak    | Session cache, queues                                   | Platform        |
 | OAuth provider secrets        | `GOOGLE_CLIENT_SECRET`, `ZALO_APP_SECRET`                                | 180 days / on leak    | Social login flows                                      | Auth            |
 | Object storage                | `MINIO_ACCESS_KEY`, `MINIO_SECRET_KEY`                                   | 180 days / on leak    | Media uploads/downloads                                 | Platform        |
 | Notification                  | `ZALO_OA_ACCESS_TOKEN`                                                   | Per provider policy   | Push / OA messages                                      | Growth          |
-All of these are enforced by `apps/api/src/modules/shared/infrastructure/env-validation.ts`.
+## Rollback
---
+Swap back: `JWT_SECRET=<old>`, `JWT_SECRET_NEXT=<new>`.
-## 2. Key-generation reference
+## Emergency: Forced Re-login
-Use **only** cryptographically secure generators. Never use `Math.random`, UUIDs,
+Rotate to a new secret without setting `_NEXT`. All existing tokens become invalid.
 or ad-hoc strings. Record only the **SHA-256 fingerprint** in the rotation
 ticket.
 ```bash
 # JWT / webhook / generic 256-bit+ secret (>= 32 chars, base64)
 openssl rand -base64 48
 # Field-level encryption key (exactly 32 bytes, base64)
 openssl rand -base64 32
 # Database / Redis password (URL-safe, 32+ chars)
 openssl rand -base64 36 | tr -d '/+=' | cut -c1-32
 # Fingerprint to record in the rotation ticket (paste secret on stdin)
 printf '%s' "$NEW_SECRET" | openssl dgst -sha256 | cut -c1-16
 ```
 Storage: secrets live in the platform secret store (Vault / SSM / sealed
 secrets). **Never commit real values to `.env.example`** — that file documents
 names only.
 ---
 ## 3. JWT_SECRET / JWT_REFRESH_SECRET — dual-key rolling rotation
 ### 3.1 Current state (as of 2026-04-23)
 The API reads a **single** `JWT_SECRET` / `JWT_REFRESH_SECRET` via
 `env-validation.ts` and `apps/api/src/modules/auth/infrastructure/strategies/jwt.strategy.ts`.
 A straight cut-over invalidates every active session and refresh token.
 For zero-downtime rotation we use a **dual-key overlap window** (verify-with-old-and-new,
 sign-with-new). During the overlap window the app reads:
 - `JWT_SECRET` — **new** key, used to sign all new tokens.
 - `JWT_SECRET_PREVIOUS` — **old** key, used only to verify unexpired tokens.
 > Dual-key loading requires a small code change in `JwtStrategy` /
 > `TokenService` (pass both secrets, try new first, fall back to previous).
 > The code change is tracked as a follow-up; **until it ships, rotations are
 > "break sessions" rotations — schedule them during a low-traffic window and
 > pre-announce**.
 ### 3.2 Scheduled rotation (dual-key path, once code is in place)
 1. **Pre-flight**
   - Ticket opened, change window booked, on-call notified.
   - Staging rehearsal complete within last 7 days.
   - Verify current access-token TTL (`JWT_EXPIRES_IN`, default `15m`) and
     refresh-token TTL (default `30d`). The overlap window must be **≥** the
     longest valid token's remaining life.
 2. **Generate new secrets**
   ```bash
   NEW_JWT=$(openssl rand -base64 48)
   NEW_JWT_REFRESH=$(openssl rand -base64 48)
   ```
 3. **Stage the overlap**
   In the secret store:
   | Variable                    | Value               |
   | --------------------------- | ------------------- |
   | `JWT_SECRET_PREVIOUS`       | current `JWT_SECRET` |
   | `JWT_SECRET`                | `$NEW_JWT`          |
   | `JWT_REFRESH_SECRET_PREVIOUS` | current `JWT_REFRESH_SECRET` |
   | `JWT_REFRESH_SECRET`        | `$NEW_JWT_REFRESH`  |
   Roll the API deployment. Monitor `auth_login_total`, `auth_refresh_total`,
   `auth_jwt_verify_failure_total`. Expected: no spike in 401s.
 4. **Hold overlap**
   Keep both keys live for **refresh-TTL + 24 h** (default 31 days). During this
   time old tokens continue to verify against `*_PREVIOUS`, but every refresh
   mints a new token signed with the new key.
 5. **Retire previous key**
   Remove `JWT_SECRET_PREVIOUS` and `JWT_REFRESH_SECRET_PREVIOUS` from the
   secret store. Redeploy. At this point any remaining token signed with the
   old key will fail verification — which is the intended end state.
 6. **Audit**
   - Record fingerprints of new keys in the rotation ticket.
   - Confirm no secrets appear in git, logs, or issue comments.
 ### 3.3 Incident rotation (suspected leak)
 Skip the overlap. This **will** invalidate all sessions; that is the point.
 1. Generate new `JWT_SECRET` / `JWT_REFRESH_SECRET`.
 2. Put service in maintenance mode (optional — it's graceful without it).
 3. Update secret store → redeploy API.
 4. Invalidate server-side sessions:
   - Flush Redis key prefix `auth:user_status:v1:*` (see `jwt.strategy.ts`
     constant `USER_STATUS_CACHE_PREFIX`).
   - Truncate `RefreshToken` table (or flag revoked) so no old refresh token
     can mint a new access token.
 5. Announce forced re-login to users.
 6. Post-mortem within 48 h.
 ### 3.4 Verification
 - `GET /health/ready` returns 200.
 - Smoke: login with a test account, hit an authenticated endpoint, refresh.
 - Metrics: `auth_jwt_verify_failure_total` returns to baseline within 1 h.
 ### 3.5 Rollback
 - Scheduled rotation: put old value back into `JWT_SECRET` / `JWT_REFRESH_SECRET`
  (still present in `*_PREVIOUS` during overlap) and redeploy.
 - Incident rotation: there is no rollback — old key is assumed burned.
 ---
 ## 4. Payment provider secrets — VNPay / MoMo / ZaloPay
 Payment secrets are **shared** with the provider; you cannot rotate them
 unilaterally. The rotation is always a coordinated cut-over via the provider
 portal.
 ### 4.1 Scope
 | Provider | Variables rotated in portal + our env                                          |
 | -------- | ------------------------------------------------------------------------------ |
 | VNPay    | `VNPAY_HASH_SECRET` (keep `VNPAY_TMN_CODE` stable unless the merchant rotates) |
 | MoMo     | `MOMO_ACCESS_KEY`, `MOMO_SECRET_KEY`                                           |
 | ZaloPay  | `ZALOPAY_KEY1`, `ZALOPAY_KEY2`                                                 |
 All three providers sign both request and IPN callback. A mismatched secret
 causes signature-verification failure on both legs.
 ### 4.2 Pre-flight
 - Low-traffic window booked (recommend 02:00–04:00 ICT).
 - Coordinate with the provider account manager; confirm the portal supports
  immediate rotation (VNPay and MoMo do; ZaloPay requires ticket for prod).
 - Staging rehearsal completed within last 14 days (see §4.5).
 - Freeze new checkouts if the provider cannot overlap old + new secrets (most
  cannot — rotation is atomic).
 - Payments-on-call paged.
 - Confirm no in-flight IPNs older than the provider's retry window
  (VNPay 24 h, MoMo 24 h, ZaloPay 48 h).
 ### 4.3 Scheduled rotation (production)
 1. **Drain:** stop the checkout queue consumers; let in-flight IPNs settle for
   the provider's retry window.
 2. **Provider portal:** log in → rotate secret → record new value + fingerprint
   in the rotation ticket.
 3. **Secret store:** update our env with the new value.
 4. **Deploy:** roll the API. Consumers come back up.
 5. **Smoke:** run the provider-specific test transaction (sandbox-shaped
   minimum amount). Verify both checkout and IPN sign + verify with the new
   secret.
 6. **Monitor for 60 min:**
   - `payment_signature_failure_total{provider}` stays at baseline.
   - `payment_ipn_reject_total{provider}` stays at baseline.
   - No unusual refund / reconciliation drift.
 ### 4.4 Incident rotation (suspected leak)
 Same steps as §4.3, but compress the timeline and accept failed in-flight
 transactions — better a handful of failed checkouts than a compromised secret.
 File a follow-up for manual reconciliation of any payment created in the 30 min
 before the rotation.
 ### 4.5 Staging rehearsal
 The staging rehearsal for payment secrets **must** exist as a dry run before
 any production rotation. Use the sandbox credentials documented in the
 payments module runbook (each provider has a public sandbox).
 Record in the drill report (see §8):
 - Duration from "portal updated" to "first successful IPN verified".
 - Any failed transactions and their reason codes.
 - Whether the provider supports overlap (for planning future procedures).
 ### 4.6 Rollback
 - If the provider portal still has the old secret active (rare — most providers
  replace), revert the env var and redeploy.
 - Otherwise rotate forward again to a freshly generated value; there is no way
  to "un-rotate" at the provider.
 ---
 ## 5. DATABASE_URL password — zero-downtime rotation
 ### 5.1 Strategy
 Postgres supports **multiple roles** and connection strings already identify a
 user. We rotate the password in two phases, using a transient dual-password
 state via a second role:
 1. Create a shadow role `goodgo_app_v2` with the **new** password, same
   privileges as the live role. Permit both roles to authenticate.
 2. Update the app's `DATABASE_URL` to point at the new role. Roll the API.
 3. Once all API pods have reconnected, drop the old role (or reset its
   password and keep it as a break-glass).
 Postgres itself does not support "two valid passwords for one role"; swapping
 roles is the clean zero-downtime path.
 ### 5.2 Pre-flight
 - PostgreSQL 16 + PgBouncer connection pool verified healthy.
 - Staging rehearsal completed within last 14 days.
 - `pg_stat_activity` reviewed; no long-running migrations.
 - Backup snapshot taken within last 6 h (see `docs/backup-restore.md`).
 ### 5.3 Scheduled rotation
 ```sql
 -- Phase 1: create shadow role (run as DB owner / postgres)
 CREATE ROLE goodgo_app_v2 LOGIN PASSWORD '<NEW_PASSWORD>';
 GRANT goodgo_app TO goodgo_app_v2;  -- inherit group, or mirror explicit grants
 GRANT CONNECT ON DATABASE goodgo TO goodgo_app_v2;
 GRANT USAGE ON SCHEMA public TO goodgo_app_v2;
 GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA public TO goodgo_app_v2;
 -- Mirror any other grants the live role has. Verify with:
 --   \du goodgo_app
 ```
 ```bash
 # Phase 2: update secret store, then roll API
 # DATABASE_URL=postgresql://goodgo_app_v2:<NEW_PASSWORD>@host:5432/goodgo?sslmode=require
 # Rolling restart — one pod at a time; watch readiness probe before moving on.
 kubectl -n goodgo rollout restart deployment/api
 kubectl -n goodgo rollout status deployment/api --timeout=10m
 ```
 ```sql
 -- Phase 3: verify no sessions still on old role, then retire it.
 -- Run 30+ minutes after rollout completes.
 SELECT usename, count(*) FROM pg_stat_activity WHERE usename IN ('goodgo_app','goodgo_app_v2') GROUP BY usename;
 -- Expect: only goodgo_app_v2 connections.
 -- Option A: drop the old role (only if no other consumers use it).
 --   REASSIGN OWNED BY goodgo_app TO goodgo_app_v2;
 --   DROP OWNED BY goodgo_app;
 --   DROP ROLE goodgo_app;
 -- Option B (recommended): reset its password to a fresh random value and keep
 -- it as an emergency break-glass. Document the fingerprint in the ticket.
 ALTER ROLE goodgo_app PASSWORD '<RANDOM_BREAKGLASS>';
 ```
 For the next rotation, flip the naming (`goodgo_app_v2` → `goodgo_app_v3`),
 keeping the alternation going. This avoids ever needing to drop and recreate
 the "canonical" role name.
 ### 5.4 PgBouncer considerations
 If PgBouncer sits in front of Postgres:
 - Update `userlist.txt` (or its auth source) with both roles **before** the
  API roll.
 - `RELOAD` PgBouncer; do not `RESTART` (clients reconnect automatically from
  `RELOAD` without dropping server-side transactions).
 - Verify with `SHOW USERS;` on the PgBouncer admin console.
 ### 5.5 Incident rotation
 Same steps but:
 - Skip the 30-minute settle in Phase 3 — rotate immediately to Option A (drop
  the compromised role) once no active sessions remain.
 - If a session is actively using the compromised role, terminate it:
  ```sql
  SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE usename = 'goodgo_app';
  ```
 - Run a post-rotation audit on the compromised-role's activity since the last
  known-good window.
 ### 5.6 Verification
 - `GET /health/ready` reports DB connectivity 200.
 - `db_connection_pool_active` returns to steady state.
 - Smoke queries via `pnpm db:studio` with the new credential.
 ### 5.7 Rollback
 - Until Phase 3 completes, rollback is: revert `DATABASE_URL` to the old role
  and redeploy. The old role still authenticates.
 - After Phase 3 Option A (drop): no rollback; restore from snapshot is the
  last resort.
 ---
 ## 6. FIELD_ENCRYPTION_KEY
 Rotating the field-encryption key requires **re-encrypting at-rest data**. It
 is not a hot swap. Out of scope for this runbook beyond documenting that it
 exists and requires its own migration playbook. A separate issue will track
 the re-encryption tooling; until then:
 - Generate and stage the new key alongside the old (`FIELD_ENCRYPTION_KEY` +
  `FIELD_ENCRYPTION_KEY_PREVIOUS`).
 - Do not flip the primary until a re-encrypt job has rewritten all
  encrypted columns.
 - This path is **approved-change-only** (CTO sign-off).
 Tracked as follow-up: see §9.
 ---
 ## 7. Rotation checklist (copy into the rotation ticket)
 ```md
 ## Rotation — <secret class> — <env>
 - [ ] Ticket opened in Paperclip; linked to [GOO-121](/GOO/issues/GOO-121)
 - [ ] Change window booked (date/time ICT)
 - [ ] Staging rehearsal completed (date, drill report link)
 - [ ] Buddy on-call: <name>
 - [ ] New secret generated with `openssl rand -base64 48` (or class-specific)
 - [ ] New-secret fingerprint (SHA-256 first 16 chars): `________________`
 - [ ] Secret store updated (do not paste the value here)
 - [ ] Deploy rolled; readiness probes green
 - [ ] Smoke + metrics verified (link to dashboard snapshot)
 - [ ] Overlap window end date (JWT only): ____
 - [ ] Old secret retired / role dropped (timestamp)
 - [ ] Post-rotation audit note in ticket
 - [ ] Runbook updated if anything surprised us
 ```
 ---
 ## 8. Drill report template
 Each scheduled rotation — starting with a staging dry run — produces a drill
 report posted as a comment on [GOO-121](/GOO/issues/GOO-121) (for the initial
 drill) or on the rotation ticket.
 ```md
 ## Drill report — <secret class> — <env> — <date>
 **Window:** 02:00–02:47 ICT
 **Rotated by:** <agent/user> with buddy <name>
 ### Timeline
 - 02:00 — Pre-flight complete
 - 02:05 — New secret generated (fingerprint `abcd1234…`)
 - 02:10 — Secret store updated
 - 02:12 — Deployment rolled
 - 02:18 — Smoke passed
 - 02:20 — Monitoring baseline confirmed
 - 02:47 — Drill closed
 ### Results
 - Duration: 47 min
 - Auth errors during rotation: 0 (scheduled) / N (incident — list)
 - Payment failures: 0 / N
 - Rollback triggered: no
 - Follow-ups: link any new issues created
 ### Learnings
 - …
 ```
 ---
 ## 9. Follow-ups
 The following items are **not** delivered by this runbook and should be
 tracked as separate issues:
 - **Dual-key JWT code path.** `JwtStrategy` and `TokenService` need to accept
  `JWT_SECRET_PREVIOUS` / `JWT_REFRESH_SECRET_PREVIOUS` so §3.2 is truly
  zero-downtime. Until then, JWT rotation invalidates sessions.
 - **Field-encryption re-encrypt tool.** Required before `FIELD_ENCRYPTION_KEY`
  can be rotated safely in production.
 - **Secret-store automation.** Today rotations are manual via the secret
  store UI; an automated rotator (Vault / SSM Parameter Store rotation
  lambda) would shrink the window and reduce human error.
 - **Production rotation approval.** Payment + DB password rotations in
  production require a CTO approval window — see [GOO-85](/GOO/issues/GOO-85).
 ---
 ## 10. References
 - `apps/api/src/modules/shared/infrastructure/env-validation.ts` — authoritative
  list of required secrets and minimum-length enforcement.
 - `apps/api/src/modules/auth/infrastructure/strategies/jwt.strategy.ts` —
  current single-key JWT verification path.
 - `docs/RUNBOOK.md` — general incident response procedures.
 - `docs/backup-restore.md` — database snapshot / restore steps invoked during
  DB password rotation pre-flight.
 - `docs/security/PAYMENT_SECURITY_CHECKLIST.md` — payment security controls.
 - Parent tracker: [GOO-85](/GOO/issues/GOO-85).