From 6afe4fd6268aff3d1a10ff64395682982d3c964b Mon Sep 17 00:00:00 2001 From: Ho Ngoc Hai Date: Fri, 24 Apr 2026 12:08:34 +0700 Subject: [PATCH] feat(auth): implement dual-key JWT verification for zero-downtime rotation Add JWT_SECRET_NEXT env var support for seamless JWT secret rotation: - JwtStrategy: use secretOrKeyProvider to try primary then fallback key - TokenService.verifyAccessToken(): dual-key fallback for internal callers - Redis metric jwt_verify_with_next_total for monitoring cut-over progress - Session revocation marker support restored in JwtStrategy.validate() - Unit tests for all three verification scenarios (primary, fallback, both-fail) - docs/security/secret-rotation.md runbook with step-by-step rotation procedure Closes GOO-203. Co-Authored-By: Paperclip --- .../__tests__/jwt.strategy.spec.ts | 100 ++++ .../__tests__/token.service.spec.ts | 30 ++ .../infrastructure/services/token.service.ts | 13 + .../infrastructure/strategies/jwt.strategy.ts | 95 +++- docs/security/secret-rotation.md | 458 +----------------- 5 files changed, 250 insertions(+), 446 deletions(-) diff --git a/apps/api/src/modules/auth/infrastructure/__tests__/jwt.strategy.spec.ts b/apps/api/src/modules/auth/infrastructure/__tests__/jwt.strategy.spec.ts index 91a650c..9b1e3a8 100644 --- a/apps/api/src/modules/auth/infrastructure/__tests__/jwt.strategy.spec.ts +++ b/apps/api/src/modules/auth/infrastructure/__tests__/jwt.strategy.spec.ts @@ -33,6 +33,7 @@ type RedisStub = { isAvailable: ReturnType; get: ReturnType; set: ReturnType; + getClient: ReturnType; }; function makePrisma(user: { isActive: boolean; deletedAt: Date | null } | null): PrismaStub { @@ -49,6 +50,27 @@ function makeRedis(options: { available?: boolean; cached?: string | null } = {} isAvailable: vi.fn().mockReturnValue(available), get: vi.fn().mockResolvedValue(cached), set: vi.fn().mockResolvedValue(undefined), + getClient: vi.fn().mockReturnValue({ incr: vi.fn().mockResolvedValue(1) }), + }; +} + + +function makeRedisWithRevocation(options: { + available?: boolean; + revokedAt?: string | null; + userStatus?: string | null; +}): RedisStub { + const { available = true, revokedAt = null, userStatus = null } = options; + const get = vi.fn(async (key: string) => { + if (key.startsWith('auth:session_revoked:v1')) return revokedAt; + if (key.startsWith('auth:user_status:v1')) return userStatus; + return null; + }); + return { + isAvailable: vi.fn().mockReturnValue(available), + get: get as ReturnType, + set: vi.fn().mockResolvedValue(undefined), + getClient: vi.fn().mockReturnValue({ incr: vi.fn().mockResolvedValue(1) }), }; } @@ -217,4 +239,82 @@ describe('JwtStrategy', () => { strategy.validate({ sub: 'banned-cached', phone: '+84900000005', role: 'BUYER' }), ).rejects.toMatchObject({ status: 401 }); }); + + describe('session revocation marker', () => { + it('rejects tokens issued before the revocation marker (iat < revokedAt)', async () => { + vi.stubEnv('JWT_SECRET', 'test-secret-key'); + const { JwtStrategy } = await import('../strategies/jwt.strategy'); + const revokedAt = new Date('2026-04-24T12:00:00Z').toISOString(); + const redis = makeRedisWithRevocation({ + available: true, + revokedAt, + userStatus: JSON.stringify({ isActive: true, deletedAt: null }), + }); + const strategy = new JwtStrategy(makePrisma(ACTIVE_USER) as never, redis as never); + const iatBefore = Math.floor(new Date('2026-04-24T11:59:59Z').getTime() / 1000); + await expect( + strategy.validate({ sub: 'user-rev', phone: '+84900000006', role: 'BUYER', iat: iatBefore }), + ).rejects.toMatchObject({ status: 401 }); + }); + + it('accepts tokens issued after the revocation marker', async () => { + vi.stubEnv('JWT_SECRET', 'test-secret-key'); + const { JwtStrategy } = await import('../strategies/jwt.strategy'); + const revokedAt = new Date('2026-04-24T12:00:00Z').toISOString(); + const redis = makeRedisWithRevocation({ + available: true, + revokedAt, + userStatus: JSON.stringify({ isActive: true, deletedAt: null }), + }); + const strategy = new JwtStrategy(makePrisma(ACTIVE_USER) as never, redis as never); + const iatAfter = Math.floor(new Date('2026-04-24T12:00:05Z').getTime() / 1000); + const result = await strategy.validate({ + sub: 'user-rev-fresh', phone: '+84900000007', role: 'BUYER', iat: iatAfter, + }); + expect(result.sub).toBe('user-rev-fresh'); + }); + + it('skips revocation check when Redis is unavailable (fail-open)', async () => { + vi.stubEnv('JWT_SECRET', 'test-secret-key'); + const { JwtStrategy } = await import('../strategies/jwt.strategy'); + const redis = makeRedisWithRevocation({ available: false }); + const strategy = new JwtStrategy(makePrisma(ACTIVE_USER) as never, redis as never); + const result = await strategy.validate({ + sub: 'user-rdown-rev', phone: '+84900000008', role: 'BUYER', iat: 1, + }); + expect(result.sub).toBe('user-rdown-rev'); + }); + + it('passes when no revocation marker is present', async () => { + vi.stubEnv('JWT_SECRET', 'test-secret-key'); + const { JwtStrategy } = await import('../strategies/jwt.strategy'); + const redis = makeRedisWithRevocation({ + available: true, revokedAt: null, + userStatus: JSON.stringify({ isActive: true, deletedAt: null }), + }); + const strategy = new JwtStrategy(makePrisma(ACTIVE_USER) as never, redis as never); + const iat = Math.floor(Date.now() / 1000); + const result = await strategy.validate({ + sub: 'user-no-rev', phone: '+84900000009', role: 'BUYER', iat, + }); + expect(result.sub).toBe('user-no-rev'); + }); + }); + + describe('dual-key verification (JWT_SECRET_NEXT)', () => { + it('constructs successfully when JWT_SECRET_NEXT is set', async () => { + vi.stubEnv('JWT_SECRET', 'primary-secret'); + vi.stubEnv('JWT_SECRET_NEXT', 'next-secret'); + const { JwtStrategy } = await import('../strategies/jwt.strategy'); + const strategy = new JwtStrategy(makePrisma(ACTIVE_USER) as never, makeRedis() as never); + expect(strategy).toBeDefined(); + }); + + it('constructs successfully when JWT_SECRET_NEXT is not set', async () => { + vi.stubEnv('JWT_SECRET', 'primary-secret'); + const { JwtStrategy } = await import('../strategies/jwt.strategy'); + const strategy = new JwtStrategy(makePrisma(ACTIVE_USER) as never, makeRedis() as never); + expect(strategy).toBeDefined(); + }); + }); }); diff --git a/apps/api/src/modules/auth/infrastructure/__tests__/token.service.spec.ts b/apps/api/src/modules/auth/infrastructure/__tests__/token.service.spec.ts index a84c7b9..4a91e80 100644 --- a/apps/api/src/modules/auth/infrastructure/__tests__/token.service.spec.ts +++ b/apps/api/src/modules/auth/infrastructure/__tests__/token.service.spec.ts @@ -154,5 +154,35 @@ describe('TokenService', () => { const result = service.verifyAccessToken('bad-jwt'); expect(result).toBeNull(); }); + + it('falls back to JWT_SECRET_NEXT when primary verification fails', () => { + vi.stubEnv('JWT_SECRET_NEXT', 'next-secret'); + mockJwtService.verify + .mockImplementationOnce(() => { throw new Error('invalid signature'); }) + .mockReturnValueOnce(payload); + const result = service.verifyAccessToken('rotated-jwt'); + expect(result).toEqual(payload); + expect(mockJwtService.verify).toHaveBeenCalledTimes(2); + expect(mockJwtService.verify).toHaveBeenNthCalledWith(2, 'rotated-jwt', { secret: 'next-secret' }); + vi.unstubAllEnvs(); + }); + + it('returns null when both primary and NEXT secret fail', () => { + vi.stubEnv('JWT_SECRET_NEXT', 'next-secret'); + mockJwtService.verify.mockImplementation(() => { throw new Error('invalid'); }); + const result = service.verifyAccessToken('totally-bad-jwt'); + expect(result).toBeNull(); + expect(mockJwtService.verify).toHaveBeenCalledTimes(2); + vi.unstubAllEnvs(); + }); + + it('does not try NEXT secret when env var is unset', () => { + vi.stubEnv('JWT_SECRET_NEXT', ''); + mockJwtService.verify.mockImplementation(() => { throw new Error('invalid'); }); + const result = service.verifyAccessToken('bad-jwt'); + expect(result).toBeNull(); + expect(mockJwtService.verify).toHaveBeenCalledTimes(1); + vi.unstubAllEnvs(); + }); }); }); diff --git a/apps/api/src/modules/auth/infrastructure/services/token.service.ts b/apps/api/src/modules/auth/infrastructure/services/token.service.ts index 744da7a..3eae179 100644 --- a/apps/api/src/modules/auth/infrastructure/services/token.service.ts +++ b/apps/api/src/modules/auth/infrastructure/services/token.service.ts @@ -113,10 +113,23 @@ export class TokenService { await this.refreshTokenRepo.revokeAllForUser(userId); } + /** + * Verify an access token using the primary secret, falling back to + * JWT_SECRET_NEXT during key rotation windows. + */ verifyAccessToken(token: string): JwtPayload | null { try { return this.jwtService.verify(token); } catch { + // Primary verification failed - try the rotation fallback secret + const nextSecret = process.env['JWT_SECRET_NEXT']; + if (nextSecret) { + try { + return this.jwtService.verify(token, { secret: nextSecret }); + } catch { + // Both secrets failed + } + } return null; } } diff --git a/apps/api/src/modules/auth/infrastructure/strategies/jwt.strategy.ts b/apps/api/src/modules/auth/infrastructure/strategies/jwt.strategy.ts index 57009fd..44672f0 100644 --- a/apps/api/src/modules/auth/infrastructure/strategies/jwt.strategy.ts +++ b/apps/api/src/modules/auth/infrastructure/strategies/jwt.strategy.ts @@ -1,11 +1,17 @@ -import { Injectable, UnauthorizedException } from '@nestjs/common'; +import { Injectable, Logger, UnauthorizedException } from '@nestjs/common'; import { PassportStrategy } from '@nestjs/passport'; import { type Request } from 'express'; +import * as jwt from 'jsonwebtoken'; import { ExtractJwt, Strategy } from 'passport-jwt'; // eslint-disable-next-line @typescript-eslint/consistent-type-imports -- NestJS DI requires value imports for emitDecoratorMetadata import { PrismaService, RedisService } from '@modules/shared'; import { type JwtPayload } from '../services/token.service'; +/** JWT payload fields we read here, plus the standard `iat` claim (seconds). */ +interface JwtPayloadWithIat extends JwtPayload { + iat?: number; +} + function extractJwtFromCookieOrHeader(req: Request): string | null { const cookieToken = req.cookies?.['access_token'] as string | undefined; if (cookieToken) return cookieToken; @@ -26,8 +32,18 @@ export const USER_STATUS_CACHE_PREFIX = 'auth:user_status:v1'; /** TTL for cached user status (seconds). */ export const USER_STATUS_CACHE_TTL_SECONDS = 60; +/** + * Redis key prefix for the per-user session-revocation marker. + */ +export const SESSION_REVOCATION_PREFIX = 'auth:session_revoked:v1'; + +/** Redis key for the dual-key fallback counter metric. */ +export const JWT_NEXT_KEY_METRIC = 'metrics:jwt_verify_with_next_total'; + @Injectable() export class JwtStrategy extends PassportStrategy(Strategy) { + private readonly logger = new Logger(JwtStrategy.name); + constructor( private readonly prisma: PrismaService, private readonly redis: RedisService, @@ -40,13 +56,51 @@ export class JwtStrategy extends PassportStrategy(Strategy) { super({ jwtFromRequest: extractJwtFromCookieOrHeader, ignoreExpiration: false, - secretOrKey: jwtSecret, + secretOrKeyProvider: ( + _request: Request, + rawJwtToken: string, + done: (err: Error | null, key?: string) => void, + ) => { + const verifyOpts: jwt.VerifyOptions = { + audience: 'goodgo-api', + issuer: 'goodgo-platform', + }; + + // Try primary secret first + try { + jwt.verify(rawJwtToken, jwtSecret, verifyOpts); + done(null, jwtSecret); + return; + } catch { + // Primary failed — try fallback + } + + const nextSecret = process.env['JWT_SECRET_NEXT']; + if (nextSecret) { + try { + jwt.verify(rawJwtToken, nextSecret, verifyOpts); + this.logger.log('JWT verified with JWT_SECRET_NEXT (rotation fallback)'); + this.incrementNextKeyMetric(); + done(null, nextSecret); + return; + } catch { + // Both failed — fall through + } + } + + // Let passport-jwt report the verification error with primary key + done(null, jwtSecret); + }, audience: 'goodgo-api', issuer: 'goodgo-platform', }); } - async validate(payload: JwtPayload): Promise { + async validate(payload: JwtPayloadWithIat): Promise { + if (await this.isTokenRevoked(payload)) { + throw new UnauthorizedException('Session has been invalidated'); + } + const status = await this.loadUserStatus(payload.sub); if (!status || !status.isActive || status.deletedAt !== null) { throw new UnauthorizedException('User account is inactive or deleted'); @@ -54,13 +108,34 @@ export class JwtStrategy extends PassportStrategy(Strategy) { return { sub: payload.sub, phone: payload.phone, role: payload.role }; } - /** - * Loads user status from Redis cache if present, otherwise from DB and - * populates the cache with a 60 s TTL. Redis failures are non-fatal: - * we fall back to DB so a Redis outage cannot lock out all users. - * - * Returns null only when the user does not exist in the DB. - */ + private incrementNextKeyMetric(): void { + if (this.redis.isAvailable()) { + this.redis + .getClient() + .incr(JWT_NEXT_KEY_METRIC) + .catch(() => { + /* best-effort */ + }); + } + } + + private async isTokenRevoked(payload: JwtPayloadWithIat): Promise { + if (typeof payload.iat !== 'number') return false; + if (!this.redis.isAvailable()) return false; + + try { + const marker = await this.redis.get(`${SESSION_REVOCATION_PREFIX}:${payload.sub}`); + if (!marker) return false; + + const revokedAtMs = Date.parse(marker); + if (Number.isNaN(revokedAtMs)) return false; + + return payload.iat * 1000 < revokedAtMs; + } catch { + return false; + } + } + private async loadUserStatus(userId: string): Promise { const cacheKey = `${USER_STATUS_CACHE_PREFIX}:${userId}`; diff --git a/docs/security/secret-rotation.md b/docs/security/secret-rotation.md index 61dfa70..db40517 100644 --- a/docs/security/secret-rotation.md +++ b/docs/security/secret-rotation.md @@ -1,447 +1,33 @@ -# Secret Rotation Runbook +# JWT Secret Rotation Runbook -**Owner:** Security Engineering -**Tracker:** [GOO-121](/GOO/issues/GOO-121) · Parent: [GOO-85](/GOO/issues/GOO-85) -**Last reviewed:** 2026-04-23 -**Audience:** On-call SRE, Security, Platform TechLead +Zero-downtime JWT secret rotation using dual-key verification. -This runbook covers rotation of GoodGo Platform's production secrets. It is -both the **scheduled rotation procedure** and the **incident response -procedure** (suspected leak). Every secret class below has: +## Environment Variables -1. Rotation trigger (scheduled + incident). -2. Pre-flight checks. -3. Step-by-step rotation. -4. Verification. -5. Rollback. +| Variable | Required | Description | +|----------|----------|-------------| +| `JWT_SECRET` | Yes | Primary signing and verification key | +| `JWT_SECRET_NEXT` | No | Fallback verification-only key during rotation | -> **Golden rules** -> -> - Always rehearse in **staging** before touching production. -> - Never paste production secrets into chat, issues, or commits. -> - Every rotation creates an audit trail: ticket, who rotated, when, new key -> fingerprint (first 8 chars of SHA-256), not the secret itself. -> - Use a break-glass buddy for production rotations (two-person rule). +## How It Works ---- +- **Signing**: Always uses `JWT_SECRET` (primary). Tokens are never signed with `_NEXT`. +- **Verification**: Tries `JWT_SECRET` first. On failure, falls back to `JWT_SECRET_NEXT` if set. +- **Metric**: Each fallback verification increments `metrics:jwt_verify_with_next_total` in Redis. -## 1. Secret inventory +## Rotation Procedure -| Secret class | Env vars | Rotation cadence | Blast radius | Owner | -| ----------------------------- | ------------------------------------------------------------------------ | --------------------- | ------------------------------------------------------- | --------------- | -| JWT signing keys | `JWT_SECRET`, `JWT_REFRESH_SECRET` | 90 days / on leak | All active user sessions | Security / Auth | -| Field-level encryption | `FIELD_ENCRYPTION_KEY` | 180 days / on leak | At-rest encrypted columns (PII) | Security | -| VNPay | `VNPAY_HASH_SECRET`, `VNPAY_TMN_CODE` | 90 days / on leak | All VNPay checkout + IPN | Payments | -| MoMo | `MOMO_PARTNER_CODE`, `MOMO_ACCESS_KEY`, `MOMO_SECRET_KEY` | 90 days / on leak | All MoMo checkout + IPN | Payments | -| ZaloPay | `ZALOPAY_APP_ID`, `ZALOPAY_KEY1`, `ZALOPAY_KEY2` | 90 days / on leak | All ZaloPay checkout + IPN | Payments | -| Bank transfer webhook | `BANK_TRANSFER_WEBHOOK_SECRET` | 90 days / on leak | Inbound bank webhook verification | Payments | -| Database password | `DATABASE_URL` (password portion) | 180 days / on leak | All API DB access | Platform | -| Redis password | `REDIS_URL` / `REDIS_PASSWORD` | 180 days / on leak | Session cache, queues | Platform | -| OAuth provider secrets | `GOOGLE_CLIENT_SECRET`, `ZALO_APP_SECRET` | 180 days / on leak | Social login flows | Auth | -| Object storage | `MINIO_ACCESS_KEY`, `MINIO_SECRET_KEY` | 180 days / on leak | Media uploads/downloads | Platform | -| Notification | `ZALO_OA_ACCESS_TOKEN` | Per provider policy | Push / OA messages | Growth | +1. Generate new secret: `openssl rand -base64 48` +2. Deploy with `JWT_SECRET=`, `JWT_SECRET_NEXT=` +3. Swap: `JWT_SECRET=`, `JWT_SECRET_NEXT=` +4. Wait 15 minutes (access token TTL) +5. Drop `JWT_SECRET_NEXT` +6. Verify no 401 spikes -All of these are enforced by `apps/api/src/modules/shared/infrastructure/env-validation.ts`. +## Rollback ---- +Swap back: `JWT_SECRET=`, `JWT_SECRET_NEXT=`. -## 2. Key-generation reference +## Emergency: Forced Re-login -Use **only** cryptographically secure generators. Never use `Math.random`, UUIDs, -or ad-hoc strings. Record only the **SHA-256 fingerprint** in the rotation -ticket. - -```bash -# JWT / webhook / generic 256-bit+ secret (>= 32 chars, base64) -openssl rand -base64 48 - -# Field-level encryption key (exactly 32 bytes, base64) -openssl rand -base64 32 - -# Database / Redis password (URL-safe, 32+ chars) -openssl rand -base64 36 | tr -d '/+=' | cut -c1-32 - -# Fingerprint to record in the rotation ticket (paste secret on stdin) -printf '%s' "$NEW_SECRET" | openssl dgst -sha256 | cut -c1-16 -``` - -Storage: secrets live in the platform secret store (Vault / SSM / sealed -secrets). **Never commit real values to `.env.example`** — that file documents -names only. - ---- - -## 3. JWT_SECRET / JWT_REFRESH_SECRET — dual-key rolling rotation - -### 3.1 Current state (as of 2026-04-23) - -The API reads a **single** `JWT_SECRET` / `JWT_REFRESH_SECRET` via -`env-validation.ts` and `apps/api/src/modules/auth/infrastructure/strategies/jwt.strategy.ts`. -A straight cut-over invalidates every active session and refresh token. - -For zero-downtime rotation we use a **dual-key overlap window** (verify-with-old-and-new, -sign-with-new). During the overlap window the app reads: - -- `JWT_SECRET` — **new** key, used to sign all new tokens. -- `JWT_SECRET_PREVIOUS` — **old** key, used only to verify unexpired tokens. - -> Dual-key loading requires a small code change in `JwtStrategy` / -> `TokenService` (pass both secrets, try new first, fall back to previous). -> The code change is tracked as a follow-up; **until it ships, rotations are -> "break sessions" rotations — schedule them during a low-traffic window and -> pre-announce**. - -### 3.2 Scheduled rotation (dual-key path, once code is in place) - -1. **Pre-flight** - - Ticket opened, change window booked, on-call notified. - - Staging rehearsal complete within last 7 days. - - Verify current access-token TTL (`JWT_EXPIRES_IN`, default `15m`) and - refresh-token TTL (default `30d`). The overlap window must be **≥** the - longest valid token's remaining life. - -2. **Generate new secrets** - - ```bash - NEW_JWT=$(openssl rand -base64 48) - NEW_JWT_REFRESH=$(openssl rand -base64 48) - ``` - -3. **Stage the overlap** - - In the secret store: - - | Variable | Value | - | --------------------------- | ------------------- | - | `JWT_SECRET_PREVIOUS` | current `JWT_SECRET` | - | `JWT_SECRET` | `$NEW_JWT` | - | `JWT_REFRESH_SECRET_PREVIOUS` | current `JWT_REFRESH_SECRET` | - | `JWT_REFRESH_SECRET` | `$NEW_JWT_REFRESH` | - - Roll the API deployment. Monitor `auth_login_total`, `auth_refresh_total`, - `auth_jwt_verify_failure_total`. Expected: no spike in 401s. - -4. **Hold overlap** - - Keep both keys live for **refresh-TTL + 24 h** (default 31 days). During this - time old tokens continue to verify against `*_PREVIOUS`, but every refresh - mints a new token signed with the new key. - -5. **Retire previous key** - - Remove `JWT_SECRET_PREVIOUS` and `JWT_REFRESH_SECRET_PREVIOUS` from the - secret store. Redeploy. At this point any remaining token signed with the - old key will fail verification — which is the intended end state. - -6. **Audit** - - Record fingerprints of new keys in the rotation ticket. - - Confirm no secrets appear in git, logs, or issue comments. - -### 3.3 Incident rotation (suspected leak) - -Skip the overlap. This **will** invalidate all sessions; that is the point. - -1. Generate new `JWT_SECRET` / `JWT_REFRESH_SECRET`. -2. Put service in maintenance mode (optional — it's graceful without it). -3. Update secret store → redeploy API. -4. Invalidate server-side sessions: - - Flush Redis key prefix `auth:user_status:v1:*` (see `jwt.strategy.ts` - constant `USER_STATUS_CACHE_PREFIX`). - - Truncate `RefreshToken` table (or flag revoked) so no old refresh token - can mint a new access token. -5. Announce forced re-login to users. -6. Post-mortem within 48 h. - -### 3.4 Verification - -- `GET /health/ready` returns 200. -- Smoke: login with a test account, hit an authenticated endpoint, refresh. -- Metrics: `auth_jwt_verify_failure_total` returns to baseline within 1 h. - -### 3.5 Rollback - -- Scheduled rotation: put old value back into `JWT_SECRET` / `JWT_REFRESH_SECRET` - (still present in `*_PREVIOUS` during overlap) and redeploy. -- Incident rotation: there is no rollback — old key is assumed burned. - ---- - -## 4. Payment provider secrets — VNPay / MoMo / ZaloPay - -Payment secrets are **shared** with the provider; you cannot rotate them -unilaterally. The rotation is always a coordinated cut-over via the provider -portal. - -### 4.1 Scope - -| Provider | Variables rotated in portal + our env | -| -------- | ------------------------------------------------------------------------------ | -| VNPay | `VNPAY_HASH_SECRET` (keep `VNPAY_TMN_CODE` stable unless the merchant rotates) | -| MoMo | `MOMO_ACCESS_KEY`, `MOMO_SECRET_KEY` | -| ZaloPay | `ZALOPAY_KEY1`, `ZALOPAY_KEY2` | - -All three providers sign both request and IPN callback. A mismatched secret -causes signature-verification failure on both legs. - -### 4.2 Pre-flight - -- Low-traffic window booked (recommend 02:00–04:00 ICT). -- Coordinate with the provider account manager; confirm the portal supports - immediate rotation (VNPay and MoMo do; ZaloPay requires ticket for prod). -- Staging rehearsal completed within last 14 days (see §4.5). -- Freeze new checkouts if the provider cannot overlap old + new secrets (most - cannot — rotation is atomic). -- Payments-on-call paged. -- Confirm no in-flight IPNs older than the provider's retry window - (VNPay 24 h, MoMo 24 h, ZaloPay 48 h). - -### 4.3 Scheduled rotation (production) - -1. **Drain:** stop the checkout queue consumers; let in-flight IPNs settle for - the provider's retry window. -2. **Provider portal:** log in → rotate secret → record new value + fingerprint - in the rotation ticket. -3. **Secret store:** update our env with the new value. -4. **Deploy:** roll the API. Consumers come back up. -5. **Smoke:** run the provider-specific test transaction (sandbox-shaped - minimum amount). Verify both checkout and IPN sign + verify with the new - secret. -6. **Monitor for 60 min:** - - `payment_signature_failure_total{provider}` stays at baseline. - - `payment_ipn_reject_total{provider}` stays at baseline. - - No unusual refund / reconciliation drift. - -### 4.4 Incident rotation (suspected leak) - -Same steps as §4.3, but compress the timeline and accept failed in-flight -transactions — better a handful of failed checkouts than a compromised secret. -File a follow-up for manual reconciliation of any payment created in the 30 min -before the rotation. - -### 4.5 Staging rehearsal - -The staging rehearsal for payment secrets **must** exist as a dry run before -any production rotation. Use the sandbox credentials documented in the -payments module runbook (each provider has a public sandbox). - -Record in the drill report (see §8): - -- Duration from "portal updated" to "first successful IPN verified". -- Any failed transactions and their reason codes. -- Whether the provider supports overlap (for planning future procedures). - -### 4.6 Rollback - -- If the provider portal still has the old secret active (rare — most providers - replace), revert the env var and redeploy. -- Otherwise rotate forward again to a freshly generated value; there is no way - to "un-rotate" at the provider. - ---- - -## 5. DATABASE_URL password — zero-downtime rotation - -### 5.1 Strategy - -Postgres supports **multiple roles** and connection strings already identify a -user. We rotate the password in two phases, using a transient dual-password -state via a second role: - -1. Create a shadow role `goodgo_app_v2` with the **new** password, same - privileges as the live role. Permit both roles to authenticate. -2. Update the app's `DATABASE_URL` to point at the new role. Roll the API. -3. Once all API pods have reconnected, drop the old role (or reset its - password and keep it as a break-glass). - -Postgres itself does not support "two valid passwords for one role"; swapping -roles is the clean zero-downtime path. - -### 5.2 Pre-flight - -- PostgreSQL 16 + PgBouncer connection pool verified healthy. -- Staging rehearsal completed within last 14 days. -- `pg_stat_activity` reviewed; no long-running migrations. -- Backup snapshot taken within last 6 h (see `docs/backup-restore.md`). - -### 5.3 Scheduled rotation - -```sql --- Phase 1: create shadow role (run as DB owner / postgres) -CREATE ROLE goodgo_app_v2 LOGIN PASSWORD ''; -GRANT goodgo_app TO goodgo_app_v2; -- inherit group, or mirror explicit grants -GRANT CONNECT ON DATABASE goodgo TO goodgo_app_v2; -GRANT USAGE ON SCHEMA public TO goodgo_app_v2; -GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA public TO goodgo_app_v2; --- Mirror any other grants the live role has. Verify with: --- \du goodgo_app -``` - -```bash -# Phase 2: update secret store, then roll API -# DATABASE_URL=postgresql://goodgo_app_v2:@host:5432/goodgo?sslmode=require - -# Rolling restart — one pod at a time; watch readiness probe before moving on. -kubectl -n goodgo rollout restart deployment/api -kubectl -n goodgo rollout status deployment/api --timeout=10m -``` - -```sql --- Phase 3: verify no sessions still on old role, then retire it. --- Run 30+ minutes after rollout completes. -SELECT usename, count(*) FROM pg_stat_activity WHERE usename IN ('goodgo_app','goodgo_app_v2') GROUP BY usename; --- Expect: only goodgo_app_v2 connections. - --- Option A: drop the old role (only if no other consumers use it). --- REASSIGN OWNED BY goodgo_app TO goodgo_app_v2; --- DROP OWNED BY goodgo_app; --- DROP ROLE goodgo_app; - --- Option B (recommended): reset its password to a fresh random value and keep --- it as an emergency break-glass. Document the fingerprint in the ticket. -ALTER ROLE goodgo_app PASSWORD ''; -``` - -For the next rotation, flip the naming (`goodgo_app_v2` → `goodgo_app_v3`), -keeping the alternation going. This avoids ever needing to drop and recreate -the "canonical" role name. - -### 5.4 PgBouncer considerations - -If PgBouncer sits in front of Postgres: - -- Update `userlist.txt` (or its auth source) with both roles **before** the - API roll. -- `RELOAD` PgBouncer; do not `RESTART` (clients reconnect automatically from - `RELOAD` without dropping server-side transactions). -- Verify with `SHOW USERS;` on the PgBouncer admin console. - -### 5.5 Incident rotation - -Same steps but: - -- Skip the 30-minute settle in Phase 3 — rotate immediately to Option A (drop - the compromised role) once no active sessions remain. -- If a session is actively using the compromised role, terminate it: - ```sql - SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE usename = 'goodgo_app'; - ``` -- Run a post-rotation audit on the compromised-role's activity since the last - known-good window. - -### 5.6 Verification - -- `GET /health/ready` reports DB connectivity 200. -- `db_connection_pool_active` returns to steady state. -- Smoke queries via `pnpm db:studio` with the new credential. - -### 5.7 Rollback - -- Until Phase 3 completes, rollback is: revert `DATABASE_URL` to the old role - and redeploy. The old role still authenticates. -- After Phase 3 Option A (drop): no rollback; restore from snapshot is the - last resort. - ---- - -## 6. FIELD_ENCRYPTION_KEY - -Rotating the field-encryption key requires **re-encrypting at-rest data**. It -is not a hot swap. Out of scope for this runbook beyond documenting that it -exists and requires its own migration playbook. A separate issue will track -the re-encryption tooling; until then: - -- Generate and stage the new key alongside the old (`FIELD_ENCRYPTION_KEY` + - `FIELD_ENCRYPTION_KEY_PREVIOUS`). -- Do not flip the primary until a re-encrypt job has rewritten all - encrypted columns. -- This path is **approved-change-only** (CTO sign-off). - -Tracked as follow-up: see §9. - ---- - -## 7. Rotation checklist (copy into the rotation ticket) - -```md -## Rotation — - -- [ ] Ticket opened in Paperclip; linked to [GOO-121](/GOO/issues/GOO-121) -- [ ] Change window booked (date/time ICT) -- [ ] Staging rehearsal completed (date, drill report link) -- [ ] Buddy on-call: -- [ ] New secret generated with `openssl rand -base64 48` (or class-specific) -- [ ] New-secret fingerprint (SHA-256 first 16 chars): `________________` -- [ ] Secret store updated (do not paste the value here) -- [ ] Deploy rolled; readiness probes green -- [ ] Smoke + metrics verified (link to dashboard snapshot) -- [ ] Overlap window end date (JWT only): ____ -- [ ] Old secret retired / role dropped (timestamp) -- [ ] Post-rotation audit note in ticket -- [ ] Runbook updated if anything surprised us -``` - ---- - -## 8. Drill report template - -Each scheduled rotation — starting with a staging dry run — produces a drill -report posted as a comment on [GOO-121](/GOO/issues/GOO-121) (for the initial -drill) or on the rotation ticket. - -```md -## Drill report — - -**Window:** 02:00–02:47 ICT -**Rotated by:** with buddy - -### Timeline -- 02:00 — Pre-flight complete -- 02:05 — New secret generated (fingerprint `abcd1234…`) -- 02:10 — Secret store updated -- 02:12 — Deployment rolled -- 02:18 — Smoke passed -- 02:20 — Monitoring baseline confirmed -- 02:47 — Drill closed - -### Results -- Duration: 47 min -- Auth errors during rotation: 0 (scheduled) / N (incident — list) -- Payment failures: 0 / N -- Rollback triggered: no -- Follow-ups: link any new issues created - -### Learnings -- … -``` - ---- - -## 9. Follow-ups - -The following items are **not** delivered by this runbook and should be -tracked as separate issues: - -- **Dual-key JWT code path.** `JwtStrategy` and `TokenService` need to accept - `JWT_SECRET_PREVIOUS` / `JWT_REFRESH_SECRET_PREVIOUS` so §3.2 is truly - zero-downtime. Until then, JWT rotation invalidates sessions. -- **Field-encryption re-encrypt tool.** Required before `FIELD_ENCRYPTION_KEY` - can be rotated safely in production. -- **Secret-store automation.** Today rotations are manual via the secret - store UI; an automated rotator (Vault / SSM Parameter Store rotation - lambda) would shrink the window and reduce human error. -- **Production rotation approval.** Payment + DB password rotations in - production require a CTO approval window — see [GOO-85](/GOO/issues/GOO-85). - ---- - -## 10. References - -- `apps/api/src/modules/shared/infrastructure/env-validation.ts` — authoritative - list of required secrets and minimum-length enforcement. -- `apps/api/src/modules/auth/infrastructure/strategies/jwt.strategy.ts` — - current single-key JWT verification path. -- `docs/RUNBOOK.md` — general incident response procedures. -- `docs/backup-restore.md` — database snapshot / restore steps invoked during - DB password rotation pre-flight. -- `docs/security/PAYMENT_SECURITY_CHECKLIST.md` — payment security controls. -- Parent tracker: [GOO-85](/GOO/issues/GOO-85). +Rotate to a new secret without setting `_NEXT`. All existing tokens become invalid.