Files
pos-system/.agent/rules/resilience-patterns.md

422 lines
10 KiB
Markdown

---
trigger: always_on
---
# Resilience Patterns
## When to Use This Skill
Use this skill when:
- Implementing circuit breaker patterns for external services
- Adding retry logic for transient failures
- Setting timeout handling for long-running operations
- Implementing graceful degradation strategies
- Handling external service failures
- Improving system fault tolerance
## Core Concepts
### Resilience Patterns
1. **Circuit Breaker**: Prevents cascading failures by stopping calls to failing services
2. **Retry**: Automatically retries failed operations with backoff
3. **Timeout**: Sets maximum time limits for operations
4. **Bulkhead**: Isolates failures to prevent spread
5. **Graceful Degradation**: Provides fallback behavior when services fail
## Patterns
### Circuit Breaker Pattern
Protects against cascading failures:
```typescript
import CircuitBreaker from 'opossum';
import { logger } from '@goodgo/logger';
export const createCircuitBreaker = <TArgs extends any[], TResult>(
action: (...args: TArgs) => Promise<TResult>,
name: string,
options: Partial<CircuitBreaker.Options> = {}
): CircuitBreaker<TArgs, TResult> => {
const breaker = new CircuitBreaker(action, {
timeout: 3000,
errorThresholdPercentage: 50,
resetTimeout: 30000,
...options,
name,
});
breaker.on('open', () => {
logger.warn(`Circuit Breaker OPEN: ${name}`);
});
breaker.on('halfOpen', () => {
logger.info(`Circuit Breaker HALF-OPEN: ${name}`);
});
breaker.on('close', () => {
logger.info(`Circuit Breaker CLOSED: ${name}`);
});
return breaker;
};
// Usage
const externalApiBreaker = createCircuitBreaker(
async (data) => await externalApi.call(data),
'external-api'
);
try {
const result = await externalApiBreaker.fire(requestData);
} catch (error) {
// Handle circuit breaker error or fallback
}
```
### Retry Pattern
Retry transient failures with exponential backoff:
```typescript
async function retryWithBackoff<T>(
fn: () => Promise<T>,
maxRetries: number = 3,
baseDelay: number = 1000
): Promise<T> {
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
return await fn();
} catch (error) {
if (attempt === maxRetries) throw error;
const delay = baseDelay * Math.pow(2, attempt);
await new Promise(resolve => setTimeout(resolve, delay));
}
}
throw new Error('Retry exhausted');
}
```
### Timeout Pattern
Set maximum time limits:
```typescript
async function withTimeout<T>(
promise: Promise<T>,
timeoutMs: number
): Promise<T> {
const timeout = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error('Operation timeout')), timeoutMs);
});
return Promise.race([promise, timeout]);
}
// Usage
try {
const result = await withTimeout(
externalService.call(),
5000 // 5 second timeout
);
} catch (error) {
if (error.message === 'Operation timeout') {
// Handle timeout
}
}
```
### Graceful Degradation
Provide fallback behavior:
```typescript
async function getDataWithFallback() {
try {
return await primaryDataSource.get();
} catch (error) {
logger.warn('Primary source failed, using fallback', { error });
return await fallbackDataSource.get();
}
}
```
### Bulkhead Pattern
Isolate failures to prevent spread:
```typescript
import PQueue from 'p-queue';
// Create separate queues for different operations
const externalApiQueue = new PQueue({
concurrency: 10, // Max 10 concurrent calls
timeout: 30000 // 30 second timeout per operation
});
const databaseQueue = new PQueue({
concurrency: 20
});
// Usage - operations are isolated
async function fetchExternalData(id: string) {
return externalApiQueue.add(async () => {
return await externalApi.getData(id);
});
}
async function queryDatabase(query: string) {
return databaseQueue.add(async () => {
return await database.execute(query);
});
}
```
### Combined Resilience Service
```typescript
// src/core/resilience/resilience.service.ts
import CircuitBreaker from 'opossum';
import { logger } from '@goodgo/logger';
interface ResilienceOptions {
timeout?: number;
maxRetries?: number;
circuitBreaker?: boolean;
fallback?: () => Promise<any>;
}
export class ResilienceService {
async execute<T>(
operation: () => Promise<T>,
name: string,
options: ResilienceOptions = {}
): Promise<T> {
const {
timeout = 5000,
maxRetries = 3,
circuitBreaker = true,
fallback
} = options;
let fn = operation;
// Wrap with timeout
fn = () => this.withTimeout(operation(), timeout);
// Wrap with retry
fn = () => this.retryWithBackoff(fn, maxRetries);
// Wrap with circuit breaker
if (circuitBreaker) {
const breaker = this.createCircuitBreaker(fn, name);
try {
return await breaker.fire();
} catch (error) {
if (fallback) {
logger.warn(`${name}: Using fallback`, { error: error.message });
return await fallback();
}
throw error;
}
}
try {
return await fn();
} catch (error) {
if (fallback) {
return await fallback();
}
throw error;
}
}
private withTimeout<T>(promise: Promise<T>, ms: number): Promise<T> {
const timeout = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error('Operation timeout')), ms);
});
return Promise.race([promise, timeout]);
}
private async retryWithBackoff<T>(
fn: () => Promise<T>,
maxRetries: number
): Promise<T> {
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
return await fn();
} catch (error) {
if (attempt === maxRetries) throw error;
const delay = 1000 * Math.pow(2, attempt);
await new Promise(resolve => setTimeout(resolve, delay));
}
}
throw new Error('Retry exhausted');
}
private createCircuitBreaker<T>(
fn: () => Promise<T>,
name: string
): CircuitBreaker<[], T> {
return new CircuitBreaker(fn, {
timeout: 3000,
errorThresholdPercentage: 50,
resetTimeout: 30000,
name
});
}
}
// Usage
const resilience = new ResilienceService();
const result = await resilience.execute(
() => externalApi.fetchUser(userId),
'fetch-user',
{
timeout: 3000,
maxRetries: 2,
fallback: () => Promise.resolve({ id: userId, name: 'Unknown' })
}
);
```
### Health Check with Resilience
```typescript
// src/health/health.controller.ts
export class HealthController {
async checkDependencies(): Promise<HealthStatus> {
const checks = await Promise.allSettled([
this.checkDatabase(),
this.checkRedis(),
this.checkExternalApi()
]);
const results = {
database: checks[0].status === 'fulfilled' ? 'healthy' : 'unhealthy',
redis: checks[1].status === 'fulfilled' ? 'healthy' : 'unhealthy',
externalApi: checks[2].status === 'fulfilled' ? 'healthy' : 'degraded'
};
// Service is healthy even if external API is down (graceful degradation)
const isHealthy = results.database === 'healthy' && results.redis === 'healthy';
return {
status: isHealthy ? 'healthy' : 'unhealthy',
dependencies: results
};
}
}
```
## Best Practices
1. **Circuit Breaker**: Use for external service calls
2. **Retry**: Retry only transient failures (network, timeout)
3. **Timeout**: Set appropriate timeouts for all external calls
4. **Fallback**: Always provide fallback behavior
5. **Monitoring**: Monitor circuit breaker states and retry rates
6. **Logging**: Log all resilience actions for debugging
## Common Mistakes
1. **Retrying Non-Retryable Errors**: Retrying 4xx errors (client errors)
```typescript
// ❌ BAD: Retry all errors
catch (error) {
await retry(operation);
}
// ✅ GOOD: Only retry transient errors
catch (error) {
if (isTransientError(error)) {
await retry(operation);
} else {
throw error;
}
}
```
2. **No Timeout**: Missing timeouts on external calls
```typescript
// ❌ BAD: No timeout
const data = await externalApi.fetch();
// ✅ GOOD: With timeout
const data = await withTimeout(externalApi.fetch(), 5000);
```
3. **No Fallback**: No graceful degradation strategy
```typescript
// ❌ BAD: Service crashes if dependency fails
const user = await userService.get(id);
// ✅ GOOD: Fallback to cached/default data
const user = await userService.get(id).catch(() => cachedUser);
```
4. **Too Many Retries**: Excessive retries causing performance issues
```typescript
// ❌ BAD: Too many retries with short delay
retry(fn, { maxRetries: 10, delay: 100 });
// ✅ GOOD: Limited retries with exponential backoff
retry(fn, { maxRetries: 3, baseDelay: 1000, exponential: true });
```
5. **Circuit Breaker Misconfiguration**: Wrong thresholds
```typescript
// ❌ BAD: Circuit opens too easily or never
{ errorThresholdPercentage: 5 } // Opens after 5% errors
{ errorThresholdPercentage: 99 } // Almost never opens
// ✅ GOOD: Balanced threshold
{ errorThresholdPercentage: 50, resetTimeout: 30000 }
```
## Quick Reference
| Pattern | Use Case | Key Config |
|---------|----------|------------|
| **Circuit Breaker** | External API calls | threshold: 50%, reset: 30s |
| **Retry** | Transient failures | max: 3, exponential backoff |
| **Timeout** | All external calls | 3-5s for API, 30s for batch |
| **Bulkhead** | Resource isolation | 10-20 concurrent ops |
| **Fallback** | Critical operations | Cache, default, or degraded |
**Opossum Circuit Breaker States:**
```
CLOSED → (errors exceed threshold) → OPEN
OPEN → (reset timeout expires) → HALF-OPEN
HALF-OPEN → (success) → CLOSED
HALF-OPEN → (failure) → OPEN
```
**Retry Delays (Exponential Backoff):**
```
Attempt 1: 1s
Attempt 2: 2s
Attempt 3: 4s
Attempt 4: 8s
```
**Essential Imports:**
```typescript
import CircuitBreaker from 'opossum';
import PQueue from 'p-queue';
import { logger } from '@goodgo/logger';
```
## Resources
- [Opossum Documentation](https://nodeshift.dev/opossum/) - Circuit breaker library
- [Microsoft Resilience Patterns](https://docs.microsoft.com/en-us/azure/architecture/patterns/category/resiliency)
- [API Gateway Advanced](../api-gateway-advanced/SKILL.md) - Gateway circuit breaker
- [Observability & Monitoring](../observability-monitoring/SKILL.md) - Health checks, metrics
- [Event-Driven Architecture](../event-driven-architecture/SKILL.md) - Event retry patterns
- [Error Handling Patterns](../error-handling-patterns/SKILL.md) - Error handling
- [Project Rules](../project-rules/SKILL.md) - GoodGo standards