422 lines
10 KiB
Markdown
422 lines
10 KiB
Markdown
---
|
|
trigger: always_on
|
|
---
|
|
|
|
# Resilience Patterns
|
|
|
|
## When to Use This Skill
|
|
|
|
Use this skill when:
|
|
- Implementing circuit breaker patterns for external services
|
|
- Adding retry logic for transient failures
|
|
- Setting timeout handling for long-running operations
|
|
- Implementing graceful degradation strategies
|
|
- Handling external service failures
|
|
- Improving system fault tolerance
|
|
|
|
## Core Concepts
|
|
|
|
### Resilience Patterns
|
|
|
|
1. **Circuit Breaker**: Prevents cascading failures by stopping calls to failing services
|
|
2. **Retry**: Automatically retries failed operations with backoff
|
|
3. **Timeout**: Sets maximum time limits for operations
|
|
4. **Bulkhead**: Isolates failures to prevent spread
|
|
5. **Graceful Degradation**: Provides fallback behavior when services fail
|
|
|
|
## Patterns
|
|
|
|
### Circuit Breaker Pattern
|
|
|
|
Protects against cascading failures:
|
|
|
|
```typescript
|
|
import CircuitBreaker from 'opossum';
|
|
import { logger } from '@goodgo/logger';
|
|
|
|
export const createCircuitBreaker = <TArgs extends any[], TResult>(
|
|
action: (...args: TArgs) => Promise<TResult>,
|
|
name: string,
|
|
options: Partial<CircuitBreaker.Options> = {}
|
|
): CircuitBreaker<TArgs, TResult> => {
|
|
const breaker = new CircuitBreaker(action, {
|
|
timeout: 3000,
|
|
errorThresholdPercentage: 50,
|
|
resetTimeout: 30000,
|
|
...options,
|
|
name,
|
|
});
|
|
|
|
breaker.on('open', () => {
|
|
logger.warn(`Circuit Breaker OPEN: ${name}`);
|
|
});
|
|
|
|
breaker.on('halfOpen', () => {
|
|
logger.info(`Circuit Breaker HALF-OPEN: ${name}`);
|
|
});
|
|
|
|
breaker.on('close', () => {
|
|
logger.info(`Circuit Breaker CLOSED: ${name}`);
|
|
});
|
|
|
|
return breaker;
|
|
};
|
|
|
|
// Usage
|
|
const externalApiBreaker = createCircuitBreaker(
|
|
async (data) => await externalApi.call(data),
|
|
'external-api'
|
|
);
|
|
|
|
try {
|
|
const result = await externalApiBreaker.fire(requestData);
|
|
} catch (error) {
|
|
// Handle circuit breaker error or fallback
|
|
}
|
|
```
|
|
|
|
### Retry Pattern
|
|
|
|
Retry transient failures with exponential backoff:
|
|
|
|
```typescript
|
|
async function retryWithBackoff<T>(
|
|
fn: () => Promise<T>,
|
|
maxRetries: number = 3,
|
|
baseDelay: number = 1000
|
|
): Promise<T> {
|
|
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
try {
|
|
return await fn();
|
|
} catch (error) {
|
|
if (attempt === maxRetries) throw error;
|
|
|
|
const delay = baseDelay * Math.pow(2, attempt);
|
|
await new Promise(resolve => setTimeout(resolve, delay));
|
|
}
|
|
}
|
|
throw new Error('Retry exhausted');
|
|
}
|
|
```
|
|
|
|
### Timeout Pattern
|
|
|
|
Set maximum time limits:
|
|
|
|
```typescript
|
|
async function withTimeout<T>(
|
|
promise: Promise<T>,
|
|
timeoutMs: number
|
|
): Promise<T> {
|
|
const timeout = new Promise<never>((_, reject) => {
|
|
setTimeout(() => reject(new Error('Operation timeout')), timeoutMs);
|
|
});
|
|
|
|
return Promise.race([promise, timeout]);
|
|
}
|
|
|
|
// Usage
|
|
try {
|
|
const result = await withTimeout(
|
|
externalService.call(),
|
|
5000 // 5 second timeout
|
|
);
|
|
} catch (error) {
|
|
if (error.message === 'Operation timeout') {
|
|
// Handle timeout
|
|
}
|
|
}
|
|
```
|
|
|
|
### Graceful Degradation
|
|
|
|
Provide fallback behavior:
|
|
|
|
```typescript
|
|
async function getDataWithFallback() {
|
|
try {
|
|
return await primaryDataSource.get();
|
|
} catch (error) {
|
|
logger.warn('Primary source failed, using fallback', { error });
|
|
return await fallbackDataSource.get();
|
|
}
|
|
}
|
|
```
|
|
|
|
### Bulkhead Pattern
|
|
|
|
Isolate failures to prevent spread:
|
|
|
|
```typescript
|
|
import PQueue from 'p-queue';
|
|
|
|
// Create separate queues for different operations
|
|
const externalApiQueue = new PQueue({
|
|
concurrency: 10, // Max 10 concurrent calls
|
|
timeout: 30000 // 30 second timeout per operation
|
|
});
|
|
|
|
const databaseQueue = new PQueue({
|
|
concurrency: 20
|
|
});
|
|
|
|
// Usage - operations are isolated
|
|
async function fetchExternalData(id: string) {
|
|
return externalApiQueue.add(async () => {
|
|
return await externalApi.getData(id);
|
|
});
|
|
}
|
|
|
|
async function queryDatabase(query: string) {
|
|
return databaseQueue.add(async () => {
|
|
return await database.execute(query);
|
|
});
|
|
}
|
|
```
|
|
|
|
### Combined Resilience Service
|
|
|
|
```typescript
|
|
// src/core/resilience/resilience.service.ts
|
|
import CircuitBreaker from 'opossum';
|
|
import { logger } from '@goodgo/logger';
|
|
|
|
interface ResilienceOptions {
|
|
timeout?: number;
|
|
maxRetries?: number;
|
|
circuitBreaker?: boolean;
|
|
fallback?: () => Promise<any>;
|
|
}
|
|
|
|
export class ResilienceService {
|
|
async execute<T>(
|
|
operation: () => Promise<T>,
|
|
name: string,
|
|
options: ResilienceOptions = {}
|
|
): Promise<T> {
|
|
const {
|
|
timeout = 5000,
|
|
maxRetries = 3,
|
|
circuitBreaker = true,
|
|
fallback
|
|
} = options;
|
|
|
|
let fn = operation;
|
|
|
|
// Wrap with timeout
|
|
fn = () => this.withTimeout(operation(), timeout);
|
|
|
|
// Wrap with retry
|
|
fn = () => this.retryWithBackoff(fn, maxRetries);
|
|
|
|
// Wrap with circuit breaker
|
|
if (circuitBreaker) {
|
|
const breaker = this.createCircuitBreaker(fn, name);
|
|
try {
|
|
return await breaker.fire();
|
|
} catch (error) {
|
|
if (fallback) {
|
|
logger.warn(`${name}: Using fallback`, { error: error.message });
|
|
return await fallback();
|
|
}
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
try {
|
|
return await fn();
|
|
} catch (error) {
|
|
if (fallback) {
|
|
return await fallback();
|
|
}
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
private withTimeout<T>(promise: Promise<T>, ms: number): Promise<T> {
|
|
const timeout = new Promise<never>((_, reject) => {
|
|
setTimeout(() => reject(new Error('Operation timeout')), ms);
|
|
});
|
|
return Promise.race([promise, timeout]);
|
|
}
|
|
|
|
private async retryWithBackoff<T>(
|
|
fn: () => Promise<T>,
|
|
maxRetries: number
|
|
): Promise<T> {
|
|
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
try {
|
|
return await fn();
|
|
} catch (error) {
|
|
if (attempt === maxRetries) throw error;
|
|
const delay = 1000 * Math.pow(2, attempt);
|
|
await new Promise(resolve => setTimeout(resolve, delay));
|
|
}
|
|
}
|
|
throw new Error('Retry exhausted');
|
|
}
|
|
|
|
private createCircuitBreaker<T>(
|
|
fn: () => Promise<T>,
|
|
name: string
|
|
): CircuitBreaker<[], T> {
|
|
return new CircuitBreaker(fn, {
|
|
timeout: 3000,
|
|
errorThresholdPercentage: 50,
|
|
resetTimeout: 30000,
|
|
name
|
|
});
|
|
}
|
|
}
|
|
|
|
// Usage
|
|
const resilience = new ResilienceService();
|
|
|
|
const result = await resilience.execute(
|
|
() => externalApi.fetchUser(userId),
|
|
'fetch-user',
|
|
{
|
|
timeout: 3000,
|
|
maxRetries: 2,
|
|
fallback: () => Promise.resolve({ id: userId, name: 'Unknown' })
|
|
}
|
|
);
|
|
```
|
|
|
|
### Health Check with Resilience
|
|
|
|
```typescript
|
|
// src/health/health.controller.ts
|
|
export class HealthController {
|
|
async checkDependencies(): Promise<HealthStatus> {
|
|
const checks = await Promise.allSettled([
|
|
this.checkDatabase(),
|
|
this.checkRedis(),
|
|
this.checkExternalApi()
|
|
]);
|
|
|
|
const results = {
|
|
database: checks[0].status === 'fulfilled' ? 'healthy' : 'unhealthy',
|
|
redis: checks[1].status === 'fulfilled' ? 'healthy' : 'unhealthy',
|
|
externalApi: checks[2].status === 'fulfilled' ? 'healthy' : 'degraded'
|
|
};
|
|
|
|
// Service is healthy even if external API is down (graceful degradation)
|
|
const isHealthy = results.database === 'healthy' && results.redis === 'healthy';
|
|
|
|
return {
|
|
status: isHealthy ? 'healthy' : 'unhealthy',
|
|
dependencies: results
|
|
};
|
|
}
|
|
}
|
|
```
|
|
|
|
## Best Practices
|
|
|
|
1. **Circuit Breaker**: Use for external service calls
|
|
2. **Retry**: Retry only transient failures (network, timeout)
|
|
3. **Timeout**: Set appropriate timeouts for all external calls
|
|
4. **Fallback**: Always provide fallback behavior
|
|
5. **Monitoring**: Monitor circuit breaker states and retry rates
|
|
6. **Logging**: Log all resilience actions for debugging
|
|
|
|
## Common Mistakes
|
|
|
|
1. **Retrying Non-Retryable Errors**: Retrying 4xx errors (client errors)
|
|
```typescript
|
|
// ❌ BAD: Retry all errors
|
|
catch (error) {
|
|
await retry(operation);
|
|
}
|
|
|
|
// ✅ GOOD: Only retry transient errors
|
|
catch (error) {
|
|
if (isTransientError(error)) {
|
|
await retry(operation);
|
|
} else {
|
|
throw error;
|
|
}
|
|
}
|
|
```
|
|
|
|
2. **No Timeout**: Missing timeouts on external calls
|
|
```typescript
|
|
// ❌ BAD: No timeout
|
|
const data = await externalApi.fetch();
|
|
|
|
// ✅ GOOD: With timeout
|
|
const data = await withTimeout(externalApi.fetch(), 5000);
|
|
```
|
|
|
|
3. **No Fallback**: No graceful degradation strategy
|
|
```typescript
|
|
// ❌ BAD: Service crashes if dependency fails
|
|
const user = await userService.get(id);
|
|
|
|
// ✅ GOOD: Fallback to cached/default data
|
|
const user = await userService.get(id).catch(() => cachedUser);
|
|
```
|
|
|
|
4. **Too Many Retries**: Excessive retries causing performance issues
|
|
```typescript
|
|
// ❌ BAD: Too many retries with short delay
|
|
retry(fn, { maxRetries: 10, delay: 100 });
|
|
|
|
// ✅ GOOD: Limited retries with exponential backoff
|
|
retry(fn, { maxRetries: 3, baseDelay: 1000, exponential: true });
|
|
```
|
|
|
|
5. **Circuit Breaker Misconfiguration**: Wrong thresholds
|
|
```typescript
|
|
// ❌ BAD: Circuit opens too easily or never
|
|
{ errorThresholdPercentage: 5 } // Opens after 5% errors
|
|
{ errorThresholdPercentage: 99 } // Almost never opens
|
|
|
|
// ✅ GOOD: Balanced threshold
|
|
{ errorThresholdPercentage: 50, resetTimeout: 30000 }
|
|
```
|
|
|
|
## Quick Reference
|
|
|
|
| Pattern | Use Case | Key Config |
|
|
|---------|----------|------------|
|
|
| **Circuit Breaker** | External API calls | threshold: 50%, reset: 30s |
|
|
| **Retry** | Transient failures | max: 3, exponential backoff |
|
|
| **Timeout** | All external calls | 3-5s for API, 30s for batch |
|
|
| **Bulkhead** | Resource isolation | 10-20 concurrent ops |
|
|
| **Fallback** | Critical operations | Cache, default, or degraded |
|
|
|
|
**Opossum Circuit Breaker States:**
|
|
```
|
|
CLOSED → (errors exceed threshold) → OPEN
|
|
OPEN → (reset timeout expires) → HALF-OPEN
|
|
HALF-OPEN → (success) → CLOSED
|
|
HALF-OPEN → (failure) → OPEN
|
|
```
|
|
|
|
**Retry Delays (Exponential Backoff):**
|
|
```
|
|
Attempt 1: 1s
|
|
Attempt 2: 2s
|
|
Attempt 3: 4s
|
|
Attempt 4: 8s
|
|
```
|
|
|
|
**Essential Imports:**
|
|
```typescript
|
|
import CircuitBreaker from 'opossum';
|
|
import PQueue from 'p-queue';
|
|
import { logger } from '@goodgo/logger';
|
|
```
|
|
|
|
## Resources
|
|
|
|
- [Opossum Documentation](https://nodeshift.dev/opossum/) - Circuit breaker library
|
|
- [Microsoft Resilience Patterns](https://docs.microsoft.com/en-us/azure/architecture/patterns/category/resiliency)
|
|
- [API Gateway Advanced](../api-gateway-advanced/SKILL.md) - Gateway circuit breaker
|
|
- [Observability & Monitoring](../observability-monitoring/SKILL.md) - Health checks, metrics
|
|
- [Event-Driven Architecture](../event-driven-architecture/SKILL.md) - Event retry patterns
|
|
- [Error Handling Patterns](../error-handling-patterns/SKILL.md) - Error handling
|
|
- [Project Rules](../project-rules/SKILL.md) - GoodGo standards
|