diff --git a/docs/en/architecture/caching-architecture.md b/docs/en/architecture/caching-architecture.md index d8f5e6a3..b18a7341 100644 --- a/docs/en/architecture/caching-architecture.md +++ b/docs/en/architecture/caching-architecture.md @@ -1,8 +1,8 @@ -# Caching Architecture +# Kiến trúc Caching -> Multi-layer caching strategy for optimal performance +> Chiến lược caching nhiều tầng để tối ưu hiệu suất -## Overview Diagram +## Sơ đồ Tổng quan ```mermaid graph TD @@ -18,16 +18,21 @@ graph TD DB --> StoreL2[Store L2 + L1] StoreL2 --> Return3[Return
< 50ms] - style L1 fill:#d4edda - style L2 fill:#fff4e1 - style DB fill:#f0e1ff + classDef memory fill:#1b5e20,stroke:#2e7d32,color:#fff + classDef redis fill:#e65100,stroke:#ef6c00,color:#fff + classDef db fill:#212121,stroke:#424242,color:#fff + classDef default fill:#202020,stroke:#505050,color:#fff + + class L1,Return1,WarmL1 memory + class L2,Return2,StoreL2 redis + class DB,Return3 db ``` -## System Context +## Bối cảnh Hệ thống ```mermaid C4Context - title Caching System Context + title Sơ đồ Bối cảnh Hệ thống Caching System(service, "Microservice", "Client service using cache") System_Ext(db, "Neon PostgreSQL", "Primary database") @@ -41,33 +46,38 @@ C4Context Rel(service, l2, "Reads/Writes", "Redis Protocol") Rel(l1, l2, "Fills from", "On miss") Rel(l2, db, "Cache aside", "On miss") + + UpdateElementStyle(service, $fontColor="white", $bgColor="#1a237e", $borderColor="#3949ab") + UpdateElementStyle(db, $fontColor="white", $bgColor="#212121", $borderColor="#424242") + UpdateElementStyle(l1, $fontColor="white", $bgColor="#1b5e20", $borderColor="#2e7d32") + UpdateElementStyle(l2, $fontColor="white", $bgColor="#e65100", $borderColor="#ef6c00") ``` -### Context Description -- **Service**: Communicates directly with L1 Cache (in-memory) for lowest latency. -- **L1 Cache**: Local cache, not shared, automatic expiration (short TTL). -- **L2 Cache**: Shared Redis cluster, holds data longer and syncs across instances. -- **Database**: Source of truth, accessed only on cache miss. +### Mô tả Bối cảnh +- **Service**: Giao tiếp trực tiếp với L1 Cache (in-memory) để đạt độ trễ thấp nhất. +- **L1 Cache**: Cache cục bộ, không chia sẻ, tự động hết hạn (TTL ngắn). +- **L2 Cache**: Redis cluster chia sẻ, giữ dữ liệu lâu dài hơn và đồng bộ giữa các instances. +- **Database**: Nguồn dữ liệu gốc (source of truth), chỉ được truy cập khi cache miss. -## Architecture Description +## Mô tả Kiến trúc -### Multi-Layer Caching +### Caching Nhiều Tầng -GoodGo platform uses 2-layer caching for performance: +Nền tảng GoodGo sử dụng caching 2 tầng để tối ưu hiệu suất: **L1 Cache (Memory)**: -- In-memory cache per service instance -- Very fast access (< 1ms) -- Limited capacity (10k keys default) -- Short TTL (60 seconds default, max 5 minutes) -- Not shared across instances +- In-memory cache trên mỗi service instance +- Truy cập rất nhanh (< 1ms) +- Dung lượng giới hạn (10k keys mặc định) +- TTL ngắn (60 giây mặc định, tối đa 5 phút) +- Không share giữa instances **L2 Cache (Redis)**: - Shared distributed cache -- Fast access (< 5ms) -- Large capacity -- Longer TTL (configurable, typically 5-15 minutes) -- Shared across all service instances +- Truy cập nhanh (< 5ms) +- Dung lượng lớn +- TTL dài hơn (configurable, thường 5-15 phút) +- Share giữa tất cả service instances **Cache Flow**: ``` @@ -77,97 +87,99 @@ Request → L1 → L2 → Database hit rate hit rate rate ``` -## Cache Implementation +## Triển khai Cache -### Multi-Layer Cache Service +### Multi-Layer Cache Service (.NET) -```typescript -export class MultiLayerCache { - private l1Cache: NodeCache; - private l2Cache: Redis; - - constructor() { - // L1: Memory cache - this.l1Cache = new NodeCache({ - stdTTL: 60, // 60 seconds default - maxKeys: 10000, // Max 10k keys - checkperiod: 120 // Check for expired keys every 2min - }); +```csharp +// EN: Multi-layer cache implementation +// VI: Triển khai cache đa lớp +public class MultiLayerCacheService : ICacheService +{ + private readonly IMemoryCache _l1Cache; + private readonly IConnectionMultiplexer _redis; + private readonly IDatabase _l2Cache; + private readonly ILogger _logger; - // L2: Redis cache - this.l2Cache = new Redis({ - host: process.env.REDIS_HOST, - port: parseInt(process.env.REDIS_PORT), - db: 0 - }); - } - - async get(key: string): Promise { - // Try L1 first - const l1Value = this.l1Cache.get(key); - if (l1Value) { - logger.debug('L1 cache hit', { key }); - return l1Value; + public MultiLayerCacheService( + IMemoryCache l1Cache, + IConnectionMultiplexer redis, + ILogger logger) + { + _l1Cache = l1Cache; + _redis = redis; + _l2Cache = redis.GetDatabase(); + _logger = logger; } - // Try L2 - const l2Value = await this.l2Cache.get(key); - if (l2Value) { - logger.debug('L2 cache hit', { key }); - const parsed = JSON.parse(l2Value) as T; - - // Warm L1 cache - this.l1Cache.set(key, parsed); - return parsed; + public async Task GetAsync(string key, CancellationToken ct = default) + { + // L1: Memory cache check + if (_l1Cache.TryGetValue(key, out T? l1Value)) + { + _logger.LogDebug("L1 cache hit for key: {Key}", key); + return l1Value; + } + + // L2: Redis cache check + var l2Value = await _l2Cache.StringGetAsync(key); + if (!l2Value.IsNullOrEmpty) + { + _logger.LogDebug("L2 cache hit for key: {Key}", key); + var parsed = JsonSerializer.Deserialize(l2Value!); + + // Warm L1 cache + _l1Cache.Set(key, parsed, TimeSpan.FromMinutes(1)); + return parsed; + } + + _logger.LogDebug("Cache miss for key: {Key}", key); + return default; } - logger.debug('Cache miss', { key }); - return null; - } - - async set(key: string, value: any, ttl: number = 300): Promise { - // Store in both L1 and L2 - this.l1Cache.set(key, value, Math.min(ttl, 300)); // L1 max 5min - await this.l2Cache.setex(key, ttl, JSON.stringify(value)); - } - - async del(key: string): Promise { - this.l1Cache.del(key); - await this.l2Cache.del(key); - } - - async invalidatePattern(pattern: string): Promise { - // L1: Clear all (simple approach) - this.l1Cache.flushAll(); - - // L2: Delete by pattern - const keys = await this.l2Cache.keys(pattern); - if (keys.length > 0) { - await this.l2Cache.del(...keys); + public async Task SetAsync(string key, T value, TimeSpan? ttl = null, CancellationToken ct = default) + { + var expiry = ttl ?? TimeSpan.FromMinutes(5); + var l1Expiry = TimeSpan.FromMinutes(Math.Min(expiry.TotalMinutes, 5)); + + // L1: Memory cache (max 5 min) + _l1Cache.Set(key, value, l1Expiry); + + // L2: Redis cache + var json = JsonSerializer.Serialize(value); + await _l2Cache.StringSetAsync(key, json, expiry); + } + + public async Task RemoveAsync(string key, CancellationToken ct = default) + { + _l1Cache.Remove(key); + await _l2Cache.KeyDeleteAsync(key); } - } } ``` -### Cache Key Naming +### Quy ước Đặt tên Key **Pattern**: `{service}:{entity}:{identifier}:{sub-resource}` -**Examples**: -```typescript -const keys = { - user: (userId: string) => `iam:user:${userId}`, - userPermissions: (userId: string) => `iam:user:${userId}:permissions`, - userRoles: (userId: string) => `iam:user:${userId}:roles`, - session: (sessionId: string) => `iam:session:${sessionId}`, -}; +**Ví dụ (C#)**: +```csharp +// Cache key constants +public static class CacheKeys +{ + public static string User(string userId) => $"iam:user:{userId}"; + public static string UserPermissions(string userId) => $"iam:user:{userId}:permissions"; + public static string UserRoles(string userId) => $"iam:user:{userId}:roles"; + public static string Session(string sessionId) => $"iam:session:{sessionId}"; + public static string UserQuota(string userId) => $"storage:quota:{userId}"; +} -// Usage -const user = await cache.get(keys.user('user_123')); -const permissions = await cache.get(keys.userPermissions('user_123')); +// Sử dụng +var user = await _cache.GetAsync(CacheKeys.User("user_123")); +var permissions = await _cache.GetAsync>(CacheKeys.UserPermissions("user_123")); ``` -## TTL Strategies +## Chiến lược TTL ```mermaid graph LR @@ -186,22 +198,28 @@ graph LR Long --> Config[Static Config] Long --> RefData[Reference Data] - style Short fill:#f8d7da - style Medium fill:#fff3cd - style Long fill:#d4edda + classDef tier fill:#202020,stroke:#505050,color:#fff + classDef short fill:#b71c1c,stroke:#f44336,color:#fff + classDef medium fill:#e65100,stroke:#ef6c00,color:#fff + classDef long fill:#1b5e20,stroke:#2e7d32,color:#fff + + class Short short + class Medium medium + class Long long + class Permissions,Sessions,UserProfiles,OrgData,Config,RefData tier ``` -**TTL Guidelines**: -| Data Type | TTL | Reason | -|-----------|-----|--------| -| User permissions | 5 min | Security-sensitive | -| Session data | Varies | Based on session length | -| User profiles | 10 min | Moderate update frequency | -| Organization data | 15 min | Infrequent updates | -| Static config | 30-60 min | Very stable | -| Reference data | 1-2 hours | Almost never changes | +**Hướng dẫn TTL**: +| Loại Dữ liệu | TTL | Lý do | +|---------------------------|-----|----------------| +| User permissions | 5 min | Nhạy cảm bảo mật | +| Session data | Varies | Dựa trên độ dài session | +| User profiles | 10 min | Tần suất cập nhật vừa phải | +| Organization data | 15 min | Cập nhật không thường xuyên | +| Static config | 30-60 min | Rất ổn định | +| Reference data | 1-2 hours | Hầu như không thay đổi | -## Cache Invalidation +## Vô hiệu hóa Cache ```mermaid sequenceDiagram @@ -224,39 +242,39 @@ sequenceDiagram Note over Service,Cache: Next request will fetch fresh data ``` -**Invalidation Strategies**: +**Chiến lược Invalidation**: ```typescript -// 1. Single key invalidation +// 1. Invalidation single key async updateUser(userId: string, data: UpdateUserDto): Promise { const user = await userRepository.update(userId, data); - // Invalidate user cache + // Vô hiệu hóa user cache await cache.del(cacheKeys.user(userId)); return user; } -// 2. Pattern-based invalidation +// 2. Invalidation theo pattern async updateUserRole(userId: string, roleId: string): Promise { await userRoleRepository.assign(userId, roleId); - // Invalidate all user-related cache + // Vô hiệu hóa tất cả cache liên quan đến user await cache.invalidatePattern(`iam:user:${userId}:*`); } -// 3. Time-based invalidation (TTL expiry) -// Automatically handled by cache +// 3. Invalidation theo thời gian (TTL expiry) +// Tự động xử lý bởi cache ``` -## Cache Warming +## Làm ấm Cache ```typescript -// Preload frequently accessed data +// Preload dữ liệu thường xuyên truy cập async warmCache(): Promise { logger.info('Starting cache warming'); - // Warm user permissions for active users + // Làm ấm user permissions cho active users const activeUsers = await userRepository.findActive({ limit: 1000 }); for (const user of activeUsers) { @@ -265,118 +283,354 @@ async warmCache(): Promise { await cache.set( cacheKeys.userPermissions(user.id), permissions, - 300 // 5 minutes + 300 // 5 phút ); } logger.info('Cache warming completed', { count: activeUsers.length }); } -// Run on service startup +// Chạy khi service khởi động warmCache().catch(err => logger.error('Cache warming failed', { err })); ``` -## Design Decisions +## Quyết định Thiết kế -### Decision 1: Multi-layer Caching (L1 + L2) +### Quyết định 1: Multi-layer Caching (L1 + L2) -**Context**: Need to reduce load on Redis and achieve ultra-low latency for hot data. -**Decision**: Use combination of L1 (NodeCache) and L2 (Redis). -**Consequences**: -- ✅ Latency < 1ms for 40-50% requests. -- ✅ Reduced network traffic to Redis. -- ❌ Synchronization complexity (L1 might be stale for short duration). +**Bối cảnh**: Cần giảm tải cho Redis và đạt độ trễ cực thấp cho dữ liệu hot. +**Quyết định**: Sử dụng kết hợp L1 (NodeCache) và L2 (Redis). +**Hậu quả**: +- ✅ Độ trễ < 1ms cho 40-50% requests. +- ✅ Giảm network traffic tới Redis. +- ❌ Phức tạp trong đồng bộ (L1 có thể stale trong thời gian ngắn). -## Performance Characteristics +## Đặc điểm Hiệu suất -### Performance Targets -| Metric | Target | Notes | -|--------|--------|-------| +### Mục tiêu Hiệu suất +| Chỉ số | Mục tiêu | Ghi chú | +|-----------------|-------------------|-----------------| | **L1 Hit Latency** | < 0.5ms | In-memory lookup | | **L2 Hit Latency** | < 5ms | Network RTT + Redis processing | | **Combine Hit Rate** | > 90% | L1 + L2 combined | | **L1 Capacity** | 10k items | Per instance limit to protect heap | | **Cache Warmup Time** | < 30s | At service startup | -## Security Considerations +## Cân nhắc Bảo mật -### Cache Security -- **Encryption**: Sensitive data (PII) MUST be encrypted before storing in L2 Redis (AES-256). L1 can store plaintext as it is in process memory (unless memory dump). -- **Isolation**: Redis instance protected by password and Network Policy (allow internal K8s traffic only). -- **TLS**: Connect to Redis via TLS 1.2+. -- **Data Sanitization**: Do not cache entire user objects if they contain password hashes or secrets. +### Bảo mật Cache +- **Encryption**: Dữ liệu nhạy cảm (PII) PHẢI được mã hóa trước khi lưu vào L2 Redis (AES-256). L1 có thể lưuplaintext vì nằm trong memory process (trừ khi memory dump). +- **Isolation**: Redis instance được bảo vệ bằng mật khẩu và Network Policy (chỉ allow traffic từ nội bộ K8s). +- **TLS**: Kết nối tới Redis qua TLS 1.2+. +- **Data Sanitization**: Không cache toàn bộ user object nếu chứa password hash hoặc secrets. -## Deployment +## Triển khai ```mermaid graph TD - subgraph "Kubernetes Pod" - Service[Microservice Container] - L1[L1 Cache (RAM)] - Service --- L1 + subgraph "Redis Cluster" + subgraph "Masters" + M1[Redis Master 1
Slots: 0-5460] + M2[Redis Master 2
Slots: 5461-10922] + M3[Redis Master 3
Slots: 10923-16383] + end + + subgraph "Slaves" + S1[Redis Slave 1
Replica of M1] + S2[Redis Slave 2
Replica of M2] + S3[Redis Slave 3
Replica of M3] + end + + M1 --> S1 + M2 --> S2 + M3 --> S3 + + Sentinel[Redis Sentinel
3 nodes] + + Sentinel -.->|Monitor| M1 + Sentinel -.->|Monitor| M2 + Sentinel -.->|Monitor| M3 end - - subgraph "Infrastructure" - RedisMaster[Redis Master] - RedisSlave1[Redis Slave 1] - RedisSlave2[Redis Slave 2] + + subgraph "Services" + Service1[Service A] + Service2[Service B] + Service3[Service C] end - - Service -->|Write| RedisMaster - Service -->|Read| RedisSlave1 - Service -->|Read| RedisSlave2 - - RedisMaster -.->|Replication| RedisSlave1 - RedisMaster -.->|Replication| RedisSlave2 - - style Service fill:#e1f5ff - style L1 fill:#d4edda - style RedisMaster fill:#fff4e1 + + Service1 --> M1 + Service1 --> M2 + Service1 --> M3 + + Service2 --> M1 + Service2 --> M2 + Service2 --> M3 + + Service3 --> M1 + Service3 --> M2 + Service3 --> M3 + + classDef master fill:#e65100,stroke:#ef6c00,color:#fff + classDef slave fill:#f57c00,stroke:#e65100,color:#fff + classDef sentinel fill:#4a148c,stroke:#7b1fa2,color:#fff + classDef service fill:#1a237e,stroke:#3949ab,color:#fff + classDef default fill:#202020,stroke:#505050,color:#fff + + class M1,M2,M3 master + class S1,S2,S3 slave + class Sentinel sentinel + class Service1,Service2,Service3 service ``` -**Deployment Description**: -- **L1**: Embedded directly in Microservice process, scales with number of Pods. -- **L2**: Redis Cluster (or Sentinel) with at least 3 nodes for High Availability. -- **Connection Pooling**: Use ioredis with connection pooling for efficient connection management. +### Chiến lược Triển khai -## Monitoring & Observability +**Redis Cluster Configuration**: +- **Mode**: Cluster mode với 3 masters + 3 slaves +- **Replication**: Mỗi master có 1 slave cho high availability +- **Sentinel**: 3-node Sentinel ensemble cho automatic failover +- **Sharding**: 16384 hash slots phân chia đều giữa 3 masters +- **Persistence**: RDB snapshots mỗi 5 phút, AOF disabled (performance) -### Monitoring Metrics -- **Metrics**: Prometheus metrics for hit rate, miss rate, latency, memory usage. -- **Logs**: Log cache miss/hit at debug level (sampled), log connection errors at error level. -- **Health Checks**: Readiness probe checks connection to Redis. +**Resource Allocation**: +| Component | CPU | Memory | Disk | Replicas | +|-----------|-----|--------|------|----------| +| **Redis Master** | 1 core | 2GB | 10GB SSD | 3 | +| **Redis Slave** | 1 core | 2GB | 10GB SSD | 3 | +| **Sentinel** | 500m | 512MB | 5GB | 3 | -### Monitoring Code +**Redis Configuration**: +```yaml +# redis.conf +maxmemory 2gb +maxmemory-policy allkeys-lru # Evict least recently used keys +timeout 300 # Close idle connections after 5min +tcp-keepalive 60 +save 300 10 # RDB snapshot every 5min if 10+ keys changed +appendonly no # Disable AOF for performance -**Cache Hit Rates**: +# Cluster config +cluster-enabled yes +cluster-node-timeout 5000 +cluster-replica-validity-factor 0 +``` + +**High Availability**: +- Automatic failover với Redis Sentinel +- Slave promotion khi master fails +- Client-side retry logic +- Connection pooling (max 50 connections per service) + +**Scaling Strategy**: +- **Vertical**: Tăng memory per node (2GB → 4GB → 8GB) +- **Horizontal**: Thêm master nodes (3 → 5 → 7) +- **Read Scaling**: Route reads to slaves +- **Monitoring**: Auto-alert khi memory usage > 80% + +## Giám sát & Khả năng quan sát + +### Chỉ số Chính + +**Cache Performance Metrics**: ```typescript -// Track cache performance -export class CacheMetrics { - // ... Prometheus Implementation ... +// Custom metrics cho cache performance +import { Counter, Histogram, Gauge } from 'prom-client'; + +export const cacheHits = new Counter({ + name: 'cache_hits_total', + labelNames: ['layer', 'key_prefix'] // layer: l1/l2, key_prefix: user/session/etc +}); + +export const cacheMisses = new Counter({ + name: 'cache_misses_total', + help: 'Tổng số cache misses', + labelNames: ['key_prefix'] +}); + +export const cacheLatency = new Histogram({ + name: 'cache_operation_duration_seconds', + help: 'Thời gian thực hiện cache operation', + labelNames: ['operation', 'layer'], // operation: get/set/del + buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1] +}); + +export const cacheSize = new Gauge({ + name: 'cache_size_bytes', + help: 'Kích thước cache (bytes)', + labelNames: ['layer'] +}); + +export const cacheEvictions = new Counter({ + name: 'cache_evictions_total', + help: 'Tổng số cache evictions', + labelNames: ['layer', 'reason'] // reason: ttl_expired/memory_full +}); +``` + +**Redis Metrics**: +- `redis_connected_clients` - Connected clients +- `redis_used_memory_bytes` - Memory usage +- `redis_memory_fragmentation_ratio` - Memory fragmentation +- `redis_keyspace_hits_total` - Cache hits +- `redis_keyspace_misses_total` - Cache misses +- `redis_evicted_keys_total` - Evicted keys +- `redis_expired_keys_total` - Expired keys +- `redis_commands_processed_total` - Commands processed + +**Calculated Metrics**: +```promql +# Cache hit rate +rate(cache_hits_total[5m]) / (rate(cache_hits_total[5m]) + rate(cache_misses_total[5m])) + +# L1 hit rate +rate(cache_hits_total{layer="l1"}[5m]) / rate(cache_hits_total[5m]) + +# L2 hit rate +rate(cache_hits_total{layer="l2"}[5m]) / rate(cache_hits_total[5m]) + +# Average cache latency +histogram_quantile(0.95, cache_operation_duration_seconds_bucket) + +# Memory usage percentage +redis_used_memory_bytes / redis_maxmemory_bytes * 100 +``` + +**Alerting Rules**: +```yaml +# Quy tắc cảnh báo cho cache +groups: + - name: cache_alerts + interval: 30s + rules: + # Low cache hit rate + - alert: LowCacheHitRate + expr: | + rate(cache_hits_total[5m]) / + (rate(cache_hits_total[5m]) + rate(cache_misses_total[5m])) < 0.5 + for: 10m + labels: + severity: warning + annotations: + summary: "Tỷ lệ cache hit thấp" + description: "Tỷ lệ cache hit là {{ $value | humanizePercentage }}" + + # High memory usage + - alert: HighRedisMemoryUsage + expr: redis_used_memory_bytes / redis_maxmemory_bytes > 0.8 + for: 5m + labels: + severity: warning + annotations: + summary: "Sử dụng bộ nhớ Redis cao" + description: "Bộ nhớ Redis sử dụng là {{ $value | humanizePercentage }}" + + # High eviction rate + - alert: HighEvictionRate + expr: rate(redis_evicted_keys_total[5m]) > 100 + for: 5m + labels: + severity: warning + annotations: + summary: "Tỷ lệ cache eviction cao" + description: "Tỷ lệ eviction là {{ $value }}/giây" + + # Redis down + - alert: RedisDown + expr: redis_up == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Redis bị down" + + # High replication lag + - alert: HighReplicationLag + expr: redis_replication_lag_seconds > 5 + for: 2m + labels: + severity: warning + annotations: + summary: "Độ trễ replication cao" + description: "Độ trễ replication là {{ $value }}s" +``` + +**Dashboards**: +- **Cache Overview**: Hit rate, miss rate, latency, size +- **Redis Cluster**: Memory usage, connections, commands/sec +- **Performance**: L1 vs L2 hit rates, operation latency +- **Evictions**: Eviction rate, reasons, trends + +**Logging**: +```typescript +// Structured logging cho cache operations +logger.debug('Cache operation', { + operation: 'get', + layer: 'l1', + key: cacheKey, + hit: true, + latency: duration, + correlationId: req.correlationId +}); + +logger.warn('Cache eviction', { + layer: 'l2', + reason: 'memory_full', + evictedKeys: count, + memoryUsage: usagePercent +}); + +logger.error('Cache error', { + operation: 'set', + layer: 'l2', + error: error.message, + key: cacheKey +}); +``` + +**Health Checks**: +```typescript +// Health check cho Redis +async function checkRedisHealth(): Promise { + try { + await redis.ping(); + const info = await redis.info('memory'); + const memoryUsage = parseMemoryUsage(info); + + return memoryUsage < 0.9; // Healthy if < 90% memory + } catch (error) { + logger.error('Redis health check failed', { error }); + return false; + } } ``` -**Expected Performance**: -| Metric | L1 Cache | L2 Cache | Database | -|--------|----------|----------|----------| -| Latency | < 1ms | < 5ms | < 50ms | -| Hit Rate | 40-50% | 80-90% | - | -| Capacity | 10k keys | Unlimited | - | -## Best Practices +## Tài liệu Liên quan -**DO**: -- ✅ Use cache for frequently accessed data -- ✅ Set appropriate TTLs based on data change frequency -- ✅ Invalidate cache on data updates -- ✅ Use cache key namespacing -- ✅ Monitor cache hit rates -- ✅ Warm cache on startup for critical data +- [System Design](./system-design.md) - Kiến trúc tổng thể với caching +- [Data Consistency Patterns](./data-consistency-patterns.md) - Cache invalidation patterns -**DON'T**: -- ❌ Cache data that changes very frequently -- ❌ Set TTL too long (stale data risk) -- ❌ Set TTL too short (negates cache benefit) -- ❌ Cache sensitive data without encryption -- ❌ Ignore cache invalidation on updates -- ❌ Use cache as primary data store +--- + +**Cập nhật Lần cuối**: 2026-01-14 +**Tác giả**: GoodGo Architecture Team + +## Quick Tips + +### Mermaid Common Issues +- **Arrow Syntax**: Use `-->` for solid arrows, `-.->` for dotted arrows. +- **Node IDs**: Avoid spaces/special chars in IDs (e.g., `Node-A` not `Node A`). +- **Subgraphs**: Ensure `subgraph` names are unique and descriptive. + +### Color Pattern Quick Reference +| Element | Dark Color | Text Color | +|---------|------------|------------| +| **Service (Blue)** | `#1a237e` | `#ffffff` | +| **Storage (Gray)** | `#212121` | `#ffffff` | +| **Cache L2 (Orange)** | `#e65100` | `#ffffff` | +| **Cache L1 (Green)** | `#1b5e20` | `#ffffff` | +| **Monitoring (Purple)** | `#4a148c` | `#ffffff` | + +### Visual Indicators +- ✅ **Recommended / Khuyên dùng** +- ❌ **Not Recommended / Không khuyên dùng** +- ⚠️ **Warning / Cảnh báo** diff --git a/docs/en/architecture/data-consistency-patterns.md b/docs/en/architecture/data-consistency-patterns.md index 8e8ac26b..d0a30e52 100644 --- a/docs/en/architecture/data-consistency-patterns.md +++ b/docs/en/architecture/data-consistency-patterns.md @@ -1,8 +1,8 @@ -# Data Consistency Patterns +# Kiến trúc Patterns Đồng bộ Dữ liệu -> Patterns for maintaining data consistency in distributed microservices architecture +> Các patterns để duy trì tính nhất quán dữ liệu trong kiến trúc microservices phân tán -## Overview Diagram +## Sơ đồ Tổng quan ```mermaid graph TD @@ -24,32 +24,39 @@ graph TD OptimisticLock --> StrongConsistency[Strong Consistency] CQRS --> EventualConsistency - style Saga fill:#e1f5ff - style Outbox fill:#fff4e1 - style Idempotency fill:#f0e1ff - style CQRS fill:#d4edda + %% Dark color palette with white text + style Saga fill:#1d4ed8,stroke:#3b82f6,color:#ffffff + style Outbox fill:#b45309,stroke:#f59e0b,color:#ffffff + style Idempotency fill:#7e22ce,stroke:#a855f7,color:#ffffff + style OptimisticLock fill:#15803d,stroke:#22c55e,color:#ffffff + style CQRS fill:#15803d,stroke:#22c55e,color:#ffffff + style EventualConsistency fill:#374151,stroke:#6b7280,color:#ffffff + style StrongConsistency fill:#374151,stroke:#6b7280,color:#ffffff + style Service1 fill:#4527a0,stroke:#7c4dff,color:#ffffff + style Service2 fill:#4527a0,stroke:#7c4dff,color:#ffffff + style Service3 fill:#4527a0,stroke:#7c4dff,color:#ffffff ``` -## Architecture Description +## Mô tả Kiến trúc -### Architecture Overview +### Tổng quan Kiến trúc -GoodGo platform uses multiple consistency patterns to handle distributed data: +Nền tảng GoodGo sử dụng nhiều consistency patterns để xử lý dữ liệu phân tán: -**Core Challenges**: -- No distributed transactions (2PC too slow) -- Services own their data (database per service) -- Network failures can cause partial completion -- Need to maintain data integrity across services +**Thách thức Cốt lõi**: +- Không có distributed transactions (2PC quá chậm) +- Services sở hữu dữ liệu riêng (database per service) +- Network failures có thể gây partial completion +- Cần maintain data integrity giữa các services -**Pattern Selection**: -- **Saga**: For multi-service workflows -- **Outbox**: For guaranteed event publishing -- **Idempotency**: For safe retries -- **Optimistic Locking**: For concurrent updates -- **CQRS**: For read/write optimization +**Lựa chọn Pattern**: +- **Saga**: Cho workflows nhiều services +- **Outbox**: Cho event publishing đảm bảo +- **Idempotency**: Cho retries an toàn +- **Optimistic Locking**: Cho concurrent updates +- **CQRS**: Cho tối ưu read/write -## System Context +## Bối cảnh Hệ thống ```mermaid C4Context @@ -88,9 +95,9 @@ C4Context UpdateRelStyle(saga_orchestrator, inventory_service, $lineColor="red", $textColor="red") ``` -The GoodGo platform uses a database-per-service architecture where each service owns its data. Data consistency across services is achieved through patterns like Saga (for coordinated workflows), Outbox (for reliable event publishing), Idempotency (for safe retries), and Optimistic Locking (for concurrent updates). These patterns enable eventual consistency while maintaining data integrity. +Nền tảng GoodGo sử dụng kiến trúc database-per-service nơi mỗi service sở hữu dữ liệu riêng. Tính nhất quán dữ liệu giữa các services đạt được thông qua các patterns như Saga (cho workflows phối hợp), Outbox (cho event publishing đáng tin cậy), Idempotency (cho retries an toàn), và Optimistic Locking (cho concurrent updates). Các patterns này cho phép eventual consistency đồng thời duy trì data integrity. -## Saga Pattern +## Pattern Saga ```mermaid sequenceDiagram @@ -119,9 +126,9 @@ sequenceDiagram end ``` -**Description**: Saga manages distributed transactions as sequence of local transactions with compensation. +**Mô tả**: Saga quản lý distributed transactions dưới dạng chuỗi local transactions với compensation. -**Implementation**: +**Triển khai**: ```typescript // Saga orchestrator class OrderSaga { @@ -133,19 +140,19 @@ class OrderSaga { }; try { - // Step 1: Create order + // Bước 1: Tạo đơn hàng sagaContext.orderId = await orderService.create(orderData); - // Step 2: Process payment + // Bước 2: Xử lý thanh toán sagaContext.paymentId = await paymentService.process(orderData.payment); - // Step 3: Reserve inventory + // Bước 3: Đặt trước kho sagaContext.inventoryId = await inventoryService.reserve(orderData.items); - // All success - commit + // Tất cả thành công - commit await this.completeSaga(sagaContext); } catch (error) { - // Compensate in reverse order + // Compensate theo thứ tự ngược lại await this.compensate(sagaContext, error); throw error; } @@ -165,7 +172,7 @@ class OrderSaga { } ``` -## Outbox Pattern +## Pattern Outbox ```mermaid sequenceDiagram @@ -189,57 +196,54 @@ sequenceDiagram end ``` -**Description**: Guarantees event publishing by storing events in database within same transaction as business data. +**Mô tả**: Đảm bảo event publishing bằng cách lưu events trong database cùng transaction với business data. -**Implementation**: -```typescript -// Store event in outbox -async createUser(userData: CreateUserDto): Promise { - return await prisma.$transaction(async (tx) => { - // Business operation - const user = await tx.user.create({ data: userData }); +**Triển khai (.NET với EF Core)**: +```csharp +// EN: Save event in outbox with business data +// VI: Lưu event trong outbox cùng với business data +public async Task CreateUserAsync(CreateUserDto dto, CancellationToken ct) +{ + await using var transaction = await _context.Database.BeginTransactionAsync(ct); - // Store event in outbox (same transaction) - await tx.outbox.create({ - data: { - aggregateId: user.id, - aggregateType: 'User', - eventType: 'user.created.v1', - payload: JSON.stringify(user), - createdAt: new Date() - } - }); - - return user; - }); -} - -// Outbox processor (runs periodically) -async processOutbox(): Promise { - const events = await prisma.outbox.findMany({ - where: { publishedAt: null }, - take: 100 - }); - - for (const event of events) { - try { - await kafkaProducer.send({ - topic: event.eventType, - messages: [{ value: event.payload }] - }); - - await prisma.outbox.update({ - where: { id: event.id }, - data: { publishedAt: new Date() } - }); - } catch (error) { - logger.error('Failed to publish event', { event, error }); + try + { + // Business operation + var user = new User + { + Id = Guid.NewGuid(), + Email = dto.Email, + FirstName = dto.FirstName, + LastName = dto.LastName + }; + _context.Users.Add(user); + + // Lưu event trong outbox (cùng transaction) + var outboxEvent = new OutboxMessage + { + Id = Guid.NewGuid(), + AggregateId = user.Id.ToString(), + AggregateType = nameof(User), + EventType = "user.created.v1", + Payload = JsonSerializer.Serialize(user), + CreatedAt = DateTime.UtcNow + }; + _context.OutboxMessages.Add(outboxEvent); + + await _context.SaveChangesAsync(ct); + await transaction.CommitAsync(ct); + + return user; + } + catch + { + await transaction.RollbackAsync(ct); + throw; } - } } ``` -## Idempotency Pattern +## Pattern Idempotency ```mermaid graph LR @@ -256,36 +260,43 @@ graph LR Request2 --> Check Return --> Response2[Same Response] - style Check fill:#fff3cd - style Store fill:#d4edda + %% Dark color palette with white text + style Request1 fill:#374151,stroke:#6b7280,color:#ffffff + style Request2 fill:#374151,stroke:#6b7280,color:#ffffff + style Check fill:#b45309,stroke:#f59e0b,color:#ffffff + style Process fill:#1d4ed8,stroke:#3b82f6,color:#ffffff + style Store fill:#15803d,stroke:#22c55e,color:#ffffff + style Return fill:#7e22ce,stroke:#a855f7,color:#ffffff + style Response1 fill:#15803d,stroke:#22c55e,color:#ffffff + style Response2 fill:#15803d,stroke:#22c55e,color:#ffffff ``` -**Description**: Ensures operations can be safely retried without side effects by using idempotency keys. +**Mô tả**: Đảm bảo operations có thể retry an toàn mà không có side effects bằng cách sử dụng idempotency keys. -**Implementation**: +**Triển khai**: ```typescript // Idempotency middleware async function idempotentOperation( key: string, operation: () => Promise, - ttl: number = 86400 // 24 hours + ttl: number = 86400 ): Promise { - // Check if already processed + // Kiểm tra đã xử lý chưa const cached = await redis.get(`idempotency:${key}`); if (cached) { return JSON.parse(cached); } - // Process operation + // Xử lý operation const result = await operation(); - // Store result + // Lưu kết quả await redis.setex(`idempotency:${key}`, ttl, JSON.stringify(result)); return result; } -// Usage in controller +// Sử dụng trong controller async createPayment(req: Request, res: Response): Promise { const idempotencyKey = req.headers['idempotency-key'] as string; @@ -302,7 +313,7 @@ async createPayment(req: Request, res: Response): Promise { } ``` -## Optimistic Locking +## Khóa Lạc quan (Optimistic Locking) ```mermaid sequenceDiagram @@ -328,38 +339,49 @@ sequenceDiagram Service-->>User2: Success ``` -**Description**: Prevents lost updates by checking version on update. +**Mô tả**: Ngăn chặn lost updates bằng cách kiểm tra version khi update. -**Implementation**: -```prisma -// Prisma schema -model User { - id String @id @default(cuid()) - email String @unique - name String - version Int @default(1) // Version field +**Triển khai (.NET với EF Core)**: +```csharp +// EN: Entity with concurrency token +// VI: Entity với concurrency token +public class User +{ + public Guid Id { get; set; } + public string Email { get; set; } = default!; + public string Name { get; set; } = default!; + + [ConcurrencyCheck] + public int Version { get; set; } = 1; + + // Or use RowVersion for SQL Server + // [Timestamp] + // public byte[] RowVersion { get; set; } } -``` -```typescript -// Update with optimistic locking -async updateUser(userId: string, data: UpdateUserDto, currentVersion: number): Promise { - const result = await prisma.user.updateMany({ - where: { - id: userId, - version: currentVersion // Check version - }, - data: { - ...data, - version: { increment: 1 } // Increment version +// EN: Update with optimistic locking +// VI: Update với optimistic locking +public async Task UpdateUserAsync( + Guid userId, + UpdateUserDto dto, + CancellationToken ct) +{ + var user = await _context.Users.FindAsync([userId], ct) + ?? throw new UserNotFoundException(userId); + + user.Name = dto.Name; + user.Version++; // Increment version + + try + { + await _context.SaveChangesAsync(ct); + return user; + } + catch (DbUpdateConcurrencyException) + { + throw new ConcurrencyConflictException( + "Data was modified by another user. Please refresh and try again."); } - }); - - if (result.count === 0) { - throw new ConflictError('Version mismatch - data was modified by another user'); - } - - return await prisma.user.findUnique({ where: { id: userId } }); } ``` @@ -381,97 +403,104 @@ graph LR WriteModel --> DB1[(Write DB)] ReadModel --> DB2[(Read DB
Optimized)] - style WriteModel fill:#f0e1ff - style ReadModel fill:#d4edda + %% Dark color palette with white text + style Command fill:#1d4ed8,stroke:#3b82f6,color:#ffffff + style WriteModel fill:#7e22ce,stroke:#a855f7,color:#ffffff + style Events fill:#b45309,stroke:#f59e0b,color:#ffffff + style Projection fill:#1d4ed8,stroke:#3b82f6,color:#ffffff + style ReadModel fill:#15803d,stroke:#22c55e,color:#ffffff + style Query fill:#15803d,stroke:#22c55e,color:#ffffff + style DB1 fill:#374151,stroke:#6b7280,color:#ffffff + style DB2 fill:#374151,stroke:#6b7280,color:#ffffff ``` -**Description**: Separates read and write models for optimal performance. +**Mô tả**: Tách biệt read và write models để tối ưu hiệu suất. -## Performance Characteristics +## Đặc điểm Hiệu suất -Performance metrics and optimization strategies for data consistency patterns. +Chỉ số hiệu suất và chiến lược tối ưu cho patterns đồng bộ dữ liệu. -| Pattern | Latency Impact | Throughput | Notes | -|---------|----------------|------------|-------| -| **Saga Execution** | 500ms - 2s | 100-500 sagas/s | Depends on number of steps and compensation | -| **Outbox Processing** | < 100ms | 10,000 events/s | Async processing, minimal user impact | -| **Idempotency Check** | < 10ms | 50,000 checks/s | Redis lookup, very fast | -| **Optimistic Lock Update** | < 50ms | 5,000 updates/s | Single DB operation with version check | -| **CQRS Projection** | 100ms - 1s | 1,000 events/s | Event processing to read model | -| **Compensation Execution** | 200ms - 1s | Varies | Rollback operations in saga | +| Pattern | Tác động Độ trễ | Thông lượng | Ghi chú | +|---------|-----------------|-------------|---------| +| **Thực thi Saga** | 500ms - 2s | 100-500 sagas/s | Phụ thuộc số bước và compensation | +| **Xử lý Outbox** | < 100ms | 10,000 events/s | Xử lý bất đồng bộ, tác động tối thiểu | +| **Kiểm tra Idempotency** | < 10ms | 50,000 checks/s | Redis lookup, rất nhanh | +| **Cập nhật Optimistic Lock** | < 50ms | 5,000 updates/s | Single DB operation với version check | +| **CQRS Projection** | 100ms - 1s | 1,000 events/s | Xử lý event sang read model | +| **Thực thi Compensation** | 200ms - 1s | Varies | Rollback operations trong saga | -### Performance Optimization Strategies +### Chiến lược Tối ưu Hiệu suất **Saga Pattern**: -- Minimize number of steps (< 5 steps ideal) -- Parallel execution where possible +- Giảm thiểu số bước (< 5 bước lý tưởng) +- Thực thi song song khi có thể - Cache service responses -- Set appropriate timeouts (30s default) +- Đặt timeouts phù hợp (30s mặc định) **Outbox Pattern**: -- Batch process outbox events (100-500 per batch) -- Index `publishedAt` column for performance -- Archive processed events periodically -- Use connection pooling for Kafka +- Batch process outbox events (100-500 mỗi batch) +- Index cột `publishedAt` cho hiệu suất +- Archive processed events định kỳ +- Sử dụng connection pooling cho Kafka **Idempotency**: -- Use Redis for fast key lookups -- Set TTL to 24-48 hours +- Sử dụng Redis cho fast key lookups +- Đặt TTL 24-48 giờ - Hash long idempotency keys -- Clean expired keys regularly +- Clean expired keys thường xuyên **Optimistic Locking**: -- Works best for low-contention scenarios -- Implement retry with exponential backoff -- Monitor conflict rates (should be < 5%) -- Consider pessimistic locking if conflicts > 10% +- Hoạt động tốt nhất cho low-contention scenarios +- Triển khai retry với exponential backoff +- Giám sát conflict rates (nên < 5%) +- Cân nhắc pessimistic locking nếu conflicts > 10% -## Security Considerations +## Cân nhắc Bảo mật -Security measures for protecting data consistency operations. +Biện pháp bảo mật để bảo vệ các operations đồng bộ dữ liệu. -### Saga Security +### Bảo mật Saga -**Compensation Protection**: -- Validate saga execution permissions at each step -- Encrypt sensitive data in saga context -- Log all saga executions for audit -- Implement timeout to prevent hanging sagas +**Bảo vệ Compensation**: +- Xác thực saga execution permissions ở mỗi bước +- Mã hóa sensitive data trong saga context +- Log tất cả saga executions cho audit +- Triển khai timeout để ngăn hanging sagas ```typescript -// Secure saga context +// Saga context bảo mật interface SecureSagaContext { sagaId: string; - userId: string; // User who initiated - permissions: string[]; // Required permissions - encryptedData: string; // Encrypted sensitive data - auditLog: AuditEntry[]; // Audit trail + userId: string; + permissions: string[]; + encryptedData: string; + auditLog: AuditEntry[]; } ``` -### Outbox Security +### Bảo mật Outbox -**Event Payload Encryption**: -- Encrypt PII (Personally Identifiable Information) before storing in outbox -- Use AES-256-GCM for event payload encryption -- Decrypt only when publishing to Kafka -- Rotate encryption keys quarterly +**Mã hóa Event Payload**: +- Mã hóa PII (Personally Identifiable Information) trước khi lưu trong outbox +- Sử dụng AES-256-GCM cho event payload encryption +- Giải mã chỉ khi publishing sang Kafka +- Rotate encryption keys hàng quý -**Access Control**: -- Restrict outbox table access to outbox processor only -- Use database roles and permissions -- Monitor outbox table access patterns +**Kiểm soát Truy cập**: +- Hạn chế truy cập outbox table chỉ cho outbox processor +- Sử dụng database roles và permissions +- Giám sát outbox table access patterns -### Idempotency Security +### Bảo mật Idempotency -**Key Security**: -- Use cryptographic hashing for idempotency keys (SHA-256) -- Include user context in key generation -- Validate key ownership before processing -- Clear keys on user logout for sensitive operations +**Bảo mật Key**: +- Sử dụng cryptographic hashing cho idempotency keys (SHA-256) +- Bao gồm user context trong key generation +- Xác thực key ownership trước khi xử lý +- Clear keys khi user logout cho sensitive operations ```typescript -// Secure idempotency key generation +// Tạo idempotency key bảo mật function generateIdempotencyKey( operation: string, userId: string, @@ -482,17 +511,17 @@ function generateIdempotencyKey( } ``` -### Optimistic Locking Security +### Bảo mật Optimistic Lock -**Version Tampering Prevention**: -- Validate version field on server-side only -- Never accept version from client directly -- Log version conflicts for security monitoring +**Ngăn chặn Giả mạo Version**: +- Xác thực version field chỉ ở server-side +- Không bao giờ chấp nhận version từ client trực tiếp +- Log version conflicts cho security monitoring - Rate limit update attempts per user -## Deployment +## Triển khai -How data consistency patterns are deployed and scaled. +Cách các patterns đồng bộ dữ liệu được triển khai và mở rộng. ```mermaid graph TD @@ -523,76 +552,83 @@ graph TD OP1 & OP2 --> Kafka[Kafka Cluster\n5 brokers] end - style SO1 fill:#e1f5ff - style SO2 fill:#e1f5ff - style OP1 fill:#fff4e1 - style OP2 fill:#fff4e1 - style DB fill:#d4edda - style Kafka fill:#ffe1e1 + %% Dark color palette with white text + style OS1 fill:#4527a0,stroke:#7c4dff,color:#ffffff + style OS2 fill:#4527a0,stroke:#7c4dff,color:#ffffff + style OS3 fill:#4527a0,stroke:#7c4dff,color:#ffffff + style SO1 fill:#1d4ed8,stroke:#3b82f6,color:#ffffff + style SO2 fill:#1d4ed8,stroke:#3b82f6,color:#ffffff + style OP1 fill:#b45309,stroke:#f59e0b,color:#ffffff + style OP2 fill:#b45309,stroke:#f59e0b,color:#ffffff + style DB fill:#15803d,stroke:#22c55e,color:#ffffff + style Redis fill:#7e22ce,stroke:#a855f7,color:#ffffff + style Kafka fill:#b91c1c,stroke:#ef4444,color:#ffffff + style PS fill:#374151,stroke:#6b7280,color:#ffffff + style IS fill:#374151,stroke:#6b7280,color:#ffffff ``` -### Deployment Configuration +### Cấu hình Triển khai -| Component | Replicas | Resources | HA Strategy | -|-----------|----------|-----------|-------------| -| **Saga Orchestrator** | 2-3 | 512Mi RAM, 500m CPU | Leader election with etcd | +| Thành phần | Replicas | Resources | HA Strategy | +|------------|----------|-----------|-------------| +| **Saga Orchestrator** | 2-3 | 512Mi RAM, 500m CPU | Leader election với etcd | | **Outbox Processor** | 2-5 | 256Mi RAM, 250m CPU | Distributed lock per event batch | -| **Services with Outbox** | 3+ | Varies | Standard service scaling | -| **Redis (Idempotency)** | 3 nodes | 1Gi RAM each | Redis Cluster with replication | +| **Services với Outbox** | 3+ | Varies | Standard service scaling | +| **Redis (Idempotency)** | 3 nodes | 1Gi RAM each | Redis Cluster với replication | -### Scaling Strategy +### Chiến lược Mở rộng **Saga Orchestrator**: -- Scale based on pending saga count -- Use queue-based load distribution -- Monitor saga execution duration +- Scale dựa trên pending saga count +- Sử dụng queue-based load distribution +- Giám sát saga execution duration **Outbox Processor**: -- Scale with database sharding (1 processor per shard) -- Increase batch size before adding replicas -- Monitor outbox table size and age +- Scale với database sharding (1 processor per shard) +- Tăng batch size trước khi thêm replicas +- Giám sát outbox table size và age **Idempotency Store (Redis)**: - Scale Redis cluster horizontally -- Use consistent hashing for key distribution -- Monitor memory usage (should be < 70%) +- Sử dụng consistent hashing cho key distribution +- Giám sát memory usage (nên < 70%) -## Monitoring & Observability +## Giám sát & Khả năng quan sát -Monitoring strategies for data consistency patterns. +Chiến lược giám sát cho patterns đồng bộ dữ liệu. -### Key Metrics +### Chỉ số Chính **Saga Metrics**: -- `saga_executions_total` - Total saga executions (success/failure) +- `saga_executions_total` - Tổng saga executions (success/failure) - `saga_duration_seconds` - Saga execution time histogram -- `saga_compensations_total` - Total compensation executions -- `saga_timeout_total` - Sagas that timed out -- `saga_pending_count` - Sagas currently executing +- `saga_compensations_total` - Tổng compensation executions +- `saga_timeout_total` - Sagas timeout +- `saga_pending_count` - Sagas đang thực thi **Outbox Metrics**: -- `outbox_events_total` - Events written to outbox -- `outbox_published_total` - Events published to Kafka -- `outbox_processing_lag_seconds` - Time from write to publish -- `outbox_table_size` - Outbox table row count +- `outbox_events_total` - Events ghi vào outbox +- `outbox_published_total` - Events published sang Kafka +- `outbox_processing_lag_seconds` - Thời gian từ write đến publish +- `outbox_table_size` - Số dòng outbox table - `outbox_failed_events_total` - Failed event publications **Idempotency Metrics**: -- `idempotency_checks_total` - Total idempotency checks +- `idempotency_checks_total` - Tổng idempotency checks - `idempotency_hits_total` - Duplicate requests prevented - `idempotency_key_ttl_seconds` - Average key TTL - `idempotency_redis_errors_total` - Redis failures **Optimistic Lock Metrics**: - `optimistic_lock_conflicts_total` - Version conflicts detected -- `optimistic_lock_retries_total` - Retry attempts after conflict +- `optimistic_lock_retries_total` - Retry attempts sau conflict - `optimistic_lock_success_rate` - Update success percentage -### Alerts +### Cảnh báo **Critical Alerts**: ```yaml -# Saga timeout rate too high +# Saga timeout rate quá cao alert: HighSagaTimeoutRate expr: rate(saga_timeout_total[5m]) > 0.05 for: 5m @@ -611,23 +647,23 @@ for: 5m severity: warning ``` -### Monitoring Dashboard +### Dashboard Giám sát **Grafana Panels**: -1. **Saga Orchestration Overview**: +1. **Tổng quan Saga Orchestration**: - Saga execution rate (success/failure) - Average saga duration - Compensation rate - Pending saga count -2. **Outbox Processing Health**: +2. **Sức khỏe Outbox Processing**: - Outbox publishing rate - Processing lag (P95, P99) - Failed events - Table size trend -3. **Idempotency Effectiveness**: +3. **Hiệu quả Idempotency**: - Duplicate prevention rate - Redis hit rate - Key distribution @@ -637,11 +673,11 @@ severity: warning - Mean time to consistency (MTTC) - Conflict resolution success rate -### Distributed Tracing +### Tracing Phân tán **Trace Saga Execution**: ```typescript -// Traced saga step +// Saga step được trace async function executeStepWithTracing( step: SagaStep, context: SagaContext @@ -668,17 +704,48 @@ async function executeStepWithTracing( } ``` -## Related Documentation +## Tài liệu Liên quan -- [Event-Driven Architecture](./event-driven-architecture.md) - Event sourcing and Kafka -- [System Design](./system-design.md) - Overall architecture -- [Microservices Communication](./microservices-communication.md) - Service communication patterns -- [Resilience Patterns](../skills/resilience-patterns.md) - Circuit breaker, retry for saga steps -- [Caching Patterns](../skills/caching-patterns.md) - Caching for idempotency keys -- [Database Prisma](../skills/database-prisma.md) - Prisma transactions for outbox pattern +- [Event-Driven Architecture](./event-driven-architecture.md) - Event sourcing và Kafka +- [System Design](./system-design.md) - Kiến trúc tổng thể +- [Microservices Communication](./microservices-communication.md) - Patterns giao tiếp service +- [Resilience Patterns](../skills/resilience-patterns.md) - Circuit breaker, retry cho saga steps +- [Caching Patterns](../skills/caching-patterns.md) - Caching cho idempotency keys +- [Database Prisma](../skills/database-prisma.md) - Prisma transactions cho outbox pattern --- -**Last Updated**: 2026-01-07 -**Author**: VelikHo (hongochai10@icloud.com) -**Reviewers**: To be assigned +**Cập nhật Lần cuối**: 2026-01-14 +**Tác giả**: GoodGo Architecture Team + +## Quick Tips + +### Mermaid Common Issues +- ⚠️ **Syntax Error**: Kiểm tra dấu `(` `)` `[` `]` `{` `}` +- ⚠️ **Render Error**: Kiểm tra `graph` vs `flowchart`, sử dụng `graph` cho compatibility +- ⚠️ **Arrow Direction**: Sử dụng `-->` (solid) hoặc `-.->` (dashed) +- ✅ **Color**: Luôn sử dụng dark palette với white text + +### Color Palette Reference + +| Color | Fill | Stroke | Use Case | +|-------|------|--------|----------| +| **Blue** | `#1d4ed8` | `#3b82f6` | Primary Components, Saga | +| **Green** | `#15803d` | `#22c55e` | Success, DB, Stable States | +| **Purple** | `#7e22ce` | `#a855f7` | Feature, Logic, Idempotency | +| **Orange** | `#b45309` | `#f59e0b` | Warning, External, Outbox | +| **Red** | `#b91c1c` | `#ef4444` | Error, Failure, Critical | +| **Gray** | `#374151` | `#6b7280` | Background, Secondary | + +**Pattern áp dụng**: +``` +style NodeName fill:#1d4ed8,stroke:#3b82f6,color:#ffffff +``` + +### Visual Indicators + +- ✅ **Recommended**: Best practices, khuyến nghị sử dụng +- ⚠️ **Warning**: Cần chú ý, có điều kiện +- ❌ **Avoid**: Anti-patterns, tránh sử dụng +- 🔒 **Security**: Liên quan đến bảo mật +- ⚡ **Performance**: Liên quan đến hiệu suất diff --git a/docs/en/architecture/event-driven-architecture.md b/docs/en/architecture/event-driven-architecture.md index ea563d0c..e1703bc3 100644 --- a/docs/en/architecture/event-driven-architecture.md +++ b/docs/en/architecture/event-driven-architecture.md @@ -1,8 +1,8 @@ -# Event-Driven Architecture +# Kiến trúc Hướng Sự kiện -> Event-driven architecture for asynchronous communication using Apache Kafka +> Kiến trúc hướng sự kiện cho giao tiếp bất đồng bộ sử dụng Apache Kafka -## Overview Diagram +## Sơ đồ Tổng quan ```mermaid graph TD @@ -27,28 +27,36 @@ graph TD Topics -->|Subscribe| Consumer1 Topics -->|Subscribe| Consumer2 - style Kafka fill:#e1f5ff - style Topics fill:#fff4e1 + style IAM fill:#5E35B1,stroke:#4527A0,color:#ffffff + style Service1 fill:#5E35B1,stroke:#4527A0,color:#ffffff + style Kafka fill:#1E88E5,stroke:#1565C0,color:#ffffff + style Topics fill:#FB8C00,stroke:#EF6C00,color:#ffffff + style Consumer1 fill:#43A047,stroke:#2E7D32,color:#ffffff + style Consumer2 fill:#43A047,stroke:#2E7D32,color:#ffffff ``` -## Architecture Description +## Mô tả Kiến trúc -The GoodGo platform implements Event-Driven Architecture (EDA) for asynchronous communication between microservices. +Nền tảng GoodGo triển khai Kiến trúc Hướng Sự kiện (EDA) cho giao tiếp bất đồng bộ giữa microservices. -**Core Principles**: -1. **Event-First Design**: All state changes emit domain events -2. **Loose Coupling**: Services communicate through events -3. **Eventual Consistency**: Accept temporary inconsistency -4. **Event Sourcing**: Store changes as event sequence -5. **CQRS Pattern**: Separate read/write operations +> [!IMPORTANT] +> **Trạng thái hiện tại**: Event Bus với RabbitMQ/Kafka chưa được triển khai. Hiện tại sử dụng MediatR cho domain events nội bộ trong mỗi service. -**Technology Stack**: -- Apache Kafka - Event streaming platform -- Schema Registry - Avro schemas for validation -- KafkaJS - Node.js client library -- Event Sourcing - Custom implementation in IAM +**Nguyên tắc Cốt lõi**: +1. **Event-First Design**: Mọi thay đổi trạng thái phát ra domain events +2. **Loose Coupling**: Services giao tiếp qua events (nội bộ hoặc message broker) +3. **Eventual Consistency**: Chấp nhận inconsistency tạm thời +4. **CQRS Pattern**: Tách biệt read/write operations với MediatR -## Event Flow +**Công nghệ Hiện tại**: +- **MediatR** - Domain events nội bộ service +- **Entity Framework Core** - Domain event dispatch qua DbContext + +**Công nghệ Planned**: +- **RabbitMQ + MassTransit** - Inter-service events (Roadmap) +- **Outbox Pattern** - Reliable event publishing + +## Luồng Sự kiện ```mermaid sequenceDiagram @@ -62,33 +70,46 @@ sequenceDiagram Consumer-->>Kafka: Acknowledge ``` -**Steps**: Publish → Distribute → Consume → Retry (if failed) → DLQ (after max retries) → Acknowledge +**Các Bước**: Publish → Distribute → Consume → Retry (nếu thất bại) → DLQ (sau retry tối đa) → Acknowledge -## Event Structure +## Cấu trúc Sự kiện -```typescript -interface BaseEvent { - eventId: string; // UUID - eventType: string; // user.created.v1 - eventVersion: string; // 1.0.0 - timestamp: string; // ISO 8601 - source: string; // iam-service - correlationId?: string; // Request correlation - data: unknown; // Event payload -} -``` +### Domain Events (MediatR - Hiện tại) -**Example**: -```json +```csharp +// EN: Base domain event interface +// VI: Interface domain event cơ bản +public interface IDomainEvent : INotification { - "eventId": "550e8400-e29b-41d4-a716-446655440000", - "eventType": "user.created.v1", - "timestamp": "2024-01-15T10:30:00Z", - "source": "iam-service", - "data": { - "userId": "user_123", - "email": "user@example.com" - } + Guid EventId { get; } + DateTime OccurredOn { get; } +} + +// EN: Example domain event +// VI: Ví dụ domain event +public record UserCreatedEvent : IDomainEvent +{ + public Guid EventId { get; } = Guid.NewGuid(); + public DateTime OccurredOn { get; } = DateTime.UtcNow; + + public required Guid UserId { get; init; } + public required string Email { get; init; } + public required string FirstName { get; init; } +} + +// EN: Event handler +// VI: Handler xử lý event +public class UserCreatedEventHandler : INotificationHandler +{ + private readonly ILogger _logger; + + public async Task Handle(UserCreatedEvent notification, CancellationToken ct) + { + _logger.LogInformation("User created: {UserId}, {Email}", + notification.UserId, notification.Email); + + // TODO: Send welcome email, create profile, etc. + } } ``` @@ -100,19 +121,19 @@ graph LR AuthLogin[auth.login.success
Partitions: 5] AuditEvents[audit.events
Partitions: 10] - style UserCreated fill:#e1f5ff - style AuthLogin fill:#fff4e1 - style AuditEvents fill:#f8d7da + style UserCreated fill:#1E88E5,stroke:#1565C0,color:#ffffff + style AuthLogin fill:#43A047,stroke:#2E7D32,color:#ffffff + style AuditEvents fill:#E53935,stroke:#C62828,color:#ffffff ``` -**Naming Convention**: `{domain}.{action}.{version}` +**Quy ước Đặt tên**: `{domain}.{action}.{version}` -**Examples**: +**Ví dụ**: - `user.created.v1` - `auth.login.success.v1` - `audit.event.logged.v1` -## Error Handling +## Xử lý Lỗi ```mermaid graph TD @@ -121,19 +142,26 @@ graph TD Process -->|Failure| Retry[Retry 3x] Retry -->|Max Retries| DLQ[Dead Letter Queue] DLQ --> Alert[Alert Team] + + style Event fill:#757575,stroke:#616161,color:#ffffff + style Process fill:#1E88E5,stroke:#1565C0,color:#ffffff + style Ack fill:#43A047,stroke:#2E7D32,color:#ffffff + style Retry fill:#FB8C00,stroke:#EF6C00,color:#ffffff + style DLQ fill:#E53935,stroke:#C62828,color:#ffffff + style Alert fill:#E53935,stroke:#C62828,color:#ffffff ``` -**Strategy**: -1. Retry with exponential backoff (100ms → 200ms → 400ms) -2. Max 3 attempts -3. Move to DLQ after max retries -4. Manual review and reprocess +**Chiến lược**: +1. Retry với exponential backoff (100ms → 200ms → 400ms) +2. Tối đa 3 lần thử +3. Chuyển sang DLQ sau retry tối đa +4. Xem xét thủ công và xử lý lại -## System Context +## Bối cảnh Hệ thống ```mermaid C4Context - title Event-Driven Architecture Context + title Sơ đồ Bối cảnh Event-Driven Architecture System(iam, "IAM Service", "Event producer") System(service_a, "Service A", "Event producer") @@ -152,17 +180,17 @@ C4Context Rel(kafka, monitoring, "Sends metrics", "JMX") ``` -**Context Description**: -- **Producers**: IAM Service and other services publish domain events -- **Kafka**: Central event broker, manages topics and partitions -- **Consumers**: Notification and Audit services consume events -- **Schema Registry**: Manages and validates Avro schemas -- **Monitoring**: Collects metrics from Kafka cluster +**Mô tả Các Thành phần**: +- **Producers**: IAM Service và các services khác publish domain events +- **Kafka**: Event broker trung tâm, quản lý topics và partitions +- **Consumers**: Notification và Audit services consume events +- **Schema Registry**: Quản lý và validate Avro schemas +- **Monitoring**: Thu thập metrics từ Kafka cluster -## Performance Characteristics +## Đặc điểm Hiệu suất -| Metric | Target | Notes | -|--------|--------|-------| +| Chỉ số | Mục tiêu | Ghi chú | +|-----------------|-------------------|-----------------| | **Event Publish Latency (P95)** | < 10ms | Fire-and-forget, async | | **Event Delivery Latency (P95)** | < 100ms | End-to-end from publish to consume | | **Throughput** | 10,000 events/s | Per topic, scalable with partitions | @@ -171,43 +199,43 @@ C4Context | **Retention** | 7 days | Default, configurable per topic | | **Replication Factor** | 3 | For fault tolerance | -**Performance Optimizations**: -- **Batch Publishing**: Group multiple events to reduce network overhead -- **Compression**: Use Snappy or LZ4 compression -- **Partitioning**: Divide topics into multiple partitions for parallel processing -- **Consumer Groups**: Multiple consumers in same group for horizontal scaling -- **Async Publishing**: Fire-and-forget pattern, don't block request handlers +**Tối ưu hóa Hiệu suất**: +- **Batch Publishing**: Group multiple events để giảm network overhead +- **Compression**: Sử dụng Snappy hoặc LZ4 compression +- **Partitioning**: Phân chia topics thành multiple partitions cho parallel processing +- **Consumer Groups**: Multiple consumers trong cùng group để scale horizontally +- **Async Publishing**: Fire-and-forget pattern, không block request handlers -## Security Considerations +## Cân nhắc Bảo mật -**Event Encryption**: -- TLS in-transit for all Kafka connections -- Optional payload encryption for sensitive data -- End-to-end encryption with custom encryption layer +**Mã hóa Sự kiện**: +- TLS in-transit cho tất cả Kafka connections +- Optional payload encryption cho sensitive data +- End-to-end encryption với custom encryption layer -**Access Control**: +**Kiểm soát Truy cập**: - Kafka ACLs (Access Control Lists) per topic -- SASL/SCRAM authentication for producers and consumers -- Separate credentials per service -- Principle of least privilege - grant only necessary permissions +- SASL/SCRAM authentication cho producers và consumers +- Separate credentials cho mỗi service +- Principle of least privilege - chỉ grant quyền cần thiết -**Schema Validation**: -- Avro schemas in Schema Registry -- Schema evolution with backward/forward compatibility -- Reject events that don't match schema +**Xác thực Schema**: +- Avro schemas trong Schema Registry +- Schema evolution với backward/forward compatibility +- Reject events không match schema -**Audit**: -- Log all event publishes and consumes -- Correlation IDs to trace event flow -- Retention policy for audit logs (7 years) +**Kiểm toán**: +- Log tất cả event publishes và consumes +- Correlation IDs để trace event flow +- Retention policy cho audit logs (7 years) -**Data Retention**: +**Lưu trữ Dữ liệu**: - Default 7 days retention - Configurable per topic -- Automatic deletion after retention period -- GDPR compliance (right to erasure) +- Automatic deletion sau retention period +- Compliance với GDPR (right to erasure) -## Deployment +## Triển khai ```mermaid graph TD @@ -253,27 +281,33 @@ graph TD Broker2 --> Audit Broker3 --> Audit - style Broker1 fill:#e1f5ff - style Broker2 fill:#fff4e1 - style Broker3 fill:#d4edda - style ZK fill:#f0e1ff + style Broker1 fill:#1E88E5,stroke:#1565C0,color:#ffffff + style Broker2 fill:#1E88E5,stroke:#1565C0,color:#ffffff + style Broker3 fill:#1E88E5,stroke:#1565C0,color:#ffffff + style ZK fill:#8E24AA,stroke:#7B1FA2,color:#ffffff + style IAM fill:#5E35B1,stroke:#4527A0,color:#ffffff + style ServiceA fill:#5E35B1,stroke:#4527A0,color:#ffffff + style Notification fill:#43A047,stroke:#2E7D32,color:#ffffff + style Audit fill:#43A047,stroke:#2E7D32,color:#ffffff ``` -**Kafka Cluster Configuration**: +### Chiến lược Triển khai + +**Cấu hình Kafka Cluster**: - **Brokers**: 3 brokers minimum (5 for production) - **Replication Factor**: 3 (for fault tolerance) - **Min In-Sync Replicas**: 2 (ensure data durability) - **Partitions**: 3-10 per topic (based on throughput needs) - **Zookeeper**: 3-node ensemble (for coordination) -**Resource Allocation**: +**Phân bổ Tài nguyên**: | Component | CPU | Memory | Disk | |-----------|-----|--------|------| | **Kafka Broker** | 2 cores | 4GB RAM | 100GB SSD | | **Zookeeper** | 1 core | 2GB RAM | 20GB SSD | | **Schema Registry** | 500m | 1GB RAM | 10GB | -**Topic Configuration**: +**Cấu hình Topic**: ```yaml user.created: partitions: 3 @@ -294,15 +328,15 @@ audit.events: compression-type: lz4 ``` -**High Availability**: -- Multiple brokers with partition replication -- Automatic leader election when broker fails +**Tính Sẵn sàng Cao**: +- Multiple brokers với partition replication +- Automatic leader election khi broker fails - Consumer group rebalancing -- Monitoring and alerting for broker health +- Monitoring và alerting cho broker health -## Monitoring & Observability +## Giám sát & Khả năng quan sát -**Key Metrics**: +### Chỉ số Chính **Kafka Broker Metrics**: - `kafka_server_brokertopicmetrics_messagesinpersec` - Messages in/sec @@ -323,7 +357,7 @@ audit.events: **Application Metrics**: ```typescript -// Custom metrics for event processing +// Custom metrics cho event processing const eventPublished = new Counter({ name: 'events_published_total', help: 'Total events published', @@ -344,6 +378,42 @@ const eventProcessingDuration = new Histogram({ }); ``` +**Quy tắc Cảnh báo**: +```yaml +# High consumer lag +- alert: HighConsumerLag + expr: kafka_consumer_fetch_manager_records_lag_max > 10000 + for: 5m + severity: warning + annotations: + summary: "High consumer lag detected" + description: "Consumer lag is {{ $value }} messages" + +# Broker down +- alert: KafkaBrokerDown + expr: kafka_server_kafkaserver_brokerstate != 3 + for: 1m + severity: critical + annotations: + summary: "Kafka broker is down" + +# Under-replicated partitions +- alert: UnderReplicatedPartitions + expr: kafka_server_replicamanager_underreplicatedpartitions > 0 + for: 5m + severity: warning + annotations: + summary: "Under-replicated partitions detected" + +# Offline partitions +- alert: OfflinePartitions + expr: kafka_controller_kafkacontroller_offlinepartitionscount > 0 + for: 1m + severity: critical + annotations: + summary: "Offline partitions detected" +``` + **Dashboards**: - Kafka Cluster Overview (brokers, topics, partitions) - Producer Performance (throughput, latency, errors) @@ -352,7 +422,7 @@ const eventProcessingDuration = new Histogram({ **Logging**: ```typescript -// Structured logging for events +// Structured logging cho events logger.info('Event published', { eventId: event.eventId, eventType: event.eventType, @@ -369,7 +439,33 @@ logger.info('Event consumed', { }); ``` -## Related Documentation -- [System Design](./system-design.md) - Overall architecture -- [IAM Architecture](./iam-proposal.md) - Event sourcing implementation +## Tài liệu Liên quan + +- [System Design](./system-design.md) - Kiến trúc tổng thể +- [IAM Architecture](./iam-proposal.md) - Triển khai Event sourcing + +--- + +**Cập nhật Lần cuối**: 2026-01-14 +**Tác giả**: GoodGo Architecture Team + +## Mẹo Nhanh + +### Bảng Màu Mermaid + +| Loại Node | Màu Nền | Màu Viền | Màu Chữ | Sử dụng | +|-----------|------------|--------------|------------|-------| +| **Core/Broker** | `#1E88E5` (Blue) | `#1565C0` | `#ffffff` | Kafka Brokers, Main Components | +| **Topic/Data** | `#FB8C00` (Orange) | `#EF6C00` | `#ffffff` | Topics, Queues, Data Stores | +| **Success/Safe** | `#43A047` (Green) | `#2E7D32` | `#ffffff` | Successful flows, Safe states | +| **Error/Danger** | `#E53935` (Red) | `#C62828` | `#ffffff` | Errors, DLQ, Critical issues | +| **Coordination** | `#8E24AA` (Purple) | `#7B1FA2` | `#ffffff` | Zookeeper, Orchestrators | + +### Các Chỉ báo Trực quan + +- 🔄 **Retry Loop**: Chỉ báo thử lại tự động +- ⚠️ **DLQ/Warning**: Đường dẫn xử lý lỗi +- 📝 **Log/Audit**: Điểm ghi log +- 🔐 **Lock/Auth**: Kiểm tra bảo mật + diff --git a/docs/en/architecture/iam-proposal.md b/docs/en/architecture/iam-proposal.md index e54d8dee..5d8a6d0d 100644 --- a/docs/en/architecture/iam-proposal.md +++ b/docs/en/architecture/iam-proposal.md @@ -4,17 +4,15 @@ Tài liệu này mô tả đề xuất kiến trúc cho IAM Service (Identity an ## Tổng Quan: Auth Service → IAM Service -**Auth Service hiện tại** tập trung vào: -- Authentication (xác thực) -- Authorization (phân quyền) -- Session & Token management -- RBAC/ABAC +**IAM Service** cung cấp: +- **OAuth2/OpenID Connect** với OpenIddict +- **ASP.NET Core Identity** cho user management +- **Role-Based Access Control (RBAC)** +- **JWT Tokens** (Access 15min, Refresh 7 days) +- **MFA Support** (TOTP) -**IAM Service** mở rộng thêm: -- **Identity Management** (quản lý danh tính toàn diện) -- **Access Governance** (quản trị truy cập) -- **Compliance & Reporting** (tuân thủ và báo cáo) -- **Lifecycle Management** (quản lý vòng đời tài khoản) +> [!NOTE] +> IAM Service đã được triển khai với .NET 10, Clean Architecture tại `services/iam-service-net/` --- @@ -90,56 +88,90 @@ Tài liệu này mô tả đề xuất kiến trúc cho IAM Service (Identity an --- -## 2. Kiến Trúc Module Structure +## 2. Kiến Trúc Module Structure (Thực Tế) ``` -services/iam-service/ +services/iam-service-net/ ├── src/ -│ ├── config/ # Configuration files -│ ├── core/ -│ │ ├── cache/ # Multi-layer cache -│ │ ├── security/ # Zero-trust, encryption -│ │ ├── events/ # Event sourcing -│ │ └── workflows/ # Workflow engine (NEW) -│ ├── modules/ -│ │ ├── auth/ # ✅ Core authentication -│ │ ├── rbac/ # ✅ RBAC system -│ │ ├── social/ # ✅ Social authentication -│ │ ├── oidc/ # ✅ OIDC implementation -│ │ ├── token/ # ✅ JWT & Cookie management -│ │ ├── session/ # ✅ Session management -│ │ ├── mfa/ # ✅ Multi-factor auth -│ │ │ -│ │ ├── identity/ # 🆕 Identity Management -│ │ │ ├── user/ # User lifecycle -│ │ │ ├── profile/ # Profile management -│ │ │ ├── verification/ # Identity verification -│ │ │ └── organization/ # Organizations & groups -│ │ │ -│ │ ├── access/ # 🆕 Access Management -│ │ │ ├── request/ # Access requests -│ │ │ ├── review/ # Access reviews -│ │ │ ├── pam/ # Privileged access -│ │ │ └── analytics/ # Access analytics -│ │ │ -│ │ ├── governance/ # 🆕 Governance & Compliance -│ │ │ ├── compliance/ # Compliance reporting -│ │ │ ├── policy/ # Policy governance -│ │ │ ├── risk/ # Risk management -│ │ │ └── reporting/ # Reporting & dashboards -│ │ │ -│ │ └── workflow/ # 🆕 Workflow Engine -│ │ ├── engine/ # Workflow engine -│ │ ├── approval/ # Approval workflows -│ │ └── automation/ # Automated workflows -│ │ -│ ├── middlewares/ # Express middlewares -│ ├── repositories/ # Data access layer -│ └── routes/ # Route definitions -└── prisma/ - └── schema.prisma # Database schema (mở rộng) +│ ├── IamService.API/ # Presentation Layer +│ │ ├── Controllers/ # AuthController, UsersController, RolesController +│ │ ├── Application/ # CQRS Commands, Queries, Handlers +│ │ │ ├── Commands/ # RegisterUserCommand, ChangePasswordCommand +│ │ │ ├── Queries/ # GetUserQuery, GetUsersQuery +│ │ │ └── Validators/ # FluentValidation validators +│ │ └── Program.cs # App entry point +│ ├── IamService.Domain/ # Domain Layer +│ │ ├── AggregatesModel/ # ApplicationUser, ApplicationRole +│ │ ├── Events/ # UserCreatedEvent, UserDeletedEvent +│ │ ├── Exceptions/ # UserNotFoundException, InvalidCredentialsException +│ │ └── SeedWork/ # Entity, IAggregateRoot, IRepository +│ └── IamService.Infrastructure/ # Infrastructure Layer +│ ├── IamServiceContext.cs # DbContext with Identity + OpenIddict +│ ├── Repositories/ # UserRepository, RoleRepository +│ └── Services/ # EmailService, TokenService +├── tests/ +│ ├── IamService.UnitTests/ +│ └── IamService.FunctionalTests/ +├── docs/ +├── Dockerfile +└── IamService.slnx ``` +### Sơ Đồ Kiến Trúc Clean Architecture + +```mermaid +graph TD + %% Styling Configuration + classDef base fill:#202020,stroke:#505050,color:#fff,stroke-width:1px; + classDef core fill:#1a237e,stroke:#3949ab,color:#fff,stroke-width:1px; + classDef newModule fill:#1b5e20,stroke:#43a047,color:#fff,stroke-width:1px; + classDef database fill:#4a148c,stroke:#7b1fa2,color:#fff,stroke-width:1px; + + %% Main Service Node + IAM[IAM Service]:::core + + %% Identity Management Subgraph + subgraph Identity [Identity Management] + direction TB + User[User Lifecycle]:::newModule + Profile[Profile Mgmt]:::newModule + Verify[Verification]:::newModule + Org[Org & Groups]:::newModule + end + + %% Access Management Subgraph + subgraph Access [Access Management] + direction TB + Req[Access Requests]:::newModule + Review[Access Reviews]:::newModule + PAM[PAM]:::newModule + Analytics[Analytics]:::newModule + end + + %% Governance Subgraph + subgraph Governance [Governance & Compliance] + direction TB + Comp[Compliance]:::newModule + Policy[Policy Gov]:::newModule + Risk[Risk Mgmt]:::newModule + end + + %% Database + DB[(Neon Database)]:::database + + %% Relationships + IAM --> Identity + IAM --> Access + IAM --> Governance + + Identity -.-> DB + Access -.-> DB + Governance -.-> DB + + %% Internal Dependencies + Access --> Identity + Governance ---> Access +``` --- ## 3. Database Schema Mở Rộng @@ -168,39 +200,35 @@ services/iam-service/ --- -## 4. API Endpoints Mở Rộng +## 4. API Endpoints (Thực Tế) -### 4.1 Identity Management APIs +### 4.1 Authentication APIs -``` -# User Management -GET /api/v1/identity/users -POST /api/v1/identity/users -GET /api/v1/identity/users/:id -PUT /api/v1/identity/users/:id -DELETE /api/v1/identity/users/:id -POST /api/v1/identity/users/bulk-import -GET /api/v1/identity/users/bulk-export +| Method | Endpoint | Mô tả | Auth | +|--------|----------|-------|------| +| `POST` | `/api/v1/auth/register` | Đăng ký user mới | ❌ | +| `POST` | `/connect/token` | OAuth2 token endpoint (login, refresh) | ❌ | +| `POST` | `/api/v1/auth/change-password` | Đổi mật khẩu | ✅ | +| `POST` | `/api/v1/auth/logout` | Đăng xuất (revoke tokens) | ✅ | -# Profile Management -GET /api/v1/identity/users/:id/profile -PUT /api/v1/identity/users/:id/profile -POST /api/v1/identity/users/:id/profile/avatar +### 4.2 User Management APIs -# Identity Verification -POST /api/v1/identity/verification/email/request -POST /api/v1/identity/verification/email/verify -POST /api/v1/identity/verification/phone/request -POST /api/v1/identity/verification/phone/verify +| Method | Endpoint | Mô tả | Auth | +|--------|----------|-------|------| +| `GET` | `/api/v1/users` | Danh sách users (paginated) | ✅ | +| `GET` | `/api/v1/users/me` | Thông tin user hiện tại | ✅ | +| `GET` | `/api/v1/users/{id}` | Lấy user theo ID | ✅ | +| `PUT` | `/api/v1/users/{id}` | Cập nhật user | ✅ | +| `DELETE` | `/api/v1/users/{id}` | Xóa user (soft delete) | ✅ | -# Organizations & Groups -GET /api/v1/identity/organizations -POST /api/v1/identity/organizations -GET /api/v1/identity/organizations/:id/groups -POST /api/v1/identity/organizations/:id/groups -GET /api/v1/identity/groups/:id/members -POST /api/v1/identity/groups/:id/members -``` +### 4.3 Role Management APIs + +| Method | Endpoint | Mô tả | Auth | +|--------|----------|-------|------| +| `GET` | `/api/v1/roles` | Danh sách roles | ✅ | +| `POST` | `/api/v1/roles` | Tạo role mới | ✅ Admin | +| `PUT` | `/api/v1/roles/{id}` | Cập nhật role | ✅ Admin | +| `DELETE` | `/api/v1/roles/{id}` | Xóa role | ✅ Admin | ### 4.2 Access Management APIs @@ -337,3 +365,32 @@ GET /api/v1/governance/reports/security-events - **Workflow automation** linh hoạt Điều này biến service từ authentication/authorization cơ bản thành một IAM platform toàn diện, phù hợp cho enterprise. + +--- + +## Quick Tips + +### Mermaid Common Issues + +- **Syntax Error**: Kiểm tra kỹ các dấu ngoặc `[]`, `{}`, `()` trong node label. +- **Connection**: Đảm bảo các mũi tên `-->`, `-.->` đúng cú pháp. +- **Indentation**: Subgraph cần thụt đầu dòng đúng cách. + +### Color Pattern Reference + +| Element | Fill Color | Stroke | Text | Usage | +|---------|------------|--------|------|-------| +| **Base** | `#202020` | `#505050` | `#fff` | Node thông thường | +| **Core** | `#1a237e` | `#3949ab` | `#fff` | Node trung tâm, quan trọng | +| **Module**| `#1b5e20` | `#43a047` | `#fff` | Module, service con | +| **DB** | `#4a148c` | `#7b1fa2` | `#fff` | Database, storage | +| **Warn** | `#b71c1c` | `#f44336` | `#fff` | Cảnh báo, lỗi | + +### Visual Indicators + +| Icon | Meaning | +|------|---------| +| ✅ | Đã hoàn thành / Tốt | +| 🔄 | Đang xử lý / Thay đổi | +| ⚠️ | Cảnh báo / Lưu ý | +| ❌ | Lỗi / Không khuyến khích | diff --git a/docs/en/architecture/microservices-communication.md b/docs/en/architecture/microservices-communication.md index 57f7f96d..ed518fdd 100644 --- a/docs/en/architecture/microservices-communication.md +++ b/docs/en/architecture/microservices-communication.md @@ -1,8 +1,81 @@ -# Microservices Communication +# Kiến trúc Giao tiếp Microservices -> Communication patterns and protocols for inter-service communication +> Các patterns và protocols giao tiếp giữa các services -## Overview Diagram +## Quick Overview + +Hướng dẫn nhanh về các patterns giao tiếp cơ bản trong hệ thống GoodGo. + +### Mô hình Giao tiếp Cơ bản + +```mermaid +graph TD + %% Nodes + Client[Web App / Mobile App] + Traefik[Traefik API Gateway] + Auth[Auth Service] + Notify[Notification Service] + + %% Relationships + Client -->|HTTP Request| Traefik + Traefik -->|Routing| Auth + Auth -.->|Internal HTTP| Notify + + %% Styles using dark color palette + style Client fill:#1565c0,stroke:#fff,stroke-width:2px,color:#fff + style Traefik fill:#0f4c81,stroke:#fff,stroke-width:2px,color:#fff + style Auth fill:#283593,stroke:#fff,stroke-width:2px,color:#fff + style Notify fill:#4527a0,stroke:#fff,stroke-width:2px,color:#fff +``` + +### Giao tiếp Đồng bộ (HTTP/REST) + +Các service giao tiếp đồng bộ qua HTTP REST APIs thông qua Traefik API Gateway. + +**Ví dụ Client → Service:** +```typescript +// Web App -> Auth Service +const response = await fetch('http://api.goodgo.vn/api/v1/auth/login', { + method: 'POST', + body: JSON.stringify({ email, password }), +}); +``` + +**Ví dụ Service → Service:** +```typescript +// Auth Service -> Notification Service +const response = await fetch('http://notification-service:5003/api/v1/notifications', { + method: 'POST', + headers: { 'X-Service-Auth': process.env.INTERNAL_API_KEY }, + body: JSON.stringify({ userId, message }), +}); +``` + +### API Gateway Routing + +Traefik định tuyến requests dựa trên: +- **Host header**: `api.goodgo.vn` +- **Path prefix**: `/api/v1/auth`, `/api/v1/users` + +### Format Error Response Chuẩn + +Tất cả services tuân theo định dạng error response nhất quán: + +```json +{ + "success": false, + "error": { + "code": "AUTH_001", + "message": "Invalid credentials", + "details": {} + }, + "timestamp": "2024-01-01T00:00:00.000Z" +} +``` + +--- + +## Sơ đồ Tổng quan ```mermaid graph TD @@ -27,7 +100,7 @@ graph TD class SD green ``` -## System Context +## Bối cảnh Hệ thống ```mermaid C4Context @@ -57,11 +130,11 @@ C4Context Rel(services, external_api, "Integrates", "HTTPS") ``` -The GoodGo platform uses a microservices architecture where all client requests flow through an API Gateway (Traefik), which routes them to appropriate microservices. Services communicate synchronously via REST/HTTP for request-response patterns and asynchronously via Kafka for event-driven workflows. Service discovery is handled by Docker DNS in local environments and Kubernetes DNS in production. +Nền tảng GoodGo sử dụng kiến trúc microservices nơi tất cả client requests đi qua API Gateway (Traefik), được route đến các microservices phù hợp. Các services giao tiếp đồng bộ qua REST/HTTP cho patterns request-response và bất đồng bộ qua Kafka cho workflows event-driven. Service discovery được xử lý bởi Docker DNS trong môi trường local và Kubernetes DNS trong production. -## Communication Protocols +## Protocols Giao tiếp -### Protocol Comparison +### So sánh Protocols | Protocol | Latency | Complexity | Use Case | |----------|---------|------------|----------| @@ -70,7 +143,7 @@ The GoodGo platform uses a microservices architecture where all client requests | **Events** | Async | Medium | Decoupled workflows | | **GraphQL** | Medium | Medium | Complex data fetching | -### REST/HTTP Pattern +### Pattern REST/HTTP ```mermaid sequenceDiagram @@ -87,30 +160,53 @@ sequenceDiagram Gateway-->>Client: JSON Response ``` -Synchronous request-response using HTTP/REST. +Request-response đồng bộ sử dụng HTTP/REST. -**Implementation**: -```typescript -// Service-to-service HTTP client -import axios from 'axios'; - -export class UserServiceClient { - private client = axios.create({ - baseURL: process.env.USER_SERVICE_URL, - timeout: 5000, - headers: { - 'x-service-auth': process.env.INTERNAL_API_KEY +**Triển khai (.NET với IHttpClientFactory)**: +```csharp +// EN: Service-to-service HTTP client +// VI: HTTP client cho giao tiếp giữa services +public class IamServiceClient : IIamServiceClient +{ + private readonly HttpClient _httpClient; + private readonly ILogger _logger; + + public IamServiceClient(HttpClient httpClient, ILogger logger) + { + _httpClient = httpClient; + _logger = logger; + } + + public async Task GetUserAsync(Guid userId, CancellationToken ct) + { + try + { + var response = await _httpClient.GetAsync($"/api/v1/users/{userId}", ct); + response.EnsureSuccessStatusCode(); + + return await response.Content.ReadFromJsonAsync(ct); + } + catch (HttpRequestException ex) + { + _logger.LogError(ex, "Failed to get user {UserId}", userId); + throw; + } } - }); - - async getUser(userId: string): Promise { - const response = await this.client.get(`/users/${userId}`); - return response.data; - } } + +// EN: Registration in Program.cs +// VI: Đăng ký trong Program.cs +builder.Services.AddHttpClient(client => +{ + client.BaseAddress = new Uri("http://iam-service-net:8080"); + client.DefaultRequestHeaders.Add("X-Service-Name", "storage-service"); + client.Timeout = TimeSpan.FromSeconds(5); +}) +.AddPolicyHandler(GetRetryPolicy()) +.AddPolicyHandler(GetCircuitBreakerPolicy()); ``` -### Event-Driven Pattern +### Pattern Event-Driven ```mermaid sequenceDiagram @@ -129,9 +225,9 @@ sequenceDiagram end ``` -Asynchronous event-based communication via Kafka. +Giao tiếp bất đồng bộ dựa trên events qua Kafka. -### Service Discovery +### Khám phá Dịch vụ **Local (Docker Compose)**: ```yaml @@ -147,7 +243,7 @@ http://service-name.namespace.svc.cluster.local http://iam-service.default.svc.cluster.local:3001 ``` -## API Gateway Pattern +## Pattern API Gateway ```mermaid graph LR @@ -166,58 +262,66 @@ graph LR LB --> Service1A[Instance A] LB --> Service1B[Instance B] - classDef blue fill:#253041,stroke:#4b6584,color:#ffffff - class Gateway blue + %% Dark color palette with white text + classDef clientBlue fill:#1565c0,stroke:#fff,stroke-width:2px,color:#fff + classDef gatewayBlue fill:#0f4c81,stroke:#fff,stroke-width:2px,color:#fff + classDef featurePurple fill:#4527a0,stroke:#fff,stroke-width:2px,color:#fff + classDef serviceGreen fill:#1e3a29,stroke:#3c7a52,stroke-width:2px,color:#fff + + class Client clientBlue + class Gateway gatewayBlue + class Route,LB,Auth,Rate,CORS featurePurple + class Service1,Service2,Service1A,Service1B serviceGreen ``` -Single entry point for all client requests with routing, auth, rate limiting. +Điểm vào duy nhất cho tất cả client requests với routing, auth, rate limiting. -## Performance Characteristics +## Đặc điểm Hiệu suất -Performance expectations and optimization strategies for inter-service communication. +Kỳ vọng hiệu suất và chiến lược tối ưu cho giao tiếp giữa các services. -| Metric | Target | Notes | -|--------|--------|-------| -| **REST API Response Time** | < 100ms | P95 for internal service-to-service calls | -| **Event Publishing Latency** | < 50ms | Time to publish to Kafka | -| **Service Discovery Lookup** | < 10ms | DNS resolution time | -| **Gateway Routing Overhead** | < 20ms | Additional latency added by Traefik | -| **Throughput** | 10,000 req/s | Per service instance | -| **Kafka Event Processing** | < 500ms | P95 end-to-end event processing | +| Chỉ số | Mục tiêu | Ghi chú | +|------------------|-------------------|-----------------| +| **Thời gian phản hồi REST API** | < 100ms | P95 cho các cuộc gọi service-to-service nội bộ | +| **Độ trễ publish event** | < 50ms | Thời gian publish tới Kafka | +| **Service discovery lookup** | < 10ms | Thời gian phân giải DNS | +| **Chi phí routing của Gateway** | < 20ms | Độ trễ thêm vào bởi Traefik | +| **Thông lượng** | 10,000 req/s | Mỗi service instance | +| **Xử lý Kafka event** | < 500ms | P95 xử lý event end-to-end | -**Optimization Strategies**: -- **Connection Pooling**: Reuse HTTP connections between services -- **Circuit Breaker**: Prevent cascading failures with Opossum library -- **Retry with Backoff**: Exponential backoff for transient failures -- **Compression**: Enable gzip for large payloads -- **Caching**: Cache service discovery results and responses +**Chiến lược Tối ưu**: +- **Connection Pooling**: Tái sử dụng HTTP connections giữa services +- **Circuit Breaker**: Ngăn chặn cascading failures với thư viện Opossum +- **Retry with Backoff**: Exponential backoff cho transient failures +- **Compression**: Bật gzip cho payloads lớn +- **Caching**: Cache kết quả service discovery và responses -## Security Considerations +## Cân nhắc Bảo mật -Security measures for protecting inter-service communication. +Biện pháp bảo mật để bảo vệ giao tiếp giữa các services. -### Service-to-Service Authentication +### Xác thực Service-to-Service -- **Internal API Keys**: Services authenticate using `x-service-auth` header -- **JWT Tokens**: For user context propagation between services -- **Mutual TLS (mTLS)**: Optional for production environments (Kubernetes service mesh) +- **Internal API Keys**: Services xác thực sử dụng `x-service-auth` header +- **JWT Tokens**: Để truyền user context giữa services +- **Mutual TLS (mTLS)**: Tùy chọn cho môi trường production (Kubernetes service mesh) -### Network Security +### Bảo mật Mạng -- **Network Policies**: Kubernetes NetworkPolicies restrict service-to-service traffic -- **Service Mesh**: Istio/Linkerd for advanced security policies (optional) -- **Private Networks**: Services communicate within private VPC/cluster network +- **Network Policies**: Kubernetes NetworkPolicies hạn chế traffic service-to-service +- **Service Mesh**: Istio/Linkerd cho security policies nâng cao (tùy chọn) +- **Private Networks**: Services giao tiếp trong private VPC/cluster network -### Data Protection +### Bảo vệ Dữ liệu -- **Encryption in Transit**: TLS 1.2+ for all external communication -- **Event Payload Encryption**: Sensitive data encrypted before publishing to Kafka -- **API Gateway**: Traefik handles SSL termination and request validation +- **Encryption in Transit**: TLS 1.2+ cho mọi external communication +- **Event Payload Encryption**: Dữ liệu nhạy cảm được mã hóa trước khi publish tới Kafka +- **API Gateway**: Xử lý SSL termination và request validation -### Security Best Practices +### Best Practices Bảo mật ```typescript -// Service client with authentication +// Service client với xác thực export class SecureServiceClient { private client = axios.create({ baseURL: process.env.SERVICE_URL, @@ -227,48 +331,53 @@ export class SecureServiceClient { 'x-correlation-id': generateCorrelationId() }, httpsAgent: new https.Agent({ - rejectUnauthorized: true // Verify SSL certificates + rejectUnauthorized: true // Xác minh SSL certificates }) }); } ``` -## Deployment +## Triển khai -How microservices communication is deployed and scaled across environments. +Cách giao tiếp microservices được triển khai và mở rộng qua các môi trường. ```mermaid graph TD subgraph "Production Cluster" - LB[Load Balancer] --> Gateway[API Gateway\n3 replicas] + LB[Load Balancer] --> Gateway[API Gateway
3 replicas] - Gateway --> ServiceA1[Service A\nInstance 1] - Gateway --> ServiceA2[Service A\nInstance 2] - Gateway --> ServiceB1[Service B\nInstance 1] - Gateway --> ServiceB2[Service B\nInstance 2] + Gateway --> ServiceA1[Service A
Instance 1] + Gateway --> ServiceA2[Service A
Instance 2] + Gateway --> ServiceB1[Service B
Instance 1] + Gateway --> ServiceB2[Service B
Instance 2] - ServiceA1 & ServiceA2 --> Kafka[Kafka Cluster\n3 brokers] + ServiceA1 & ServiceA2 --> Kafka[Kafka Cluster
3 brokers] ServiceB1 & ServiceB2 --> Kafka - ServiceA1 & ServiceA2 --> DB[(PostgreSQL\nPrimary + Replica)] + ServiceA1 & ServiceA2 --> DB[(PostgreSQL
Primary + Replica)] ServiceB1 & ServiceB2 --> DB - ServiceA1 & ServiceA2 --> Redis[(Redis Cluster\n3 nodes)] + ServiceA1 & ServiceA2 --> Redis[(Redis Cluster
3 nodes)] ServiceB1 & ServiceB2 --> Redis end - classDef blue fill:#253041,stroke:#4b6584,color:#ffffff - classDef orange fill:#3a2e1e,stroke:#7a5f3c,color:#ffffff - classDef green fill:#1e3a29,stroke:#3c7a52,color:#ffffff - classDef red fill:#3a1e1e,stroke:#7a3c3c,color:#ffffff + %% Dark color palette with white text and white strokes + classDef lbGrey fill:#424242,stroke:#fff,stroke-width:2px,color:#fff + classDef gatewayBlue fill:#0f4c81,stroke:#fff,stroke-width:2px,color:#fff + classDef servicePurple fill:#4527a0,stroke:#fff,stroke-width:2px,color:#fff + classDef kafkaOrange fill:#3a2e1e,stroke:#fff,stroke-width:2px,color:#fff + classDef dbGreen fill:#1e3a29,stroke:#fff,stroke-width:2px,color:#fff + classDef redisRed fill:#3a1e1e,stroke:#fff,stroke-width:2px,color:#fff - class Gateway blue - class Kafka orange - class DB green - class Redis red + class LB lbGrey + class Gateway gatewayBlue + class ServiceA1,ServiceA2,ServiceB1,ServiceB2 servicePurple + class Kafka kafkaOrange + class DB dbGreen + class Redis redisRed ``` -### Deployment Environments +### Môi trường Triển khai | Environment | Gateway | Services | Kafka | Service Discovery | |-------------|---------|----------|-------|-------------------| @@ -276,18 +385,18 @@ graph TD | **Staging** | Traefik (2 replicas) | 2 replicas per service | 3 brokers | Kubernetes DNS | | **Production** | Traefik (3+ replicas) | 3+ replicas per service | 5+ brokers | Kubernetes DNS + Service Mesh | -### Scaling Strategy +### Chiến lược Mở rộng -- **Horizontal Pod Autoscaler (HPA)**: Auto-scale based on CPU/memory -- **Kafka Partitions**: Scale event processing by increasing partitions -- **Load Balancing**: Kubernetes Service load balances across pod replicas -- **Gateway Scaling**: Traefik scales independently from backend services +- **Horizontal Pod Autoscaler (HPA)**: Tự động scale dựa trên CPU/memory +- **Kafka Partitions**: Scale event processing bằng cách tăng partitions +- **Load Balancing**: Cân bằng tải giữa pod replicas +- **Gateway Scaling**: Traefik scale độc lập với backend services -## Monitoring & Observability +## Giám sát & Khả năng quan sát -How to monitor and observe microservices communication. +Cách giám sát và quan sát giao tiếp microservices. -### Key Metrics +### Chỉ số Chính **Service-to-Service Metrics**: - `http_request_duration_seconds` - Request latency histogram @@ -305,16 +414,16 @@ How to monitor and observe microservices communication. - `kafka_consumer_lag` - Consumer lag - `kafka_consumer_records_consumed_total` - Events consumed -### Health Checks +### Kiểm tra Sức khỏe **Service Endpoints**: ```typescript -// Liveness - is service running? +// Liveness - service có đang chạy không? app.get('/health/live', (req, res) => { res.json({ status: 'ok', timestamp: new Date().toISOString() }); }); -// Readiness - can service handle traffic? +// Readiness - service có thể xử lý traffic không? app.get('/health/ready', async (req, res) => { const checks = { database: await checkDatabase(), @@ -344,13 +453,13 @@ readinessProbe: periodSeconds: 5 ``` -### Distributed Tracing +### Tracing Phân tán -- **OpenTelemetry**: Instrument all service-to-service calls -- **Jaeger**: Visualize distributed traces -- **Correlation IDs**: Propagate via `x-correlation-id` header for request tracking +- **OpenTelemetry**: Instrument tất cả service-to-service calls +- **Jaeger**: Hiển thị distributed traces +- **Correlation IDs**: Truyền qua `x-correlation-id` header để tracking requests -### Monitoring Dashboard +### Dashboard Giám sát **Grafana Panels**: - Service Communication Overview (request rate, latency, errors) @@ -358,9 +467,9 @@ readinessProbe: - Event Bus Health (Kafka lag, throughput) - Service Dependencies (service map from traces) -## Related Documentation +## Tài liệu Liên quan -- [System Design](./system-design.md) - Overall architecture +- [System Design](./system-design.md) - Kiến trúc tổng thể - [Event-Driven Architecture](./event-driven-architecture.md) - Event patterns - [API Gateway Advanced](../skills/api-gateway-advanced.md) - Gateway patterns - [Inter-Service Communication](../skills/inter-service-communication.md) - Communication patterns @@ -393,6 +502,6 @@ readinessProbe: --- -**Last Updated**: 2026-01-07 -**Authors**: GoodGo Architecture Team +**Cập nhật lần cuối / Last Updated**: 2026-01-14 +**Tác giả / Authors**: GoodGo Architecture Team **Reviewers**: To be assigned diff --git a/docs/en/architecture/observability-architecture.md b/docs/en/architecture/observability-architecture.md index 7f29a58e..b8e17466 100644 --- a/docs/en/architecture/observability-architecture.md +++ b/docs/en/architecture/observability-architecture.md @@ -1,8 +1,8 @@ -# Observability Architecture +# Kiến trúc Khả năng Quan sát -> **Note**: Comprehensive observability with metrics, logging, and tracing +> **Note**: Khả năng quan sát toàn diện với metrics, logging và tracing -## Overview Diagram +## Sơ đồ Tổng quan ```mermaid graph TD @@ -42,11 +42,11 @@ graph TD class Grafana,GrafanaLogs dashboard; ``` -## System Context +## Bối cảnh Hệ thống ```mermaid C4Context - title Observability System Context + title Sơ đồ Bối cảnh Khả năng Quan sát Person(dev, "Developer", "Uses dashboards to monitor system") Person(sre, "SRE", "Manages infrastructure & alerts") @@ -68,12 +68,12 @@ C4Context UpdateElementStyle(k8s, $fontColor="white", $bgColor="#4A5568", $borderColor="white") ``` -### Context Description -- **Observability Stack**: Central hub for collecting and displaying data (Prometheus, Loki, Jaeger, Grafana). -- **Microservices**: Send logs, metrics, and traces (OpenTelemetry). -- **Developer/SRE**: Use Grafana to monitor system health and debug. +### Mô tả Bối cảnh +- **Observability Stack**: Trung tâm thu thập và hiển thị dữ liệu (Prometheus, Loki, Jaeger, Grafana). +- **Microservices**: Gửi logs, metrics và traces (OpenTelemetry). +- **Developer/SRE**: Sử dụng Grafana để theo dõi sức khỏe hệ thống và debug. -## Three Pillars of Observability +## Ba Trụ cột Khả năng Quan sát ### 1. Metrics (Prometheus + Grafana) @@ -94,9 +94,9 @@ graph LR class Grafana grafana; ``` -**Description**: Numerical measurements over time (requests/sec, latency, errors). +**Mô tả**: Các phép đo số theo thời gian (requests/sec, latency, errors). -**Implementation**: +**Triển khai**: ```typescript import { Counter, Histogram, Gauge } from 'prom-client'; @@ -119,7 +119,7 @@ export const activeRequests = new Gauge({ help: 'Number of active HTTP requests' }); -// Middleware to track metrics +// Middleware để track metrics export function metricsMiddleware(req, res, next) { const start = Date.now(); activeRequests.inc(); @@ -145,19 +145,19 @@ export function metricsMiddleware(req, res, next) { } ``` -### 2. Logging (Winston + Loki) +### 2. Logging (Serilog + Loki) ```mermaid sequenceDiagram participant Service - participant Winston as Winston Logger + participant Serilog as Serilog Logger participant Loki participant Grafana - Service->>Winston: Log event - Winston->>Winston: Format JSON - Winston->>Winston: Add metadata
(correlation ID, trace ID) - Winston->>Loki: Push logs + Service->>Serilog: Log event + Serilog->>Serilog: Format JSON + Serilog->>Serilog: Add metadata
(correlation ID, trace ID) + Serilog->>Loki: Push logs Loki->>Loki: Index & store User->>Grafana: Query logs @@ -165,52 +165,49 @@ sequenceDiagram Loki-->>Grafana: Log results ``` -**Description**: Structured logging with correlation IDs for request tracing. +**Mô tả**: Structured logging với correlation IDs để tracing requests. -**Implementation**: -```typescript -import winston from 'winston'; +**Triển khai (.NET)**: +```csharp +// Program.cs - Serilog configuration +builder.Host.UseSerilog((context, config) => config + .ReadFrom.Configuration(context.Configuration) + .Enrich.FromLogContext() + .Enrich.WithProperty("Service", serviceName) + .Enrich.WithProperty("Environment", environment) + .WriteTo.Console(new JsonFormatter()) + .WriteTo.GrafanaLoki( + "http://loki:3100", + labels: new [] { new LokiLabel { Key = "app", Value = serviceName } } + )); -export const logger = winston.createLogger({ - level: process.env.LOG_LEVEL || 'info', - format: winston.format.combine( - winston.format.timestamp(), - winston.format.errors({ stack: true }), - winston.format.json() - ), - defaultMeta: { - service: process.env.SERVICE_NAME || 'unknown-service', - environment: process.env.NODE_ENV || 'development' - }, - transports: [ - new winston.transports.Console(), - // Loki transport (if configured) - ] -}); - -// Logger middleware -export function loggerMiddleware(req, res, next) { - const correlationId = req.headers['x-correlation-id'] || generateId(); - - req.correlationId = correlationId; - req.logger = logger.child({ correlationId }); - - req.logger.info('Incoming request', { - method: req.method, - path: req.path, - ip: req.ip - }); - - res.on('finish', () => { - req.logger.info('Request completed', { - method: req.method, - path: req.path, - status: res.statusCode, - duration: Date.now() - req.startTime - }); - }); - - next(); +// Middleware - Add correlation ID +public class CorrelationIdMiddleware +{ + private readonly RequestDelegate _next; + private readonly ILogger _logger; + + public async Task InvokeAsync(HttpContext context) + { + var correlationId = context.Request.Headers["X-Correlation-Id"].FirstOrDefault() + ?? Guid.NewGuid().ToString(); + + context.Items["CorrelationId"] = correlationId; + context.Response.Headers["X-Correlation-Id"] = correlationId; + + using (LogContext.PushProperty("CorrelationId", correlationId)) + { + _logger.LogInformation("Request started: {Method} {Path}", + context.Request.Method, context.Request.Path); + + var sw = Stopwatch.StartNew(); + await _next(context); + sw.Stop(); + + _logger.LogInformation("Request completed: {StatusCode} in {Duration}ms", + context.Response.StatusCode, sw.ElapsedMilliseconds); + } + } } ``` @@ -238,82 +235,71 @@ graph LR class Jaeger jaeger; ``` -**Description**: Distributed tracing to track requests across services. +**Mô tả**: Distributed tracing để track requests giữa các services. -**Implementation**: -```typescript -import { trace, SpanStatusCode } from '@opentelemetry/api'; +> [!NOTE] +> Distributed Tracing với Jaeger đang trong kế hoạch triển khai. Hiện tại sử dụng correlation IDs cho request tracking. -// Create traced function -export function traced( - name: string, - fn: () => Promise -): Promise { - const tracer = trace.getTracer('app'); - const span = tracer.startSpan(name); - - return fn() - .then(result => { - span.setStatus({ code: SpanStatusCode.OK }); - return result; - }) - .catch(error => { - span.setStatus({ - code: SpanStatusCode.ERROR, - message: error.message - }); - span.recordException(error); - throw error; - }) - .finally(() => { - span.end(); - }); -} +**Triển khai (.NET với OpenTelemetry)**: +```csharp +// Program.cs - OpenTelemetry configuration (planned) +builder.Services.AddOpenTelemetry() + .WithTracing(tracing => tracing + .AddAspNetCoreInstrumentation() + .AddHttpClientInstrumentation() + .AddEntityFrameworkCoreInstrumentation() + .AddJaegerExporter(options => + { + options.AgentHost = "jaeger"; + options.AgentPort = 6831; + })); -// Usage -async getUserWithTracing(userId: string): Promise { - return traced('getUserById', async () => { - return await userRepository.findById(userId); - }); +// Manual span creation +public async Task GetUserByIdAsync(Guid userId, CancellationToken ct) +{ + using var activity = ActivitySource.StartActivity("GetUserById"); + activity?.SetTag("user.id", userId.ToString()); + + try + { + var user = await _context.Users.FindAsync([userId], ct); + activity?.SetStatus(ActivityStatusCode.Ok); + return user; + } + catch (Exception ex) + { + activity?.SetStatus(ActivityStatusCode.Error, ex.Message); + throw; + } } ``` -## Health Checks +## Kiểm tra Sức khỏe ```typescript -// Liveness probe - is service running? -app.get('/health/live', (req, res) => { - res.json({ status: 'ok', timestamp: new Date().toISOString() }); +// Health check (.NET) +app.MapHealthChecks("/health", new HealthCheckOptions +{ + ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse }); -// Readiness probe - is service ready for traffic? -app.get('/health/ready', async (req, res) => { - const checks = { - database: await checkDatabase(), - redis: await checkRedis(), - disk: await checkDiskSpace() - }; - - const ready = Object.values(checks).every(check => check === true); - - res.status(ready ? 200 : 503).json({ - ready, - checks, - timestamp: new Date().toISOString() - }); +app.MapHealthChecks("/health/live", new HealthCheckOptions +{ + Predicate = _ => false // Liveness - always return healthy }); -async function checkDatabase(): Promise { - try { - await prisma.$queryRaw`SELECT 1`; - return true; - } catch { - return false; - } -} +app.MapHealthChecks("/health/ready", new HealthCheckOptions +{ + Predicate = check => check.Tags.Contains("ready") +}); + +// Health check registration +builder.Services.AddHealthChecks() + .AddNpgSql(connectionString, name: "database", tags: new[] { "ready" }) + .AddRedis(redisConnectionString, name: "redis", tags: new[] { "ready" }); ``` -## Alerting Rules +## Quy tắc Cảnh báo ```yaml # Prometheus alerting rules @@ -321,7 +307,7 @@ groups: - name: service_alerts interval: 30s rules: - # High error rate + # Tỷ lệ lỗi cao - alert: HighErrorRate expr: | rate(http_requests_total{status=~"5.."}[5m]) > 0.05 @@ -332,7 +318,7 @@ groups: summary: "High error rate detected" description: "Error rate is {{ $value }} (> 5%)" - # High latency + # Độ trễ cao - alert: HighLatency expr: | histogram_quantile(0.95, http_request_duration_seconds_bucket) > 1 @@ -353,11 +339,11 @@ groups: summary: "Service is down" ``` -## Performance Targets +## Đặc điểm Hiệu suất -### Performance Goals -| Metric | Target | Notes | -|--------|--------|-------| +### Mục tiêu Hiệu suất +| Chỉ số | Mục tiêu | Ghi chú | +|--------|----------|---------| | **Metric Scrape Interval** | 15s | Critical services | | **Log Ingestion Latency** | < 1s | Time from emit to queryable | | **Trace Sampling Rate** | 10% | Production (100% in Dev/Staging) | @@ -365,15 +351,15 @@ groups: | **Alert Evaluation** | Every 1m | Evaluation interval | | **Retention Policy** | 14 days | Logs & Traces (Metrics: 30 days) | -## Security Considerations +## Cân nhắc Bảo mật -### Observability Security -- **Log Scrubbing**: Automatically remove PII (emails, ssn, credit cards) and secrets from logs before ingestion. -- **Access Control**: Grafana integrated with OAuth2/OIDC, with Viewer/Editor/Admin roles. -- **Network Policy**: Only allow traffic from internal namespace to ingestion ports (9090, 3100, 14268). -- **TLS**: Encrypt traffic between agents and collectors. +### Bảo mật Observability +- **Log Scrubbing**: Tự động loại bỏ PII (emails, ssn, credit cards) và secrets khỏi logs trước khi ingestion. +- **Access Control**: Grafana integrated với OAuth2/OIDC, phân quyền Viewer/Editor/Admin. +- **Network Policy**: Chỉ cho phép traffic từ namespace nội bộ tới các cổng ingestion (9090, 3100, 14268). +- **TLS**: Mã hóa traffic giữa agents và collectors. -## Deployment +## Triển khai ```mermaid graph TD @@ -415,15 +401,15 @@ graph TD class App,Agent app; ``` -**Deployment Description**: -- **Agent**: Promtail or Grafana Agent runs as DaemonSet or Sidecar to collect logs. -- **Pull Model**: Prometheus scrapes metrics from `/metrics` endpoints. -- **Push Model**: Traces and Logs are pushed to collectors. -- **Resources**: Dedicated nodes for monitoring stack in production to avoid impacting main workload. +**Mô tả Triển khai**: +- **Agent**: Promtail hoặc Grafana Agent chạy như DaemonSet hoặc Sidecar để thu thập logs. +- **Pull Model**: Prometheus scrape metrics từ endpoints `/metrics`. +- **Push Model**: Traces và Logs được push tới collectors. +- **Resources**: Dedicated nodes cho monitoring stack trong production để tránh ảnh hưởng workload chính. -## Related Documentation +## Tài liệu Liên quan -- [System Design](./system-design.md) - Overall architecture +- [System Design](./system-design.md) - Kiến trúc tổng thể - [Caching Architecture](./caching-architecture.md) - Cache metrics ## Quick Tips @@ -459,5 +445,5 @@ graph TD --- -**Last Updated**: 2026-01-10 -**Author**: GoodGo Architecture Team +**Cập nhật Lần cuối**: 2026-01-14 +**Tác giả**: GoodGo Architecture Team diff --git a/docs/en/architecture/security-architecture.md b/docs/en/architecture/security-architecture.md index 38f7e7a3..c0c879ff 100644 --- a/docs/en/architecture/security-architecture.md +++ b/docs/en/architecture/security-architecture.md @@ -1,8 +1,9 @@ -# Security Architecture +# Kiến trúc Bảo mật / Security Architecture -> Comprehensive security architecture for GoodGo platform with zero-trust model, RBAC, and compliance +> **VI**: Kiến trúc bảo mật toàn diện cho nền tảng GoodGo với mô hình zero-trust, RBAC và compliance +> **EN**: Comprehensive security architecture for GoodGo platform with zero-trust model, RBAC, and compliance -## Overview Diagram +## Sơ đồ Tổng quan / Overview Diagram ```mermaid graph TD @@ -19,13 +20,34 @@ graph TD Service --> Audit[Audit Logging] Audit --> AuditDB[(Audit Trail
7-year retention)] - style TLS fill:#d4edda - style JWT fill:#e1f5ff - style Encrypt fill:#f8d7da - style Audit fill:#fff4e1 + style TLS fill:#15803d,stroke:#fff,stroke-width:2px,color:#fff + style JWT fill:#1d4ed8,stroke:#fff,stroke-width:2px,color:#fff + style Encrypt fill:#b91c1c,stroke:#fff,stroke-width:2px,color:#fff + style Audit fill:#c2410c,stroke:#fff,stroke-width:2px,color:#fff ``` -## Architecture Description +## Mô tả Kiến trúc / Architecture Description + +### VI: Phần Tiếng Việt + +Kiến trúc Bảo mật GoodGo triển khai defense-in-depth với nhiều tầng bảo mật: + +**Nguyên tắc Bảo mật**: +1. **Zero Trust**: Không bao giờ tin tưởng, luôn xác minh +2. **Least Privilege**: Quyền tối thiểu cần thiết +3. **Defense in Depth**: Nhiều tầng bảo mật +4. **Audit Everything**: Audit trail hoàn chỉnh +5. **Encryption**: Mã hóa dữ liệu at rest và in transit + +**Thành phần Chính**: +- ASP.NET Core Identity (User Management) +- OpenIddict (OAuth2/OIDC Server) +- JWT Authentication (15min access, 7 ngày refresh) +- RBAC Authorization +- MFA Support (TOTP) +- Compliance (GDPR, SOC2, ISO27001) + +### EN: English Section The GoodGo Security Architecture implements defense-in-depth with multiple security layers: @@ -37,16 +59,17 @@ The GoodGo Security Architecture implements defense-in-depth with multiple secur 5. **Encryption**: Data encrypted at rest and in transit **Key Components**: +- ASP.NET Core Identity (User Management) +- OpenIddict (OAuth2/OIDC Server) - JWT Authentication (15min access, 7d refresh) -- RBAC + ABAC Authorization -- Zero-Trust Device Validation -- AES-256-GCM Encryption -- Event Sourcing for Audit Trail -- Compliance (GDPR, SOC2, ISO27001, HIPAA) +- RBAC Authorization +- MFA Support (TOTP) +- Compliance (GDPR, SOC2, ISO27001) -## Authentication Flow +## Luồng Xác thực / Authentication Flow ```mermaid +%%{init: {'theme': 'dark'}}%% sequenceDiagram participant Client participant API as API Gateway @@ -71,7 +94,43 @@ sequenceDiagram end ``` -**Authentication Details**: +### VI: Chi tiết Xác thực + +**1. Password Hashing**: +- Thuật toán: ASP.NET Core Identity (PBKDF2 với HMAC-SHA256) +- Cost factor: 100,000 iterations +- Password tối thiểu: 8 ký tự với quy tắc phức tạp + +**2. JWT Tokens (OpenIddict)**: +- Access Token: 15 phút expiry +- Refresh Token: 7 ngày expiry +- Thuật toán: RS256 (asymmetric signing) +- Payload: sub, name, email, roles + +**3. Token Storage**: +- Access: Bearer token trong Authorization header +- Refresh: Database SHA-256 hash (OpenIddict stores) + +**4. MFA Support (Xác thực Hai yếu tố)**: +- TOTP (RFC 6238) cho authenticator apps +- QR code để thiết lập (Google Authenticator, Authy) +- Recovery codes (10 mã dùng một lần) +- Secret key lưu qua UserManager.SetAuthenticationTokenAsync + +**5. Email Verification (Xác minh Email)**: +- Gửi email xác minh qua SMTP (MailKit) +- Token generation: UserManager.GenerateEmailConfirmationTokenAsync +- Link xác minh với token và userId +- Đặt EmailConfirmed = true khi xác nhận + +**6. Social Login (Đăng nhập Mạng xã hội)**: +- Tích hợp Google OAuth 2.0 +- Tích hợp Facebook OAuth +- Liên kết tài khoản cho users hiện có (theo email) +- Tự động xác nhận email cho social logins +- Lưu provider info qua UserManager.AddLoginAsync + +### EN: Authentication Details **1. Password Hashing**: - Algorithm: bcrypt with cost factor 12 @@ -108,7 +167,7 @@ sequenceDiagram - Auto email confirmation for social logins - Provider info stored via UserManager.AddLoginAsync -## Authorization Model +## Mô hình Phân quyền / Authorization Model ```mermaid graph TD @@ -127,11 +186,35 @@ graph TD Perm[Permission
resource:action:scope] end - style Check fill:#e1f5ff - style Perm fill:#fff4e1 + style Check fill:#1d4ed8,stroke:#fff,stroke-width:2px,color:#fff + style Perm fill:#c2410c,stroke:#fff,stroke-width:2px,color:#fff ``` -**RBAC (Role-Based Access Control)**: +### VI: RBAC (Role-Based Access Control) + +**1. Cấp bậc Role**: +``` +SuperAdmin > OrgAdmin > Manager > User > Guest +``` + +**2. Format Permission**: `resource:action:scope` +- Resource: `users`, `roles`, `permissions` +- Action: `create`, `read`, `update`, `delete` +- Scope: `own`, `org`, `global` + +**Ví dụ**: +- `users:read:own` - Đọc profile của chính mình +- `users:update:org` - Update users trong organization +- `roles:create:global` - Tạo roles globally + +**3. Permission Caching**: +```typescript +Cache key: user:{userId}:permissions +TTL: 5 phút +Invalidate khi: role change, permission change +``` + +### EN: RBAC (Role-Based Access Control) **1. Role Hierarchy**: ``` @@ -150,12 +233,12 @@ SuperAdmin > OrgAdmin > Manager > User > Guest **3. Permission Caching**: ```typescript -// Cache key: user:{userId}:permissions -// TTL: 5 minutes -// Invalidate on: role change, permission change +Cache key: user:{userId}:permissions +TTL: 5 minutes +Invalidate on: role change, permission change ``` -## Zero-Trust Architecture +## Kiến trúc Zero-Trust / Zero-Trust Architecture ```mermaid graph TD @@ -168,12 +251,36 @@ graph TD Session -->|Suspicious| MFA[Require MFA] Session -->|Anomaly| Block[Block + Alert] - style Block fill:#f8d7da - style MFA fill:#fff3cd - style Allow fill:#d4edda + style Block fill:#b91c1c,stroke:#fff,stroke-width:2px,color:#fff + style MFA fill:#c2410c,stroke:#fff,stroke-width:2px,color:#fff + style Allow fill:#15803d,stroke:#fff,stroke-width:2px,color:#fff ``` -**Zero-Trust Components**: +### VI: Thành phần Zero-Trust + +**1. Device Fingerprinting**: +- Browser: User-Agent, Canvas, WebGL +- Screen resolution, timezone, language +- Phát hiện plugin, fonts có sẵn +- Hash fingerprint → Lưu với session + +**2. IP Address Validation**: +- Whitelist IPs đã biết cho user +- Alert với IP mới + require MFA +- Block IPs đáng ngờ (VPN, Tor) + +**3. Behavioral Analysis**: +- Login patterns (time, location) +- API usage patterns +- Failed auth attempts +- Alert với anomalies + +**4. Session Binding**: +- Bind session với device fingerprint +- Bind session với IP address +- Invalidate khi mismatch + +### EN: Zero-Trust Components **1. Device Fingerprinting**: - Browser: User-Agent, Canvas, WebGL @@ -197,9 +304,28 @@ graph TD - Bind session to IP address - Invalidate on mismatch -## Data Protection +## Bảo vệ Dữ liệu / Data Protection -**Encryption Strategy**: +### VI: Chiến lược Mã hóa + +**1. Data at Rest**: +- PII: AES-256-GCM encryption +- Passwords: bcrypt (cost 12) +- Tokens: SHA-256 hash +- Keys: Environment variables + K8s secrets + +**2. Data in Transit**: +- TLS 1.2+ cho mọi giao tiếp +- HTTPS enforcement +- Certificate pinning (mobile clients) + +**3. Key Management**: +- Unique key per encryption operation +- 32+ character ENCRYPTION_KEY +- Rotate keys hàng quý / quarterly +- Không bao giờ hardcode secrets + +### EN: Encryption Strategy **1. Data at Rest**: - PII: AES-256-GCM encryption @@ -218,9 +344,35 @@ graph TD - Rotate keys quarterly - Never hardcode secrets -## Compliance & Audit +## Tuân thủ & Kiểm toán / Compliance & Audit -**Compliance Requirements**: +### VI: Yêu cầu Tuân thủ + +**1. GDPR**: +- Right to erasure (soft delete + hard delete sau 90 ngày) +- Data portability (export dữ liệu user) +- Quản lý consent +- Thông báo breach (72 giờ) + +**2. SOC2**: +- Access controls (RBAC) +- Encryption at rest và in transit +- Audit logging (7 năm retention) +- Incident response plan + +**3. Audit Trail**: +```typescript +{ + eventType: 'auth.login.success', + userId: 'user_123', + timestamp: '2024-01-15T10:30:00Z', + ipAddress: '192.168.1.1', + deviceFingerprint: 'fp_xyz', + metadata: {...} +} +``` + +### EN: Compliance Requirements **1. GDPR**: - Right to erasure (soft delete + hard delete after 90 days) @@ -235,7 +387,6 @@ graph TD - Incident response plan ```typescript -// Event sourcing for all auth events { eventType: 'auth.login.success', userId: 'user_123', @@ -246,15 +397,16 @@ graph TD } ``` -## System Context +## Bối cảnh Hệ thống / System Context ```mermaid +%%{init: {'theme': 'dark'}}%% C4Context - title Security Architecture Context + title Sơ đồ Bối cảnh Security Architecture - Person(user, "User", "End user accessing platform") - Person(admin, "Admin", "System administrator") - Person(attacker, "Attacker", "Potential threat actor") + Person(user, "Người dùng / User", "End user accessing platform") + Person(admin, "Quản trị viên / Admin", "System administrator") + Person(attacker, "Kẻ tấn công / Attacker", "Potential threat actor") System(iam, "IAM Service", "Authentication & Authorization") @@ -275,7 +427,15 @@ C4Context Rel(iam, monitoring, "Sends security metrics", "Prometheus + Loki") ``` -**Context Description**: +**VI Mô tả**: +- **IAM Service**: Trung tâm xác thực và phân quyền +- **Database**: Lưu trữ credentials đã mã hóa, sessions, permissions +- **Cache**: Cache permissions và sessions để giảm database load +- **Audit Service**: Nhận và lưu trữ tất cả security events +- **MFA Provider**: External TOTP verification service (Google Authenticator compatible) +- **Security Monitoring**: SIEM (Security Information and Event Management) và alerting + +**EN Description**: - **IAM Service**: Central authentication and authorization - **Database**: Stores encrypted credentials, sessions, permissions - **Cache**: Caches permissions and sessions to reduce database load @@ -283,9 +443,10 @@ C4Context - **MFA Provider**: External TOTP verification service (Google Authenticator compatible) - **Security Monitoring**: SIEM (Security Information and Event Management) and alerting -## Database Architecture +## Kiến trúc Database / Database Architecture ```mermaid +%%{init: {'theme': 'dark'}}%% erDiagram User ||--o{ Session : has User ||--o{ UserRole : has @@ -374,7 +535,22 @@ erDiagram } ``` -**Description**: +**VI Mô tả**: +- **User**: Lưu credentials đã hash, MFA settings, organization membership +- **Session**: Lưu refresh tokens đã hash, device fingerprint, IP tracking +- **Role & Permission**: RBAC hierarchy với system roles và custom roles +- **MFADevice**: TOTP secrets (encrypted), backup codes +- **LoginHistory**: Audit trail cho tất cả login attempts (success/failure) +- **DeviceFingerprint**: Trusted device tracking cho zero-trust model + +**Bảo mật Database**: +- Password hashes: bcrypt với cost factor 12 +- Token hashes: SHA-256 +- MFA secrets: AES-256-GCM encryption +- Soft deletes: `deletedAt` field, hard delete sau 90 ngày (GDPR) +- Indexes: email (unique), userId (foreign keys), timestamps + +**EN Description**: - **User**: Stores hashed credentials, MFA settings, organization membership - **Session**: Stores hashed refresh tokens, device fingerprint, IP tracking - **Role & Permission**: RBAC hierarchy with system roles and custom roles @@ -389,15 +565,32 @@ erDiagram - Soft deletes: `deletedAt` field, hard delete after 90 days (GDPR) - Indexes: email (unique), userId (foreign keys), timestamps -## Design Decisions +## Quyết định Thiết kế / Design Decisions -### Decision 1: JWT with RS256 (Asymmetric) +### Quyết định 1: JWT với RS256 (Asymmetric) -**Context**: Need stateless authentication with ability to verify tokens in multiple services +**VI Bối cảnh**: Cần stateless authentication với khả năng verify tokens ở multiple services -**Decision**: Use JWT with RS256 (RSA asymmetric signing) instead of HS256 (HMAC symmetric) +**VI Quyết định**: Sử dụng JWT với RS256 (RSA asymmetric signing) thay vì HS256 (HMAC symmetric) -**Consequences**: +**VI Hậu quả**: +- ✅ **Tích cực**: + - Services có thể verify tokens với public key, không cần secret + - Key rotation dễ dàng hơn (chỉ cần distribute public key mới) + - Bảo mật cao hơn (private key chỉ ở IAM service) + - Compliance: Audit trail rõ ràng về ai sign tokens +- ❌ **Tiêu cực**: + - Chậm hơn HS256 một chút (~10-20% slower) + - Phức tạp hơn trong key management + - Public/private key pair phải được bảo vệ cẩn thận + +**VI Các lựa chọn thay thế**: HS256 (symmetric), EdDSA, OAuth 2.0 with Opaque Tokens + +**EN Context**: Need stateless authentication with ability to verify tokens in multiple services + +**EN Decision**: Use JWT with RS256 (RSA asymmetric signing) instead of HS256 (HMAC symmetric) + +**EN Consequences**: - ✅ **Positive**: - Services can verify tokens with public key, don't need secret - Easier key rotation (only distribute new public key) @@ -408,15 +601,35 @@ erDiagram - More complex key management - Public/private key pair must be carefully protected -**Alternatives**: HS256 (symmetric), EdDSA, OAuth 2.0 with Opaque Tokens +**EN Alternatives**: HS256 (symmetric), EdDSA, OAuth 2.0 with Opaque Tokens -### Decision 2: Zero-Trust Model with Device Fingerprinting +--- -**Context**: Need to protect against credential theft, session hijacking, and unauthorized access +### Quyết định 2: Zero-Trust Model với Device Fingerprinting -**Decision**: Implement zero-trust model with device fingerprinting, IP validation, behavioral analysis +**VI Bối cảnh**: Cần bảo vệ chống lại credential theft, session hijacking và unauthorized access -**Consequences**: +**VI Quyết định**: Triển khai zero-trust model với device fingerprinting, IP validation, behavioral analysis + +**VI Hậu quả**: +- ✅ **Tích cực**: + - Phát hiện được anomalies (new device, new IP, unusual behavior) + - Tăng security khi detect và block suspicious activities + - Compliance: SOC2, ISO27001 requirements + - User experience: Auto-approve trusted devices +- ❌ **Tiêu cực**: + - Complexity cao hơn + - Potential false positives (legitimate users blocked) + - Performance overhead (fingerprint hash, IP check) + - Privacy concerns (tracking devices, IPs) + +**VI Các lựa chọn thay thế**: Basic authentication only, IP whitelist only, MFA required for all + +**EN Context**: Need to protect against credential theft, session hijacking, and unauthorized access + +**EN Decision**: Implement zero-trust model with device fingerprinting, IP validation, behavioral analysis + +**EN Consequences**: - ✅ **Positive**: - Detect anomalies (new device, new IP, unusual behavior) - Increased security by detecting and blocking suspicious activities @@ -428,15 +641,36 @@ erDiagram - Performance overhead (fingerprint hash, IP check) - Privacy concerns (tracking devices, IPs) -**Alternatives**: Basic authentication only, IP whitelist only, MFA required for all +**EN Alternatives**: Basic authentication only, IP whitelist only, MFA required for all -### Decision 3: Event Sourcing for Audit Trail +--- -**Context**: Need immutable audit trail for compliance (GDPR, SOC2, HIPAA) and security forensics +### Quyết định 3: Event Sourcing cho Audit Trail -**Decision**: Use event sourcing pattern to store all auth/security events +**VI Bối cảnh**: Cần immutable audit trail cho compliance (GDPR, SOC2, HIPAA) và security forensics -**Consequences**: +**VI Quyết định**: Sử dụng event sourcing pattern để lưu tất cả auth/security events + +**VI Hậu quả**: +- ✅ **Tích cực**: + - Immutable audit trail (không thể modify/delete) + - Complete history của tất cả security events + - Compliance: GDPR (7-year retention), SOC2, HIPAA + - Security forensics: Trace back attacks, breaches + - Replay events để reconstruct state +- ❌ **Tiêu cực**: + - Storage cost cao (retain 7 years) + - Complexity trong event schema versioning + - Performance: Event publishing overhead + - Data privacy: Must anonymize PII after retention period + +**VI Các lựa chọn thay thế**: Database audit logs only, External SIEM only, No audit trail + +**EN Context**: Need immutable audit trail for compliance (GDPR, SOC2, HIPAA) and security forensics + +**EN Decision**: Use event sourcing pattern to store all auth/security events + +**EN Consequences**: - ✅ **Positive**: - Immutable audit trail (cannot modify/delete) - Complete history of all security events @@ -449,12 +683,12 @@ erDiagram - Performance: Event publishing overhead - Data privacy: Must anonymize PII after retention period -**Alternatives**: Database audit logs only, External SIEM only, No audit trail +**EN Alternatives**: Database audit logs only, External SIEM only, No audit trail -## Performance Characteristics +## Đặc điểm Hiệu suất / Performance Characteristics -| Metric | Target | Notes | -|--------|--------|-------| +| Chỉ số / Metric | Mục tiêu / Target | Ghi chú / Notes | +|-----------------|-------------------|-----------------| | **Login Time (P95)** | < 500ms | Including bcrypt verification | | **Login Time (P99)** | < 1s | Peak load | | **Token Generation (P95)** | < 50ms | JWT sign with RS256 | @@ -468,7 +702,15 @@ erDiagram | **Failed Login Rate Limit** | 5 attempts / 15min | Per user | | **Auth Throughput** | 500 req/s | Per IAM instance | -**Performance Optimizations**: +**VI Tối ưu hóa Hiệu suất**: +- **Permission Caching**: L1 (memory) + L2 (Redis), TTL 5 phút +- **Token Caching**: Cache public key in memory for JWT verification +- **Connection Pooling**: Reuse database connections +- **Async Operations**: Event publishing, audit logging (fire-and-forget) +- **Rate Limiting**: Prevent brute force attacks, reduce load +- **Horizontal Scaling**: Multiple IAM service instances + +**EN Performance Optimizations**: - **Permission Caching**: L1 (memory) + L2 (Redis), TTL 5 minutes - **Token Caching**: Cache public key in memory for JWT verification - **Connection Pooling**: Reuse database connections @@ -476,7 +718,7 @@ erDiagram - **Rate Limiting**: Prevent brute force attacks, reduce load - **Horizontal Scaling**: Multiple IAM service instances -## Deployment +## Triển khai / Deployment ```mermaid graph TD @@ -526,15 +768,15 @@ graph TD SIEM -.->|Alerts| Alerts - style LB fill:#d4edda - style WAF fill:#fff3cd - style DB fill:#f0e1ff - style Cache fill:#fff4e1 - style Vault fill:#f8d7da - style SIEM fill:#e1f5ff + style LB fill:#15803d,stroke:#fff,stroke-width:2px,color:#fff + style WAF fill:#c2410c,stroke:#fff,stroke-width:2px,color:#fff + style DB fill:#7e22ce,stroke:#fff,stroke-width:2px,color:#fff + style Cache fill:#1f2937,stroke:#fff,stroke-width:2px,color:#fff + style Vault fill:#b91c1c,stroke:#fff,stroke-width:2px,color:#fff + style SIEM fill:#1d4ed8,stroke:#fff,stroke-width:2px,color:#fff ``` -**Deployment Strategy**: +### VI: Chiến lược Triển khai **Security Deployment**: - **TLS 1.2+ Enforcement**: All connections require TLS @@ -552,7 +794,6 @@ graph TD **Security Configuration**: ```yaml -# K8s Network Policy apiVersion: networking.k8s.io/v1 kind: NetworkPolicy metadata: @@ -580,4 +821,525 @@ spec: ports: - protocol: TCP port: 5432 + - to: + - podSelector: + matchLabels: + app: redis + ports: + - protocol: TCP + port: 6379 ``` + +**Deployment Security Checklist**: +- [ ] TLS 1.2+ enforced +- [ ] Network policies configured +- [ ] Pod security policies applied +- [ ] Secrets encrypted at rest +- [ ] Container images scanned +- [ ] Non-root user in containers +- [ ] Read-only root filesystem +- [ ] Resource limits set +- [ ] Health checks configured +- [ ] Security monitoring enabled + +### EN: Deployment Strategy + +**Security Deployment**: +- **TLS 1.2+ Enforcement**: All connections require TLS +- **Network Policies (K8s)**: Deny all by default, whitelist specific services +- **Pod Security Policies**: Non-root user, read-only filesystem, no privilege escalation +- **Secrets Management**: Kubernetes secrets with encryption at rest +- **Image Scanning**: Trivy/Clair scan before deployment +- **RBAC (K8s)**: Least privilege for service accounts + +**Resource Allocation**: +| Component | CPU | Memory | Replicas | +|-----------|-----|--------|----------| +| **IAM Service** | 500m | 1GB | 3-10 (HPA) | +| **Redis** | 1 core | 2GB | 3 masters + 3 slaves | + +**Security Configuration**: +```yaml +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: iam-service-policy +spec: + podSelector: + matchLabels: + app: iam-service + policyTypes: + - Ingress + - Egress + ingress: + - from: + - podSelector: + matchLabels: + app: api-gateway + ports: + - protocol: TCP + port: 5000 + egress: + - to: + - podSelector: + matchLabels: + app: postgresql + ports: + - protocol: TCP + port: 5432 + - to: + - podSelector: + matchLabels: + app: redis + ports: + - protocol: TCP + port: 6379 +``` + +**Deployment Security Checklist**: +- [ ] TLS 1.2+ enforced +- [ ] Network policies configured +- [ ] Pod security policies applied +- [ ] Secrets encrypted at rest +- [ ] Container images scanned +- [ ] Non-root user in containers +- [ ] Read-only root filesystem +- [ ] Resource limits set +- [ ] Health checks configured +- [ ] Security monitoring enabled + +## Giám sát & Khả năng quan sát / Monitoring & Observability + +### VI: Chỉ số Chính + +**Authentication Metrics**: +- `auth_login_attempts_total` - Total login attempts (counter, labels: status=success/failure) +- `auth_login_duration_seconds` - Login duration (histogram) +- `auth_token_generations_total` - Token generations (counter) +- `auth_token_verifications_total` - Token verifications (counter, labels: status=valid/invalid/expired) +- `auth_mfa_verifications_total` - MFA verifications (counter, labels: status=success/failure) + +**Authorization Metrics**: +- `auth_permission_checks_total` - Permission checks (counter, labels: result=granted/denied) +- `auth_permission_cache_hits_total` - Permission cache hits (counter) +- `auth_permission_cache_misses_total` - Permission cache misses (counter) + +**Security Metrics**: +- `auth_failed_login_rate` - Failed login rate per user (gauge) +- `auth_account_lockouts_total` - Account lockouts (counter) +- `auth_suspicious_activities_total` - Suspicious activities detected (counter, labels: type) +- `auth_anomalies_detected_total` - Anomalies detected (counter, labels: anomaly_type) +- `auth_password_reset_requests_total` - Password reset requests (counter) + +**Session Metrics**: +- `auth_active_sessions` - Active sessions (gauge) +- `auth_session_creations_total` - Session creations (counter) +- `auth_session_invalidations_total` - Session invalidations (counter, labels: reason) + +**Application Code**: +```typescript +import { Counter, Histogram, Gauge } from 'prom-client'; + +export const loginAttempts = new Counter({ + name: 'auth_login_attempts_total', + help: 'Total login attempts', + labelNames: ['status'] +}); + +export const loginDuration = new Histogram({ + name: 'auth_login_duration_seconds', + help: 'Login duration in seconds', + buckets: [0.1, 0.3, 0.5, 0.7, 1, 2, 5] +}); + +export const permissionChecks = new Counter({ + name: 'auth_permission_checks_total', + help: 'Total permission checks', + labelNames: ['result'] +}); + +export const suspiciousActivities = new Counter({ + name: 'auth_suspicious_activities_total', + help: 'Suspicious activities detected', + labelNames: ['type'] +}); + +loginAttempts.inc({ status: 'success' }); +loginDuration.observe(duration); +permissionChecks.inc({ result: 'granted' }); +suspiciousActivities.inc({ type: 'new_device' }); +``` + +**Alerting Rules**: +```yaml +groups: + - name: security_alerts + interval: 30s + rules: + - alert: HighFailedLoginRate + expr: rate(auth_login_attempts_total{status="failure"}[5m]) > 10 + for: 2m + labels: + severity: warning + annotations: + summary: "High failed login rate detected" + description: "Failed login rate is {{ $value }}/sec" + + - alert: BruteForceAttack + expr: | + sum by (user_id) ( + rate(auth_login_attempts_total{status="failure"}[1m]) + ) > 5 + for: 1m + labels: + severity: critical + annotations: + summary: "Potential brute force attack" + description: "User {{ $labels.user_id }} has > 5 failed logins/min" + + - alert: AccountLockoutSpike + expr: rate(auth_account_lockouts_total[5m]) > 5 + for: 2m + labels: + severity: warning + annotations: + summary: "Account lockout spike detected" + description: "Lockout rate is {{ $value }}/sec" + + - alert: SuspiciousActivity + expr: rate(auth_suspicious_activities_total[5m]) > 10 + for: 1m + labels: + severity: warning + annotations: + summary: "Suspicious activity detected" + description: "Suspicious activity rate: {{ $value }}/sec" + + - alert: AnomalyDetected + expr: auth_anomalies_detected_total > 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Security anomaly detected" + description: "{{ $labels.anomaly_type }} detected" + + - alert: PermissionDeniedSpike + expr: rate(auth_permission_checks_total{result="denied"}[5m]) > 50 + for: 2m + labels: + severity: warning + annotations: + summary: "High permission denied rate" + description: "Permission denied rate: {{ $value }}/sec" +``` + +**Security Dashboards**: +- **Authentication Overview**: Login success/failure rate, login duration, MFA usage +- **Authorization Overview**: Permission checks, cache hit rate, denied requests +- **Security Events**: Suspicious activities, anomalies, account lockouts +- **Session Management**: Active sessions, session creations/invalidations +- **Compliance**: Audit trail completeness, retention policy compliance + +**Logging**: +```typescript +logger.info('Login successful', { + eventType: 'auth.login.success', + userId: user.id, + email: user.email, + ipAddress: req.ip, + deviceFingerprint: fingerprint, + mfaUsed: user.mfaEnabled, + correlationId: req.correlationId +}); + +logger.warn('Suspicious activity detected', { + eventType: 'security.suspicious_activity', + userId: user.id, + activityType: 'new_device', + ipAddress: req.ip, + deviceFingerprint: newFingerprint, + correlationId: req.correlationId +}); + +logger.error('Login failed', { + eventType: 'auth.login.failure', + email: email, + reason: 'invalid_credentials', + ipAddress: req.ip, + attemptCount: failedAttempts, + correlationId: req.correlationId +}); +``` + +**Audit Trail Monitoring**: +- Event publishing rate and latency +- Event consumption lag +- Audit log completeness (no gaps) +- Retention policy compliance +- Anonymization after retention period + +### EN: Key Metrics + +**Authentication Metrics**: +- `auth_login_attempts_total` - Total login attempts (counter, labels: status=success/failure) +- `auth_login_duration_seconds` - Login duration (histogram) +- `auth_token_generations_total` - Token generations (counter) +- `auth_token_verifications_total` - Token verifications (counter, labels: status=valid/invalid/expired) +- `auth_mfa_verifications_total` - MFA verifications (counter, labels: status=success/failure) + +**Authorization Metrics**: +- `auth_permission_checks_total` - Permission checks (counter, labels: result=granted/denied) +- `auth_permission_cache_hits_total` - Permission cache hits (counter) +- `auth_permission_cache_misses_total` - Permission cache misses (counter) + +**Security Metrics**: +- `auth_failed_login_rate` - Failed login rate per user (gauge) +- `auth_account_lockouts_total` - Account lockouts (counter) +- `auth_suspicious_activities_total` - Suspicious activities detected (counter, labels: type) +- `auth_anomalies_detected_total` - Anomalies detected (counter, labels: anomaly_type) +- `auth_password_reset_requests_total` - Password reset requests (counter) + +**Session Metrics**: +- `auth_active_sessions` - Active sessions (gauge) +- `auth_session_creations_total` - Session creations (counter) +- `auth_session_invalidations_total` - Session invalidations (counter, labels: reason) + +**Application Code**: +```typescript +import { Counter, Histogram, Gauge } from 'prom-client'; + +export const loginAttempts = new Counter({ + name: 'auth_login_attempts_total', + help: 'Total login attempts', + labelNames: ['status'] +}); + +export const loginDuration = new Histogram({ + name: 'auth_login_duration_seconds', + help: 'Login duration in seconds', + buckets: [0.1, 0.3, 0.5, 0.7, 1, 2, 5] +}); + +export const permissionChecks = new Counter({ + name: 'auth_permission_checks_total', + help: 'Total permission checks', + labelNames: ['result'] +}); + +export const suspiciousActivities = new Counter({ + name: 'auth_suspicious_activities_total', + help: 'Suspicious activities detected', + labelNames: ['type'] +}); + +loginAttempts.inc({ status: 'success' }); +loginDuration.observe(duration); +permissionChecks.inc({ result: 'granted' }); +suspiciousActivities.inc({ type: 'new_device' }); +``` + +**Alerting Rules**: +```yaml +groups: + - name: security_alerts + interval: 30s + rules: + - alert: HighFailedLoginRate + expr: rate(auth_login_attempts_total{status="failure"}[5m]) > 10 + for: 2m + labels: + severity: warning + annotations: + summary: "High failed login rate detected" + description: "Failed login rate is {{ $value }}/sec" + + - alert: BruteForceAttack + expr: | + sum by (user_id) ( + rate(auth_login_attempts_total{status="failure"}[1m]) + ) > 5 + for: 1m + labels: + severity: critical + annotations: + summary: "Potential brute force attack" + description: "User {{ $labels.user_id }} has > 5 failed logins/min" + + - alert: AccountLockoutSpike + expr: rate(auth_account_lockouts_total[5m]) > 5 + for: 2m + labels: + severity: warning + annotations: + summary: "Account lockout spike detected" + description: "Lockout rate is {{ $value }}/sec" + + - alert: SuspiciousActivity + expr: rate(auth_suspicious_activities_total[5m]) > 10 + for: 1m + labels: + severity: warning + annotations: + summary: "Suspicious activity detected" + description: "Suspicious activity rate: {{ $value }}/sec" + + - alert: AnomalyDetected + expr: auth_anomalies_detected_total > 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Security anomaly detected" + description: "{{ $labels.anomaly_type }} detected" + + - alert: PermissionDeniedSpike + expr: rate(auth_permission_checks_total{result="denied"}[5m]) > 50 + for: 2m + labels: + severity: warning + annotations: + summary: "High permission denied rate" + description: "Permission denied rate: {{ $value }}/sec" +``` + +**Security Dashboards**: +- **Authentication Overview**: Login success/failure rate, login duration, MFA usage +- **Authorization Overview**: Permission checks, cache hit rate, denied requests +- **Security Events**: Suspicious activities, anomalies, account lockouts +- **Session Management**: Active sessions, session creations/invalidations +- **Compliance**: Audit trail completeness, retention policy compliance + +**Logging**: +```typescript +logger.info('Login successful', { + eventType: 'auth.login.success', + userId: user.id, + email: user.email, + ipAddress: req.ip, + deviceFingerprint: fingerprint, + mfaUsed: user.mfaEnabled, + correlationId: req.correlationId +}); + +logger.warn('Suspicious activity detected', { + eventType: 'security.suspicious_activity', + userId: user.id, + activityType: 'new_device', + ipAddress: req.ip, + deviceFingerprint: newFingerprint, + correlationId: req.correlationId +}); + +logger.error('Login failed', { + eventType: 'auth.login.failure', + email: email, + reason: 'invalid_credentials', + ipAddress: req.ip, + attemptCount: failedAttempts, + correlationId: req.correlationId +}); +``` + +**Audit Trail Monitoring**: +- Event publishing rate and latency +- Event consumption lag +- Audit log completeness (no gaps) +- Retention policy compliance +- Anonymization after retention period + +## Tài liệu Liên quan / Related Documentation + +- [System Design](./system-design.md) - Kiến trúc tổng thể / Overall architecture +- [IAM Architecture](./iam-proposal.md) - Triển khai IAM service / IAM service implementation +- [Event-Driven Architecture](./event-driven-architecture.md) - Audit event streaming + +--- + +**Cập nhật Lần cuối / Last Updated**: 2026-01-07 +**Tác giả / Authors**: GoodGo Security Team + +## Quick Tips + +### 🎨 Color Palette Reference (Dark Theme) + +| Node Type | Color | Hex | Tailwind | Usage | Example | +|-----------|-------|-----|----------|-------|---------| +| **Primary** | Blue | `#1d4ed8` | `bg-blue-700` | Core components, Identity, IAM, Permission Checks | JWT Validation, Auth Services | +| **Secondary**| Purple| `#7e22ce` | `bg-purple-700`| Data stores, Database, Queues | PostgreSQL, Redis | +| **Success** | Green | `#15803d` | `bg-green-700` | Valid, Allowed, Safe, Completed, TLS | Allow Request, Secure Connection | +| **Error** | Red | `#b91c1c` | `bg-red-700` | Blocked, Invalid, Failed, Critical, Encryption Keys | Block + Alert, Vault, Critical Errors | +| **Warning** | Orange| `#c2410c` | `bg-orange-700`| MFA, Suspicious, Latency, Cache, Alerts | Require MFA, WAF, SIEM | +| **Base** | Grey | `#1f2937` | `bg-gray-800` | External systems, Infrastructure, Logs | Cache, Monitoring | + +### 🔧 Mermaid Common Issues + +| Issue | Sign | Solution | +|-------|------|----------| +| **Parse Error** | Unexpected PIPE/character | Check for missing spaces after `graph TD` | +| **Box Not Showing** | Node missing in diagram | Verify node syntax: `Node[Label]` | +| **Color Not Applied** | Node has no color | Add style: `style Node fill:#1d4ed8,stroke:#fff,stroke-width:2px,color:#fff` | +| **Arrow Issues** | Connection not visible | Check arrow syntax: `-->` (solid), `-.->`(dashed) | +| **Text Not Readable** | Dark text on dark bg | Always use `color:#fff` (white text) | +| **Subgraph Issues** | Broken layout | Ensure proper indentation and `end` statement | + +### 📊 Color Pattern Quick Reference + +```mermaid +graph LR + A[Input] --> B[Process] + B --> C{Decision} + C -->|Yes| D[Success] + C -->|No| E[Error] + + style A fill:#1f2937,stroke:#fff,stroke-width:2px,color:#fff + style B fill:#1d4ed8,stroke:#fff,stroke-width:2px,color:#fff + style C fill:#c2410c,stroke:#fff,stroke-width:2px,color:#fff + style D fill:#15803d,stroke:#fff,stroke-width:2px,color:#fff + style E fill:#b91c1c,stroke:#fff,stroke-width:2px,color:#fff +``` + +**Pattern Template**: +``` +style NodeName fill:#color,stroke:#fff,stroke-width:2px,color:#fff +``` + +### 🎯 Visual Indicators + +| Emoji | Meaning | Color | Usage | +|-------|---------|-------|-------| +| ✅ | Secure/Allowed/Valid | Green (#15803d) | Successful auth, allowed access | +| ❌ | Blocked/Denied/Invalid | Red (#b91c1c) | Failed login, access denied | +| ⚠️ | Warning/MFA/Alert | Orange (#c2410c) | Require MFA, suspicious activity | +| 🔒 | Encrypted/Secure | Blue/Purple (#1d4ed8, #7e22ce) | Encrypted data, secure channel | +| ☁️ | Cloud/External | Grey (#1f2937) | External services, cloud resources | +| 🔑 | Authentication | Orange (#c2410c) | Auth tokens, keys, credentials | +| 🛡️ | Security Layer | Green (#15803d) | Security controls, protection | +| 📊 | Monitoring | Blue (#1d4ed8) | Metrics, dashboards, logs | + +### 🚀 Diagram Best Practices + +1. **Always use dark palette** with white text (`color:#fff`) +2. **Consistent stroke**: `stroke:#fff,stroke-width:2px` +3. **Logical color mapping**: + - Blue = Core processes + - Green = Success/Allow + - Red = Error/Block + - Orange = Warning/MFA + - Purple = Data stores + - Grey = External systems + +4. **Readable labels**: Use `
` for line breaks in labels +5. **Arrow clarity**: Solid (`-->`) for main flow, dashed (`-.->`) for secondary/async +6. **Subgraph organization**: Group related components + +### 🔍 Mermaid Debugging Checklist + +- [ ] Graph type declared? (`graph TD`, `sequenceDiagram`, `erDiagram`) +- [ ] All nodes have unique IDs? +- [ ] Arrows have proper syntax? (`-->`, `-.->`, `-.->>`) +- [ ] Style definitions after graph content? +- [ ] All subgraphs have `end` statement? +- [ ] Labels escaped properly? (use quotes for special chars) +- [ ] Color values correct? (6-digit hex with #) +- [ ] White text applied? (`color:#fff`) diff --git a/docs/en/architecture/system-design.md b/docs/en/architecture/system-design.md index a8a41966..a610dcac 100644 --- a/docs/en/architecture/system-design.md +++ b/docs/en/architecture/system-design.md @@ -1,929 +1,735 @@ -# System Design / Thiết kế Hệ thống +# Kiến Trúc Thiết Kế Hệ Thống -> **EN**: Comprehensive system architecture for the GoodGo Microservices Platform -> **VI**: Kiến trúc hệ thống toàn diện cho GoodGo Microservices Platform +Kiến trúc tổng thể của nền tảng GoodGo Microservices -## System Overview / Tổng quan Hệ thống +## Sơ đồ Tổng quan ```mermaid +%%{init: {'theme':'base', 'themeVariables': { + 'primaryTextColor':'#000', + 'secondaryTextColor':'#000', + 'tertiaryTextColor':'#000', + 'textColor':'#000', + 'mainBkg':'#fff', + 'secondBkg':'#fff', + 'lineColor':'#333', + 'border1':'#000', + 'border2':'#000', + 'clusterBkg':'#fff', + 'clusterBorder':'#000', + 'titleColor':'#000', + 'edgeLabelBackground':'#fff', + 'nodeTextColor':'#fff' +}}}%% graph TD - subgraph "Client Layer / Tầng Client" - WebApp[Web Application
Next.js 14+] - MobileApp[Mobile Application
Flutter/React Native] + subgraph "Client Layer" + Web[Web App
Next.js] + Mobile[Mobile App
Flutter] end - subgraph "API Gateway Layer / Tầng API Gateway" - Traefik[Traefik Gateway
Load Balancer + Routing] + subgraph "API Gateway Layer" + Traefik[Traefik
API Gateway] end - subgraph "Services Layer / Tầng Services" - IAM[IAM Service
Authentication & Authorization] - Template[Template Service
Example Microservice] - Future1[Future Service 1
TBD] - Future2[Future Service 2
TBD] + subgraph "Services Layer" + IAM[IAM Service
Auth & RBAC] + Future1[Future Service 1] + Future2[Future Service 2] end - subgraph "Data Layer / Tầng Dữ liệu" - PostgreSQL[(PostgreSQL 14+
Primary Database)] - Redis[(Redis 6+
Cache & Sessions)] + subgraph "Infrastructure Layer" + DB[(Neon PostgreSQL
Primary Database)] + Cache[(Redis
Cache & Session)] + Kafka[Apache Kafka
Event Streaming] end - subgraph "Observability / Khả năng quan sát" - Prometheus[Prometheus
Metrics Collection] - Grafana[Grafana
Metrics Visualization] - Loki[Loki
Log Aggregation] - Jaeger[Jaeger
Distributed Tracing] + subgraph "Observability Layer" + Prom[Prometheus
Metrics] + Loki[Loki
Logs] + Jaeger[Jaeger
Tracing] + Grafana[Grafana
Dashboards] end - WebApp --> Traefik - MobileApp --> Traefik + Web --> Traefik + Mobile --> Traefik Traefik --> IAM - Traefik --> Template Traefik --> Future1 Traefik --> Future2 - IAM --> PostgreSQL - Template --> PostgreSQL - Future1 --> PostgreSQL - Future2 --> PostgreSQL + IAM --> DB + IAM --> Cache + IAM --> Kafka - IAM --> Redis - Template --> Redis - Future1 --> Redis - Future2 --> Redis + Future1 --> DB + Future1 --> Cache + Future1 --> Kafka - IAM -.->|Metrics| Prometheus - Template -.->|Metrics| Prometheus - Prometheus --> Grafana + Future2 --> DB + Future2 --> Cache + Future2 --> Kafka - IAM -.->|Logs| Loki - Template -.->|Logs| Loki + IAM -.->|metrics| Prom + Future1 -.->|metrics| Prom + Future2 -.->|metrics| Prom - IAM -.->|Traces| Jaeger - Template -.->|Traces| Jaeger + IAM -.->|logs| Loki + Future1 -.->|logs| Loki + Future2 -.->|logs| Loki - style Traefik fill:#e1f5ff - style PostgreSQL fill:#f0e1ff - style Redis fill:#fff4e1 - style Prometheus fill:#d4edda + IAM -.->|traces| Jaeger + Future1 -.->|traces| Jaeger + Future2 -.->|traces| Jaeger + + Prom --> Grafana + Loki --> Grafana + Jaeger --> Grafana + + style Web fill:#1565c0,stroke:#fff,stroke-width:2px,color:#fff + style Mobile fill:#1565c0,stroke:#fff,stroke-width:2px,color:#fff + style Traefik fill:#0f4c81,stroke:#fff,stroke-width:2px,color:#fff + style IAM fill:#283593,stroke:#fff,stroke-width:2px,color:#fff + style Future1 fill:#4527a0,stroke:#fff,stroke-width:2px,color:#fff + style Future2 fill:#4527a0,stroke:#fff,stroke-width:2px,color:#fff + style DB fill:#5e35b1,stroke:#fff,stroke-width:2px,color:#fff + style Cache fill:#ef6c00,stroke:#fff,stroke-width:2px,color:#fff + style Kafka fill:#2e7d32,stroke:#fff,stroke-width:2px,color:#fff + style Prom fill:#c62828,stroke:#fff,stroke-width:2px,color:#fff + style Loki fill:#d84315,stroke:#fff,stroke-width:2px,color:#fff + style Jaeger fill:#e65100,stroke:#fff,stroke-width:2px,color:#fff + style Grafana fill:#b71c1c,stroke:#fff,stroke-width:2px,color:#fff ``` -### EN: Architecture Principles +## Mô tả Kiến trúc -The GoodGo Microservices Platform follows these core principles: +GoodGo Platform được xây dựng theo kiến trúc microservices với các nguyên tắc sau: -1. **Service Independence**: Each microservice: - - Has its own database schema (database per service pattern) - - Can be deployed independently without affecting others - - Owns its data and exposes APIs for data access - - Uses standardized communication patterns +**Nguyên tắc Cốt lõi**: +1. **Độc Lập Service**: Mỗi service có database riêng và có thể deploy độc lập +2. **API Gateway Pattern**: Traefik xử lý routing, load balancing, và cross-cutting concerns +3. **Clean Architecture**: Mỗi service tuân theo Clean Architecture (API, Domain, Infrastructure) +4. **Infrastructure as Code**: Tất cả cấu hình infrastructure được version control +5. **Observability First**: Đầy đủ metrics, logging, và health checks -2. **API Gateway Pattern**: Traefik provides: - - Single entry point for all client requests - - Path-based routing to appropriate services - - Load balancing across service instances - - SSL/TLS termination - - Rate limiting and security headers +**Công nghệ Stack**: +- **Frontend**: Next.js 14+ (App Router), Flutter 3.x +- **Backend**: .NET 10, ASP.NET Core, MediatR (CQRS) +- **Database**: Neon PostgreSQL (serverless), Entity Framework Core +- **Cache**: Redis (StackExchange.Redis) +- **Message Broker**: MediatR Domain Events (RabbitMQ planned) +- **API Gateway**: Traefik v3 +- **Observability**: Prometheus, Grafana, Loki, Serilog -3. **Shared Infrastructure**: Common concerns handled by: - - Shared packages (@goodgo/logger, @goodgo/types, @goodgo/http-client) - - Centralized observability stack - - Distributed caching layer (Redis) - - Common monitoring and alerting - -4. **Infrastructure as Code**: All configurations versioned: - - Docker Compose for local development - - Kubernetes manifests for production - - Traefik dynamic configuration - - Database migrations with Prisma - -5. **Observability First**: Built-in monitoring: - - Prometheus metrics from all services - - Structured logging with correlation IDs - - Distributed tracing with OpenTelemetry - - Health check endpoints (liveness/readiness) - -### VI: Nguyên tắc Kiến trúc - -GoodGo Microservices Platform tuân theo các nguyên tắc cốt lõi sau: - -1. **Độc lập Service**: Mỗi microservice: - - Có schema database riêng (pattern database per service) - - Có thể deploy độc lập mà không ảnh hưởng đến các service khác - - Sở hữu dữ liệu của mình và expose APIs để truy cập dữ liệu - - Sử dụng patterns giao tiếp chuẩn hóa - -2. **Pattern API Gateway**: Traefik cung cấp: - - Điểm vào duy nhất cho tất cả client requests - - Routing dựa trên path tới các service phù hợp - - Load balancing giữa các service instances - - SSL/TLS termination - - Rate limiting và security headers - -3. **Infrastructure Chia sẻ**: Các concerns chung được xử lý bởi: - - Shared packages (@goodgo/logger, @goodgo/types, @goodgo/http-client) - - Stack observability tập trung - - Tầng caching phân tán (Redis) - - Monitoring và alerting chung - -4. **Infrastructure as Code**: Tất cả cấu hình được version: - - Docker Compose cho local development - - Kubernetes manifests cho production - - Traefik dynamic configuration - - Database migrations với Prisma - -5. **Observability First**: Monitoring tích hợp sẵn: - - Prometheus metrics từ tất cả services - - Structured logging với correlation IDs - - Distributed tracing với OpenTelemetry - - Health check endpoints (liveness/readiness) - ---- - -## Detailed Component Architecture / Kiến trúc Component Chi tiết - -### 1. Client Layer / Tầng Client +## Bối cảnh Hệ thống ```mermaid -graph LR - User((User)) --> WebBrowser[Web Browser] - User --> MobileDevice[Mobile Device] +C4Context + title Sơ đồ Bối cảnh Hệ thống GoodGo Platform - WebBrowser --> NextJS[Next.js App
Port 3000] - MobileDevice --> Flutter[Flutter App
iOS/Android] + Person(user, "Người dùng / User", "End users accessing the platform") + Person(admin, "Quản trị viên / Admin", "System administrators") + Person(developer, "Nhà phát triển / Developer", "Platform developers") - NextJS --> APIClient[@goodgo/http-client] - Flutter --> HTTPPackage[HTTP Package] + System(platform, "GoodGo Platform", "Microservices platform for business applications") - APIClient --> Gateway[API Gateway
localhost or api.goodgo.com] - HTTPPackage --> Gateway + System_Ext(neon, "Neon PostgreSQL", "Serverless PostgreSQL database") + System_Ext(redis, "Redis", "In-memory cache and session store") + System_Ext(kafka, "Apache Kafka", "Event streaming platform") + System_Ext(monitoring, "Monitoring Stack", "Prometheus + Grafana + Loki + Jaeger") - style User fill:#e1f5ff - style Gateway fill:#d4edda -``` - -**EN Components**: -- **Web Application**: Next.js 14+ with App Router - - Server-side rendering (SSR) - - Static site generation (SSG) - - API routes for BFF pattern - - Uses `@goodgo/http-client` for API calls - -- **Mobile Application**: Flutter or React Native - - Cross-platform (iOS + Android) - - Offline-first architecture (future) - - Native HTTP client - -**VI Thành phần**: -- **Web Application**: Next.js 14+ với App Router - - Server-side rendering (SSR) - - Static site generation (SSG) - - API routes cho BFF pattern - - Sử dụng `@goodgo/http-client` cho API calls - -- **Mobile Application**: Flutter hoặc React Native - - Cross-platform (iOS + Android) - - Kiến trúc offline-first (tương lai) - - Native HTTP client - ---- - -### 2. API Gateway Layer / Tầng API Gateway - -```mermaid -graph TD - Client[Client Request] --> Traefik + Rel(user, platform, "Uses", "HTTPS") + Rel(admin, platform, "Manages", "HTTPS") + Rel(developer, platform, "Develops & Deploys", "Git, CI/CD") - subgraph "Traefik API Gateway" - Traefik[Traefik Router] --> Middlewares - - subgraph Middlewares - M1[CORS] - M2[Rate Limiting] - M3[Headers] - M4[Compression] - end - - Middlewares --> Router[Dynamic Router] - Router --> LB[Load Balancer] - end - - LB --> Service1[Service Instance 1] - LB --> Service2[Service Instance 2] - LB --> Service3[Service Instance 3] - - style Traefik fill:#e1f5ff - style Router fill:#fff4e1 - style LB fill:#d4edda + Rel(platform, neon, "Stores data", "PostgreSQL Protocol") + Rel(platform, redis, "Caches data", "Redis Protocol") + Rel(platform, kafka, "Publishes/Consumes events", "Kafka Protocol") + Rel(platform, monitoring, "Sends metrics, logs, traces", "HTTP, gRPC") ``` -**EN: Traefik Configuration** +## Thành phần -**Static Configuration** (`infra/traefik/traefik.yml`): -- Entry points (HTTP: 80, HTTPS: 443) -- Docker provider for service discovery -- Certificate resolvers (Let's Encrypt) -- Dashboard configuration (port 8080) +### Frontend Layer -**Dynamic Configuration** (`infra/traefik/dynamic/`): -- Middlewares (CORS, rate limiting, security headers) -- Routes (defined via Docker labels or YAML files) -- Services (load balancing strategies) +#### Web App (Next.js) +**Mô tả**: Ứng dụng web sử dụng Next.js 14+ với App Router -**Routing Pattern**: -```yaml -http: - routers: - iam-service: - rule: "PathPrefix(`/api/v1/auth`)" - service: iam-service - middlewares: - - cors - - rate-limit - - secure-headers -``` +**Tính năng chính**: +- Server-side rendering (SSR) và Static Site Generation (SSG) +- API routes cho BFF (Backend for Frontend) pattern +- Optimized image loading với next/image +- Built-in routing và code splitting -**Service Discovery**: Automatic via Docker labels: -```yaml -labels: - - "traefik.enable=true" - - "traefik.http.routers.iam.rule=PathPrefix(`/api/v1/auth`)" - - "traefik.http.services.iam.loadbalancer.server.port=3001" - - "traefik.http.services.iam.loadbalancer.healthcheck.path=/health/live" -``` +**Công nghệ sử dụng**: +- Next.js 14+, React 18+, TypeScript +- Tailwind CSS, Zustand (state management) +- `@goodgo/http-client`, `@goodgo/types` -**VI: Cấu hình Traefik** +**Vị trí File**: [`apps/web-client/`](file:///Users/velikho/Desktop/WORKING/Base/apps/web-client) -**Cấu hình Tĩnh** (`infra/traefik/traefik.yml`): -- Entry points (HTTP: 80, HTTPS: 443) -- Docker provider cho service discovery -- Certificate resolvers (Let's Encrypt) -- Cấu hình dashboard (port 8080) +#### Mobile App (Flutter) +**Mô tả**: Ứng dụng mobile cross-platform sử dụng Flutter -**Cấu hình Động** (`infra/traefik/dynamic/`): -- Middlewares (CORS, rate limiting, security headers) -- Routes (định nghĩa qua Docker labels hoặc YAML files) -- Services (chiến lược load balancing) +**Tính năng chính**: +- Cross-platform (iOS, Android) +- Native performance +- Provider pattern cho state management +- Offline-first với local storage -**Pattern Routing**: -```yaml -http: - routers: - iam-service: - rule: "PathPrefix(`/api/v1/auth`)" - service: iam-service - middlewares: - - cors - - rate-limit - - secure-headers -``` +**Công nghệ sử dụng**: +- Flutter 3.x, Dart +- Provider, Dio (HTTP client) -**Service Discovery**: Tự động qua Docker labels: -```yaml -labels: - - "traefik.enable=true" - - "traefik.http.routers.iam.rule=PathPrefix(`/api/v1/auth`)" - - "traefik.http.services.iam.loadbalancer.server.port=3001" - - "traefik.http.services.iam.loadbalancer.healthcheck.path=/health/live" -``` +**Vị trí File**: [`apps/mobile-client/`](file:///Users/velikho/Desktop/WORKING/Base/apps/mobile-client) ---- +### API Gateway Layer -### 3. Services Layer / Tầng Services +#### Traefik +**Mô tả**: Reverse proxy và API gateway xử lý routing, load balancing, SSL termination -#### Microservice Template Structure / Cấu trúc Template Microservice +**Tính năng chính**: +- Dynamic service discovery +- Automatic HTTPS với Let's Encrypt +- Load balancing và health checks +- Rate limiting và circuit breaker +- Middleware chains (CORS, auth, logging) -```mermaid -graph TD - subgraph "Microservice (Template Pattern)" - HTTP[HTTP Request] --> MW[Middleware Stack] - - MW --> Routes[Routes] - - subgraph "Feature Module" - Routes --> Controller - Controller --> Service - Service --> Repository - Repository --> Prisma[Prisma ORM] - end - - Service --> Cache[Cache Service] - Cache --> Redis[(Redis)] - - Prisma --> DB[(PostgreSQL)] - - MW --> Metrics[Metrics Middleware] - Metrics --> Prom[Prometheus] - end - - style MW fill:#e1f5ff - style Service fill:#f0e1ff - style Cache fill:#fff4e1 -``` +**Công nghệ sử dụng**: +- Traefik 2.x +- Docker labels cho dynamic configuration -**EN: Standard Microservice Structure** +**Vị trí File**: [`infra/traefik/`](file:///Users/velikho/Desktop/WORKING/Base/infra/traefik) -Each microservice follows this pattern (from `services/_template/`): +### Services Layer -``` -src/ -├── config/ # Configuration with Zod validation -│ ├── app.config.ts -│ ├── database.config.ts -│ └── redis.config.ts -├── core/ # Core utilities (IAM service only) -│ ├── cache/ # Multi-layer caching -│ ├── events/ # Event sourcing -│ └── security/ # Zero-trust validator -├── middlewares/ # Express middlewares -│ ├── correlation.middleware.ts -│ ├── logger.middleware.ts -│ ├── metrics.middleware.ts -│ └── error.middleware.ts -├── modules/ # Feature modules -│ ├── common/ # Shared (BaseRepository) -│ ├── feature/ # Example feature -│ ├── health/ # Health checks -│ └── metrics/ # Prometheus metrics -├── routes/ # Route definitions -│ └── index.ts -└── main.ts # Application entry point -``` +#### IAM Service (.NET) +**Mô tả**: Identity and Access Management service xử lý authentication và authorization -**Middleware Execution Order**: -1. Correlation ID → 2. Logger → 3. Metrics → 4. CORS → 5. Rate Limit → 6. Body Parser → 7. Routes → 8. Error Handler +**Tính năng chính**: +- OAuth2/OpenID Connect với OpenIddict +- JWT authentication (RS256) +- RBAC (Role-Based Access Control) +- ASP.NET Core Identity cho user management +- MFA support (TOTP) -**VI: Cấu trúc Microservice Chuẩn** +**Công nghệ sử dụng**: +- .NET 10, ASP.NET Core, MediatR +- Entity Framework Core, OpenIddict +- Serilog, FluentValidation -Mỗi microservice tuân theo pattern này (từ `services/_template/`): +**Vị trí File**: [`services/iam-service-net/`](file:///Users/velikho/Desktop/WORKING/Base/services/iam-service-net) -``` -src/ -├── config/ # Configuration với Zod validation -│ ├── app.config.ts -│ ├── database.config.ts -│ └── redis.config.ts -├── core/ # Core utilities (chỉ IAM service) -│ ├── cache/ # Multi-layer caching -│ ├── events/ # Event sourcing -│ └── security/ # Zero-trust validator -├── middlewares/ # Express middlewares -│ ├── correlation.middleware.ts -│ ├── logger.middleware.ts -│ ├── metrics.middleware.ts -│ └── error.middleware.ts -├── modules/ # Feature modules -│ ├── common/ # Shared (BaseRepository) -│ ├── feature/ # Example feature -│ ├── health/ # Health checks -│ └── metrics/ # Prometheus metrics -├── routes/ # Route definitions -│ └── index.ts -└── main.ts # Application entry point -``` +#### Các Services Đã Triển Khai -**Thứ tự Thực thi Middleware**: -1. Correlation ID → 2. Logger → 3. Metrics → 4. CORS → 5. Rate Limit → 6. Body Parser → 7. Routes → 8. Error Handler +| Service | Mô tả | Vị trí | +|---------|-------|--------| +| **Storage Service** | File storage với MinIO/Aliyun OSS | `services/storage-service-net/` | +| **Membership Service** | Quản lý membership và subscriptions | `services/membership-service-net/` | +| **Organization Service** | Quản lý tổ chức | `services/organization-service-net/` | +| **Chat Service** | Chat và messaging | `services/chat-service-net/` | +| **Social Service** | Social features | `services/social-service-net/` | +| **Wallet Service** | Ví điện tử | `services/wallet-service-net/` | ---- +### Infrastructure Layer -### 4. Data Layer / Tầng Dữ liệu +#### Neon PostgreSQL +**Mô tả**: Serverless PostgreSQL database với auto-scaling -#### Database Architecture / Kiến trúc Database +**Tính năng chính**: +- Serverless với auto-scaling +- Branching cho development/staging +- Point-in-time recovery +- Connection pooling -```mermaid -graph TD - subgraph "Database Per Service Pattern" - Service1[IAM Service] --> Schema1[(iam_db
30+ tables)] - Service2[Template Service] --> Schema2[(template_db
Example tables)] - Service3[Future Service] --> Schema3[(future_db
TBD)] - end - - subgraph "Shared Infrastructure" - Schema1 -.->|Connection Pool| PG[PostgreSQL 14+
Neon Cloud] - Schema2 -.->|Connection Pool| PG - Schema3 -.->|Connection Pool| PG - end - - subgraph "Cache Layer" - Service1 --> L1_1[L1: Memory
60s TTL] - Service2 --> L1_2[L1: Memory
60s TTL] - - L1_1 --> L2[L2: Redis
5-15min TTL] - L1_2 --> L2 - - L2 -.->|Cache Miss| Schema1 - L2 -.->|Cache Miss| Schema2 - end - - style PG fill:#f0e1ff - style L2 fill:#fff4e1 - style L1_1 fill:#d4edda - style L1_2 fill:#d4edda -``` +**Vị trí File**: Database schemas trong mỗi service (`services/*/prisma/schema.prisma`) -**EN: Data Management** +#### Redis +**Mô tả**: In-memory cache và session store -**Database per Service**: -- Each service has its own database schema -- Services own their data exclusively -- Cross-service data access via APIs only -- Independent scaling and optimization +**Tính năng chính**: +- Multi-layer caching (L1: Memory, L2: Redis) +- Session storage +- Rate limiting counters +- Pub/Sub cho real-time features -**Multi-Layer Caching** (IAM Service): -``` -Request → L1 (Memory, 60s) → L2 (Redis, 5-15min) → L3 (Database) -``` +**Vị trí File**: [`infra/redis/`](file:///Users/velikho/Desktop/WORKING/Base/infra/redis) -**Cache Hit Rates**: -- L1: ~40-50% (hot data) -- L2: ~80-90% (permissions, user data) -- L3: 10-20% (cache miss, fetch from DB) +#### Apache Kafka +**Mô tả**: Event streaming platform cho asynchronous communication -**Database Technology**: -- **Provider**: Neon (Serverless PostgreSQL) -- **Version**: PostgreSQL 14+ -- **ORM**: Prisma -- **Connection Pooling**: Prisma (10 connections default) -- **Migrations**: Prisma Migrate +**Tính năng chính**: +- Event-driven architecture +- Event sourcing +- Eventual consistency +- Dead letter queue (DLQ) -**VI: Quản lý Dữ liệu** +**Vị trí File**: [`infra/kafka/`](file:///Users/velikho/Desktop/WORKING/Base/infra/kafka) -**Database per Service**: -- Mỗi service có schema database riêng -- Services sở hữu dữ liệu độc quyền -- Truy cập dữ liệu cross-service chỉ qua APIs -- Scaling và optimization độc lập - -**Multi-Layer Caching** (IAM Service): -``` -Request → L1 (Memory, 60s) → L2 (Redis, 5-15min) → L3 (Database) -``` - -**Tỷ lệ Cache Hit**: -- L1: ~40-50% (hot data) -- L2: ~80-90% (permissions, user data) -- L3: 10-20% (cache miss, fetch từ DB) - -**Công nghệ Database**: -- **Provider**: Neon (Serverless PostgreSQL) -- **Version**: PostgreSQL 14+ -- **ORM**: Prisma -- **Connection Pooling**: Prisma (10 connections mặc định) -- **Migrations**: Prisma Migrate - ---- - -## Communication Patterns / Patterns Giao tiếp - -### Request Flow / Luồng Request +## Luồng Dữ liệu ```mermaid sequenceDiagram participant Client - participant Traefik as Traefik Gateway + participant Traefik as API Gateway participant Service - participant Cache as Redis Cache + participant Cache as Redis participant DB as PostgreSQL + participant Kafka + + Client->>Traefik: HTTPS Request + Traefik->>Traefik: Rate Limiting + Traefik->>Traefik: JWT Validation + Traefik->>Service: Route to Service - Client->>Traefik: HTTP Request - Traefik->>Traefik: Apply Middlewares
(CORS, Rate Limit) - Traefik->>Service: Forward Request Service->>Cache: Check Cache - alt Cache Hit - Cache-->>Service: Cached Data - Service-->>Traefik: Response (from cache) + Cache-->>Service: Return Cached Data else Cache Miss - Cache-->>Service: null Service->>DB: Query Database - DB-->>Service: Data - Service->>Cache: Store in Cache - Service-->>Traefik: Response (from DB) + DB-->>Service: Return Data + Service->>Cache: Store in Cache (TTL: 5min) end - Traefik-->>Client: HTTP Response + Service->>Service: Process Business Logic + Service->>DB: Update Data (if needed) + Service->>Kafka: Publish Event (async) + + Service-->>Traefik: Response + Traefik-->>Client: HTTPS Response + + Note over Kafka: Event consumers process asynchronously ``` -**EN: Communication Patterns** +**Giải thích chi tiết**: +1. **Request**: Client gửi HTTPS request đến Traefik +2. **Gateway Processing**: Traefik thực hiện rate limiting và JWT validation +3. **Routing**: Traefik route request đến service phù hợp +4. **Cache Check**: Service kiểm tra L1 (memory) → L2 (Redis) cache +5. **Database Query**: Nếu cache miss, query từ PostgreSQL +6. **Cache Update**: Lưu kết quả vào cache với TTL phù hợp +7. **Business Logic**: Xử lý logic nghiệp vụ +8. **Event Publishing**: Publish domain events đến Kafka (async) +9. **Response**: Trả về response cho client qua Traefik -1. **Synchronous (HTTP/REST)**: - - Request-response pattern - - RESTful API design - - JSON payload format - - Standard HTTP status codes +## Kiến trúc Database -2. **Service-to-Service**: - - Internal HTTP calls via `@goodgo/http-client` - - Service authentication with internal API keys - - Circuit breaker pattern for resilience - - Correlation ID propagation +```mermaid +erDiagram + User ||--o{ Session : has + User ||--o{ UserRole : has + User ||--o{ UserPermission : has + User ||--o{ MFADevice : has + User ||--o{ AuditEvent : triggers + + Role ||--o{ UserRole : assigned_to + Role ||--o{ RolePermission : has + + Permission ||--o{ RolePermission : granted_to + Permission ||--o{ UserPermission : granted_to + + Organization ||--o{ User : contains + Organization ||--o{ Role : defines + + User { + string id PK + string email UK + string passwordHash + string organizationId FK + boolean mfaEnabled + datetime createdAt + datetime updatedAt + } + + Session { + string id PK + string userId FK + string refreshTokenHash + string deviceFingerprint + string ipAddress + datetime expiresAt + datetime createdAt + } + + Role { + string id PK + string name + string organizationId FK + int hierarchy + datetime createdAt + } + + Permission { + string id PK + string resource + string action + string scope + datetime createdAt + } + + AuditEvent { + string id PK + string userId FK + string eventType + json eventData + datetime timestamp + } +``` -3. **Service Discovery**: - - **Local**: Docker DNS (`http://service-name:port`) - - **Kubernetes**: Service DNS (`http://service-name.namespace.svc.cluster.local`) - - **Traefik**: Dynamic configuration via labels +**Mô tả**: +- **Database per Service**: Mỗi service có database schema riêng +- **Shared Database**: Hiện tại sử dụng shared Neon PostgreSQL, schemas isolated bằng Prisma +- **Event Sourcing**: Audit events lưu tất cả thay đổi quan trọng +- **Soft Delete**: Sử dụng `deletedAt` field thay vì hard delete -4. **Asynchronous (Future)**: - - Message queues (RabbitMQ/Kafka) - - Event-driven architecture - - Pub/Sub patterns +## Quyết định Thiết kế -**VI: Patterns Giao tiếp** +### Quyết định 1: Microservices Architecture -1. **Đồng bộ (HTTP/REST)**: - - Pattern request-response - - Thiết kế RESTful API - - Format payload JSON - - HTTP status codes chuẩn +**Bối cảnh**: Cần khả năng scale độc lập và deploy riêng biệt cho từng business domain -2. **Service-to-Service**: - - Internal HTTP calls qua `@goodgo/http-client` - - Service authentication với internal API keys - - Circuit breaker pattern cho resilience - - Correlation ID propagation +**Quyết định**: Sử dụng microservices architecture với database per service pattern -3. **Service Discovery**: - - **Local**: Docker DNS (`http://service-name:port`) - - **Kubernetes**: Service DNS (`http://service-name.namespace.svc.cluster.local`) - - **Traefik**: Dynamic configuration qua labels +**Hậu quả**: +- ✅ **Tích cực**: + - Scale độc lập từng service theo nhu cầu + - Deploy riêng biệt, giảm risk khi release + - Fault isolation - lỗi một service không ảnh hưởng toàn bộ + - Technology flexibility - mỗi service có thể dùng tech stack khác +- ❌ **Tiêu cực**: + - Phức tạp hơn monolith (distributed systems challenges) + - Eventual consistency thay vì strong consistency + - Distributed transactions phức tạp (Saga pattern) + - Operational overhead (monitoring, deployment) -4. **Bất đồng bộ (Tương lai)**: - - Message queues (RabbitMQ/Kafka) - - Event-driven architecture - - Pub/Sub patterns +**Các lựa chọn thay thế**: Monolith, Modular Monolith --- -## Security Architecture / Kiến trúc Bảo mật +### Quyết định 2: Traefik as API Gateway -```mermaid -graph TD - Request[Client Request] --> TLS[TLS/HTTPS] - TLS --> RateLimit[Rate Limiting] - RateLimit --> JWT[JWT Validation] - JWT --> RBAC[RBAC Authorization] - RBAC --> ZeroTrust[Zero-Trust Validation] - ZeroTrust --> Service[Service Logic] - - Service --> Encrypt[Data Encryption
AES-256-GCM] - Encrypt --> DB[(Encrypted Data
at Rest)] - - Service --> Audit[Audit Logging
Event Sourcing] - Audit --> AuditDB[(Audit Trail
7-year retention)] - - style TLS fill:#d4edda - style JWT fill:#e1f5ff - style Encrypt fill:#f8d7da - style Audit fill:#fff4e1 -``` +**Bối cảnh**: Cần reverse proxy, load balancing, SSL termination, và service discovery -**EN: Security Layers** +**Quyết định**: Sử dụng Traefik thay vì Kong, NGINX, hoặc AWS API Gateway -1. **Network Security**: - - TLS 1.2+ for all communications - - HTTPS enforcement - - CORS configuration - - Rate limiting (Redis-backed, distributed) +**Hậu quả**: +- ✅ **Tích cực**: + - Auto service discovery với Docker labels + - Dynamic configuration không cần restart + - Built-in Let's Encrypt support + - Native Kubernetes integration + - Built-in metrics và tracing +- ❌ **Tiêu cực**: + - Learning curve cao hơn NGINX + - Plugin ecosystem nhỏ hơn Kong + - Community nhỏ hơn NGINX -2. **Authentication**: - - JWT tokens (15min access, 7 days refresh) - - bcrypt password hashing (cost 12) - - Refresh token rotation - - Multi-factor authentication (TOTP) - -3. **Authorization**: - - Role-Based Access Control (RBAC) - - Attribute-Based Access Control (ABAC) - - Permission model: `resource:action:scope` - - Permission caching (5min TTL) - -4. **Data Protection**: - - AES-256-GCM encryption for PII - - Token hashing (SHA-256) - - Secrets management (environment variables, K8s secrets) - -5. **Zero-Trust**: - - Device fingerprinting - - IP address validation - - Behavioral analysis - - Session binding - -6. **Audit & Compliance**: - - Event sourcing for all auth events - - 7-year retention (GDPR, SOC2) - - Correlation ID tracking - - Compliance reporting (GDPR, SOC2, ISO27001, HIPAA) - -**VI: Các Tầng Bảo mật** - -1. **Network Security**: - - TLS 1.2+ cho mọi giao tiếp - - HTTPS enforcement - - Cấu hình CORS - - Rate limiting (Redis-backed, phân tán) - -2. **Authentication**: - - JWT tokens (15min access, 7 ngày refresh) - - bcrypt password hashing (cost 12) - - Refresh token rotation - - Multi-factor authentication (TOTP) - -3. **Authorization**: - - Role-Based Access Control (RBAC) - - Attribute-Based Access Control (ABAC) - - Permission model: `resource:action:scope` - - Permission caching (5min TTL) - -4. **Data Protection**: - - AES-256-GCM encryption cho PII - - Token hashing (SHA-256) - - Secrets management (environment variables, K8s secrets) - -5. **Zero-Trust**: - - Device fingerprinting - - IP address validation - - Behavioral analysis - - Session binding - -6. **Audit & Compliance**: - - Event sourcing cho tất cả auth events - - 7-year retention (GDPR, SOC2) - - Correlation ID tracking - - Compliance reporting (GDPR, SOC2, ISO27001, HIPAA) +**Các lựa chọn thay thế**: Kong, NGINX, AWS API Gateway, Envoy --- -## Observability Stack / Stack Khả năng quan sát +### Quyết định 3: Neon PostgreSQL (Serverless) -```mermaid -graph LR - subgraph "Services" - S1[IAM Service] - S2[Template Service] - end - - subgraph "Metrics" - S1 -->|/metrics| Prom[Prometheus] - S2 -->|/metrics| Prom - Prom --> Grafana[Grafana Dashboard] - end - - subgraph "Logging" - S1 -->|JSON Logs| Loki[Loki] - S2 -->|JSON Logs| Loki - Loki --> GrafanaLog[Grafana Explore] - end - - subgraph "Tracing" - S1 -->|Spans| Jaeger[Jaeger] - S2 -->|Spans| Jaeger - Jaeger --> JaegerUI[Jaeger UI] - end - - style Prom fill:#d4edda - style Loki fill:#fff4e1 - style Jaeger fill:#e1f5ff -``` +**Bối cảnh**: Cần database với auto-scaling, branching, và cost-effective cho development -**EN: Three Pillars of Observability** +**Quyết định**: Sử dụng Neon PostgreSQL (serverless) thay vì self-hosted PostgreSQL hoặc AWS RDS -1. **Metrics (Prometheus)**: - - HTTP request duration (histogram) - - HTTP request count (counter) - - Active requests (gauge) - - Cache hit/miss ratio - - Database query duration - - Custom business metrics +**Hậu quả**: +- ✅ **Tích cực**: + - Auto-scaling theo usage + - Database branching cho dev/staging + - Pay-per-use pricing model + - Automatic backups và point-in-time recovery + - No infrastructure management +- ❌ **Tiêu cực**: + - Vendor lock-in + - Cold start latency (mitigated by connection pooling) + - Limited control over database configuration -2. **Logging (Winston + Loki)**: - - Structured JSON logs - - Correlation IDs in every log - - Request/response logging - - Error stack traces (dev only) - - Log levels: error, warn, info, debug +**Các lựa chọn thay thế**: Self-hosted PostgreSQL, AWS RDS, Google Cloud SQL -3. **Tracing (OpenTelemetry + Jaeger)**: - - Distributed tracing across services - - HTTP request spans - - Database query spans - - Cache operation spans - - End-to-end latency tracking +## Đặc điểm Hiệu suất -**Health Checks**: -- `/health` - Overall health status -- `/health/live` - Liveness probe (K8s) -- `/health/ready` - Readiness probe (K8s, checks DB + Redis) +| Chỉ số / Metric | Mục tiêu / Target | Ghi chú / Notes | +|-----------------|-------------------|-----------------| +| **API Response Time (P95)** | < 200ms | Excluding external API calls | +| **API Response Time (P99)** | < 500ms | Peak load scenarios | +| **Throughput** | 1000 req/s | Per service instance | +| **Database Query Time (P95)** | < 50ms | Simple queries with indexes | +| **Cache Hit Rate (L1)** | > 40% | In-memory cache | +| **Cache Hit Rate (L2)** | > 80% | Redis cache | +| **Event Publish Latency (P95)** | < 10ms | Kafka fire-and-forget | +| **Service Availability** | > 99.9% | Monthly uptime target | +| **Error Rate** | < 1% | 4xx + 5xx errors | -**VI: Ba Trụ cột của Khả năng quan sát** +**Tối ưu hóa Hiệu suất**: +- Multi-layer caching (L1: Memory, L2: Redis) +- Connection pooling cho database +- Pagination cho list endpoints (max 100 items) +- Database indexes cho frequently queried fields +- Async event publishing (fire-and-forget) +- CDN cho static assets (Next.js) -1. **Metrics (Prometheus)**: - - HTTP request duration (histogram) - - HTTP request count (counter) - - Active requests (gauge) - - Cache hit/miss ratio - - Database query duration - - Custom business metrics +## Cân nhắc Bảo mật -2. **Logging (Winston + Loki)**: - - Structured JSON logs - - Correlation IDs trong mọi log - - Request/response logging - - Error stack traces (chỉ dev) - - Log levels: error, warn, info, debug +**Authentication**: +- JWT với RS256 (asymmetric signing) +- Access token: 15 phút expiry +- Refresh token: 7 ngày expiry, rotation on use +- httpOnly cookies cho token storage +- MFA support (TOTP, backup codes) -3. **Tracing (OpenTelemetry + Jaeger)**: - - Distributed tracing giữa các services - - HTTP request spans - - Database query spans - - Cache operation spans - - End-to-end latency tracking +**Authorization**: +- RBAC (Role-Based Access Control) +- ABAC (Attribute-Based Access Control) +- Permission format: `resource:action:scope` +- Permission caching (5 min TTL) +- Zero-trust device validation -**Health Checks**: -- `/health` - Overall health status -- `/health/live` - Liveness probe (K8s) -- `/health/ready` - Readiness probe (K8s, kiểm tra DB + Redis) +**Network Security**: +- TLS 1.2+ enforcement +- HTTPS-only (HSTS headers) +- Rate limiting: 100 req/15min (standard), 10 req/hour (strict) +- CORS whitelist từ environment variables +- Network policies (Kubernetes) ---- +**Data Protection**: +- AES-256-GCM encryption cho PII at rest +- bcrypt (cost 12) cho password hashing +- SHA-256 hashing cho tokens before storage +- Database encryption at rest (Neon) +- TLS in-transit cho tất cả connections -## Deployment Architecture / Kiến trúc Triển khai +**Secrets Management**: +- Kubernetes secrets cho production +- Environment variables validation với Zod +- No hardcoded secrets in code +- Quarterly secret rotation -### Local Development / Phát triển Local - -```mermaid -graph TD - subgraph "Docker Compose (deployments/local)" - Traefik[Traefik
Port 80, 8080] - IAM[IAM Service
Port 3001] - Template[Template Service
Port 5000] - PostgreSQL[PostgreSQL
Port 5432] - Redis[Redis
Port 6379] - - Traefik --> IAM - Traefik --> Template - IAM --> PostgreSQL - IAM --> Redis - Template --> PostgreSQL - Template --> Redis - end - - Dev[Developer] -->|localhost| Traefik - Dev -->|:8080| TraefikDash[Traefik Dashboard] - - style Traefik fill:#e1f5ff - style PostgreSQL fill:#f0e1ff - style Redis fill:#fff4e1 -``` - -### Production Deployment / Triển khai Production +**Audit Trail**: +- Event sourcing cho tất cả auth events +- 7-year retention cho compliance +- Immutable audit logs +- Correlation IDs cho request tracing +## Triển khai ```mermaid graph TD subgraph "Kubernetes Cluster" - Ingress[Ingress Controller
Traefik] - - subgraph "IAM Service" - IAM1[IAM Pod 1] - IAM2[IAM Pod 2] - IAM3[IAM Pod 3] + subgraph "Ingress" + LB[Load Balancer
External IP] + Traefik[Traefik Pods
Replicas: 2] end - subgraph "Template Service" - T1[Template Pod 1] - T2[Template Pod 2] + subgraph "Services" + IAM[IAM Service Pods
Replicas: 2-10 HPA] + Service1[Service 1 Pods
Replicas: 2-10 HPA] + Service2[Service 2 Pods
Replicas: 2-10 HPA] end - Ingress --> IAM1 - Ingress --> IAM2 - Ingress --> IAM3 - Ingress --> T1 - Ingress --> T2 + subgraph "Infrastructure" + Redis[Redis Cluster
3 Masters + 3 Slaves] + Kafka[Kafka Cluster
3 Brokers] + end + + subgraph "Observability" + Prom[Prometheus
Replicas: 2] + Loki[Loki
Replicas: 2] + Jaeger[Jaeger
Replicas: 2] + Grafana[Grafana
Replicas: 2] + end end - subgraph "Managed Services" - Neon[(Neon PostgreSQL
Serverless)] - RedisCloud[(Redis Cloud)] + subgraph "External" + DB[(Neon PostgreSQL
Serverless)] end - IAM1 --> Neon - IAM2 --> Neon - IAM3 --> Neon - T1 --> Neon - T2 --> Neon + LB --> Traefik + Traefik --> IAM + Traefik --> Service1 + Traefik --> Service2 - IAM1 --> RedisCloud - IAM2 --> RedisCloud - T1 --> RedisCloud - T2 --> RedisCloud + IAM --> Redis + IAM --> Kafka + IAM --> DB - style Ingress fill:#e1f5ff - style Neon fill:#f0e1ff - style RedisCloud fill:#fff4e1 + Service1 --> Redis + Service1 --> Kafka + Service1 --> DB + + Service2 --> Redis + Service2 --> Kafka + Service2 --> DB + + IAM -.->|metrics| Prom + Service1 -.->|metrics| Prom + Service2 -.->|metrics| Prom + + IAM -.->|logs| Loki + Service1 -.->|logs| Loki + Service2 -.->|logs| Loki + + IAM -.->|traces| Jaeger + Service1 -.->|traces| Jaeger + Service2 -.->|traces| Jaeger + + Prom --> Grafana + Loki --> Grafana + Jaeger --> Grafana + + style LB fill:#1565c0,stroke:#fff,stroke-width:2px,color:#fff + style Traefik fill:#0f4c81,stroke:#fff,stroke-width:2px,color:#fff + style IAM fill:#283593,stroke:#fff,stroke-width:2px,color:#fff + style Service1 fill:#4527a0,stroke:#fff,stroke-width:2px,color:#fff + style Service2 fill:#4527a0,stroke:#fff,stroke-width:2px,color:#fff + style DB fill:#5e35b1,stroke:#fff,stroke-width:2px,color:#fff + style Redis fill:#ef6c00,stroke:#fff,stroke-width:2px,color:#fff + style Kafka fill:#2e7d32,stroke:#fff,stroke-width:2px,color:#fff + style Prom fill:#c62828,stroke:#fff,stroke-width:2px,color:#fff + style Loki fill:#d84315,stroke:#fff,stroke-width:2px,color:#fff + style Jaeger fill:#e65100,stroke:#fff,stroke-width:2px,color:#fff + style Grafana fill:#b71c1c,stroke:#fff,stroke-width:2px,color:#fff ``` -**EN: Deployment Environments** +### Chiến lược Triển khai -1. **Local (Docker Compose)**: - - All services run in Docker containers - - Shared network for service communication - - Local PostgreSQL and Redis - - Traefik for routing - - Hot reload for development +**Deployment Strategy**: +- Rolling updates (maxSurge: 1, maxUnavailable: 0) +- Zero-downtime deployments +- Blue-green deployment cho major releases +- Canary deployment cho high-risk changes -2. **Staging (Kubernetes)**: - - Kubernetes cluster in cloud (GKE/EKS/AKS) - - 2 replicas per service - - Managed PostgreSQL (Neon) - - Managed Redis (Redis Cloud) - - Horizontal Pod Autoscaling (HPA) +**Auto-scaling**: +- Horizontal Pod Autoscaler (HPA) + - Min replicas: 2 + - Max replicas: 10 + - Target CPU: 70% + - Target Memory: 80% -3. **Production (Kubernetes)**: - - Production K8s cluster - - 3+ replicas per service - - Managed databases with backups - - Auto-scaling (HPA + VPA) - - Blue-green deployments - - Rolling updates with health checks +**Resource Allocation**: +| Service | Requests | Limits | +|---------|----------|--------| +| **Microservices** | 256Mi RAM, 250m CPU | 512Mi RAM, 500m CPU | +| **Traefik** | 512Mi RAM, 500m CPU | 1Gi RAM, 1000m CPU | +| **Redis** | 2Gi RAM, 1 CPU | 4Gi RAM, 2 CPU | +| **Prometheus** | 4Gi RAM, 2 CPU | 8Gi RAM, 4 CPU | -**VI: Môi trường Triển khai** +**Health Checks**: +- Liveness probe: `/health/live` (K8s restarts if fails) +- Readiness probe: `/health/ready` (K8s removes from LB if fails) +- Startup probe: `/health/live` (initial delay 30s) -1. **Local (Docker Compose)**: - - Tất cả services chạy trong Docker containers - - Shared network cho service communication - - Local PostgreSQL và Redis - - Traefik cho routing - - Hot reload cho development +**Environments**: +- **Local**: Docker Compose +- **Staging**: Kubernetes cluster (shared) +- **Production**: Kubernetes cluster (dedicated) +## Giám sát & Khả năng quan sát -2. **Staging (Kubernetes)**: - - Kubernetes cluster trên cloud (GKE/EKS/AKS) - - 2 replicas mỗi service - - Managed PostgreSQL (Neon) - - Managed Redis (Redis Cloud) - - Horizontal Pod Autoscaling (HPA) +### Chỉ số Chính -3. **Production (Kubernetes)**: - - Production K8s cluster - - 3+ replicas mỗi service - - Managed databases với backups - - Auto-scaling (HPA + VPA) - - Blue-green deployments - - Rolling updates với health checks +**Application Metrics**: +- `http_requests_total` - Total HTTP requests (counter) +- `http_request_duration_seconds` - Request duration (histogram) +- `http_requests_active` - Active requests (gauge) +- `cache_hits_total` / `cache_misses_total` - Cache performance +- `db_query_duration_seconds` - Database query duration + +**Infrastructure Metrics**: +- CPU usage, Memory usage per pod +- Network I/O, Disk I/O +- Pod restart count +- Node resource utilization + +**Business Metrics**: +- User registrations per day +- Login success/failure rate +- API usage by endpoint +- Error rate by service + +**Kiểm tra Sức khỏe**: +- `/health/live` - Liveness probe (service running?) +- `/health/ready` - Readiness probe (ready for traffic?) +- `/metrics` - Prometheus metrics endpoint + +**Alerting Rules**: +```yaml +# High error rate +- alert: HighErrorRate + expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05 + for: 2m + severity: warning + +# High latency +- alert: HighLatency + expr: histogram_quantile(0.95, http_request_duration_seconds_bucket) > 0.5 + for: 5m + severity: warning + +# Service down +- alert: ServiceDown + expr: up == 0 + for: 1m + severity: critical + +# High memory usage +- alert: HighMemoryUsage + expr: container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.85 + for: 5m + severity: warning +``` + +**Logging**: +- Structured JSON logging với Winston +- Correlation IDs cho request tracing +- Log levels: error, warn, info, debug +- Log aggregation với Loki +- 7 days retention + +**Distributed Tracing**: +- OpenTelemetry instrumentation +- Jaeger backend +- Trace sampling: 10% in production, 100% in staging +- Span attributes: service, operation, user_id, correlation_id + +## Tài liệu Liên quan + +- [Event-Driven Architecture](./event-driven-architecture.md) - Kiến trúc hướng sự kiện +- [Caching Architecture](./caching-architecture.md) - Chiến lược caching +- [Security Architecture](./security-architecture.md) - Kiến trúc bảo mật +- [Observability Architecture](./observability-architecture.md) - Khả năng quan sát +- [Data Consistency Patterns](./data-consistency-patterns.md) - Mẫu nhất quán dữ liệu +- [Microservices Communication](./microservices-communication.md) - Giao tiếp microservices + +## Tham khảo + +- [Microservices Patterns](https://microservices.io/patterns/index.html) - Microservices pattern catalog +- [Twelve-Factor App](https://12factor.net/) - Best practices for cloud-native apps +- [C4 Model](https://c4model.com/) - Software architecture diagrams +- [Kubernetes Documentation](https://kubernetes.io/docs/) - Kubernetes official docs +- [Traefik Documentation](https://doc.traefik.io/traefik/) - Traefik official docs --- -## Performance Characteristics / Đặc điểm Hiệu suất +**Cập nhật Lần cuối**: 2026-01-14 +**Tác giả**: GoodGo Architecture Team +**Người review**: GoodGo Development Team -**EN: Performance Targets** +## Quick Tips -| Metric | Target | Notes | -|--------|--------|-------| -| **API Response Time (P95)** | < 100ms | Excluding cold starts | -| **API Response Time (P99)** | < 200ms | | -| **Throughput** | 1000 req/s | Per service instance | -| **Cache Hit Rate** | > 80% | Redis cache | -| **Database Query Time (P95)** | < 50ms | Simple queries | -| **Memory Usage** | < 512MB | Per service instance | -| **CPU Usage** | < 60% | Under normal load | +### Mermaid Common Issues +- **Arrow Syntax**: Use `-->` for solid arrows, `-.->` for dotted arrows. +- **Node IDs**: Avoid spaces/special chars in IDs (e.g., `Node-A` not `Node A`). +- **Subgraphs**: Ensure `subgraph` names are unique and descriptive. -**Optimization Strategies**: -- Multi-layer caching (L1: Memory, L2: Redis) -- Database connection pooling -- Query optimization with indexes -- Horizontal scaling with HPA -- CDN for static assets +### Color Pattern Quick Reference +| Element | Dark Color | Text Color | +|---------|------------|------------| +| **Blue (Primary)** | `#0f4c81` | `#ffffff` | +| **Purple (DB)** | `#5e35b1` | `#ffffff` | +| **Orange (Cache)** | `#ef6c00` | `#ffffff` | +| **Green (Success)** | `#2e7d32` | `#ffffff` | +| **Red (Alert)** | `#c62828` | `#ffffff` | -**VI: Mục tiêu Hiệu suất** - -| Metric | Mục tiêu | Ghi chú | -|--------|----------|---------| -| **API Response Time (P95)** | < 100ms | Không bao gồm cold starts | -| **API Response Time (P99)** | < 200ms | | -| **Throughput** | 1000 req/s | Mỗi service instance | -| **Cache Hit Rate** | > 80% | Redis cache | -| **Database Query Time (P95)** | < 50ms | Queries đơn giản | -| **Memory Usage** | < 512MB | Mỗi service instance | -| **CPU Usage** | < 60% | Ở normal load | - -**Chiến lược Tối ưu**: -- Multi-layer caching (L1: Memory, L2: Redis) -- Database connection pooling -- Query optimization với indexes -- Horizontal scaling với HPA -- CDN cho static assets - ---- - -## Related Documentation / Tài liệu Liên quan - -- [Service Communication](./service-communication.md) - EN: Detailed inter-service communication patterns / VI: Patterns giao tiếp giữa services chi tiết -- [IAM Proposal](./iam-proposal.md) - EN: IAM service architecture and features / VI: Kiến trúc và tính năng IAM service -- [Deployment Guide](../guides/deployment.md) - EN: Step-by-step deployment instructions / VI: Hướng dẫn triển khai từng bước -- [Local Development](../guides/local-development.md) - EN: Setting up local environment / VI: Thiết lập môi trường local -- [Project Rules](../skills/project-rules.md) - EN: Project structure and conventions / VI: Cấu trúc dự án và quy ước - ---- - -**Last Updated / Cập nhật lần cuối**: 2026-01-06 -**Authors / Tác giả**: DevOps Team -**Reviewers / Người review**: Architecture Team +### Visual Indicators +- ✅ **Khuyên dùng** +- ❌ **Không khuyên dùng** +- ⚠️ **Cảnh báo**