feat(notifications): production-ready WebSocket gateway (TEC-2766)

- Add RedisIoAdapter (shared/infra) for multi-instance Socket.IO fan-out
  with graceful fallback to the in-memory IoAdapter when Redis is
  unreachable.
- Pin Socket.IO heartbeat (pingInterval/pingTimeout/connectTimeout)
  via env-tunable gateway options for reconnect stability.
- Expose Prometheus metrics on /notifications: goodgo_ws_connected_clients
  (Gauge) and goodgo_ws_messages_total (Counter) with namespace/event/
  direction labels. Wired through MetricsService and tracked across
  connect/disconnect + emits.
- Unit tests: RedisIoAdapter connect/fallback/close, new MetricsService
  WS helpers, and gateway metric increments/decrements on auth paths.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
Ho Ngoc Hai
2026-04-18 15:06:25 +07:00
parent 5d4ecdeb2f
commit 329a821b4a
13 changed files with 410 additions and 5 deletions

View File

@@ -9,6 +9,11 @@ describe('MetricsService', () => {
let mockSearchQueriesCounter: { inc: ReturnType<typeof vi.fn> };
let mockRequestDurationHistogram: { observe: ReturnType<typeof vi.fn> };
let mockHttpRequestsCounter: { inc: ReturnType<typeof vi.fn> };
let mockWsConnectedClientsGauge: {
inc: ReturnType<typeof vi.fn>;
set: ReturnType<typeof vi.fn>;
};
let mockWsMessagesCounter: { inc: ReturnType<typeof vi.fn> };
beforeEach(() => {
mockListingsCreatedCounter = { inc: vi.fn() };
@@ -17,6 +22,8 @@ describe('MetricsService', () => {
mockSearchQueriesCounter = { inc: vi.fn() };
mockRequestDurationHistogram = { observe: vi.fn() };
mockHttpRequestsCounter = { inc: vi.fn() };
mockWsConnectedClientsGauge = { inc: vi.fn(), set: vi.fn() };
mockWsMessagesCounter = { inc: vi.fn() };
service = new MetricsService(
mockListingsCreatedCounter as unknown as Counter,
@@ -25,6 +32,8 @@ describe('MetricsService', () => {
mockSearchQueriesCounter as unknown as Counter,
mockRequestDurationHistogram as unknown as Histogram,
mockHttpRequestsCounter as unknown as Counter,
mockWsConnectedClientsGauge as unknown as Gauge,
mockWsMessagesCounter as unknown as Counter,
);
});
@@ -102,4 +111,41 @@ describe('MetricsService', () => {
expect.objectContaining({ status_code: '503' }),
);
});
it('recordWsConnection increments the connected-clients gauge with +1 on connect', () => {
service.recordWsConnection('/notifications', 1);
expect(mockWsConnectedClientsGauge.inc).toHaveBeenCalledWith(
{ namespace: '/notifications' },
1,
);
});
it('recordWsConnection decrements the connected-clients gauge with -1 on disconnect', () => {
service.recordWsConnection('/notifications', -1);
expect(mockWsConnectedClientsGauge.inc).toHaveBeenCalledWith(
{ namespace: '/notifications' },
-1,
);
});
it('setWsConnectedClients sets the gauge for a namespace', () => {
service.setWsConnectedClients('/notifications', 0);
expect(mockWsConnectedClientsGauge.set).toHaveBeenCalledWith(
{ namespace: '/notifications' },
0,
);
});
it('recordWsMessage increments the messages counter with namespace/event/direction', () => {
service.recordWsMessage('/notifications', 'notification:new', 'out');
expect(mockWsMessagesCounter.inc).toHaveBeenCalledWith({
namespace: '/notifications',
event: 'notification:new',
direction: 'out',
});
});
});

View File

@@ -8,6 +8,8 @@ import {
GOODGO_SEARCH_QUERIES_TOTAL,
GOODGO_API_REQUEST_DURATION,
HTTP_REQUESTS_TOTAL,
GOODGO_WS_CONNECTED_CLIENTS,
GOODGO_WS_MESSAGES_TOTAL,
WEB_VITALS_LCP,
WEB_VITALS_FCP,
WEB_VITALS_CLS,
@@ -31,6 +33,10 @@ export class MetricsService {
private readonly requestDurationHistogram: Histogram,
@InjectMetric(HTTP_REQUESTS_TOTAL)
private readonly httpRequestsCounter: Counter,
@InjectMetric(GOODGO_WS_CONNECTED_CLIENTS)
private readonly wsConnectedClientsGauge: Gauge,
@InjectMetric(GOODGO_WS_MESSAGES_TOTAL)
private readonly wsMessagesCounter: Counter,
@InjectMetric(WEB_VITALS_LCP)
private readonly lcpHistogram: Histogram,
@InjectMetric(WEB_VITALS_FCP)
@@ -81,6 +87,25 @@ export class MetricsService {
this.httpRequestsCounter.inc(labels);
}
/** Track a WebSocket client connection (++) or disconnection (--). */
recordWsConnection(namespace: string, delta: 1 | -1): void {
this.wsConnectedClientsGauge.inc({ namespace }, delta);
}
/** Reset the connected-clients gauge for a namespace (e.g. on shutdown). */
setWsConnectedClients(namespace: string, count: number): void {
this.wsConnectedClientsGauge.set({ namespace }, count);
}
/** Record a WebSocket message emitted/received on a given event. */
recordWsMessage(
namespace: string,
event: string,
direction: 'in' | 'out',
): void {
this.wsMessagesCounter.inc({ namespace, event, direction });
}
/** Map metric name → the correct histogram. */
private readonly vitalHistograms: Record<string, Histogram | undefined> = {};

View File

@@ -11,6 +11,10 @@ export const DB_QUERY_DURATION = 'db_query_duration_seconds';
export const DB_POOL_ACTIVE_CONNECTIONS = 'db_pool_active_connections';
export const SEARCH_QUERY_DURATION = 'search_query_duration_seconds';
// ── WebSocket Metrics ──
export const GOODGO_WS_CONNECTED_CLIENTS = 'goodgo_ws_connected_clients';
export const GOODGO_WS_MESSAGES_TOTAL = 'goodgo_ws_messages_total';
// ── Web Vitals / RUM Metrics ──
export const WEB_VITALS_LCP = 'goodgo_web_vitals_lcp_seconds';
export const WEB_VITALS_FCP = 'goodgo_web_vitals_fcp_seconds';

View File

@@ -15,6 +15,8 @@ import {
DB_QUERY_DURATION,
DB_POOL_ACTIVE_CONNECTIONS,
SEARCH_QUERY_DURATION,
GOODGO_WS_CONNECTED_CLIENTS,
GOODGO_WS_MESSAGES_TOTAL,
WEB_VITALS_LCP,
WEB_VITALS_FCP,
WEB_VITALS_CLS,
@@ -83,6 +85,18 @@ import { HttpMetricsInterceptor } from './presentation/interceptors/http-metrics
labelNames: ['plan'],
}),
// ── WebSocket Metrics ──
makeGaugeProvider({
name: GOODGO_WS_CONNECTED_CLIENTS,
help: 'Number of active WebSocket clients',
labelNames: ['namespace'],
}),
makeCounterProvider({
name: GOODGO_WS_MESSAGES_TOTAL,
help: 'Total number of WebSocket messages emitted/received',
labelNames: ['namespace', 'event', 'direction'],
}),
// ── Services & Interceptors ──
MetricsService,
HttpMetricsInterceptor,