Re_Backend/src/middlewares/metrics.middleware.ts

292 lines
8.6 KiB
TypeScript

/**
* Prometheus Metrics Middleware
* Exposes application metrics for monitoring with Prometheus/Grafana
*
* Metrics exposed:
* - http_requests_total: Total number of HTTP requests
* - http_request_duration_seconds: HTTP request latency histogram
* - http_request_errors_total: Total number of HTTP errors
* - nodejs_*: Node.js runtime metrics (memory, event loop, etc.)
* - Custom business metrics (TAT breaches, workflow counts, etc.)
*/
import { Request, Response, NextFunction, Router } from 'express';
import client from 'prom-client';
// ============================================================================
// REGISTRY SETUP
// ============================================================================
// Create a custom registry to avoid conflicts with default metrics
const register = new client.Registry();
// Add default Node.js metrics (memory, CPU, event loop, GC, etc.)
client.collectDefaultMetrics({
register,
prefix: 'nodejs_',
labels: { app: 're-workflow', service: 'backend' },
});
// ============================================================================
// HTTP METRICS
// ============================================================================
// Total HTTP requests counter
const httpRequestsTotal = new client.Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code'],
registers: [register],
});
// HTTP request duration histogram
const httpRequestDuration = new client.Histogram({
name: 'http_request_duration_seconds',
help: 'HTTP request latency in seconds',
labelNames: ['method', 'route', 'status_code'],
buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
registers: [register],
});
// HTTP errors counter
const httpRequestErrors = new client.Counter({
name: 'http_request_errors_total',
help: 'Total number of HTTP errors (4xx and 5xx)',
labelNames: ['method', 'route', 'status_code', 'error_type'],
registers: [register],
});
// Active HTTP connections gauge
const activeConnections = new client.Gauge({
name: 'http_active_connections',
help: 'Number of active HTTP connections',
registers: [register],
});
// ============================================================================
// BUSINESS METRICS
// ============================================================================
// TAT breaches counter
export const tatBreachesTotal = new client.Counter({
name: 'tat_breaches_total',
help: 'Total number of TAT breaches',
labelNames: ['department', 'workflow_type', 'breach_level'],
registers: [register],
});
// Pending workflows gauge
export const pendingWorkflowsCount = new client.Gauge({
name: 'pending_workflows_count',
help: 'Current number of pending workflows',
labelNames: ['department', 'status'],
registers: [register],
});
// Workflow operations counter
export const workflowOperationsTotal = new client.Counter({
name: 'workflow_operations_total',
help: 'Total number of workflow operations',
labelNames: ['operation', 'status'],
registers: [register],
});
// User authentication events counter
export const authEventsTotal = new client.Counter({
name: 'auth_events_total',
help: 'Total number of authentication events',
labelNames: ['event_type', 'success'],
registers: [register],
});
// Database query duration histogram
export const dbQueryDuration = new client.Histogram({
name: 'db_query_duration_seconds',
help: 'Database query latency in seconds',
labelNames: ['operation', 'table'],
buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5],
registers: [register],
});
// Redis operations counter
export const redisOperationsTotal = new client.Counter({
name: 'redis_operations_total',
help: 'Total number of Redis operations',
labelNames: ['operation', 'status'],
registers: [register],
});
// AI service calls
export const aiServiceCalls = new client.Counter({
name: 'ai_service_calls_total',
help: 'Total number of AI service calls',
labelNames: ['provider', 'operation', 'status'],
registers: [register],
});
export const aiServiceDuration = new client.Histogram({
name: 'ai_service_duration_seconds',
help: 'AI service call latency in seconds',
labelNames: ['provider', 'operation'],
buckets: [0.5, 1, 2, 5, 10, 30, 60],
registers: [register],
});
// ============================================================================
// MIDDLEWARE
// ============================================================================
/**
* Normalize route path for metrics labels
* Replaces dynamic segments like UUIDs and IDs with placeholders
*/
function normalizeRoutePath(path: string): string {
return path
// Replace UUIDs
.replace(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/gi, ':id')
// Replace numeric IDs
.replace(/\/\d+/g, '/:id')
// Replace request IDs (REQ-XXXX-XXX format)
.replace(/REQ-\d+-\d+/gi, ':requestId')
// Clean up multiple slashes
.replace(/\/+/g, '/');
}
/**
* HTTP metrics middleware
* Tracks request counts, durations, and errors
*/
export function metricsMiddleware(req: Request, res: Response, next: NextFunction): void {
// Skip metrics endpoint itself
if (req.path === '/metrics') {
return next();
}
const startTime = Date.now();
activeConnections.inc();
// Capture response on finish
res.on('finish', () => {
const duration = (Date.now() - startTime) / 1000; // Convert to seconds
const route = normalizeRoutePath(req.route?.path || req.path);
const statusCode = res.statusCode.toString();
const method = req.method;
// Record request count
httpRequestsTotal.inc({ method, route, status_code: statusCode });
// Record request duration
httpRequestDuration.observe(
{ method, route, status_code: statusCode },
duration
);
// Record errors (4xx and 5xx)
if (res.statusCode >= 400) {
const errorType = res.statusCode >= 500 ? 'server_error' : 'client_error';
httpRequestErrors.inc({
method,
route,
status_code: statusCode,
error_type: errorType,
});
}
activeConnections.dec();
});
// Handle connection errors
res.on('error', () => {
activeConnections.dec();
});
next();
}
/**
* Metrics endpoint handler
* Returns Prometheus-formatted metrics
*/
export async function metricsHandler(_req: Request, res: Response): Promise<void> {
try {
res.set('Content-Type', register.contentType);
const metrics = await register.metrics();
res.end(metrics);
} catch (error) {
res.status(500).end('Error collecting metrics');
}
}
/**
* Create metrics router
* Sets up the /metrics endpoint
*/
export function createMetricsRouter(): Router {
const router = Router();
// Metrics endpoint (GET /metrics)
router.get('/metrics', metricsHandler);
return router;
}
// ============================================================================
// HELPER FUNCTIONS FOR RECORDING METRICS
// ============================================================================
/**
* Record a TAT breach event
*/
export function recordTATBreach(department: string, workflowType: string, breachLevel: string = 'warning'): void {
tatBreachesTotal.inc({ department, workflow_type: workflowType, breach_level: breachLevel });
}
/**
* Update pending workflows count
*/
export function updatePendingWorkflows(department: string, status: string, count: number): void {
pendingWorkflowsCount.set({ department, status }, count);
}
/**
* Record a workflow operation
*/
export function recordWorkflowOperation(operation: string, success: boolean): void {
workflowOperationsTotal.inc({ operation, status: success ? 'success' : 'failure' });
}
/**
* Record an authentication event
*/
export function recordAuthEvent(eventType: string, success: boolean): void {
authEventsTotal.inc({ event_type: eventType, success: success ? 'true' : 'false' });
}
/**
* Record a database query duration
*/
export function recordDBQuery(operation: string, table: string, durationMs: number): void {
dbQueryDuration.observe({ operation, table }, durationMs / 1000);
}
/**
* Record a Redis operation
*/
export function recordRedisOperation(operation: string, success: boolean): void {
redisOperationsTotal.inc({ operation, status: success ? 'success' : 'failure' });
}
/**
* Record an AI service call
*/
export function recordAIServiceCall(provider: string, operation: string, success: boolean, durationMs?: number): void {
aiServiceCalls.inc({ provider, operation, status: success ? 'success' : 'failure' });
if (durationMs !== undefined) {
aiServiceDuration.observe({ provider, operation }, durationMs / 1000);
}
}
// Export the registry for advanced use cases
export { register };