/** * Prometheus Metrics Middleware * Exposes application metrics for monitoring with Prometheus/Grafana * * Metrics exposed: * - http_requests_total: Total number of HTTP requests * - http_request_duration_seconds: HTTP request latency histogram * - http_request_errors_total: Total number of HTTP errors * - nodejs_*: Node.js runtime metrics (memory, event loop, etc.) * - Custom business metrics (TAT breaches, workflow counts, etc.) */ import { Request, Response, NextFunction, Router } from 'express'; import client from 'prom-client'; // ============================================================================ // REGISTRY SETUP // ============================================================================ // Create a custom registry to avoid conflicts with default metrics const register = new client.Registry(); // Add default Node.js metrics (memory, CPU, event loop, GC, etc.) client.collectDefaultMetrics({ register, prefix: 'nodejs_', labels: { app: 're-workflow', service: 'backend' }, }); // ============================================================================ // HTTP METRICS // ============================================================================ // Total HTTP requests counter const httpRequestsTotal = new client.Counter({ name: 'http_requests_total', help: 'Total number of HTTP requests', labelNames: ['method', 'route', 'status_code'], registers: [register], }); // HTTP request duration histogram const httpRequestDuration = new client.Histogram({ name: 'http_request_duration_seconds', help: 'HTTP request latency in seconds', labelNames: ['method', 'route', 'status_code'], buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10], registers: [register], }); // HTTP errors counter const httpRequestErrors = new client.Counter({ name: 'http_request_errors_total', help: 'Total number of HTTP errors (4xx and 5xx)', labelNames: ['method', 'route', 'status_code', 'error_type'], registers: [register], }); // Active HTTP connections gauge const activeConnections = new client.Gauge({ name: 'http_active_connections', help: 'Number of active HTTP connections', registers: [register], }); // ============================================================================ // BUSINESS METRICS // ============================================================================ // TAT breaches counter export const tatBreachesTotal = new client.Counter({ name: 'tat_breaches_total', help: 'Total number of TAT breaches', labelNames: ['department', 'workflow_type', 'breach_level'], registers: [register], }); // Pending workflows gauge export const pendingWorkflowsCount = new client.Gauge({ name: 'pending_workflows_count', help: 'Current number of pending workflows', labelNames: ['department', 'status'], registers: [register], }); // Workflow operations counter export const workflowOperationsTotal = new client.Counter({ name: 'workflow_operations_total', help: 'Total number of workflow operations', labelNames: ['operation', 'status'], registers: [register], }); // User authentication events counter export const authEventsTotal = new client.Counter({ name: 'auth_events_total', help: 'Total number of authentication events', labelNames: ['event_type', 'success'], registers: [register], }); // Database query duration histogram export const dbQueryDuration = new client.Histogram({ name: 'db_query_duration_seconds', help: 'Database query latency in seconds', labelNames: ['operation', 'table'], buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5], registers: [register], }); // Redis operations counter export const redisOperationsTotal = new client.Counter({ name: 'redis_operations_total', help: 'Total number of Redis operations', labelNames: ['operation', 'status'], registers: [register], }); // AI service calls export const aiServiceCalls = new client.Counter({ name: 'ai_service_calls_total', help: 'Total number of AI service calls', labelNames: ['provider', 'operation', 'status'], registers: [register], }); export const aiServiceDuration = new client.Histogram({ name: 'ai_service_duration_seconds', help: 'AI service call latency in seconds', labelNames: ['provider', 'operation'], buckets: [0.5, 1, 2, 5, 10, 30, 60], registers: [register], }); // ============================================================================ // MIDDLEWARE // ============================================================================ /** * Normalize route path for metrics labels * Replaces dynamic segments like UUIDs and IDs with placeholders */ function normalizeRoutePath(path: string): string { return path // Replace UUIDs .replace(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/gi, ':id') // Replace numeric IDs .replace(/\/\d+/g, '/:id') // Replace request IDs (REQ-XXXX-XXX format) .replace(/REQ-\d+-\d+/gi, ':requestId') // Clean up multiple slashes .replace(/\/+/g, '/'); } /** * HTTP metrics middleware * Tracks request counts, durations, and errors */ export function metricsMiddleware(req: Request, res: Response, next: NextFunction): void { // Skip metrics endpoint itself if (req.path === '/metrics') { return next(); } const startTime = Date.now(); activeConnections.inc(); // Capture response on finish res.on('finish', () => { const duration = (Date.now() - startTime) / 1000; // Convert to seconds const route = normalizeRoutePath(req.route?.path || req.path); const statusCode = res.statusCode.toString(); const method = req.method; // Record request count httpRequestsTotal.inc({ method, route, status_code: statusCode }); // Record request duration httpRequestDuration.observe( { method, route, status_code: statusCode }, duration ); // Record errors (4xx and 5xx) if (res.statusCode >= 400) { const errorType = res.statusCode >= 500 ? 'server_error' : 'client_error'; httpRequestErrors.inc({ method, route, status_code: statusCode, error_type: errorType, }); } activeConnections.dec(); }); // Handle connection errors res.on('error', () => { activeConnections.dec(); }); next(); } /** * Metrics endpoint handler * Returns Prometheus-formatted metrics */ export async function metricsHandler(_req: Request, res: Response): Promise { try { res.set('Content-Type', register.contentType); const metrics = await register.metrics(); res.end(metrics); } catch (error) { res.status(500).end('Error collecting metrics'); } } /** * Create metrics router * Sets up the /metrics endpoint */ export function createMetricsRouter(): Router { const router = Router(); // Metrics endpoint (GET /metrics) router.get('/metrics', metricsHandler); return router; } // ============================================================================ // HELPER FUNCTIONS FOR RECORDING METRICS // ============================================================================ /** * Record a TAT breach event */ export function recordTATBreach(department: string, workflowType: string, breachLevel: string = 'warning'): void { tatBreachesTotal.inc({ department, workflow_type: workflowType, breach_level: breachLevel }); } /** * Update pending workflows count */ export function updatePendingWorkflows(department: string, status: string, count: number): void { pendingWorkflowsCount.set({ department, status }, count); } /** * Record a workflow operation */ export function recordWorkflowOperation(operation: string, success: boolean): void { workflowOperationsTotal.inc({ operation, status: success ? 'success' : 'failure' }); } /** * Record an authentication event */ export function recordAuthEvent(eventType: string, success: boolean): void { authEventsTotal.inc({ event_type: eventType, success: success ? 'true' : 'false' }); } /** * Record a database query duration */ export function recordDBQuery(operation: string, table: string, durationMs: number): void { dbQueryDuration.observe({ operation, table }, durationMs / 1000); } /** * Record a Redis operation */ export function recordRedisOperation(operation: string, success: boolean): void { redisOperationsTotal.inc({ operation, status: success ? 'success' : 'failure' }); } /** * Record an AI service call */ export function recordAIServiceCall(provider: string, operation: string, success: boolean, durationMs?: number): void { aiServiceCalls.inc({ provider, operation, status: success ? 'success' : 'failure' }); if (durationMs !== undefined) { aiServiceDuration.observe({ provider, operation }, durationMs / 1000); } } // Export the registry for advanced use cases export { register };