Re_Backend/src/middlewares/metrics.middleware.ts

387 lines
12 KiB
TypeScript

/**
* Prometheus Metrics Middleware
* Exposes application metrics for monitoring with Prometheus/Grafana
*
* Metrics exposed:
* - http_requests_total: Total number of HTTP requests
* - http_request_duration_seconds: HTTP request latency histogram
* - http_request_errors_total: Total number of HTTP errors
* - nodejs_*: Node.js runtime metrics (memory, event loop, etc.)
* - Custom business metrics (TAT breaches, workflow counts, etc.)
*/
import { Request, Response, NextFunction, Router } from 'express';
import client from 'prom-client';
// ============================================================================
// REGISTRY SETUP
// ============================================================================
// Create a custom registry to avoid conflicts with default metrics
const register = new client.Registry();
// Add default Node.js metrics (memory, CPU, event loop, GC, etc.)
// Collect with standard metric names (no prefix to avoid double-prefixing issue)
client.collectDefaultMetrics({
register,
labels: { app: 're-workflow', service: 'backend' },
});
// ============================================================================
// HTTP METRICS
// ============================================================================
// Total HTTP requests counter
const httpRequestsTotal = new client.Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code'],
registers: [register],
});
// HTTP request duration histogram
const httpRequestDuration = new client.Histogram({
name: 'http_request_duration_seconds',
help: 'HTTP request latency in seconds',
labelNames: ['method', 'route', 'status_code'],
buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
registers: [register],
});
// HTTP errors counter
const httpRequestErrors = new client.Counter({
name: 'http_request_errors_total',
help: 'Total number of HTTP errors (4xx and 5xx)',
labelNames: ['method', 'route', 'status_code', 'error_type'],
registers: [register],
});
// Active HTTP connections gauge
const activeConnections = new client.Gauge({
name: 'http_active_connections',
help: 'Number of active HTTP connections',
registers: [register],
});
// ============================================================================
// BUSINESS METRICS
// ============================================================================
// TAT breaches counter
export const tatBreachesTotal = new client.Counter({
name: 'tat_breaches_total',
help: 'Total number of TAT breaches',
labelNames: ['department', 'workflow_type', 'breach_level'],
registers: [register],
});
// Pending workflows gauge
export const pendingWorkflowsCount = new client.Gauge({
name: 'pending_workflows_count',
help: 'Current number of pending workflows',
labelNames: ['department', 'status'],
registers: [register],
});
// Workflow operations counter
export const workflowOperationsTotal = new client.Counter({
name: 'workflow_operations_total',
help: 'Total number of workflow operations',
labelNames: ['operation', 'status'],
registers: [register],
});
// User authentication events counter
export const authEventsTotal = new client.Counter({
name: 'auth_events_total',
help: 'Total number of authentication events',
labelNames: ['event_type', 'success'],
registers: [register],
});
// Database query duration histogram
export const dbQueryDuration = new client.Histogram({
name: 'db_query_duration_seconds',
help: 'Database query latency in seconds',
labelNames: ['operation', 'table'],
buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5],
registers: [register],
});
// Redis operations counter
export const redisOperationsTotal = new client.Counter({
name: 'redis_operations_total',
help: 'Total number of Redis operations',
labelNames: ['operation', 'status'],
registers: [register],
});
// AI service calls
export const aiServiceCalls = new client.Counter({
name: 'ai_service_calls_total',
help: 'Total number of AI service calls',
labelNames: ['provider', 'operation', 'status'],
registers: [register],
});
export const aiServiceDuration = new client.Histogram({
name: 'ai_service_duration_seconds',
help: 'AI service call latency in seconds',
labelNames: ['provider', 'operation'],
buckets: [0.5, 1, 2, 5, 10, 30, 60],
registers: [register],
});
// ============================================================================
// QUEUE METRICS (BullMQ/Redis Queues)
// ============================================================================
// Queue job counts by status
export const queueJobsWaiting = new client.Gauge({
name: 'queue_jobs_waiting',
help: 'Number of jobs waiting in queue',
labelNames: ['queue_name'],
registers: [register],
});
export const queueJobsActive = new client.Gauge({
name: 'queue_jobs_active',
help: 'Number of jobs currently being processed',
labelNames: ['queue_name'],
registers: [register],
});
export const queueJobsCompleted = new client.Gauge({
name: 'queue_jobs_completed',
help: 'Number of completed jobs',
labelNames: ['queue_name'],
registers: [register],
});
export const queueJobsFailed = new client.Gauge({
name: 'queue_jobs_failed',
help: 'Number of failed jobs',
labelNames: ['queue_name'],
registers: [register],
});
export const queueJobsDelayed = new client.Gauge({
name: 'queue_jobs_delayed',
help: 'Number of delayed jobs',
labelNames: ['queue_name'],
registers: [register],
});
// Queue processing rate
export const queueJobProcessingRate = new client.Gauge({
name: 'queue_job_processing_rate',
help: 'Jobs processed per minute',
labelNames: ['queue_name'],
registers: [register],
});
// ============================================================================
// MIDDLEWARE
// ============================================================================
/**
* Normalize route path for metrics labels
* Replaces dynamic segments like UUIDs and IDs with placeholders
*/
function normalizeRoutePath(path: string): string {
return path
// Replace UUIDs
.replace(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/gi, ':id')
// Replace numeric IDs
.replace(/\/\d+/g, '/:id')
// Replace request IDs (REQ-XXXX-XXX format)
.replace(/REQ-\d+-\d+/gi, ':requestId')
// Clean up multiple slashes
.replace(/\/+/g, '/');
}
/**
* HTTP metrics middleware
* Tracks request counts, durations, and errors
*/
export function metricsMiddleware(req: Request, res: Response, next: NextFunction): void {
// Skip metrics endpoint itself
if (req.path === '/metrics') {
return next();
}
const startTime = Date.now();
activeConnections.inc();
// Capture response on finish
res.on('finish', () => {
const duration = (Date.now() - startTime) / 1000; // Convert to seconds
const route = normalizeRoutePath(req.route?.path || req.path);
const statusCode = res.statusCode.toString();
const method = req.method;
// Record request count
httpRequestsTotal.inc({ method, route, status_code: statusCode });
// Record request duration
httpRequestDuration.observe(
{ method, route, status_code: statusCode },
duration
);
// Record errors (4xx and 5xx)
if (res.statusCode >= 400) {
const errorType = res.statusCode >= 500 ? 'server_error' : 'client_error';
httpRequestErrors.inc({
method,
route,
status_code: statusCode,
error_type: errorType,
});
}
activeConnections.dec();
});
// Handle connection errors
res.on('error', () => {
activeConnections.dec();
});
next();
}
/**
* Metrics endpoint handler
* Returns Prometheus-formatted metrics
*/
export async function metricsHandler(_req: Request, res: Response): Promise<void> {
try {
res.set('Content-Type', register.contentType);
const metrics = await register.metrics();
res.end(metrics);
} catch (error) {
res.status(500).end('Error collecting metrics');
}
}
/**
* Create metrics router
* Sets up the /metrics endpoint
*/
export function createMetricsRouter(): Router {
const router = Router();
// Metrics endpoint (GET /metrics)
router.get('/metrics', metricsHandler);
return router;
}
// ============================================================================
// HELPER FUNCTIONS FOR RECORDING METRICS
// ============================================================================
/**
* Record a TAT breach event
*/
export function recordTATBreach(department: string, workflowType: string, breachLevel: string = 'warning'): void {
tatBreachesTotal.inc({ department, workflow_type: workflowType, breach_level: breachLevel });
}
/**
* Update pending workflows count
*/
export function updatePendingWorkflows(department: string, status: string, count: number): void {
pendingWorkflowsCount.set({ department, status }, count);
}
/**
* Record a workflow operation
*/
export function recordWorkflowOperation(operation: string, success: boolean): void {
workflowOperationsTotal.inc({ operation, status: success ? 'success' : 'failure' });
}
/**
* Record an authentication event
*/
export function recordAuthEvent(eventType: string, success: boolean): void {
authEventsTotal.inc({ event_type: eventType, success: success ? 'true' : 'false' });
}
/**
* Record a database query duration
*/
export function recordDBQuery(operation: string, table: string, durationMs: number): void {
dbQueryDuration.observe({ operation, table }, durationMs / 1000);
}
/**
* Record a Redis operation
*/
export function recordRedisOperation(operation: string, success: boolean): void {
redisOperationsTotal.inc({ operation, status: success ? 'success' : 'failure' });
}
/**
* Record an AI service call
*/
export function recordAIServiceCall(provider: string, operation: string, success: boolean, durationMs?: number): void {
aiServiceCalls.inc({ provider, operation, status: success ? 'success' : 'failure' });
if (durationMs !== undefined) {
aiServiceDuration.observe({ provider, operation }, durationMs / 1000);
}
}
// ============================================================================
// QUEUE METRICS COLLECTION
// ============================================================================
/**
* Update queue metrics for a specific queue
* Call this periodically or on queue events
*/
export async function updateQueueMetrics(queueName: string, queue: any): Promise<void> {
try {
const [waiting, active, completed, failed, delayed] = await Promise.all([
queue.getWaitingCount(),
queue.getActiveCount(),
queue.getCompletedCount(),
queue.getFailedCount(),
queue.getDelayedCount(),
]);
queueJobsWaiting.set({ queue_name: queueName }, waiting);
queueJobsActive.set({ queue_name: queueName }, active);
queueJobsCompleted.set({ queue_name: queueName }, completed);
queueJobsFailed.set({ queue_name: queueName }, failed);
queueJobsDelayed.set({ queue_name: queueName }, delayed);
} catch (error) {
// Silently fail to avoid breaking metrics collection
console.error(`[Metrics] Failed to update queue metrics for ${queueName}:`, error);
}
}
/**
* Initialize periodic queue metrics collection
* Should be called after queues are initialized
*/
export function startQueueMetricsCollection(queues: { name: string; queue: any }[], intervalMs: number = 15000): NodeJS.Timeout {
const collect = async () => {
for (const { name, queue } of queues) {
await updateQueueMetrics(name, queue);
}
};
// Collect immediately
collect();
// Then collect periodically
return setInterval(collect, intervalMs);
}
// Export the registry for advanced use cases
export { register };