292 lines
8.6 KiB
TypeScript
292 lines
8.6 KiB
TypeScript
/**
|
|
* Prometheus Metrics Middleware
|
|
* Exposes application metrics for monitoring with Prometheus/Grafana
|
|
*
|
|
* Metrics exposed:
|
|
* - http_requests_total: Total number of HTTP requests
|
|
* - http_request_duration_seconds: HTTP request latency histogram
|
|
* - http_request_errors_total: Total number of HTTP errors
|
|
* - nodejs_*: Node.js runtime metrics (memory, event loop, etc.)
|
|
* - Custom business metrics (TAT breaches, workflow counts, etc.)
|
|
*/
|
|
|
|
import { Request, Response, NextFunction, Router } from 'express';
|
|
import client from 'prom-client';
|
|
|
|
// ============================================================================
|
|
// REGISTRY SETUP
|
|
// ============================================================================
|
|
|
|
// Create a custom registry to avoid conflicts with default metrics
|
|
const register = new client.Registry();
|
|
|
|
// Add default Node.js metrics (memory, CPU, event loop, GC, etc.)
|
|
client.collectDefaultMetrics({
|
|
register,
|
|
prefix: 'nodejs_',
|
|
labels: { app: 're-workflow', service: 'backend' },
|
|
});
|
|
|
|
// ============================================================================
|
|
// HTTP METRICS
|
|
// ============================================================================
|
|
|
|
// Total HTTP requests counter
|
|
const httpRequestsTotal = new client.Counter({
|
|
name: 'http_requests_total',
|
|
help: 'Total number of HTTP requests',
|
|
labelNames: ['method', 'route', 'status_code'],
|
|
registers: [register],
|
|
});
|
|
|
|
// HTTP request duration histogram
|
|
const httpRequestDuration = new client.Histogram({
|
|
name: 'http_request_duration_seconds',
|
|
help: 'HTTP request latency in seconds',
|
|
labelNames: ['method', 'route', 'status_code'],
|
|
buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
|
|
registers: [register],
|
|
});
|
|
|
|
// HTTP errors counter
|
|
const httpRequestErrors = new client.Counter({
|
|
name: 'http_request_errors_total',
|
|
help: 'Total number of HTTP errors (4xx and 5xx)',
|
|
labelNames: ['method', 'route', 'status_code', 'error_type'],
|
|
registers: [register],
|
|
});
|
|
|
|
// Active HTTP connections gauge
|
|
const activeConnections = new client.Gauge({
|
|
name: 'http_active_connections',
|
|
help: 'Number of active HTTP connections',
|
|
registers: [register],
|
|
});
|
|
|
|
// ============================================================================
|
|
// BUSINESS METRICS
|
|
// ============================================================================
|
|
|
|
// TAT breaches counter
|
|
export const tatBreachesTotal = new client.Counter({
|
|
name: 'tat_breaches_total',
|
|
help: 'Total number of TAT breaches',
|
|
labelNames: ['department', 'workflow_type', 'breach_level'],
|
|
registers: [register],
|
|
});
|
|
|
|
// Pending workflows gauge
|
|
export const pendingWorkflowsCount = new client.Gauge({
|
|
name: 'pending_workflows_count',
|
|
help: 'Current number of pending workflows',
|
|
labelNames: ['department', 'status'],
|
|
registers: [register],
|
|
});
|
|
|
|
// Workflow operations counter
|
|
export const workflowOperationsTotal = new client.Counter({
|
|
name: 'workflow_operations_total',
|
|
help: 'Total number of workflow operations',
|
|
labelNames: ['operation', 'status'],
|
|
registers: [register],
|
|
});
|
|
|
|
// User authentication events counter
|
|
export const authEventsTotal = new client.Counter({
|
|
name: 'auth_events_total',
|
|
help: 'Total number of authentication events',
|
|
labelNames: ['event_type', 'success'],
|
|
registers: [register],
|
|
});
|
|
|
|
// Database query duration histogram
|
|
export const dbQueryDuration = new client.Histogram({
|
|
name: 'db_query_duration_seconds',
|
|
help: 'Database query latency in seconds',
|
|
labelNames: ['operation', 'table'],
|
|
buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5],
|
|
registers: [register],
|
|
});
|
|
|
|
// Redis operations counter
|
|
export const redisOperationsTotal = new client.Counter({
|
|
name: 'redis_operations_total',
|
|
help: 'Total number of Redis operations',
|
|
labelNames: ['operation', 'status'],
|
|
registers: [register],
|
|
});
|
|
|
|
// AI service calls
|
|
export const aiServiceCalls = new client.Counter({
|
|
name: 'ai_service_calls_total',
|
|
help: 'Total number of AI service calls',
|
|
labelNames: ['provider', 'operation', 'status'],
|
|
registers: [register],
|
|
});
|
|
|
|
export const aiServiceDuration = new client.Histogram({
|
|
name: 'ai_service_duration_seconds',
|
|
help: 'AI service call latency in seconds',
|
|
labelNames: ['provider', 'operation'],
|
|
buckets: [0.5, 1, 2, 5, 10, 30, 60],
|
|
registers: [register],
|
|
});
|
|
|
|
// ============================================================================
|
|
// MIDDLEWARE
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Normalize route path for metrics labels
|
|
* Replaces dynamic segments like UUIDs and IDs with placeholders
|
|
*/
|
|
function normalizeRoutePath(path: string): string {
|
|
return path
|
|
// Replace UUIDs
|
|
.replace(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/gi, ':id')
|
|
// Replace numeric IDs
|
|
.replace(/\/\d+/g, '/:id')
|
|
// Replace request IDs (REQ-XXXX-XXX format)
|
|
.replace(/REQ-\d+-\d+/gi, ':requestId')
|
|
// Clean up multiple slashes
|
|
.replace(/\/+/g, '/');
|
|
}
|
|
|
|
/**
|
|
* HTTP metrics middleware
|
|
* Tracks request counts, durations, and errors
|
|
*/
|
|
export function metricsMiddleware(req: Request, res: Response, next: NextFunction): void {
|
|
// Skip metrics endpoint itself
|
|
if (req.path === '/metrics') {
|
|
return next();
|
|
}
|
|
|
|
const startTime = Date.now();
|
|
activeConnections.inc();
|
|
|
|
// Capture response on finish
|
|
res.on('finish', () => {
|
|
const duration = (Date.now() - startTime) / 1000; // Convert to seconds
|
|
const route = normalizeRoutePath(req.route?.path || req.path);
|
|
const statusCode = res.statusCode.toString();
|
|
const method = req.method;
|
|
|
|
// Record request count
|
|
httpRequestsTotal.inc({ method, route, status_code: statusCode });
|
|
|
|
// Record request duration
|
|
httpRequestDuration.observe(
|
|
{ method, route, status_code: statusCode },
|
|
duration
|
|
);
|
|
|
|
// Record errors (4xx and 5xx)
|
|
if (res.statusCode >= 400) {
|
|
const errorType = res.statusCode >= 500 ? 'server_error' : 'client_error';
|
|
httpRequestErrors.inc({
|
|
method,
|
|
route,
|
|
status_code: statusCode,
|
|
error_type: errorType,
|
|
});
|
|
}
|
|
|
|
activeConnections.dec();
|
|
});
|
|
|
|
// Handle connection errors
|
|
res.on('error', () => {
|
|
activeConnections.dec();
|
|
});
|
|
|
|
next();
|
|
}
|
|
|
|
/**
|
|
* Metrics endpoint handler
|
|
* Returns Prometheus-formatted metrics
|
|
*/
|
|
export async function metricsHandler(_req: Request, res: Response): Promise<void> {
|
|
try {
|
|
res.set('Content-Type', register.contentType);
|
|
const metrics = await register.metrics();
|
|
res.end(metrics);
|
|
} catch (error) {
|
|
res.status(500).end('Error collecting metrics');
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Create metrics router
|
|
* Sets up the /metrics endpoint
|
|
*/
|
|
export function createMetricsRouter(): Router {
|
|
const router = Router();
|
|
|
|
// Metrics endpoint (GET /metrics)
|
|
router.get('/metrics', metricsHandler);
|
|
|
|
return router;
|
|
}
|
|
|
|
// ============================================================================
|
|
// HELPER FUNCTIONS FOR RECORDING METRICS
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Record a TAT breach event
|
|
*/
|
|
export function recordTATBreach(department: string, workflowType: string, breachLevel: string = 'warning'): void {
|
|
tatBreachesTotal.inc({ department, workflow_type: workflowType, breach_level: breachLevel });
|
|
}
|
|
|
|
/**
|
|
* Update pending workflows count
|
|
*/
|
|
export function updatePendingWorkflows(department: string, status: string, count: number): void {
|
|
pendingWorkflowsCount.set({ department, status }, count);
|
|
}
|
|
|
|
/**
|
|
* Record a workflow operation
|
|
*/
|
|
export function recordWorkflowOperation(operation: string, success: boolean): void {
|
|
workflowOperationsTotal.inc({ operation, status: success ? 'success' : 'failure' });
|
|
}
|
|
|
|
/**
|
|
* Record an authentication event
|
|
*/
|
|
export function recordAuthEvent(eventType: string, success: boolean): void {
|
|
authEventsTotal.inc({ event_type: eventType, success: success ? 'true' : 'false' });
|
|
}
|
|
|
|
/**
|
|
* Record a database query duration
|
|
*/
|
|
export function recordDBQuery(operation: string, table: string, durationMs: number): void {
|
|
dbQueryDuration.observe({ operation, table }, durationMs / 1000);
|
|
}
|
|
|
|
/**
|
|
* Record a Redis operation
|
|
*/
|
|
export function recordRedisOperation(operation: string, success: boolean): void {
|
|
redisOperationsTotal.inc({ operation, status: success ? 'success' : 'failure' });
|
|
}
|
|
|
|
/**
|
|
* Record an AI service call
|
|
*/
|
|
export function recordAIServiceCall(provider: string, operation: string, success: boolean, durationMs?: number): void {
|
|
aiServiceCalls.inc({ provider, operation, status: success ? 'success' : 'failure' });
|
|
if (durationMs !== undefined) {
|
|
aiServiceDuration.observe({ provider, operation }, durationMs / 1000);
|
|
}
|
|
}
|
|
|
|
// Export the registry for advanced use cases
|
|
export { register };
|
|
|