# ============================================================================= # Prometheus Alert Rules for RE Workflow # ============================================================================= groups: # =========================================================================== # Backend API Alerts # =========================================================================== - name: re-workflow-backend interval: 30s rules: # High Error Rate - alert: HighErrorRate expr: rate(http_request_errors_total{job="re-workflow-backend"}[5m]) > 0.1 for: 5m labels: severity: critical service: backend annotations: summary: "High error rate detected in RE Workflow Backend" description: "Error rate is {{ $value | printf \"%.2f\" }} errors/sec for the last 5 minutes." # High Request Latency - alert: HighRequestLatency expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="re-workflow-backend"}[5m])) > 2 for: 5m labels: severity: warning service: backend annotations: summary: "High API latency detected" description: "95th percentile latency is {{ $value | printf \"%.2f\" }}s" # API Down - alert: BackendDown expr: up{job="re-workflow-backend"} == 0 for: 1m labels: severity: critical service: backend annotations: summary: "RE Workflow Backend is DOWN" description: "Backend API has been unreachable for more than 1 minute." # High Memory Usage - alert: HighMemoryUsage expr: process_resident_memory_bytes{job="re-workflow-backend"} / 1024 / 1024 > 500 for: 10m labels: severity: warning service: backend annotations: summary: "High memory usage in Backend" description: "Memory usage is {{ $value | printf \"%.0f\" }}MB" # Event Loop Lag - alert: HighEventLoopLag expr: nodejs_eventloop_lag_seconds{job="re-workflow-backend"} > 0.5 for: 5m labels: severity: warning service: backend annotations: summary: "High Node.js event loop lag" description: "Event loop lag is {{ $value | printf \"%.3f\" }}s" # =========================================================================== # TAT/Workflow Alerts # =========================================================================== - name: re-workflow-tat interval: 1m rules: # TAT Breach Rate - alert: HighTATBreachRate expr: increase(tat_breaches_total[1h]) > 10 for: 5m labels: severity: warning service: workflow annotations: summary: "High TAT breach rate detected" description: "{{ $value | printf \"%.0f\" }} TAT breaches in the last hour" # Pending Workflows Queue - alert: LargePendingQueue expr: pending_workflows_count > 100 for: 30m labels: severity: warning service: workflow annotations: summary: "Large number of pending workflows" description: "{{ $value | printf \"%.0f\" }} workflows pending approval" # =========================================================================== # Infrastructure Alerts # =========================================================================== - name: infrastructure interval: 30s rules: # High CPU Usage (Node Exporter) - alert: HighCPUUsage expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 10m labels: severity: warning service: infrastructure annotations: summary: "High CPU usage on {{ $labels.instance }}" description: "CPU usage is {{ $value | printf \"%.1f\" }}%" # High Disk Usage - alert: HighDiskUsage expr: (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 85 for: 10m labels: severity: warning service: infrastructure annotations: summary: "High disk usage on {{ $labels.instance }}" description: "Disk usage is {{ $value | printf \"%.1f\" }}%" # Low Memory - alert: LowMemory expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 15 for: 5m labels: severity: critical service: infrastructure annotations: summary: "Low memory on {{ $labels.instance }}" description: "Available memory is {{ $value | printf \"%.1f\" }}%" # =========================================================================== # Loki/Logging Alerts # =========================================================================== - name: logging interval: 1m rules: # Loki Down - alert: LokiDown expr: up{job="loki"} == 0 for: 2m labels: severity: critical service: loki annotations: summary: "Loki is DOWN" description: "Loki has been unreachable for more than 2 minutes."