Re_Backend/monitoring/prometheus/alert.rules.yml

151 lines
5.2 KiB
YAML

# =============================================================================
# Prometheus Alert Rules for RE Workflow
# =============================================================================
groups:
# ===========================================================================
# Backend API Alerts
# ===========================================================================
- name: re-workflow-backend
interval: 30s
rules:
# High Error Rate
- alert: HighErrorRate
expr: rate(http_request_errors_total{job="re-workflow-backend"}[5m]) > 0.1
for: 5m
labels:
severity: critical
service: backend
annotations:
summary: "High error rate detected in RE Workflow Backend"
description: "Error rate is {{ $value | printf \"%.2f\" }} errors/sec for the last 5 minutes."
# High Request Latency
- alert: HighRequestLatency
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="re-workflow-backend"}[5m])) > 2
for: 5m
labels:
severity: warning
service: backend
annotations:
summary: "High API latency detected"
description: "95th percentile latency is {{ $value | printf \"%.2f\" }}s"
# API Down
- alert: BackendDown
expr: up{job="re-workflow-backend"} == 0
for: 1m
labels:
severity: critical
service: backend
annotations:
summary: "RE Workflow Backend is DOWN"
description: "Backend API has been unreachable for more than 1 minute."
# High Memory Usage
- alert: HighMemoryUsage
expr: process_resident_memory_bytes{job="re-workflow-backend"} / 1024 / 1024 > 500
for: 10m
labels:
severity: warning
service: backend
annotations:
summary: "High memory usage in Backend"
description: "Memory usage is {{ $value | printf \"%.0f\" }}MB"
# Event Loop Lag
- alert: HighEventLoopLag
expr: nodejs_eventloop_lag_seconds{job="re-workflow-backend"} > 0.5
for: 5m
labels:
severity: warning
service: backend
annotations:
summary: "High Node.js event loop lag"
description: "Event loop lag is {{ $value | printf \"%.3f\" }}s"
# ===========================================================================
# TAT/Workflow Alerts
# ===========================================================================
- name: re-workflow-tat
interval: 1m
rules:
# TAT Breach Rate
- alert: HighTATBreachRate
expr: increase(tat_breaches_total[1h]) > 10
for: 5m
labels:
severity: warning
service: workflow
annotations:
summary: "High TAT breach rate detected"
description: "{{ $value | printf \"%.0f\" }} TAT breaches in the last hour"
# Pending Workflows Queue
- alert: LargePendingQueue
expr: pending_workflows_count > 100
for: 30m
labels:
severity: warning
service: workflow
annotations:
summary: "Large number of pending workflows"
description: "{{ $value | printf \"%.0f\" }} workflows pending approval"
# ===========================================================================
# Infrastructure Alerts
# ===========================================================================
- name: infrastructure
interval: 30s
rules:
# High CPU Usage (Node Exporter)
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 10m
labels:
severity: warning
service: infrastructure
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is {{ $value | printf \"%.1f\" }}%"
# High Disk Usage
- alert: HighDiskUsage
expr: (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 85
for: 10m
labels:
severity: warning
service: infrastructure
annotations:
summary: "High disk usage on {{ $labels.instance }}"
description: "Disk usage is {{ $value | printf \"%.1f\" }}%"
# Low Memory
- alert: LowMemory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 15
for: 5m
labels:
severity: critical
service: infrastructure
annotations:
summary: "Low memory on {{ $labels.instance }}"
description: "Available memory is {{ $value | printf \"%.1f\" }}%"
# ===========================================================================
# Loki/Logging Alerts
# ===========================================================================
- name: logging
interval: 1m
rules:
# Loki Down
- alert: LokiDown
expr: up{job="loki"} == 0
for: 2m
labels:
severity: critical
service: loki
annotations:
summary: "Loki is DOWN"
description: "Loki has been unreachable for more than 2 minutes."