151 lines
5.2 KiB
YAML
151 lines
5.2 KiB
YAML
# =============================================================================
|
|
# Prometheus Alert Rules for RE Workflow
|
|
# =============================================================================
|
|
|
|
groups:
|
|
# ===========================================================================
|
|
# Backend API Alerts
|
|
# ===========================================================================
|
|
- name: re-workflow-backend
|
|
interval: 30s
|
|
rules:
|
|
# High Error Rate
|
|
- alert: HighErrorRate
|
|
expr: rate(http_request_errors_total{job="re-workflow-backend"}[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
service: backend
|
|
annotations:
|
|
summary: "High error rate detected in RE Workflow Backend"
|
|
description: "Error rate is {{ $value | printf \"%.2f\" }} errors/sec for the last 5 minutes."
|
|
|
|
# High Request Latency
|
|
- alert: HighRequestLatency
|
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="re-workflow-backend"}[5m])) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: backend
|
|
annotations:
|
|
summary: "High API latency detected"
|
|
description: "95th percentile latency is {{ $value | printf \"%.2f\" }}s"
|
|
|
|
# API Down
|
|
- alert: BackendDown
|
|
expr: up{job="re-workflow-backend"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: backend
|
|
annotations:
|
|
summary: "RE Workflow Backend is DOWN"
|
|
description: "Backend API has been unreachable for more than 1 minute."
|
|
|
|
# High Memory Usage
|
|
- alert: HighMemoryUsage
|
|
expr: process_resident_memory_bytes{job="re-workflow-backend"} / 1024 / 1024 > 500
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: backend
|
|
annotations:
|
|
summary: "High memory usage in Backend"
|
|
description: "Memory usage is {{ $value | printf \"%.0f\" }}MB"
|
|
|
|
# Event Loop Lag
|
|
- alert: HighEventLoopLag
|
|
expr: nodejs_eventloop_lag_seconds{job="re-workflow-backend"} > 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: backend
|
|
annotations:
|
|
summary: "High Node.js event loop lag"
|
|
description: "Event loop lag is {{ $value | printf \"%.3f\" }}s"
|
|
|
|
# ===========================================================================
|
|
# TAT/Workflow Alerts
|
|
# ===========================================================================
|
|
- name: re-workflow-tat
|
|
interval: 1m
|
|
rules:
|
|
# TAT Breach Rate
|
|
- alert: HighTATBreachRate
|
|
expr: increase(tat_breaches_total[1h]) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: workflow
|
|
annotations:
|
|
summary: "High TAT breach rate detected"
|
|
description: "{{ $value | printf \"%.0f\" }} TAT breaches in the last hour"
|
|
|
|
# Pending Workflows Queue
|
|
- alert: LargePendingQueue
|
|
expr: pending_workflows_count > 100
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
service: workflow
|
|
annotations:
|
|
summary: "Large number of pending workflows"
|
|
description: "{{ $value | printf \"%.0f\" }} workflows pending approval"
|
|
|
|
# ===========================================================================
|
|
# Infrastructure Alerts
|
|
# ===========================================================================
|
|
- name: infrastructure
|
|
interval: 30s
|
|
rules:
|
|
# High CPU Usage (Node Exporter)
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: infrastructure
|
|
annotations:
|
|
summary: "High CPU usage on {{ $labels.instance }}"
|
|
description: "CPU usage is {{ $value | printf \"%.1f\" }}%"
|
|
|
|
# High Disk Usage
|
|
- alert: HighDiskUsage
|
|
expr: (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: infrastructure
|
|
annotations:
|
|
summary: "High disk usage on {{ $labels.instance }}"
|
|
description: "Disk usage is {{ $value | printf \"%.1f\" }}%"
|
|
|
|
# Low Memory
|
|
- alert: LowMemory
|
|
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 15
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
service: infrastructure
|
|
annotations:
|
|
summary: "Low memory on {{ $labels.instance }}"
|
|
description: "Available memory is {{ $value | printf \"%.1f\" }}%"
|
|
|
|
# ===========================================================================
|
|
# Loki/Logging Alerts
|
|
# ===========================================================================
|
|
- name: logging
|
|
interval: 1m
|
|
rules:
|
|
# Loki Down
|
|
- alert: LokiDown
|
|
expr: up{job="loki"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
service: loki
|
|
annotations:
|
|
summary: "Loki is DOWN"
|
|
description: "Loki has been unreachable for more than 2 minutes."
|
|
|