groups: - name: novalon_website_alerts interval: 30s rules: - alert: ServiceDown expr: up{job="novalon-website"} == 0 for: 1m labels: severity: critical service: novalon-website annotations: summary: "服务不可用" description: "Novalon 网站服务已停止响应超过 1 分钟" - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05 for: 5m labels: severity: critical service: novalon-website annotations: summary: "高错误率" description: "5xx 错误率在过去 5 分钟内超过 5%: {{ $value }}" - alert: HighResponseTime expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1 for: 5m labels: severity: warning service: novalon-website annotations: summary: "高响应时间" description: "P95 响应时间超过 1 秒: {{ $value }}s" - alert: VeryHighResponseTime expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2 for: 2m labels: severity: critical service: novalon-website annotations: summary: "极高响应时间" description: "P95 响应时间超过 2 秒: {{ $value }}s" - alert: HighCPUUsage expr: rate(process_cpu_seconds_total[5m]) > 0.8 for: 5m labels: severity: warning service: novalon-website annotations: summary: "CPU 使用率过高" description: "CPU 使用率超过 80%: {{ $value }}" - alert: HighMemoryUsage expr: process_resident_memory_bytes / 1024 / 1024 / 1024 > 1 for: 5m labels: severity: warning service: novalon-website annotations: summary: "内存使用率过高" description: "内存使用超过 1GB: {{ $value }}GB" - alert: VeryHighMemoryUsage expr: process_resident_memory_bytes / 1024 / 1024 / 1024 > 2 for: 2m labels: severity: critical service: novalon-website annotations: summary: "内存使用率极高" description: "内存使用超过 2GB: {{ $value }}GB" - alert: LowRequestRate expr: rate(http_requests_total[5m]) < 0.1 for: 10m labels: severity: warning service: novalon-website annotations: summary: "请求率过低" description: "请求率在过去 10 分钟内低于 0.1 req/s: {{ $value }}" - alert: High4xxRate expr: rate(http_requests_total{status=~"4.."}[5m]) > 0.1 for: 5m labels: severity: warning service: novalon-website annotations: summary: "高 4xx 错误率" description: "4xx 错误率在过去 5 分钟内超过 10%: {{ $value }}"