groups: - name: test_environment_alerts interval: 30s rules: - alert: TestAPIDown expr: up{job="test-api-gateway"} == 0 for: 1m labels: severity: critical service: api-gateway annotations: summary: "测试API网关服务宕机" description: "测试API网关服务已宕机超过1分钟" - alert: TestAdminBackendDown expr: up{job="test-admin-backend"} == 0 for: 1m labels: severity: critical service: admin-backend annotations: summary: "测试Admin后端服务宕机" description: "测试Admin后端服务已宕机超过1分钟" - alert: TestPostgresDown expr: up{job="test-postgres"} == 0 for: 1m labels: severity: critical service: postgres annotations: summary: "测试PostgreSQL数据库宕机" description: "测试PostgreSQL数据库已宕机超过1分钟" - alert: TestRedisDown expr: up{job="test-redis"} == 0 for: 1m labels: severity: critical service: redis annotations: summary: "测试Redis缓存宕机" description: "测试Redis缓存已宕机超过1分钟" - alert: TestAPIHighErrorRate expr: rate(http_server_requests_seconds_count{job="test-api-gateway",status=~"5.."}[5m]) > 0.1 for: 5m labels: severity: warning service: api-gateway annotations: summary: "测试API网关错误率过高" description: "测试API网关5xx错误率超过10%" - alert: TestAdminBackendHighErrorRate expr: rate(http_server_requests_seconds_count{job="test-admin-backend",status=~"5.."}[5m]) > 0.1 for: 5m labels: severity: warning service: admin-backend annotations: summary: "测试Admin后端错误率过高" description: "测试Admin后端5xx错误率超过10%" - alert: TestAPILatencyHigh expr: histogram_quantile(0.95, rate(http_server_requests_seconds_bucket{job="test-api-gateway"}[5m])) > 2 for: 5m labels: severity: warning service: api-gateway annotations: summary: "测试API网关延迟过高" description: "测试API网关P95延迟超过2秒" - alert: TestPostgresConnectionsHigh expr: pg_stat_database{datname="everything_test"} > 80 for: 5m labels: severity: warning service: postgres annotations: summary: "测试PostgreSQL连接数过高" description: "测试PostgreSQL数据库连接数超过80" - alert: TestRedisMemoryHigh expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.8 for: 5m labels: severity: warning service: redis annotations: summary: "测试Redis内存使用率过高" description: "测试Redis内存使用率超过80%"