#!/bin/bash # 生产环境容器监控脚本 # 用途:监控容器状态,自动重启异常容器,发送告警通知 set -e # 配置 LOG_FILE="/var/log/container-monitor.log" ALERT_WEBHOOK="" # 企业微信webhook地址 MAX_RESTART_COUNT=3 # 最大重启次数 RESTART_WINDOW=3600 # 重启计数窗口(秒) # 颜色定义 RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # 日志函数 log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> "$LOG_FILE" echo -e "$1" } log_info() { log "${BLUE}[INFO]${NC} $1" } log_success() { log "${GREEN}[SUCCESS]${NC} $1" } log_warning() { log "${YELLOW}[WARNING]${NC} $1" } log_error() { log "${RED}[ERROR]${NC} $1" } # 发送告警通知 send_alert() { local title="$1" local message="$2" if [ -n "$ALERT_WEBHOOK" ]; then curl -s -X POST "$ALERT_WEBHOOK" \ -H 'Content-Type: application/json' \ -d "{ \"msgtype\": \"markdown\", \"markdown\": { \"content\": \"## ${title}\n\n${message}\n\n**时间**: $(date '+%Y-%m-%d %H:%M:%S')\" } }" > /dev/null 2>&1 fi } # 检查容器状态 check_container() { local container="$1" local status=$(docker inspect --format='{{.State.Status}}' "$container" 2>/dev/null) if [ -z "$status" ]; then log_error "容器 $container 不存在" return 1 fi if [ "$status" != "running" ]; then log_warning "容器 $container 状态异常: $status" return 1 fi # 检查健康状态 local health=$(docker inspect --format='{{.State.Health.Status}}' "$container" 2>/dev/null) if [ -n "$health" ] && [ "$health" != "healthy" ]; then log_warning "容器 $container 健康状态异常: $health" return 1 fi return 0 } # 重启容器 restart_container() { local container="$1" local restart_count_file="/tmp/${container}_restart_count" local restart_time_file="/tmp/${container}_restart_time" # 检查重启次数 local count=0 local first_restart_time=$(date +%s) if [ -f "$restart_count_file" ] && [ -f "$restart_time_file" ]; then count=$(cat "$restart_count_file") first_restart_time=$(cat "$restart_time_file") fi local current_time=$(date +%s) local time_diff=$((current_time - first_restart_time)) # 如果超过时间窗口,重置计数 if [ $time_diff -gt $RESTART_WINDOW ]; then count=0 first_restart_time=$current_time fi # 检查是否超过最大重启次数 if [ $count -ge $MAX_RESTART_COUNT ]; then log_error "容器 $container 已达到最大重启次数 ($MAX_RESTART_COUNT),停止自动重启" send_alert "⚠️ 容器重启次数超限" "容器 **$container** 在过去1小时内已重启 $count 次,已停止自动重启" return 1 fi # 重启容器 log_info "正在重启容器 $container (第 $((count + 1)) 次)" docker restart "$container" > /dev/null 2>&1 # 更新计数 echo $((count + 1)) > "$restart_count_file" echo "$first_restart_time" > "$restart_time_file" # 发送告警 send_alert "🔄 容器自动重启" "容器 **$container** 已自动重启 (第 $((count + 1)) 次)" return 0 } # 检查容器资源使用 check_resources() { local container="$1" local cpu_threshold=80 # CPU使用率阈值 local mem_threshold=80 # 内存使用率阈值 local stats=$(docker stats --no-stream --format "{{.CPUPerc}}\t{{.MemPerc}}" "$container" 2>/dev/null) local cpu=$(echo "$stats" | awk '{print $1}' | sed 's/%//') local mem=$(echo "$stats" | awk '{print $2}' | sed 's/%//') if [ -n "$cpu" ] && [ -n "$mem" ]; then cpu=${cpu%.*} # 取整数部分 mem=${mem%.*} if [ "$cpu" -gt "$cpu_threshold" ]; then log_warning "容器 $container CPU使用率过高: ${cpu}%" send_alert "⚠️ CPU使用率过高" "容器 **$container** CPU使用率: ${cpu}%" fi if [ "$mem" -gt "$mem_threshold" ]; then log_warning "容器 $container 内存使用率过高: ${mem}%" send_alert "⚠️ 内存使用率过高" "容器 **$container** 内存使用率: ${mem}%" fi fi } # 主监控函数 monitor() { log_info "开始容器监控..." # 关键容器列表 local containers=( "woodpecker-server" "woodpecker-agent" "novalon-nginx" "novalon-website" "forgejo" "postgresql" "registry" ) local unhealthy_count=0 local restarted_count=0 for container in "${containers[@]}"; do if ! check_container "$container"; then unhealthy_count=$((unhealthy_count + 1)) # 尝试重启容器 if restart_container "$container"; then restarted_count=$((restarted_count + 1)) sleep 5 # 等待容器启动 # 再次检查 if check_container "$container"; then log_success "容器 $container 重启成功" else log_error "容器 $container 重启后仍然异常" fi fi else # 检查资源使用 check_resources "$container" fi done # 输出监控摘要 log_info "监控摘要: 总容器 ${#containers[@]}, 异常 $unhealthy_count, 已重启 $restarted_count" # 如果有异常容器,发送汇总告警 if [ $unhealthy_count -gt 0 ]; then send_alert "⚠️ 容器监控告警" "发现 $unhealthy_count 个异常容器,已自动重启 $restarted_count 个" fi } # 主程序 main() { case "${1:-monitor}" in monitor) monitor ;; status) docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" ;; logs) tail -f "$LOG_FILE" ;; *) echo "用法: $0 {monitor|status|logs}" exit 1 ;; esac } main "$@"