Files
novalon-website/scripts/monitoring/container-monitor.sh
T
张翔 ec33aedd79
ci/woodpecker/push/woodpecker Pipeline failed
fix(docker): adapt Dockerfile.prod for Next.js 16 standalone output structure
- Fix standalone path: dist/standalone/novalon-website/
- Update static files path: dist/static
- Replace curl with wget in health checks (Alpine compatibility)
- Add monitoring and optimization scripts
- Configure external network for docker-compose

This resolves the deployment failure caused by Next.js 16's new standalone output structure.
2026-03-30 09:04:51 +08:00

225 lines
6.1 KiB
Bash
Executable File

#!/bin/bash
# 生产环境容器监控脚本
# 用途:监控容器状态,自动重启异常容器,发送告警通知
set -e
# 配置
LOG_FILE="/var/log/container-monitor.log"
ALERT_WEBHOOK="" # 企业微信webhook地址
MAX_RESTART_COUNT=3 # 最大重启次数
RESTART_WINDOW=3600 # 重启计数窗口(秒)
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
# 日志函数
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> "$LOG_FILE"
echo -e "$1"
}
log_info() {
log "${BLUE}[INFO]${NC} $1"
}
log_success() {
log "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
log "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
log "${RED}[ERROR]${NC} $1"
}
# 发送告警通知
send_alert() {
local title="$1"
local message="$2"
if [ -n "$ALERT_WEBHOOK" ]; then
curl -s -X POST "$ALERT_WEBHOOK" \
-H 'Content-Type: application/json' \
-d "{
\"msgtype\": \"markdown\",
\"markdown\": {
\"content\": \"## ${title}\n\n${message}\n\n**时间**: $(date '+%Y-%m-%d %H:%M:%S')\"
}
}" > /dev/null 2>&1
fi
}
# 检查容器状态
check_container() {
local container="$1"
local status=$(docker inspect --format='{{.State.Status}}' "$container" 2>/dev/null)
if [ -z "$status" ]; then
log_error "容器 $container 不存在"
return 1
fi
if [ "$status" != "running" ]; then
log_warning "容器 $container 状态异常: $status"
return 1
fi
# 检查健康状态
local health=$(docker inspect --format='{{.State.Health.Status}}' "$container" 2>/dev/null)
if [ -n "$health" ] && [ "$health" != "healthy" ]; then
log_warning "容器 $container 健康状态异常: $health"
return 1
fi
return 0
}
# 重启容器
restart_container() {
local container="$1"
local restart_count_file="/tmp/${container}_restart_count"
local restart_time_file="/tmp/${container}_restart_time"
# 检查重启次数
local count=0
local first_restart_time=$(date +%s)
if [ -f "$restart_count_file" ] && [ -f "$restart_time_file" ]; then
count=$(cat "$restart_count_file")
first_restart_time=$(cat "$restart_time_file")
fi
local current_time=$(date +%s)
local time_diff=$((current_time - first_restart_time))
# 如果超过时间窗口,重置计数
if [ $time_diff -gt $RESTART_WINDOW ]; then
count=0
first_restart_time=$current_time
fi
# 检查是否超过最大重启次数
if [ $count -ge $MAX_RESTART_COUNT ]; then
log_error "容器 $container 已达到最大重启次数 ($MAX_RESTART_COUNT),停止自动重启"
send_alert "⚠️ 容器重启次数超限" "容器 **$container** 在过去1小时内已重启 $count 次,已停止自动重启"
return 1
fi
# 重启容器
log_info "正在重启容器 $container (第 $((count + 1)) 次)"
docker restart "$container" > /dev/null 2>&1
# 更新计数
echo $((count + 1)) > "$restart_count_file"
echo "$first_restart_time" > "$restart_time_file"
# 发送告警
send_alert "🔄 容器自动重启" "容器 **$container** 已自动重启 (第 $((count + 1)) 次)"
return 0
}
# 检查容器资源使用
check_resources() {
local container="$1"
local cpu_threshold=80 # CPU使用率阈值
local mem_threshold=80 # 内存使用率阈值
local stats=$(docker stats --no-stream --format "{{.CPUPerc}}\t{{.MemPerc}}" "$container" 2>/dev/null)
local cpu=$(echo "$stats" | awk '{print $1}' | sed 's/%//')
local mem=$(echo "$stats" | awk '{print $2}' | sed 's/%//')
if [ -n "$cpu" ] && [ -n "$mem" ]; then
cpu=${cpu%.*} # 取整数部分
mem=${mem%.*}
if [ "$cpu" -gt "$cpu_threshold" ]; then
log_warning "容器 $container CPU使用率过高: ${cpu}%"
send_alert "⚠️ CPU使用率过高" "容器 **$container** CPU使用率: ${cpu}%"
fi
if [ "$mem" -gt "$mem_threshold" ]; then
log_warning "容器 $container 内存使用率过高: ${mem}%"
send_alert "⚠️ 内存使用率过高" "容器 **$container** 内存使用率: ${mem}%"
fi
fi
}
# 主监控函数
monitor() {
log_info "开始容器监控..."
# 关键容器列表
local containers=(
"woodpecker-server"
"woodpecker-agent"
"novalon-nginx"
"novalon-website"
"forgejo"
"postgresql"
"registry"
)
local unhealthy_count=0
local restarted_count=0
for container in "${containers[@]}"; do
if ! check_container "$container"; then
unhealthy_count=$((unhealthy_count + 1))
# 尝试重启容器
if restart_container "$container"; then
restarted_count=$((restarted_count + 1))
sleep 5 # 等待容器启动
# 再次检查
if check_container "$container"; then
log_success "容器 $container 重启成功"
else
log_error "容器 $container 重启后仍然异常"
fi
fi
else
# 检查资源使用
check_resources "$container"
fi
done
# 输出监控摘要
log_info "监控摘要: 总容器 ${#containers[@]}, 异常 $unhealthy_count, 已重启 $restarted_count"
# 如果有异常容器,发送汇总告警
if [ $unhealthy_count -gt 0 ]; then
send_alert "⚠️ 容器监控告警" "发现 $unhealthy_count 个异常容器,已自动重启 $restarted_count"
fi
}
# 主程序
main() {
case "${1:-monitor}" in
monitor)
monitor
;;
status)
docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
;;
logs)
tail -f "$LOG_FILE"
;;
*)
echo "用法: $0 {monitor|status|logs}"
exit 1
;;
esac
}
main "$@"