ec33aedd79
ci/woodpecker/push/woodpecker Pipeline failed
- Fix standalone path: dist/standalone/novalon-website/ - Update static files path: dist/static - Replace curl with wget in health checks (Alpine compatibility) - Add monitoring and optimization scripts - Configure external network for docker-compose This resolves the deployment failure caused by Next.js 16's new standalone output structure.
225 lines
6.1 KiB
Bash
Executable File
225 lines
6.1 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# 生产环境容器监控脚本
|
|
# 用途:监控容器状态,自动重启异常容器,发送告警通知
|
|
|
|
set -e
|
|
|
|
# 配置
|
|
LOG_FILE="/var/log/container-monitor.log"
|
|
ALERT_WEBHOOK="" # 企业微信webhook地址
|
|
MAX_RESTART_COUNT=3 # 最大重启次数
|
|
RESTART_WINDOW=3600 # 重启计数窗口(秒)
|
|
|
|
# 颜色定义
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m'
|
|
|
|
# 日志函数
|
|
log() {
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> "$LOG_FILE"
|
|
echo -e "$1"
|
|
}
|
|
|
|
log_info() {
|
|
log "${BLUE}[INFO]${NC} $1"
|
|
}
|
|
|
|
log_success() {
|
|
log "${GREEN}[SUCCESS]${NC} $1"
|
|
}
|
|
|
|
log_warning() {
|
|
log "${YELLOW}[WARNING]${NC} $1"
|
|
}
|
|
|
|
log_error() {
|
|
log "${RED}[ERROR]${NC} $1"
|
|
}
|
|
|
|
# 发送告警通知
|
|
send_alert() {
|
|
local title="$1"
|
|
local message="$2"
|
|
|
|
if [ -n "$ALERT_WEBHOOK" ]; then
|
|
curl -s -X POST "$ALERT_WEBHOOK" \
|
|
-H 'Content-Type: application/json' \
|
|
-d "{
|
|
\"msgtype\": \"markdown\",
|
|
\"markdown\": {
|
|
\"content\": \"## ${title}\n\n${message}\n\n**时间**: $(date '+%Y-%m-%d %H:%M:%S')\"
|
|
}
|
|
}" > /dev/null 2>&1
|
|
fi
|
|
}
|
|
|
|
# 检查容器状态
|
|
check_container() {
|
|
local container="$1"
|
|
local status=$(docker inspect --format='{{.State.Status}}' "$container" 2>/dev/null)
|
|
|
|
if [ -z "$status" ]; then
|
|
log_error "容器 $container 不存在"
|
|
return 1
|
|
fi
|
|
|
|
if [ "$status" != "running" ]; then
|
|
log_warning "容器 $container 状态异常: $status"
|
|
return 1
|
|
fi
|
|
|
|
# 检查健康状态
|
|
local health=$(docker inspect --format='{{.State.Health.Status}}' "$container" 2>/dev/null)
|
|
if [ -n "$health" ] && [ "$health" != "healthy" ]; then
|
|
log_warning "容器 $container 健康状态异常: $health"
|
|
return 1
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# 重启容器
|
|
restart_container() {
|
|
local container="$1"
|
|
local restart_count_file="/tmp/${container}_restart_count"
|
|
local restart_time_file="/tmp/${container}_restart_time"
|
|
|
|
# 检查重启次数
|
|
local count=0
|
|
local first_restart_time=$(date +%s)
|
|
|
|
if [ -f "$restart_count_file" ] && [ -f "$restart_time_file" ]; then
|
|
count=$(cat "$restart_count_file")
|
|
first_restart_time=$(cat "$restart_time_file")
|
|
fi
|
|
|
|
local current_time=$(date +%s)
|
|
local time_diff=$((current_time - first_restart_time))
|
|
|
|
# 如果超过时间窗口,重置计数
|
|
if [ $time_diff -gt $RESTART_WINDOW ]; then
|
|
count=0
|
|
first_restart_time=$current_time
|
|
fi
|
|
|
|
# 检查是否超过最大重启次数
|
|
if [ $count -ge $MAX_RESTART_COUNT ]; then
|
|
log_error "容器 $container 已达到最大重启次数 ($MAX_RESTART_COUNT),停止自动重启"
|
|
send_alert "⚠️ 容器重启次数超限" "容器 **$container** 在过去1小时内已重启 $count 次,已停止自动重启"
|
|
return 1
|
|
fi
|
|
|
|
# 重启容器
|
|
log_info "正在重启容器 $container (第 $((count + 1)) 次)"
|
|
docker restart "$container" > /dev/null 2>&1
|
|
|
|
# 更新计数
|
|
echo $((count + 1)) > "$restart_count_file"
|
|
echo "$first_restart_time" > "$restart_time_file"
|
|
|
|
# 发送告警
|
|
send_alert "🔄 容器自动重启" "容器 **$container** 已自动重启 (第 $((count + 1)) 次)"
|
|
|
|
return 0
|
|
}
|
|
|
|
# 检查容器资源使用
|
|
check_resources() {
|
|
local container="$1"
|
|
local cpu_threshold=80 # CPU使用率阈值
|
|
local mem_threshold=80 # 内存使用率阈值
|
|
|
|
local stats=$(docker stats --no-stream --format "{{.CPUPerc}}\t{{.MemPerc}}" "$container" 2>/dev/null)
|
|
local cpu=$(echo "$stats" | awk '{print $1}' | sed 's/%//')
|
|
local mem=$(echo "$stats" | awk '{print $2}' | sed 's/%//')
|
|
|
|
if [ -n "$cpu" ] && [ -n "$mem" ]; then
|
|
cpu=${cpu%.*} # 取整数部分
|
|
mem=${mem%.*}
|
|
|
|
if [ "$cpu" -gt "$cpu_threshold" ]; then
|
|
log_warning "容器 $container CPU使用率过高: ${cpu}%"
|
|
send_alert "⚠️ CPU使用率过高" "容器 **$container** CPU使用率: ${cpu}%"
|
|
fi
|
|
|
|
if [ "$mem" -gt "$mem_threshold" ]; then
|
|
log_warning "容器 $container 内存使用率过高: ${mem}%"
|
|
send_alert "⚠️ 内存使用率过高" "容器 **$container** 内存使用率: ${mem}%"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
# 主监控函数
|
|
monitor() {
|
|
log_info "开始容器监控..."
|
|
|
|
# 关键容器列表
|
|
local containers=(
|
|
"woodpecker-server"
|
|
"woodpecker-agent"
|
|
"novalon-nginx"
|
|
"novalon-website"
|
|
"forgejo"
|
|
"postgresql"
|
|
"registry"
|
|
)
|
|
|
|
local unhealthy_count=0
|
|
local restarted_count=0
|
|
|
|
for container in "${containers[@]}"; do
|
|
if ! check_container "$container"; then
|
|
unhealthy_count=$((unhealthy_count + 1))
|
|
|
|
# 尝试重启容器
|
|
if restart_container "$container"; then
|
|
restarted_count=$((restarted_count + 1))
|
|
sleep 5 # 等待容器启动
|
|
|
|
# 再次检查
|
|
if check_container "$container"; then
|
|
log_success "容器 $container 重启成功"
|
|
else
|
|
log_error "容器 $container 重启后仍然异常"
|
|
fi
|
|
fi
|
|
else
|
|
# 检查资源使用
|
|
check_resources "$container"
|
|
fi
|
|
done
|
|
|
|
# 输出监控摘要
|
|
log_info "监控摘要: 总容器 ${#containers[@]}, 异常 $unhealthy_count, 已重启 $restarted_count"
|
|
|
|
# 如果有异常容器,发送汇总告警
|
|
if [ $unhealthy_count -gt 0 ]; then
|
|
send_alert "⚠️ 容器监控告警" "发现 $unhealthy_count 个异常容器,已自动重启 $restarted_count 个"
|
|
fi
|
|
}
|
|
|
|
# 主程序
|
|
main() {
|
|
case "${1:-monitor}" in
|
|
monitor)
|
|
monitor
|
|
;;
|
|
status)
|
|
docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
|
|
;;
|
|
logs)
|
|
tail -f "$LOG_FILE"
|
|
;;
|
|
*)
|
|
echo "用法: $0 {monitor|status|logs}"
|
|
exit 1
|
|
;;
|
|
esac
|
|
}
|
|
|
|
main "$@"
|