#!/bin/bash # 阈值:10G 单位 MB THRESHOLD_MB=12288 # 两个服务配置 declare -A SERVICE_DIRS=( ["8000"]="/data/webapps/lessie_sourcing_agents_s5" ["8001"]="/data/webapps/lessie_sourcing_agents_s6" ) declare -A SERVICE_ENVS=( ["8000"]="s5" ["8001"]="s6" ) # 获取当前时间 now() { date +"%Y-%m-%d %H:%M:%S" } # 检查端口对应所有进程的内存(RSS 和 VMS 都行,这里用 RES 实际内存占用) get_memory_usage_mb() { PORT=$1 # 获取所有 PID PIDS=$(lsof -iTCP -sTCP:LISTEN -nP | awk -v port=":$PORT" '$9 ~ port"$" {print $2}' | sort -u) if [ -z "$PIDS" ]; then echo 0 return fi # 总内存 MB total=0 for pid in $PIDS; do mem=$(ps -o rss= -p "$pid" 2>/dev/null) # 单位 KB [ -n "$mem" ] && total=$((total + mem)) done echo $(( total / 1024 )) } # 启动服务 start_service() { PORT=$1 APP_DIR=${SERVICE_DIRS[$PORT]} echo "$(now) 重启服务(port=$PORT, dir=$APP_DIR)..." cd "$APP_DIR" || exit 1 source "$APP_DIR/.venv/bin/activate" TIMESTAMP=$(date +"%Y%m%d_%H%M%S") LOGFILE="$APP_DIR/logs/lessie_sourcing_agents_${TIMESTAMP}.log" if [ "$PORT" = "8000" ]; then APP_ENV="s5" else APP_ENV="s6" fi nohup env APP_ENV=$APP_ENV gunicorn -w 4 -k uvicorn.workers.UvicornWorker \ -b 0.0.0.0:$PORT --timeout 300 dialogue.app:app \ --max-requests 200 --max-requests-jitter 50 \ > "$LOGFILE" 2>&1 & ln -sf "$LOGFILE" "$APP_DIR/logs/lessie_sourcing_agents_latest.log" echo "$(now) 服务 $PORT 已重新启动" } # 主循环(两个服务) for PORT in 8000 8001; do echo "---------------------------" echo "$(now) 检查端口 $PORT 的服务" usage=$(get_memory_usage_mb "$PORT") echo "$(now) 当前内存占用: ${usage}MB" if [ "$usage" -gt "$THRESHOLD_MB" ]; then echo "$(now) ⚠️ 占用超过阈值(${THRESHOLD_MB}MB),执行重启" # 调用杀进程脚本 sh /data/sh/kill_lessie_sourcing_agents.sh "$PORT" sleep 2 # 重启服务 start_service "$PORT" # 飞书告警 APP_ENV=${SERVICE_ENVS[$PORT]} sh /data/sh/feishu_notify.sh \ "Python 内存告警" \ "$(hostname)" \ "(${APP_ENV})lessie_sourcing_agents(${PORT})" \ "warning" \ "**内存占用**: ${usage}MB\n已自动 kill 并重启。" else echo "$(now) 内存正常,无需处理。" fi done echo "$(now) 检查结束"