增加sh内存检查python的脚本
This commit is contained in:
105
sh/weblessie-server-02/check_memory_and_restart.sh
Normal file
105
sh/weblessie-server-02/check_memory_and_restart.sh
Normal file
@@ -0,0 +1,105 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 阈值:10G 单位 MB
|
||||
THRESHOLD_MB=10240
|
||||
|
||||
# 两个服务配置
|
||||
declare -A SERVICE_DIRS=(
|
||||
["8000"]="/data/webapps/lessie_sourcing_agents_s5"
|
||||
["8001"]="/data/webapps/lessie_sourcing_agents_s6"
|
||||
)
|
||||
|
||||
declare -A SERVICE_ENVS=(
|
||||
["8000"]="s5"
|
||||
["8001"]="s6"
|
||||
)
|
||||
|
||||
# 获取当前时间
|
||||
now() {
|
||||
date +"%Y-%m-%d %H:%M:%S"
|
||||
}
|
||||
|
||||
# 检查端口对应所有进程的内存(RSS 和 VMS 都行,这里用 RES 实际内存占用)
|
||||
get_memory_usage_mb() {
|
||||
PORT=$1
|
||||
# 获取所有 PID
|
||||
PIDS=$(lsof -iTCP -sTCP:LISTEN -nP | awk -v port=":$PORT" '$9 ~ port"$" {print $2}' | sort -u)
|
||||
|
||||
if [ -z "$PIDS" ]; then
|
||||
echo 0
|
||||
return
|
||||
fi
|
||||
|
||||
# 总内存 MB
|
||||
total=0
|
||||
for pid in $PIDS; do
|
||||
mem=$(ps -o rss= -p "$pid" 2>/dev/null) # 单位 KB
|
||||
[ -n "$mem" ] && total=$((total + mem))
|
||||
done
|
||||
|
||||
echo $(( total / 1024 ))
|
||||
}
|
||||
|
||||
# 启动服务
|
||||
start_service() {
|
||||
PORT=$1
|
||||
APP_DIR=${SERVICE_DIRS[$PORT]}
|
||||
echo "$(now) 重启服务(port=$PORT, dir=$APP_DIR)..."
|
||||
|
||||
cd "$APP_DIR" || exit 1
|
||||
|
||||
source "$APP_DIR/.venv/bin/activate"
|
||||
|
||||
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
|
||||
LOGFILE="$APP_DIR/logs/lessie_sourcing_agents_${TIMESTAMP}.log"
|
||||
|
||||
if [ "$PORT" = "8000" ]; then
|
||||
APP_ENV="s5"
|
||||
else
|
||||
APP_ENV="s6"
|
||||
fi
|
||||
|
||||
nohup env APP_ENV=$APP_ENV gunicorn -w 4 -k uvicorn.workers.UvicornWorker \
|
||||
-b 0.0.0.0:$PORT --timeout 300 dialogue.app:app \
|
||||
--max-requests 200 --max-requests-jitter 50 \
|
||||
> "$LOGFILE" 2>&1 &
|
||||
|
||||
ln -sf "$LOGFILE" "$APP_DIR/logs/lessie_sourcing_agents_latest.log"
|
||||
|
||||
echo "$(now) 服务 $PORT 已重新启动"
|
||||
}
|
||||
|
||||
# 主循环(两个服务)
|
||||
for PORT in 8000 8001; do
|
||||
echo "---------------------------"
|
||||
echo "$(now) 检查端口 $PORT 的服务"
|
||||
|
||||
usage=$(get_memory_usage_mb "$PORT")
|
||||
echo "$(now) 当前内存占用: ${usage}MB"
|
||||
|
||||
if [ "$usage" -gt "$THRESHOLD_MB" ]; then
|
||||
echo "$(now) ⚠️ 占用超过阈值(${THRESHOLD_MB}MB),执行重启"
|
||||
|
||||
# 调用杀进程脚本
|
||||
sh /data/sh/kill_lessie_sourcing_agents.sh "$PORT"
|
||||
|
||||
sleep 2
|
||||
|
||||
# 重启服务
|
||||
start_service "$PORT"
|
||||
|
||||
# 飞书告警
|
||||
APP_ENV=${SERVICE_ENVS[$PORT]}
|
||||
sh /data/sh/feishu_notify.sh \
|
||||
"Python 内存告警" \
|
||||
"$(hostname)" \
|
||||
"(${APP_ENV})lessie_sourcing_agents(${PORT})" \
|
||||
"warning" \
|
||||
"**内存占用**: ${usage}MB\n已自动 kill 并重启。"
|
||||
|
||||
else
|
||||
echo "$(now) 内存正常,无需处理。"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "$(now) 检查结束"
|
||||
50
sh/weblessie-server-02/feishu_notify.sh
Normal file
50
sh/weblessie-server-02/feishu_notify.sh
Normal file
@@ -0,0 +1,50 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 使用方法:
|
||||
# sh feishu_notify.sh "<title>" "<host>" "<program>" "<level>" "<detail>"
|
||||
|
||||
WEBHOOK="https://open.feishu.cn/open-apis/bot/v2/hook/c14d9964-3b5e-402a-866e-42768aa45e5e"
|
||||
|
||||
TITLE="$1" # 标题
|
||||
HOST="$2" # 主机
|
||||
PROGRAM="$3" # 程序
|
||||
LEVEL="$4" # 级别,飞书卡片 header 颜色(info / warning / danger)
|
||||
DETAIL="$5" # 详情内容(Markdown)
|
||||
|
||||
TIME=$(date +"%Y-%m-%d %H:%M:%S")
|
||||
|
||||
curl -s -X POST \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{
|
||||
\"msg_type\": \"interactive\",
|
||||
\"card\": {
|
||||
\"header\": {
|
||||
\"template\": \"${LEVEL}\",
|
||||
\"title\": {
|
||||
\"content\": \"${TITLE}\",
|
||||
\"tag\": \"plain_text\"
|
||||
}
|
||||
},
|
||||
\"elements\": [
|
||||
{
|
||||
\"tag\": \"div\",
|
||||
\"text\": {
|
||||
\"tag\": \"lark_md\",
|
||||
\"content\": \"**主机:** ${HOST}\n**程序:** ${PROGRAM}\n**级别:** ${LEVEL}\n**时间:** ${TIME}\n\n${DETAIL}\"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}" \
|
||||
"$WEBHOOK" >/dev/null 2>&1
|
||||
|
||||
|
||||
|
||||
|
||||
# 调用示例
|
||||
# sh /data/sh/feishu_notify.sh \
|
||||
# "⚠️ Python 内存告警" \
|
||||
# "$(hostname)" \
|
||||
# "lessie_sourcing_agents_s5(8000)" \
|
||||
# "danger" \
|
||||
# "**内存占用**: ${usage}MB\n已自动 kill 并重启。"
|
||||
23
sh/weblessie-server-02/kill_lessie_sourcing_agents.sh
Normal file
23
sh/weblessie-server-02/kill_lessie_sourcing_agents.sh
Normal file
@@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 从参数获取端口号
|
||||
PORT=$1
|
||||
|
||||
# 判断是否传入参数
|
||||
if [ -z "$PORT" ]; then
|
||||
echo "❌ 错误:请在执行时指定端口号,例如: sh kill_lessie_sourcing_agents.sh 8000"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 查找占用端口的进程 PID
|
||||
# PID=$(lsof -t -i:$PORT)
|
||||
PID=$(lsof -iTCP -sTCP:LISTEN -nP | awk -v port=":$PORT" '$9 ~ port"$" {print $2}' | sort -u)
|
||||
|
||||
if [ -n "$PID" ]; then
|
||||
echo "发现端口 $PORT 的进程,PID=$PID"
|
||||
echo "正在关闭进程..."
|
||||
kill -9 $PID
|
||||
echo "进程 $PID 已经被杀掉,端口 $PORT 已释放。"
|
||||
else
|
||||
echo "端口 $PORT 没有正在运行的进程。"
|
||||
fi
|
||||
Reference in New Issue
Block a user