增加sh内存检查python的脚本

This commit is contained in:
dxin
2025-11-26 14:11:03 +08:00
parent 38b93a1115
commit c0b6381910
10 changed files with 430 additions and 0 deletions

4
sh/crontab -e Normal file
View File

@@ -0,0 +1,4 @@
crontab -e
* * * * * /bin/bash /data/sh/check_memory_and_restart.sh >> /data/sh/logs/agents_memcheck.log 2>&1

View File

@@ -0,0 +1,23 @@
#!/bin/bash
# 从参数获取端口号
PORT=$1
# 判断是否传入参数
if [ -z "$PORT" ]; then
echo "❌ 错误:请在执行时指定端口号,例如: sh kill_lessie_sourcing_agents.sh 8000"
exit 1
fi
# 查找占用端口的进程 PID
# PID=$(lsof -t -i:$PORT)
PID=$(lsof -iTCP -sTCP:LISTEN -nP | awk -v port=":$PORT" '$9 ~ port"$" {print $2}' | sort -u)
if [ -n "$PID" ]; then
echo "发现端口 $PORT 的进程PID=$PID"
echo "正在关闭进程..."
kill -9 $PID
echo "进程 $PID 已经被杀掉,端口 $PORT 已释放。"
else
echo "端口 $PORT 没有正在运行的进程。"
fi

View File

@@ -0,0 +1,16 @@
cd /data/webapps/lessie_sourcing_agents_s5
uv sync
source /data/webapps/lessie_sourcing_agents_s5/.venv/bin/activate
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
LOGFILE="/data/webapps/lessie_sourcing_agents_s5/logs/lessie_sourcing_agents_${TIMESTAMP}.log"
nohup env APP_ENV=s5 gunicorn -w 4 -k uvicorn.workers.UvicornWorker -b 0.0.0.0:8000 --timeout 300 dialogue.app:app --max-requests 200 --max-requests-jitter 50 > "$LOGFILE" 2>&1 &
ln -sf "$LOGFILE" /data/webapps/lessie_sourcing_agents_s5/logs/lessie_sourcing_agents_latest.log
cd /data/webapps/lessie_sourcing_agents_s6
uv sync
source /data/webapps/lessie_sourcing_agents_s6/.venv/bin/activate
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
LOGFILE="/data/webapps/lessie_sourcing_agents_s6/logs/lessie_sourcing_agents_${TIMESTAMP}.log"
nohup env APP_ENV=s6 gunicorn -w 4 -k uvicorn.workers.UvicornWorker -b 0.0.0.0:8001 --timeout 300 dialogue.app:app --max-requests 200 --max-requests-jitter 50 > "$LOGFILE" 2>&1 &
ln -sf "$LOGFILE" /data/webapps/lessie_sourcing_agents_s6/logs/lessie_sourcing_agents_latest.log

View File

@@ -0,0 +1,105 @@
#!/bin/bash
# 阈值10G 单位 MB
THRESHOLD_MB=10240
# 两个服务配置
declare -A SERVICE_DIRS=(
["8000"]="/data/webapps/lessie_sourcing_agents"
["8001"]="/data/webapps/lessie_sourcing_agents_s4"
)
declare -A SERVICE_ENVS=(
["8000"]="s1"
["8001"]="s4"
)
# 获取当前时间
now() {
date +"%Y-%m-%d %H:%M:%S"
}
# 检查端口对应所有进程的内存RSS 和 VMS 都行,这里用 RES 实际内存占用)
get_memory_usage_mb() {
PORT=$1
# 获取所有 PID
PIDS=$(lsof -iTCP -sTCP:LISTEN -nP | awk -v port=":$PORT" '$9 ~ port"$" {print $2}' | sort -u)
if [ -z "$PIDS" ]; then
echo 0
return
fi
# 总内存 MB
total=0
for pid in $PIDS; do
mem=$(ps -o rss= -p "$pid" 2>/dev/null) # 单位 KB
[ -n "$mem" ] && total=$((total + mem))
done
echo $(( total / 1024 ))
}
# 启动服务
start_service() {
PORT=$1
APP_DIR=${SERVICE_DIRS[$PORT]}
echo "$(now) 重启服务(port=$PORT, dir=$APP_DIR)..."
cd "$APP_DIR" || exit 1
source "$APP_DIR/.venv/bin/activate"
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
LOGFILE="$APP_DIR/logs/lessie_sourcing_agents_${TIMESTAMP}.log"
if [ "$PORT" = "8000" ]; then
APP_ENV="s1"
else
APP_ENV="s4"
fi
nohup env APP_ENV=$APP_ENV gunicorn -w 4 -k uvicorn.workers.UvicornWorker \
-b 0.0.0.0:$PORT --timeout 300 dialogue.app:app \
--max-requests 200 --max-requests-jitter 50 \
> "$LOGFILE" 2>&1 &
ln -sf "$LOGFILE" "$APP_DIR/logs/lessie_sourcing_agents_latest.log"
echo "$(now) 服务 $PORT 已重新启动"
}
# 主循环(两个服务)
for PORT in 8000 8001; do
echo "---------------------------"
echo "$(now) 检查端口 $PORT 的服务"
usage=$(get_memory_usage_mb "$PORT")
echo "$(now) 当前内存占用: ${usage}MB"
if [ "$usage" -gt "$THRESHOLD_MB" ]; then
echo "$(now) ⚠️ 占用超过阈值(${THRESHOLD_MB}MB),执行重启"
# 调用杀进程脚本
sh /data/sh/kill_lessie_sourcing_agents.sh "$PORT"
sleep 2
# 重启服务
start_service "$PORT"
# 飞书告警
APP_ENV=${SERVICE_ENVS[$PORT]}
sh /data/sh/feishu_notify.sh \
"Python 内存告警" \
"$(hostname)" \
"(${APP_ENV})lessie_sourcing_agents(${PORT})" \
"warning" \
"**内存占用**: ${usage}MB\n已自动 kill 并重启。"
else
echo "$(now) 内存正常,无需处理。"
fi
done
echo "$(now) 检查结束"

View File

@@ -0,0 +1,50 @@
#!/bin/bash
# 使用方法:
# sh feishu_notify.sh "<title>" "<host>" "<program>" "<level>" "<detail>"
WEBHOOK="https://open.feishu.cn/open-apis/bot/v2/hook/c14d9964-3b5e-402a-866e-42768aa45e5e"
TITLE="$1" # 标题
HOST="$2" # 主机
PROGRAM="$3" # 程序
LEVEL="$4" # 级别,飞书卡片 header 颜色info / warning / danger
DETAIL="$5" # 详情内容Markdown
TIME=$(date +"%Y-%m-%d %H:%M:%S")
curl -s -X POST \
-H "Content-Type: application/json" \
-d "{
\"msg_type\": \"interactive\",
\"card\": {
\"header\": {
\"template\": \"${LEVEL}\",
\"title\": {
\"content\": \"${TITLE}\",
\"tag\": \"plain_text\"
}
},
\"elements\": [
{
\"tag\": \"div\",
\"text\": {
\"tag\": \"lark_md\",
\"content\": \"**主机:** ${HOST}\n**程序:** ${PROGRAM}\n**级别:** ${LEVEL}\n**时间:** ${TIME}\n\n${DETAIL}\"
}
}
]
}
}" \
"$WEBHOOK" >/dev/null 2>&1
# 调用示例
# sh /data/sh/feishu_notify.sh \
# "⚠️ Python 内存告警" \
# "$(hostname)" \
# "lessie_sourcing_agents_s5(8000)" \
# "danger" \
# "**内存占用**: ${usage}MB\n已自动 kill 并重启。"

View File

@@ -0,0 +1,23 @@
#!/bin/bash
# 从参数获取端口号
PORT=$1
# 判断是否传入参数
if [ -z "$PORT" ]; then
echo "❌ 错误:请在执行时指定端口号,例如: sh kill_lessie_sourcing_agents.sh 8000"
exit 1
fi
# 查找占用端口的进程 PID
# PID=$(lsof -t -i:$PORT)
PID=$(lsof -iTCP -sTCP:LISTEN -nP | awk -v port=":$PORT" '$9 ~ port"$" {print $2}' | sort -u)
if [ -n "$PID" ]; then
echo "发现端口 $PORT 的进程PID=$PID"
echo "正在关闭进程..."
kill -9 $PID
echo "进程 $PID 已经被杀掉,端口 $PORT 已释放。"
else
echo "端口 $PORT 没有正在运行的进程。"
fi

View File

@@ -0,0 +1,105 @@
#!/bin/bash
# 阈值10G 单位 MB
THRESHOLD_MB=10240
# 两个服务配置
declare -A SERVICE_DIRS=(
["8000"]="/data/webapps/lessie_sourcing_agents_s5"
["8001"]="/data/webapps/lessie_sourcing_agents_s6"
)
declare -A SERVICE_ENVS=(
["8000"]="s5"
["8001"]="s6"
)
# 获取当前时间
now() {
date +"%Y-%m-%d %H:%M:%S"
}
# 检查端口对应所有进程的内存RSS 和 VMS 都行,这里用 RES 实际内存占用)
get_memory_usage_mb() {
PORT=$1
# 获取所有 PID
PIDS=$(lsof -iTCP -sTCP:LISTEN -nP | awk -v port=":$PORT" '$9 ~ port"$" {print $2}' | sort -u)
if [ -z "$PIDS" ]; then
echo 0
return
fi
# 总内存 MB
total=0
for pid in $PIDS; do
mem=$(ps -o rss= -p "$pid" 2>/dev/null) # 单位 KB
[ -n "$mem" ] && total=$((total + mem))
done
echo $(( total / 1024 ))
}
# 启动服务
start_service() {
PORT=$1
APP_DIR=${SERVICE_DIRS[$PORT]}
echo "$(now) 重启服务(port=$PORT, dir=$APP_DIR)..."
cd "$APP_DIR" || exit 1
source "$APP_DIR/.venv/bin/activate"
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
LOGFILE="$APP_DIR/logs/lessie_sourcing_agents_${TIMESTAMP}.log"
if [ "$PORT" = "8000" ]; then
APP_ENV="s5"
else
APP_ENV="s6"
fi
nohup env APP_ENV=$APP_ENV gunicorn -w 4 -k uvicorn.workers.UvicornWorker \
-b 0.0.0.0:$PORT --timeout 300 dialogue.app:app \
--max-requests 200 --max-requests-jitter 50 \
> "$LOGFILE" 2>&1 &
ln -sf "$LOGFILE" "$APP_DIR/logs/lessie_sourcing_agents_latest.log"
echo "$(now) 服务 $PORT 已重新启动"
}
# 主循环(两个服务)
for PORT in 8000 8001; do
echo "---------------------------"
echo "$(now) 检查端口 $PORT 的服务"
usage=$(get_memory_usage_mb "$PORT")
echo "$(now) 当前内存占用: ${usage}MB"
if [ "$usage" -gt "$THRESHOLD_MB" ]; then
echo "$(now) ⚠️ 占用超过阈值(${THRESHOLD_MB}MB),执行重启"
# 调用杀进程脚本
sh /data/sh/kill_lessie_sourcing_agents.sh "$PORT"
sleep 2
# 重启服务
start_service "$PORT"
# 飞书告警
APP_ENV=${SERVICE_ENVS[$PORT]}
sh /data/sh/feishu_notify.sh \
"Python 内存告警" \
"$(hostname)" \
"(${APP_ENV})lessie_sourcing_agents(${PORT})" \
"warning" \
"**内存占用**: ${usage}MB\n已自动 kill 并重启。"
else
echo "$(now) 内存正常,无需处理。"
fi
done
echo "$(now) 检查结束"

View File

@@ -0,0 +1,50 @@
#!/bin/bash
# 使用方法:
# sh feishu_notify.sh "<title>" "<host>" "<program>" "<level>" "<detail>"
WEBHOOK="https://open.feishu.cn/open-apis/bot/v2/hook/c14d9964-3b5e-402a-866e-42768aa45e5e"
TITLE="$1" # 标题
HOST="$2" # 主机
PROGRAM="$3" # 程序
LEVEL="$4" # 级别,飞书卡片 header 颜色info / warning / danger
DETAIL="$5" # 详情内容Markdown
TIME=$(date +"%Y-%m-%d %H:%M:%S")
curl -s -X POST \
-H "Content-Type: application/json" \
-d "{
\"msg_type\": \"interactive\",
\"card\": {
\"header\": {
\"template\": \"${LEVEL}\",
\"title\": {
\"content\": \"${TITLE}\",
\"tag\": \"plain_text\"
}
},
\"elements\": [
{
\"tag\": \"div\",
\"text\": {
\"tag\": \"lark_md\",
\"content\": \"**主机:** ${HOST}\n**程序:** ${PROGRAM}\n**级别:** ${LEVEL}\n**时间:** ${TIME}\n\n${DETAIL}\"
}
}
]
}
}" \
"$WEBHOOK" >/dev/null 2>&1
# 调用示例
# sh /data/sh/feishu_notify.sh \
# "⚠️ Python 内存告警" \
# "$(hostname)" \
# "lessie_sourcing_agents_s5(8000)" \
# "danger" \
# "**内存占用**: ${usage}MB\n已自动 kill 并重启。"

View File

@@ -0,0 +1,23 @@
#!/bin/bash
# 从参数获取端口号
PORT=$1
# 判断是否传入参数
if [ -z "$PORT" ]; then
echo "❌ 错误:请在执行时指定端口号,例如: sh kill_lessie_sourcing_agents.sh 8000"
exit 1
fi
# 查找占用端口的进程 PID
# PID=$(lsof -t -i:$PORT)
PID=$(lsof -iTCP -sTCP:LISTEN -nP | awk -v port=":$PORT" '$9 ~ port"$" {print $2}' | sort -u)
if [ -n "$PID" ]; then
echo "发现端口 $PORT 的进程PID=$PID"
echo "正在关闭进程..."
kill -9 $PID
echo "进程 $PID 已经被杀掉,端口 $PORT 已释放。"
else
echo "端口 $PORT 没有正在运行的进程。"
fi