初始化提交

This commit is contained in:
2025-10-07 15:58:15 +08:00
commit 0e593caf99
378 changed files with 77890 additions and 0 deletions

View File

@@ -0,0 +1,24 @@
global:
resolve_timeout: 5m
route:
group_by: ['instance']
group_wait: 30s
group_interval: 60s
repeat_interval: 5m
receiver: 'web.hook.prometheusalert'
receivers:
- name: 'web.hook.prometheusalert'
webhook_configs:
- url: 'http://172.24.16.20:9094/prometheusalert?type=fs&tpl=prometheus-fs&fsurl=https://open.feishu.cn/open-apis/bot/v2/hook/8bd6a15d-90f0-4f4f-a1b1-bd105f31ea06'
# 测试机器人
- url: 'http://172.24.16.20:9094/prometheusalert?type=fs&tpl=prometheus-fs&fsurl=https://open.feishu.cn/open-apis/bot/v2/hook/c14d9964-3b5e-402a-866e-42768aa45e5e'

View File

@@ -0,0 +1,28 @@
{{ $var := .externalURL}}{{ range $k,$v:=.alerts }}
{{if eq $v.status "resolved"}}
Prometheus恢复信息
【恢复名称】{{$v.labels.alertname}}✅{{if $v.labels.level}}
【恢复级别】{{if eq $v.labels.level "0"}}提示{{else if eq $v.labels.level "1"}}警告{{else if eq $v.labels.level "2"}}一般严重{{else if eq $v.labels.level "3"}}严重{{else if eq $v.labels.level "4"}}灾难{{else}}{{$v.labels.level}}{{end}}{{end}}
【开始时间】{{GetCSTtime $v.startsAt}}
【结束时间】{{GetCSTtime $v.endsAt}}
【恢复实例】{{$v.labels.instance}}
{{$v.annotations.description1}}
{{else}}
Prometheus告警信息
【告警名称】{{$v.labels.alertname}}🔥{{if $v.labels.level}}
【告警级别】{{if eq $v.labels.level "0"}}提示{{else if eq $v.labels.level "1"}}警告🟡{{else if eq $v.labels.level "2"}}一般严重🔥{{else if eq $v.labels.level "3"}}严重🔥🔥{{else if eq $v.labels.level "4"}}灾难🔥🔥❌{{else}}{{$v.labels.level}}{{end}}{{end}}
【开始时间】{{GetCSTtime $v.startsAt}}
【告警实例】{{$v.labels.instance}}
{{$v.annotations.description}}
{{end}}
{{end}}
{{ end }}
https://open.feishu.cn/open-apis/bot/v2/hook/8bd6a15d-90f0-4f4f-a1b1-bd105f31ea06

View File

@@ -0,0 +1,85 @@
version: '3'
services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
restart: always
ports:
- "9090:9090"
volumes:
- /root/prometheus/prometheus/config:/etc/prometheus
- /root/prometheus/prometheus/data:/prometheus
- /etc/localtime:/etc/localtime:ro
environment:
- TZ=Asia/Shanghai
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
- '--storage.tsdb.retention.size=10GB'
- '--web.enable-lifecycle'
renderer:
image: grafana/grafana-image-renderer:latest
container_name: grafana-renderer
restart: always
ports:
- "8081:8081"
environment:
# 自定义渲染服务安全令牌,请与下方 Grafana 中的 GF_RENDERING_TOKEN 保持一致
- AUTH_TOKEN=4cd108857bdd30fbd4991bb146622f9d
grafana:
image: grafana/grafana:latest
container_name: grafana
restart: always
ports:
- "3000:3000"
volumes:
- /root/prometheus/grafana/data:/var/lib/grafana
environment:
# Grafana 根访问地址
- GF_SERVER_ROOT_URL=http://172.24.16.20:3000/
# 匿名访问设置
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer
# 嵌入和跨站点 Cookie
- GF_SECURITY_ALLOW_EMBEDDING=true
- GF_SECURITY_COOKIE_SECURE=false
- GF_SECURITY_COOKIE_SAMESITE=lax
# Image Renderer 远程服务地址
- GF_RENDERING_SERVER_URL=http://renderer:8081/render
# 回调地址,与根地址保持一致
- GF_RENDERING_CALLBACK_URL=http://172.24.16.20:3000/
# 与渲染服务一致的安全令牌
- GF_RENDERING_TOKEN=4cd108857bdd30fbd4991bb146622f9d
# 打开渲染日志调试,便于排查
- GF_LOG_FILTERS=rendering:debug
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
restart: always
ports:
- "9093:9093"
volumes:
- /root/prometheus/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
- /etc/localtime:/etc/localtime:ro
environment:
- TZ=Asia/Shanghai
prometheus-alert:
image: feiyu563/prometheus-alert:latest
container_name: prometheus-alert
restart: always
ports:
- "9094:8080"
volumes:
- /root/prometheus/prometheus-alert/db:/app/db
environment:
- PA_LOGIN_USER=admin
- PA_LOGIN_PASSWORD=admin
- PA_TITLE=prometheusAlert
- PA_OPEN_FEISHU=1
- PA_OPEN_DINGDING=1
- PA_OPEN_WEIXIN=1

View File

@@ -0,0 +1,45 @@
process_names:
- name: "crawler-server_dialogue.influencer_search" #珍妮的python
cmdline:
- "/root/miniconda3/envs/search/bin/python"
- ".*dialogue.influencer_search.*"
- name: "crawler-server_yt_search_crawler"
cmdline:
- ".*async_yt.*"
- name: "crawler-server_check_tiktok_account"
cmdline:
- ".*check_account.*"
- name: "crawler-server_yt_data_update_week" # yt_data_update 周更 update_yt_week.py
cmdline:
- ".*update_yt_week.*"
- name: "crawler-server_yt_data_update_day" # yt_data_update 日更 update_yt.py
cmdline:
- ".*update_yt.*"
- name: "crawler-server_yt_search_crawler" # yt_search_crawler
cmdline:
- ".*async_yt.*"
- name: "crawler-server_tk_search.py" # tk_search.py
cmdline:
- ".*tk_search.*"
- name: "crawler-server_tiktok_sign_server.jar" # tt_shop.jar
cmdline:
- ".*tiktok_sign_server.*"
- name: "crawler-server_tt_shop.py" # tt_shop.py
cmdline:
- ".*tt_shop.*"
# 监控 nginx 进程
- name: "crawler-server_nginx"
cmdline:
- "/data/tengine/sbin/nginx" # Nginx 进程命令
- ".*" # 允许其他 nginx 参数

View File

@@ -0,0 +1,6 @@
process_names:
- name: "prod_lessie_sourcing_01_7001"
cmdline:
- "gunicorn"
- ".*0.0.0.0:7001.*"

View File

@@ -0,0 +1,6 @@
process_names:
- name: "prod_lessie_sourcing_02_7001"
cmdline:
- "gunicorn"
- ".*0.0.0.0:7001.*"

View File

@@ -0,0 +1,6 @@
process_names:
- name: "prod_lessie_sourcing_03_7001"
cmdline:
- "gunicorn"
- ".*0.0.0.0:7001.*"

View File

@@ -0,0 +1,6 @@
process_names:
- name: "prod_lessie_sourcing_04_7001"
cmdline:
- "gunicorn"
- ".*0.0.0.0:7001.*"

View File

@@ -0,0 +1,6 @@
process_names:
- name: "prod_lessie_sourcing_05_7001"
cmdline:
- "gunicorn"
- ".*0.0.0.0:7001.*"

View File

@@ -0,0 +1,66 @@
process_names:
- name: "prod-flymoon_task" # 自定义的进程名称
cmdline:
- "/data/jdk1.8.0_181/bin/java" # Java 进程命令路径
- ".*flymoon-task.jar.*" # 正则表达式匹配 JAR 包路径和参数
- name: "prod-flymoon_sse"
cmdline:
- "/data/jdk1.8.0_181/bin/java"
- ".*flymoon_sse.jar.*"
- name: "prod-flymoon_monitor"
cmdline:
- "/data/jdk1.8.0_181/bin/java"
- ".*monitor-0.0.1-SNAPSHOT.jar.*"
- name: "prod-flymoon_partner"
cmdline:
- "/data/jdk1.8.0_181/bin/java"
- ".*flymoon-partner.jar.*"
- name: "prod-flymoon_email_prod"
cmdline:
- "/data/jdk1.8.0_181/bin/java"
- ".*fly-moon-email.jar.*"
- name: "prod-flymoon_admin"
cmdline:
- "/data/jdk-21.0.7/bin/java"
- ".*flymoon-admin.jar.*"
- name: "prod-flymoon_agent"
cmdline:
- "/data/data/jdk-21.0.7/bin/java"
- ".*flymoon-agent.jar.*"
- name: "prod-flymoon-payment"
cmdline:
- "/data/data/jdk-21.0.7/bin/java"
- ".*flymoon-payment.jar.*"
- name: "prod-flymoon_crawlSpider-0.0.1-SNAPSHOT"
cmdline:
- "/data/jdk1.8.0_181/bin/java"
- ".*fly_moon_crawlSpider-0.0.1-SNAPSHOT.jar.*"
- name: "prod-flymoon_crawlSpider_shop"
cmdline:
- "/data/jdk1.8.0_181/bin/java"
- ".*fly_moon_crawlSpider_shop.jar.*"
- name: "prod-nacos"
cmdline:
- "/data/jdk-21.0.7/bin/java"
- ".*nacos-server.jar.*"
- name: "prod-redis_server"
cmdline:
- "/data/redis/src/redis-server" # Redis 进程命令
- ".*" # 允许其他 Redis 参数
- name: "prod-nginx"
cmdline:
- "/data/tengine/sbin/nginx" # Nginx 进程命令
- ".*" # 允许其他 nginx 参数

View File

@@ -0,0 +1,54 @@
process_names:
- name: "prod02-flymoon-partner"
cmdline:
- ".*flymoon-partner.jar.*"
- name: "prod02-flymoon-admin"
cmdline:
- ".*flymoon-admin.jar.*"
- name: "prod02-flymoon_agent"
cmdline:
- "/data/jdk-21.0.7/bin/java"
- ".*flymoon-agent.jar.*"
- name: "prod02-flymoon-payment"
cmdline:
- "/data/jdk-21.0.7/bin/java"
- ".*flymoon-payment.jar.*"
- name: "prod02-flymoon_task"
cmdline:
- "/data/jdk1.8.0_181/bin/java"
- ".*flymoon-task.jar.*"
- name: "prod02-ycloud-0.0.1-SNAPSHOT.jar"
cmdline:
- ".*fly_moon_ycloud-0.0.1-SNAPSHOT.jar.*"
- name: "prod02-ycloud-task"
cmdline:
- ".*fly_moon_ycloud-task.jar.*"
- name: "prod02-nacos"
cmdline:
- "/data/jdk-21.0.7/bin/java"
- ".*nacos-server.jar.*"
- name: "prod02-redis_server"
cmdline:
- "/data/redis/src/redis-server" # Redis 进程命令
- ".*" # 允许其他 Redis 参数
- name: "prod02-nginx"
cmdline:
- "/data/tengine/sbin/nginx" # Nginx 进程命令
- ".*" # 允许其他 nginx 参数
- name: "prod02-nacos"
cmdline:
- "/data/jdk-21.0.7/bin/java"
- ".*nacos-server.jar.*"

View File

@@ -0,0 +1,52 @@
process_names:
- name: "test-flymoon_task" # 自定义的进程名称
cmdline:
- "/data/jdk1.8.0_181/bin/java" # Java 进程命令路径
- ".*flymoon-task.jar.*" # 正则表达式匹配 JAR 包路径和参数
- name: "test-flymoon_sse"
cmdline:
- "/data/jdk1.8.0_181/bin/java"
- ".*flymoon_sse.jar.*"
- name: "test-flymoon_monitor"
cmdline:
- "/data/jdk1.8.0_181/bin/java"
- ".*monitor-0.0.1-SNAPSHOT.jar.*"
- name: "test-flymoon_partner"
cmdline:
- "/data/jdk1.8.0_181/bin/java"
- ".*flymoon-partner.jar.*"
- name: "test-flymoon_email_test"
cmdline:
- "/data/jdk1.8.0_181/bin/java"
- ".*fly-moon-email-test.jar.*"
- name: "test-flymoon_admin"
cmdline:
- "/data/jdk-21.0.7/bin/java"
- ".*flymoon-admin.jar.*"
- name: "test-flymoon_jenniefy"
cmdline:
- "/data/jdk1.8.0_181/bin/java"
- ".*flymoon-jenniefy.jar.*"
- name: "test-nacos"
cmdline:
- "/data/jdk-21.0.7/bin/java"
- ".*nacos-server.jar.*"
# 监控 redis-server 进程
- name: "test-redis_server"
cmdline:
- "/data/redis/src/redis-server" # Redis 进程命令
- ".*" # 允许其他 Redis 参数
# 监控 nginx 进程
- name: "test-nginx"
cmdline:
- "/usr/local/nginx/sbin/nginx" # Nginx 进程命令
- ".*" # 允许其他 nginx 参数

View File

@@ -0,0 +1,29 @@
process_names:
- name: "us-prod-01-GO-lessie-sourcing-api"
cmdline:
- ".*lessie-sourcing-api.*"
- name: "us-prod-01-nacos"
cmdline:
- "/data/jdk-21.0.7/bin/java"
- ".*nacos-server.jar.*"
- name: "us-prod-01-flymoon_admin"
cmdline:
- "/data/jdk-21.0.7/bin/java"
- ".*flymoon-admin.jar.*"
- name: "us-prod-01-xxl-job-admin"
cmdline:
- "/data/jdk-21.0.7/bin/java"
- ".*xxl-job-admin.jar.*"
- name: "us-prod-01_lessie_official_web"
cmdline:
- ".*index.mjs.*"
# 监控 nginx 进程
- name: "nginx"
cmdline:
- "/data/tengine/sbin/nginx" # Nginx 进程命令
- ".*" # 允许其他 nginx 参数

View File

@@ -0,0 +1,20 @@
process_names:
- name: "us-prod-02-GO-lessie-sourcing-api"
cmdline:
- ".*lessie-sourcing-api.*"
- name: "us-prod-02-nacos"
cmdline:
- "/data/jdk-21.0.7/bin/java"
- ".*nacos-server.jar.*"
- name: "us-prod-02-flymoon-agent"
cmdline:
- ".*/java"
- "-jar"
- ".*/flymoon-agent.jar"
- name: "us-prod-02_email_prod"
cmdline:
- "/data/jdk1.8.0_181/bin/java"
- ".*fly-moon-email.jar.*"

View File

@@ -0,0 +1,21 @@
process_names:
- name: "us-prod-03-GO-lessie-sourcing-api"
cmdline:
- ".*lessie-sourcing-api.*"
- name: "us-prod-03-nacos"
cmdline:
- "/data/jdk-21.0.7/bin/java"
- ".*nacos-server.jar.*"
- name: "us-prod-03-flymoon-agent"
cmdline:
- ".*/java"
- "-jar"
- ".*/flymoon-agent.jar"
- name: "us-prod-03-flymoon-payment"
cmdline:
- ".*/java"
- "-jar"
- ".*/flymoon-payment.jar"

View File

@@ -0,0 +1,40 @@
process_names:
- name: "webdrive-server_prod-flymoon-email-v2"
cmdline:
- "/data/jdk1.8.0_181/bin/java"
- ".*fly-moon-email-prod.jar.*"
- name: "webdrive-server_test-flymoon-email-v2"
cmdline:
- "/data/jdk1.8.0_181/bin/java"
- ".*fly-moon-email-test.jar.*"
- name: "webdrive-server_s2_py_lessie_sourcing"
cmdline:
- "/data/webapps/lessie_sourcing_agents"
- ".*server.py.*"
- name: "webdrive-server_s3_lessie_sourcing_6001"
cmdline:
- "/data/webapps/qmm_sourcing_agents"
- ".*serverqmm.*"
- name: "webdrive-server_s2_GO-lessie-sourcing-api"
cmdline:
- ".*lessie-sourcing-api.*"
- name: "webdrive-server_s3_GO-lessie-sourcing-api"
cmdline:
- ".*s3-lessie-sourcing-api.*"
# 监控 redis-server 进程
- name: "webdrive-server_redis-server"
cmdline:
- "/data/redis/bin/redis-server" # Redis 进程命令
- ".*" # 允许其他 Redis 参数
# 监控 nginx 进程
- name: "webdrive-server_nginx"
cmdline:
- ".*nginx.*" # 允许其他 nginx 参数

View File

@@ -0,0 +1,29 @@
process_names:
- name: "weblessie-server_lessie_official_web"
cmdline:
- ".*index.mjs.*"
- name: "weblessie-server_lessie_sourcing_8000"
cmdline:
- "/data/webapps/lessie_sourcing_agents"
- ".*server.py*"
- name: "weblessie-server_lessie_sourcing_8002"
cmdline:
- "/data/webapps/lessie_sourcing_agents_02"
- ".*server8002.*"
- name: "weblessie-server_lessie_sourcing_7001"
cmdline:
- "/data/webapps/prod_lessie_sourcing_agents"
- ".*server7001.*"
- name: "weblessie-server_GO-lessie-sourcing-api"
cmdline:
- ".*lessie-sourcing-api.*"

View File

@@ -0,0 +1,13 @@
process_names:
- name: "weblessie-server2_s1_py_lessie_sourcing"
cmdline:
- "/data/webapps/lessie_sourcing_agents"
- ".*server.py*"

View File

@@ -0,0 +1,52 @@
process_names:
- name: "website-server_dialogue.influencer_im"
cmdline:
- "/data/webapps/test_influencer_search_agent/venv/bin/python"
- ".*dialogue.influencer_search.*"
- name: "website-server_dialogue.inf_5002_lessie.ai"
cmdline:
- "/data/webapps/influencer_search_agent/venv/bin/python"
- ".*dialogue.influencer_5002.*"
- name: "website-server_tt_shop.py"
cmdline:
- ".*tt_shop.py.*"
- name: "website-server_yt_search_crawler"
cmdline:
- ".*async_yt.*"
- name: "website-server_check_tiktok_account"
cmdline:
- ".*check_account.*"
- name: "website-server_yt_data_update_week" # yt_data_update 周更 update_yt_week.py
cmdline:
- ".*update_yt_week.*"
- name: "website-server_yt_data_update_day" # yt_data_update 日更 update_yt.py
cmdline:
- ".*update_yt.*"
- name: "website-server_yt_search_crawler" # yt_search_crawler
cmdline:
- ".*async_yt.*"
- name: "website-server_tk_shop_crawler" # yt_search_crawler
cmdline:
- ".*tt_shop.*"
- name: "website-server_api_server.py" # influencer_search_api
cmdline:
- ".*api_server.py.*"
# 监控 nginx 进程
- name: "website-server_nginx"
cmdline:
- "/data/tengine/sbin/nginx" # Nginx 进程命令
- ".*" # 允许其他 nginx 参数

View File

@@ -0,0 +1,18 @@
解压:
tar -zxvf
创建配置文件:
process-exporter.yml
后台启动:
nohup /opt/exporter/process-exporter/process-exporter -config.path=/opt/exporter/process-exporter/process-exporter.yml > /opt/exporter/process-exporter/process-exporter.log 2>&1 &
查看进程:
ps aux | grep process-exporter
查看日志:
tail -f /opt/exporter/process-exporter/process-exporter.log

View File

@@ -0,0 +1,18 @@
1、namedprocess_namegroup_num_procs{groupname=~"$processes",instance=~"$instance"}
2、sum(rate(namedprocess_namegroup_cpu_seconds_total{groupname=~"$processes",instance=~"$instance"}[$interval])) by (instance, groupname)
3、sum(rate(namedprocess_namegroup_read_bytes_total{groupname=~"$processes", instance=~"$instance"}[$interval])) by (instance, groupname)
4、sum(rate(namedprocess_namegroup_write_bytes_total{groupname=~"$processes", instance=~"$instance"}[$interval])) by (instance, groupname)
5、namedprocess_namegroup_memory_bytes{groupname=~"$processes", instance=~"$instance",memtype="resident"}
6、namedprocess_namegroup_memory_bytes{groupname=~"$processes", instance=~"$instance",memtype="virtual"}

View File

@@ -0,0 +1,302 @@
#---------------------↓全局配置-----------------------
appname = PrometheusAlert
#登录用户名
login_user=prometheusalert
#登录密码
login_password=prometheusalert
#监听地址
httpaddr = "0.0.0.0"
#监听端口
httpport = 8080
runmode = dev
#设置代理 proxy = http://123.123.123.123:8080
proxy =
#开启JSON请求
copyrequestbody = true
#告警消息标题
title=PrometheusAlert
#链接到告警平台地址
GraylogAlerturl=http://graylog.org
#钉钉告警 告警logo图标地址
logourl=https://raw.githubusercontent.com/feiyu563/PrometheusAlert/master/doc/images/alert-center.png
#钉钉告警 恢复logo图标地址
rlogourl=https://raw.githubusercontent.com/feiyu563/PrometheusAlert/master/doc/images/alert-center.png
#短信告警级别(等于3就进行短信告警) 告警级别定义 0 信息,1 警告,2 一般严重,3 严重,4 灾难
messagelevel=3
#电话告警级别(等于4就进行语音告警) 告警级别定义 0 信息,1 警告,2 一般严重,3 严重,4 灾难
phonecalllevel=4
#默认拨打号码(页面测试短信和电话功能需要配置此项)
defaultphone=xxxxxxxx
#故障恢复是否启用电话通知0为关闭,1为开启
phonecallresolved=0
#是否前台输出file or console
logtype=file
#日志文件路径
logpath=logs/prometheusalertcenter.log
#转换Prometheus,graylog告警消息的时区为CST时区(如默认已经是CST时区请勿开启)
prometheus_cst_time=0
#数据库驱动支持sqlite3mysql,postgres如使用mysql或postgres请开启db_host,db_port,db_user,db_password,db_name的注释
db_driver=sqlite3
#db_host=127.0.0.1
#db_port=3306
#db_user=root
#db_password=root
#db_name=prometheusalert
#是否开启告警记录 0为关闭,1为开启
AlertRecord=0
#是否开启告警记录定时删除 0为关闭,1为开启
RecordLive=0
#告警记录定时删除周期,单位天
RecordLiveDay=7
# 是否将告警记录写入es70为关闭1为开启
alert_to_es=0
# es地址是[]string
# beego.Appconfig.Strings读取配置为[]string使用";"而不是","
to_es_url=http://localhost:9200
# to_es_url=http://es1:9200;http://es2:9200;http://es3:9200
# es用户和密码
# to_es_user=username
# to_es_pwd=password
# 长连接最大空闲数
maxIdleConns=100
# 热更新配置文件
open-hotreload=0
#---------------------↓webhook-----------------------
#是否开启钉钉告警通道,可同时开始多个通道0为关闭,1为开启
open-dingding=1
#默认钉钉机器人地址
ddurl=https://oapi.dingtalk.com/robot/send?access_token=xxxxx
#是否开启 @所有人(0为关闭,1为开启)
dd_isatall=1
#是否开启钉钉机器人加签0为关闭,1为开启
# 使用方法https://oapi.dingtalk.com/robot/send?access_token=XXXXXX&secret=mysecret
open-dingding-secret=0
#是否开启微信告警通道,可同时开始多个通道0为关闭,1为开启
open-weixin=1
#默认企业微信机器人地址
wxurl=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxxxx
#是否开启飞书告警通道,可同时开始多个通道0为关闭,1为开启
open-feishu=1
#默认飞书机器人地址
fsurl=https://open.feishu.cn/open-apis/bot/hook/xxxxxxxxx
# webhook 发送 http 请求的 contentType, 如 application/json, application/x-www-form-urlencoded不配置默认 application/json
wh_contenttype=application/json
#---------------------↓腾讯云接口-----------------------
#是否开启腾讯云短信告警通道,可同时开始多个通道0为关闭,1为开启
open-txdx=0
#腾讯云短信接口key
TXY_DX_appkey=xxxxx
#腾讯云短信模版ID 腾讯云短信模版配置可参考 prometheus告警:{1}
TXY_DX_tpl_id=xxxxx
#腾讯云短信sdk app id
TXY_DX_sdkappid=xxxxx
#腾讯云短信签名 根据自己审核通过的签名来填写
TXY_DX_sign=腾讯云
#是否开启腾讯云电话告警通道,可同时开始多个通道0为关闭,1为开启
open-txdh=0
#腾讯云电话接口key
TXY_DH_phonecallappkey=xxxxx
#腾讯云电话模版ID
TXY_DH_phonecalltpl_id=xxxxx
#腾讯云电话sdk app id
TXY_DH_phonecallsdkappid=xxxxx
#---------------------↓华为云接口-----------------------
#是否开启华为云短信告警通道,可同时开始多个通道0为关闭,1为开启
open-hwdx=0
#华为云短信接口key
HWY_DX_APP_Key=xxxxxxxxxxxxxxxxxxxxxx
#华为云短信接口Secret
HWY_DX_APP_Secret=xxxxxxxxxxxxxxxxxxxxxx
#华为云APP接入地址(端口接口地址)
HWY_DX_APP_Url=https://rtcsms.cn-north-1.myhuaweicloud.com:10743
#华为云短信模板ID
HWY_DX_Templateid=xxxxxxxxxxxxxxxxxxxxxx
#华为云签名名称,必须是已审核通过的,与模板类型一致的签名名称,按照自己的实际签名填写
HWY_DX_Signature=华为云
#华为云签名通道号
HWY_DX_Sender=xxxxxxxxxx
#---------------------↓阿里云接口-----------------------
#是否开启阿里云短信告警通道,可同时开始多个通道0为关闭,1为开启
open-alydx=0
#阿里云短信主账号AccessKey的ID
ALY_DX_AccessKeyId=xxxxxxxxxxxxxxxxxxxxxx
#阿里云短信接口密钥
ALY_DX_AccessSecret=xxxxxxxxxxxxxxxxxxxxxx
#阿里云短信签名名称
ALY_DX_SignName=阿里云
#阿里云短信模板ID
ALY_DX_Template=xxxxxxxxxxxxxxxxxxxxxx
#是否开启阿里云电话告警通道,可同时开始多个通道0为关闭,1为开启
open-alydh=0
#阿里云电话主账号AccessKey的ID
ALY_DH_AccessKeyId=xxxxxxxxxxxxxxxxxxxxxx
#阿里云电话接口密钥
ALY_DH_AccessSecret=xxxxxxxxxxxxxxxxxxxxxx
#阿里云电话被叫显号,必须是已购买的号码
ALY_DX_CalledShowNumber=xxxxxxxxx
#阿里云电话文本转语音TTS模板ID
ALY_DH_TtsCode=xxxxxxxx
#---------------------↓容联云接口-----------------------
#是否开启容联云电话告警通道,可同时开始多个通道0为关闭,1为开启
open-rlydh=0
#容联云基础接口地址
RLY_URL=https://app.cloopen.com:8883/2013-12-26/Accounts/
#容联云后台SID
RLY_ACCOUNT_SID=xxxxxxxxxxx
#容联云api-token
RLY_ACCOUNT_TOKEN=xxxxxxxxxx
#容联云app_id
RLY_APP_ID=xxxxxxxxxxxxx
#---------------------↓邮件配置-----------------------
#是否开启邮件
open-email=0
#邮件发件服务器地址
Email_host=smtp.qq.com
#邮件发件服务器端口
Email_port=465
#邮件帐号
Email_user=xxxxxxx@qq.com
#邮件密码
Email_password=xxxxxx
#邮件标题
Email_title=运维告警
#默认发送邮箱
Default_emails=xxxxx@qq.com,xxxxx@qq.com
#---------------------↓七陌云接口-----------------------
#是否开启七陌短信告警通道,可同时开始多个通道0为关闭,1为开启
open-7moordx=0
#七陌账户ID
7MOOR_ACCOUNT_ID=Nxxx
#七陌账户APISecret
7MOOR_ACCOUNT_APISECRET=xxx
#七陌账户短信模板编号
7MOOR_DX_TEMPLATENUM=n
#注意七陌短信变量这里只用一个var1在代码里写死了。
#-----------
#是否开启七陌webcall语音通知告警通道,可同时开始多个通道0为关闭,1为开启
open-7moordh=0
#请在七陌平台添加虚拟服务号、文本节点
#七陌账户webcall的虚拟服务号
7MOOR_WEBCALL_SERVICENO=xxx
# 文本节点里被替换的变量我配置的是text。如果被替换的变量不是text请修改此配置
7MOOR_WEBCALL_VOICE_VAR=text
#---------------------↓telegram接口-----------------------
#是否开启telegram告警通道,可同时开始多个通道0为关闭,1为开启
open-tg=0
#tg机器人token
TG_TOKEN=xxxxx
#tg消息模式 个人消息或者频道消息 0为关闭(推送给个人)1为开启(推送给频道)
TG_MODE_CHAN=0
#tg用户ID
TG_USERID=xxxxx
#tg频道name或者id, 频道name需要以@开始
TG_CHANNAME=xxxxx
#tg api地址, 可以配置为代理地址
#TG_API_PROXY="https://api.telegram.org/bot%s/%s"
#TG_PARSE_MODE设置为 "1" 启用Markdown
TG_PARSE_MODE = "0"
#---------------------↓workwechat接口-----------------------
#是否开启workwechat告警通道,可同时开始多个通道0为关闭,1为开启
open-workwechat=0
# 企业ID
WorkWechat_CropID=xxxxx
# 应用ID
WorkWechat_AgentID=xxxx
# 应用secret
WorkWechat_AgentSecret=xxxx
# 接受用户
WorkWechat_ToUser="zhangsan|lisi"
# 接受部门
WorkWechat_ToParty="ops|dev"
# 接受标签
WorkWechat_ToTag=""
# 消息类型, 暂时只支持markdown
# WorkWechat_Msgtype = "markdown"
#---------------------↓百度云接口-----------------------
#是否开启百度云短信告警通道,可同时开始多个通道0为关闭,1为开启
open-baidudx=0
#百度云短信接口AK(ACCESS_KEY_ID)
BDY_DX_AK=xxxxx
#百度云短信接口SK(SECRET_ACCESS_KEY)
BDY_DX_SK=xxxxx
#百度云短信ENDPOINTENDPOINT参数需要用指定区域的域名来进行定义如服务所在区域为北京则为
BDY_DX_ENDPOINT=http://smsv3.bj.baidubce.com
#百度云短信模版ID,根据自己审核通过的模版来填写(模版支持一个参数code如prometheus告警:{code})
BDY_DX_TEMPLATE_ID=xxxxx
#百度云短信签名ID根据自己审核通过的签名来填写
TXY_DX_SIGNATURE_ID=xxxxx
#---------------------↓百度Hi(如流)-----------------------
#是否开启百度Hi(如流)告警通道,可同时开始多个通道0为关闭,1为开启
open-ruliu=0
#默认百度Hi(如流)机器人地址
BDRL_URL=https://api.im.baidu.com/api/msg/groupmsgsend?access_token=xxxxxxxxxxxxxx
#百度Hi(如流)群ID
BDRL_ID=123456
#---------------------↓bark接口-----------------------
#是否开启telegram告警通道,可同时开始多个通道0为关闭,1为开启
open-bark=0
#bark默认地址, 建议自行部署bark-server
BARK_URL=https://api.day.app
#bark key, 多个key使用分割
BARK_KEYS=xxxxx
# 复制, 推荐开启
BARK_COPY=1
# 历史记录保存,推荐开启
BARK_ARCHIVE=1
# 消息分组
BARK_GROUP=PrometheusAlert
#---------------------↓语音播报-----------------------
#语音播报需要配合语音播报插件才能使用
#是否开启语音播报通道,0为关闭,1为开启
open-voice=1
VOICE_IP=127.0.0.1
VOICE_PORT=9999
#---------------------↓飞书机器人应用-----------------------
#是否开启feishuapp告警通道,可同时开始多个通道0为关闭,1为开启
open-feishuapp=1
# APPID
FEISHU_APPID=cli_xxxxxxxxxxxxx
# APPSECRET
FEISHU_APPSECRET=xxxxxxxxxxxxxxxxxxxxxx
# 可填飞书 用户open_id、user_id、union_ids、部门open_department_id
AT_USER_ID="xxxxxxxx"
#---------------------↓告警组-----------------------
# 有其他新增的配置段,请放在告警组的上面
# 暂时仅针对 PrometheusContronller 中的 /prometheus/alert 路由
# 告警组如果放在了 wx, dd... 那部分的上分beego section 取 url 值不太对。
# 所以这里使用 include 来包含另告警组配置
# 是否启用告警组功能
open-alertgroup=0
# 自定义的告警组既可以写在这里,也可以写在单独的文件里。
# 写在单独的告警组配置里更便于修改。
# include "alertgroup.conf"
#---------------------↓kafka地址-----------------------
# kafka服务器的地址
open-kafka=1
kafka_server = 127.0.0.1:9092
# 写入消息的kafka topic
kafka_topic = devops
# 用户标记该消息是来自PrometheusAlert,一般无需修改
kafka_key = PrometheusAlert

View File

@@ -0,0 +1,283 @@
global:
scrape_interval: 30s
evaluation_interval: 30s
alerting:
alertmanagers:
- static_configs:
- targets: ['172.24.16.20:9093']
rule_files:
- "./reles/*.yml"
scrape_configs:
# 运维服务器================================
- job_name: 'ops-server_node'
static_configs:
- targets: ['172.24.16.20:9100']
labels:
instance: ops-server
- job_name: 'ops-server_prometheus'
static_configs:
- targets: ['172.24.16.20:9090']
labels:
instance: ops-server
# 运维服务器================================
# 国内测试机器==============================
- job_name: 'app-test-server_node'
static_configs:
- targets: ['172.24.16.13:9100']
labels:
instance: app-test-server
- job_name: 'test-process-exporter'
static_configs:
- targets: ['172.24.16.13:9256']
labels:
instance: app-test-server
# 国内测试机器==============================
# 国内生产服务器01==========================
- job_name: 'app-prod-server_node01'
static_configs:
- targets: ['172.24.16.10:9100']
labels:
instance: app-prod-server_01
- job_name: 'prod01-server-process-exporter'
static_configs:
- targets: ['172.24.16.10:9256']
labels:
instance: app-prod-server_01
# 国内生产服务器01==========================
# 国内生产服务器02==========================
- job_name: 'app-prod-server_node02'
static_configs:
- targets: ['172.24.16.7:9100']
labels:
instance: app-prod-server_02
- job_name: 'prod02-server-process-exporter'
static_configs:
- targets: ['172.24.16.7:9256']
labels:
instance: app-prod-server_02
# 国内生产服务器02==========================
# 海外服务器01==============================
- job_name: 'webdrive-server_node'
static_configs:
- targets: ['43.159.145.241:9100']
labels:
instance: webdrive-server
- job_name: 'webdrive-server-process-exporter'
static_configs:
- targets: ['43.159.145.241:9256']
labels:
instance: webdrive-server
# 海外服务器01==============================
# 海外服务器02==============================
- job_name: 'website-server_node'
static_configs:
- targets: ['49.51.46.148:9100']
labels:
instance: website-server
- job_name: 'website-server-process-exporter'
static_configs:
- targets: ['49.51.46.148:9256']
labels:
instance: website-server
# 海外服务器02==============================
# 海外服务器03==============================
- job_name: 'weblessie-server1_node'
static_configs:
- targets: ['43.130.56.138:9100']
labels:
instance: weblessie-server1
- job_name: 'weblessie-server1-process-exporter'
static_configs:
- targets: ['43.130.56.138:9256']
labels:
instance: weblessie-server1
# 海外服务器03==============================
# 海外服务器04==============================
- job_name: 'prod-lessie-server5_node'
static_configs:
- targets: ['43.130.53.202:9100']
labels:
instance: prod-lessie-server5
- job_name: 'prod-lessie-server5-process-exporter'
static_configs:
- targets: ['43.130.53.202:9256']
labels:
instance: prod-lessie-server5
# 海外服务器04==============================
# 海外服务器05==============================
- job_name: 'prod-lessie-server1_node'
static_configs:
- targets: ['43.130.59.68:9100']
labels:
instance: prod-lessie-server1
- job_name: 'prod-lessie-server1-process-exporter'
static_configs:
- targets: ['43.130.59.68:9256']
labels:
instance: prod-lessie-server1
# 海外服务器05==============================
# 海外服务器06==============================
- job_name: 'prod-lessie-server2_node'
static_configs:
- targets: ['43.173.126.43:9100']
labels:
instance: prod-lessie-server2
- job_name: 'prod-lessie-server2-process-exporter'
static_configs:
- targets: ['43.173.126.43:9256']
labels:
instance: prod-lessie-server2
# 海外服务器06==============================
# 海外服务器07==============================
- job_name: 'prod-lessie-server3_node'
static_configs:
- targets: ['49.51.189.136:9100']
labels:
instance: prod-lessie-server3
- job_name: 'prod-lessie-server3-process-exporter'
static_configs:
- targets: ['49.51.189.136:9256']
labels:
instance: prod-lessie-server3
# 海外服务器07==============================
# 海外服务器08==============================
- job_name: prod-lessie-server4_node
static_configs:
- targets: ['170.106.187.156:9100']
labels:
instance: prod-lessie-server4
- job_name: 'prod-lessie-server4-process-exporter'
static_configs:
- targets: ['170.106.187.156:9256']
labels:
instance: prod-lessie-server4
# 海外服务器08==============================
# 海外服务器us-prod-01======================
- job_name: 'us-prod-01_node'
static_configs:
- targets: ['43.153.21.64:9100']
labels:
instance: us-prod-01
- job_name: 'us-prod-01-process-exporter'
static_configs:
- targets: ['43.153.21.64:9256']
labels:
instance: us-prod-01
# 海外服务器us-prod-01======================
# 海外服务器us-prod-02======================
- job_name: 'us-prod-02_node'
static_configs:
- targets: ['43.153.98.191:9100']
labels:
instance: us-prod-02
- job_name: 'us-prod-02-process-exporter'
static_configs:
- targets: ['43.153.98.191:9256']
labels:
instance: us-prod-02
# 海外服务器us-prod-02======================
# 海外服务器us-prod-03======================
- job_name: 'us-prod-03_node'
static_configs:
- targets: ['49.51.41.243:9100']
labels:
instance: us-prod-03
- job_name: 'us-prod-03-process-exporter'
static_configs:
- targets: ['49.51.41.243:9256']
labels:
instance: us-prod-03
# 海外服务器us-prod-03======================
# 自建mongodb的机器=========================
- job_name: 'mongodb_node'
static_configs:
- targets: ['170.106.187.49:9100']
labels:
instance: mongodb-server
- job_name: 'mongodb_process'
static_configs:
- targets: ['170.106.187.49:9256']
labels:
instance: mongodb-server
- job_name: 'mongodb_exporter'
static_configs:
- targets: ['170.106.187.49:9216']
labels:
instance: mongodb-server
# 自建mongodb==============================
# 国内nacos================================
- job_name: 'nacos-cluster'
metrics_path: '/actuator/prometheus'
static_configs:
- targets:
- '172.24.16.10:8081'
- '172.24.16.7:8081'
- '172.24.16.13:8081'
labels:
instance: nacos-cluster
# 国内nacos================================
# 本地存储日志的es机器======================
- job_name: 'elasticsearch'
static_configs:
- targets: ['192.168.70.16:9100']
labels:
instance: es-server
# 本地存储日志的es机器======================
# 本地的sit机器============================
- job_name: 'sit-server_node'
static_configs:
- targets: ['192.168.70.18:9100']
labels:
instance: sit-server
# 本地的sit机器============================

View File

@@ -0,0 +1,78 @@
# 服务器资源告警策略
groups:
- name: 服务器资源监控
rules:
- alert: 内存使用率过高
expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 90
for: 1m # 告警持续时间超过这个时间才会发送给alertmanager
labels:
severity: 严重告警
annotations:
summary: "{{ $labels.instance }} 内存使用率过高,请尽快处理!"
description: "{{ $labels.instance }}内存使用率超过95%,当前使用率{{ $value }}%."
- alert: 服务器宕机
expr: up{job=~".*_node.*"} == 0
for: 10s
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 服务器宕机超过1分钟"
description: "{{$labels.instance}} 服务器已宕机。"
- alert: CPU高负荷
expr: 100 - (avg by (instance,job)(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 1m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} CPU使用率过高请尽快处理"
description: "{{$labels.instance}} CPU使用大于90%,当前使用率{{ $value }}%. "
- alert: 磁盘IO性能
expr: avg(irate(node_disk_io_time_seconds_total[1m])) by(instance,job)* 100 > 90
for: 1m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 流入磁盘IO使用率过高请尽快处理"
description: "{{$labels.instance}} 流入磁盘IO大于90%,当前使用率{{ $value }}%."
- alert: 网络流入
expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400
for: 1m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 流入网络带宽过高,请尽快处理!"
description: "{{$labels.instance}} 流入网络带宽持续1分钟高于100M. RX带宽使用量{{$value}}."
- alert: 网络流出
expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400
for: 1m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 流出网络带宽过高,请尽快处理!"
description: "{{$labels.instance}} 流出网络带宽持15分钟高于100M. RX带宽使用量{$value}}."
- alert: TCP连接数
expr: node_netstat_Tcp_CurrEstab > 10000
for: 1m
labels:
severity: 严重告警
annotations:
summary: " TCP_ESTABLISHED过高"
description: "{{$labels.instance}} TCP_ESTABLISHED大于100%,当前使用率{{ $value }}%."
- alert: 磁盘容量
expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 90
for: 1m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 磁盘分区使用率过高,请尽快处理!"
description: "{{$labels.instance}} 磁盘分区使用大于90%,当前使用率{{ $value }}%."

View File

@@ -0,0 +1,78 @@
# 服务器资源告警策略
groups:
- name: 服务器资源监控
rules:
- alert: 内存使用率过高
expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 90
for: 1m # 告警持续时间超过这个时间才会发送给alertmanager
labels:
severity: 严重告警
annotations:
summary: "{{ $labels.instance }} 内存使用率过高,请尽快处理!"
description: "{{ $labels.instance }}内存使用率超过95%,当前使用率{{ $value }}%."
- alert: 服务器宕机
expr: up == 0
for: 10s
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 服务器宕机超过1分钟"
description: "{{$labels.instance}} 服务器已宕机。"
- alert: CPU高负荷
expr: 100 - (avg by (instance,job)(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 1m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} CPU使用率过高请尽快处理"
description: "{{$labels.instance}} CPU使用大于90%,当前使用率{{ $value }}%. "
- alert: 磁盘IO性能
expr: avg(irate(node_disk_io_time_seconds_total[1m])) by(instance,job)* 100 > 90
for: 1m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 流入磁盘IO使用率过高请尽快处理"
description: "{{$labels.instance}} 流入磁盘IO大于90%,当前使用率{{ $value }}%."
- alert: 网络流入
expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400
for: 1m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 流入网络带宽过高,请尽快处理!"
description: "{{$labels.instance}} 流入网络带宽持续1分钟高于100M. RX带宽使用量{{$value}}."
- alert: 网络流出
expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400
for: 1m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 流出网络带宽过高,请尽快处理!"
description: "{{$labels.instance}} 流出网络带宽持15分钟高于100M. RX带宽使用量{$value}}."
- alert: TCP连接数
expr: node_netstat_Tcp_CurrEstab > 10000
for: 1m
labels:
severity: 严重告警
annotations:
summary: " TCP_ESTABLISHED过高"
description: "{{$labels.instance}} TCP_ESTABLISHED大于100%,当前使用率{{ $value }}%."
- alert: 磁盘容量
expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 90
for: 1m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 磁盘分区使用率过高,请尽快处理!"
description: "{{$labels.instance}} 磁盘分区使用大于90%,当前使用率{{ $value }}%."

View File

@@ -0,0 +1,78 @@
# 服务器资源告警策略
groups:
- name: 服务器资源监控
rules:
- alert: 内存使用率过高
expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 90
for: 1m # 告警持续时间超过这个时间才会发送给alertmanager
labels:
severity: 严重告警
annotations:
summary: "{{ $labels.instance }} 内存使用率过高,请尽快处理!"
description: "{{ $labels.instance }}内存使用率超过90%,当前使用率{{ $value }}%."
- alert: 服务器宕机
expr: up == 0
for: 1m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 服务器宕机,请尽快处理!"
description: "{{$labels.instance}} 服务器延时超过1分钟当前状态{{ $value }}. "
- alert: CPU高负荷
expr: 100 - (avg by (instance,job)(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 1m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} CPU使用率过高请尽快处理"
description: "{{$labels.instance}} CPU使用大于90%,当前使用率{{ $value }}%. "
- alert: 磁盘IO性能
expr: avg(irate(node_disk_io_time_seconds_total[1m])) by(instance,job)* 100 > 90
for: 1m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 流入磁盘IO使用率过高请尽快处理"
description: "{{$labels.instance}} 流入磁盘IO大于90%,当前使用率{{ $value }}%."
- alert: 网络流入
expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400
for: 1m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 流入网络带宽过高,请尽快处理!"
description: "{{$labels.instance}} 流入网络带宽持续1分钟高于100M. RX带宽使用量{{$value}}."
- alert: 网络流出
expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400
for: 1m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 流出网络带宽过高,请尽快处理!"
description: "{{$labels.instance}} 流出网络带宽持15分钟高于100M. RX带宽使用量{$value}}."
- alert: TCP连接数
expr: node_netstat_Tcp_CurrEstab > 10000
for: 1m
labels:
severity: 严重告警
annotations:
summary: " TCP_ESTABLISHED过高"
description: "{{$labels.instance}} TCP_ESTABLISHED大于100%,当前使用率{{ $value }}%."
- alert: 磁盘容量
expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 90
for: 1m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 磁盘分区使用率过高,请尽快处理!"
description: "{{$labels.instance}} 磁盘分区使用大于90%,当前使用率{{ $value }}%."

196
prometheus/安装exporter Normal file
View File

@@ -0,0 +1,196 @@
tar -zxvf node_exporter-1.8.2.linux-amd64.tar.gz
vim /etc/systemd/system/node_exporter.service
[Unit]
Description=node_exporter Monitoring System
Documentation=node_exporter Monitoring System
[Service]
ExecStart=/opt/exporter/node_exporter/node_exporter --web.listen-address=:9100
[Install]
WantedBy=multi-user.target
systemctl daemon-reexec
systemctl daemon-reload
systemctl start node_exporter
systemctl status node_exporter
systemctl enable node_exporter
ss -ntl | grep 9100
curl http://localhost:9100/metrics
--------------------------------------------------------------------------------------------------------
# 下载 node_exporter
cd /opt/
wget https://github.com/prometheus/node_exporter/releases/download/v1.8.0/node_exporter-1.8.0.linux-amd64.tar.gz
tar -xzf node_exporter-1.8.0.linux-amd64.tar.gz
mv node_exporter-1.8.2.linux-amd64 node_exporter
# systemd 管理----------
sudo tee /etc/systemd/system/node_exporter.service > /dev/null <<EOF
[Unit]
Description=Prometheus Node Exporter
After=network.target
[Service]
User=root
ExecStart=/opt/exporter/node_exporter/node_exporter
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
# systemd 管理----------
sudo systemctl daemon-reexec
sudo systemctl daemon-reload
sudo systemctl start node_exporter
sudo systemctl status node_exporter
sudo systemctl enable node_exporter
# 启动服务
nohup /opt/node_exporter/node_exporter > /opt/node_exporter/node_exporter.log 2>&1 &
默认监听 9100确认防火墙放行或仅 Prometheus 网络能访问。
调试技巧
cat /proc/1784152/cmdline | tr '\0' '\n'
安装 process_exporter二进制
cd /opt/
wget https://github.com/ncabatoff/process-exporter/releases/download/v0.7.10/process-exporter-0.7.10.linux-amd64.tar.gz
tar -xzf process-exporter-0.7.10.linux-amd64.tar.gz
mv process-exporter-0.8.5.linux-amd64 process-exporter
sudo tee /etc/systemd/system/process_exporter.service > /dev/null <<EOF
[Unit]
Description=Prometheus Process Exporter
After=network.target
[Service]
ExecStart=/opt/exporter/process-exporter/process-exporter --config.path=/opt/exporter/process-exporter/process-exporter.yml
Restart=always
User=root
[Install]
WantedBy=multi-user.target
EOF
sudo systemctl daemon-reload
sudo systemctl start process_exporter
sudo systemctl status process_exporter
sudo systemctl enable process_exporter
安装 mongodb_exporter
cd /opt
wget https://github.com/percona/mongodb_exporter/releases/download/v0.40.0/mongodb_exporter-0.40.0.linux-amd64.tar.gz
tar -xzf mongodb_exporter-0.40.0.linux-amd64.tar.gz
mv mongodb_exporter-0.40.0.linux-amd64 /opt/mongodb_exporter
use admin
db.createUser({
user: "prometheus",
pwd: "StrongPassword",
roles: [ { role: "clusterMonitor", db: "admin" } ]
})
[Unit]
Description=Prometheus MongoDB Exporter
After=network.target
[Service]
ExecStart=/opt/exporter/mongodb_exporter/mongodb_exporter \
--mongodb.uri=mongodb://prometheus:StrongPassword@localhost:27017/admin
Restart=on-failure
User=root
[Install]
WantedBy=multi-user.target
sudo tee /etc/systemd/system/mongodb_exporter.service > /dev/null <<EOF
[Unit]
Description=Prometheus MongoDB Exporter
After=network.target
[Service]
ExecStart=/opt/exporter/mongodb_exporter/mongodb_exporter \
--mongodb.uri=mongodb://admin:Ud4G8sty6BK@localhost:27017/admin \
Restart=on-failure
User=root
[Install]
WantedBy=multi-user.target
EOF
# 请将 mongodb://myuser:mypassword@localhost:27017/admin 替换为你的实际账号密码和数据库地址。
sudo systemctl daemon-reload
sudo systemctl restart mongodb_exporter
sudo systemctl start mongodb_exporter
sudo systemctl status mongodb_exporter
sudo systemctl enable mongodb_exporter
[Unit]
Description=Prometheus MongoDB Exporter
After=network.target
[Service]
ExecStart=/opt/exporter/mongodb_exporter/mongodb_exporter --mongodb.uri=mongodb://admin:Ud4G8sty6BK@localhost:27017/admin --compatible-mode
Restart=on-failure
User=root
[Install]
WantedBy=multi-user.target
# 启动
/opt/mongodb_exporter/mongodb_exporter \
--mongodb.uri="mongodb://<user>:<password>@localhost:27017/admin" &
use admin
// 获取总连接数信息
var connStatus = db.serverStatus().connections;
print("总连接数 current:", connStatus.current, "可用 available:", connStatus.available);
// 统计每个客户端 IP 当前连接数
db.adminCommand({
aggregate: 1,
pipeline: [
{ $currentOp: { allUsers: true, localOps: true } },
{ $match: { client: { $exists: true } } },
{ $project: { ip: { $arrayElemAt: [ { $split: ["$client", ":"] }, 0 ] } } },
{ $group: { _id: "$ip", count: { $sum: 1 } } },
{ $sort: { count: -1 } }
],
cursor: {}
}).cursor.firstBatch.forEach(doc => printjson(doc));