diff --git a/OpenTelemetry/1.txt b/OpenTelemetry/1.txt new file mode 100644 index 0000000..34dc357 --- /dev/null +++ b/OpenTelemetry/1.txt @@ -0,0 +1,29 @@ +阶段 1:仅部署指标采集(当前目标) + 部署内容: + 创建monitoring命名空间 + Collector 的 RBAC 权限; + 部署 DaemonSet Collector(仅配置hostmetrics/kubeletstats接收器,采集节点 / 容器指标); + 部署 Deployment Collector(仅配置otlp接收器 + prometheusremotewrite导出器,转发指标到 Prometheus); + 核心配置模块: + plaintext + receivers: hostmetrics、kubeletstats、otlp + processors: batch、resource + exporters: prometheusremotewrite + pipelines: metrics(关联上述接收器/处理器/导出器) + +阶段 2:新增日志采集(基于阶段 1 扩展) + 无冲突操作: + 仅更新 DaemonSet Collector 的 ConfigMap:新增filelog接收器(配置日志采集路径),并在pipelines中新增logs流水线; + 仅更新 Deployment Collector 的 ConfigMap:新增elasticsearch导出器,并在pipelines中新增logs流水线; + 重启 DaemonSet/Deployment Collector Pod(配置热更新); + 核心逻辑: + 日志采集依赖 DaemonSet 挂载宿主机日志目录(仅需在 DaemonSet 的 Pod 配置中新增 volume 挂载,不影响原有指标采集); + 日志的filelog接收器、elasticsearch导出器与指标的模块完全独立,互不干扰; + +阶段 3:新增追踪采集(基于阶段 1+2 扩展) + 无冲突操作: + 无需修改 DaemonSet Collector(追踪无需节点级采集); + 仅更新 Deployment Collector 的 ConfigMap:新增otlp/tempo导出器,并在pipelines中新增traces流水线; + 重启 Deployment Collector Pod; + 核心逻辑: + 追踪仅需 Deployment Collector 暴露 4317/4318 端口(阶段 1 已配置otlp接收器,无需新增); + 追踪的otlp/tempo导出器与指标 / 日志的模块完全独立,仅新增traces流水线即可。 \ No newline at end of file diff --git a/OpenTelemetry/Collector /01-otel-rbac.yaml b/OpenTelemetry/Collector /01-otel-rbac.yaml new file mode 100644 index 0000000..1c0239c --- /dev/null +++ b/OpenTelemetry/Collector /01-otel-rbac.yaml @@ -0,0 +1,56 @@ +# 1. 创建monitoring命名空间 +apiVersion: v1 +kind: Namespace +metadata: + name: monitoring + labels: + name: monitoring + +--- +# 2. 创建ServiceAccount +apiVersion: v1 +kind: ServiceAccount +metadata: + name: otel-collector + namespace: monitoring + +--- +# 3. 创建ClusterRole(最小权限) +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: otel-collector-role +rules: + # 读取节点/Pod/服务元数据(基础权限) + - apiGroups: [""] + resources: ["nodes", "pods", "services", "endpoints", "nodes/metrics", "nodes/stats"] + verbs: ["get", "list", "watch"] + + # 后续增加 + # # 新增:采集Deployment/DaemonSet/StatefulSet(apps API组) + # - apiGroups: ["apps"] + # resources: ["deployments", "daemonsets", "statefulsets", "replicasets"] + # verbs: ["get", "list", "watch"] + # # 新增:采集HPA(autoscaling API组) + # - apiGroups: ["autoscaling"] + # resources: ["horizontalpodautoscalers"] + # verbs: ["get", "list", "watch"] + # # 新增:采集k8s事件(可选,用于故障排查) + # - apiGroups: [""] + # resources: ["events"] + # verbs: ["get", "list", "watch"] + +--- +# 4. 绑定ClusterRole到ServiceAccount +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: otel-collector-binding +subjects: + - kind: ServiceAccount + name: otel-collector + namespace: monitoring +roleRef: + kind: ClusterRole + name: otel-collector-role + apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/OpenTelemetry/Collector /02-otel-collector-ConfigMap.yaml b/OpenTelemetry/Collector /02-otel-collector-ConfigMap.yaml new file mode 100644 index 0000000..e000f29 --- /dev/null +++ b/OpenTelemetry/Collector /02-otel-collector-ConfigMap.yaml @@ -0,0 +1,64 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: otel-collector-config + namespace: monitoring +data: + config.yaml: | + # 全局配置 + receivers: + # 1. 节点级指标采集(仅DaemonSet生效) + hostmetrics: + collection_interval: 30s + scrapers: + cpu: {} + memory: {} + disk: {} + filesystem: {} + network: {} + load: {} + processes: {} + # 2. 容器级指标采集(仅DaemonSet生效,修复kubeletstats配置) + kubeletstats: + collection_interval: 30s + auth_type: "serviceAccount" + endpoint: "https://${K8S_NODE_NAME}:10250" + insecure_skip_verify: true + # 3. OTLP接收器(DaemonSet/Deployment都生效) + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + processors: + batch: {} + resource: + attributes: + - key: k8s.cluster.name + value: test-k8s + action: insert + - key: k8s.node.name + from_attribute: host.name + action: insert + + exporters: + prometheusremotewrite: + endpoint: "http://10.0.0.38:9090/api/v1/write" + external_labels: + k8s_cluster: test-k8s + + # 核心修复:service配置 + service: + pipelines: + metrics: + receivers: [hostmetrics, kubeletstats, otlp] + processors: [batch, resource] + exporters: [prometheusremotewrite] + telemetry: + logs: + level: info + metrics: + endpoint: 0.0.0.0:8888 + collection_interval: 60s \ No newline at end of file diff --git a/OpenTelemetry/Collector /03-otel-collector-daemonset.yaml b/OpenTelemetry/Collector /03-otel-collector-daemonset.yaml new file mode 100644 index 0000000..06dc540 --- /dev/null +++ b/OpenTelemetry/Collector /03-otel-collector-daemonset.yaml @@ -0,0 +1,57 @@ +# 部署 DaemonSet(节点级指标采集) +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: otel-collector-daemonset + namespace: monitoring + labels: + app: otel-collector-daemonset +spec: + selector: + matchLabels: + app: otel-collector-daemonset + template: + metadata: + labels: + app: otel-collector-daemonset + spec: + serviceAccountName: otel-collector + hostNetwork: false # 无需主机网络 + containers: + - name: otel-collector + image: otel/opentelemetry-collector-contrib:latest + args: ["--config=/etc/otel-collector/config.yaml"] + # 挂载宿主机目录(采集节点指标) + volumeMounts: + - name: otel-config + mountPath: /etc/otel-collector + - name: proc + mountPath: /proc + readOnly: true + - name: sys + mountPath: /sys + readOnly: true + - name: rootfs + mountPath: /rootfs + readOnly: true + # 资源限制(按需调整) + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 100m + memory: 256Mi + volumes: + - name: otel-config + configMap: + name: otel-collector-config + - name: proc + hostPath: + path: /proc + - name: sys + hostPath: + path: /sys + - name: rootfs + hostPath: + path: / \ No newline at end of file diff --git a/OpenTelemetry/Collector /04-otel-collector-deployment.yaml b/OpenTelemetry/Collector /04-otel-collector-deployment.yaml new file mode 100644 index 0000000..26d9593 --- /dev/null +++ b/OpenTelemetry/Collector /04-otel-collector-deployment.yaml @@ -0,0 +1,43 @@ +# 部署 Deployment(集群级聚合转发) +apiVersion: apps/v1 +kind: Deployment +metadata: + name: otel-collector-deployment + namespace: monitoring + labels: + app: otel-collector-deployment +spec: + replicas: 1 # 测试环境单副本,生产可扩为2 + selector: + matchLabels: + app: otel-collector-deployment + template: + metadata: + labels: + app: otel-collector-deployment + spec: + serviceAccountName: otel-collector + containers: + - name: otel-collector + image: otel/opentelemetry-collector-contrib:latest + args: ["--config=/etc/otel-collector/config.yaml"] + volumeMounts: + - name: otel-config + mountPath: /etc/otel-collector + # 暴露端口 + ports: + - containerPort: 4317 # OTLP gRPC + - containerPort: 4318 # OTLP HTTP + - containerPort: 8888 # 自身监控 + # 资源限制 + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 100m + memory: 256Mi + volumes: + - name: otel-config + configMap: + name: otel-collector-config \ No newline at end of file diff --git a/OpenTelemetry/Collector /config.yaml b/OpenTelemetry/Collector /config.yaml new file mode 100644 index 0000000..3bdca3d --- /dev/null +++ b/OpenTelemetry/Collector /config.yaml @@ -0,0 +1,57 @@ + # 全局配置 + receivers: + # 1. 节点级指标采集(仅DaemonSet生效) + hostmetrics: + collection_interval: 30s + scrapers: + cpu: {} + memory: {} + disk: {} + filesystem: {} + network: {} + load: {} + processes: {} + # 2. 容器级指标采集(仅DaemonSet生效,修复kubeletstats配置) + kubeletstats: + collection_interval: 30s + auth_type: "serviceAccount" + endpoint: "https://${K8S_NODE_NAME}:10250" + insecure_skip_verify: true + # 3. OTLP接收器(DaemonSet/Deployment都生效) + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + processors: + batch: {} + resource: + attributes: + - key: k8s.cluster.name + value: test-k8s + action: insert + - key: k8s.node.name + from_attribute: host.name + action: insert + + exporters: + prometheusremotewrite: + endpoint: "http://10.0.0.38:9090/api/v1/write" + external_labels: + k8s_cluster: test-k8s + + # 核心修复:service配置 + service: + pipelines: + metrics: + receivers: [hostmetrics, kubeletstats, otlp] + processors: [batch, resource] + exporters: [prometheusremotewrite] + telemetry: + logs: + level: info + metrics: + endpoint: 0.0.0.0:8888 + collection_interval: 60s \ No newline at end of file diff --git a/OpenTelemetry/ES/filebast/01-filebeat-serviceaccount.yaml b/OpenTelemetry/ES/filebast/01-filebeat-serviceaccount.yaml new file mode 100644 index 0000000..0eee6d5 --- /dev/null +++ b/OpenTelemetry/ES/filebast/01-filebeat-serviceaccount.yaml @@ -0,0 +1,73 @@ +# 定义 Filebeat 的服务账户(ServiceAccount) +apiVersion: v1 +kind: ServiceAccount +metadata: + name: filebeat # 服务账户名称 + namespace: kube-system # 所在命名空间 + labels: + k8s-app: filebeat # 标签,标识这是 Filebeat 应用 +--- +# 定义 Filebeat 的集群角色(ClusterRole),授予集群范围的权限 +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: filebeat # 集群角色名称 + labels: + k8s-app: filebeat # 标签 +rules: + # 授予对 namespaces, pods, nodes 资源的 get, list, watch 权限 + - apiGroups: [""] + resources: ["namespaces", "pods", "nodes"] + verbs: ["get", "list", "watch"] + # 授予对 ReplicaSets 的 get, list, watch 权限 + - apiGroups: ["apps"] + resources: ["replicasets"] + verbs: ["get", "list", "watch"] + # 授予对 Jobs 的 get, list, watch 权限 + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["get", "list", "watch"] +--- +# 定义 Filebeat 的角色(Role),授予命名空间范围的权限 +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: filebeat # 角色名称 + namespace: kube-system # 作用命名空间 + labels: + k8s-app: filebeat # 标签 +rules: + # 授予对 leases 资源的 get, create, update 权限 + # Leases 用于协调和领导者选举 + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["get", "create", "update"] +--- +# 将 Filebeat 的服务账户与集群角色绑定(ClusterRoleBinding) +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: filebeat # 绑定名称 +subjects: + - kind: ServiceAccount # 主体类型为服务账户 + name: filebeat # 服务账户名称 + namespace: kube-system # 服务账户所在命名空间 +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole # 引用的角色类型 + name: filebeat # 引用的角色名称 +--- +# 将 Filebeat 的服务账户与角色绑定(RoleBinding) +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: filebeat # 绑定名称 + namespace: kube-system # 作用命名空间 +subjects: + - kind: ServiceAccount # 主体类型为服务账户 + name: filebeat # 服务账户名称 + namespace: kube-system # 服务账户所在命名空间 +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role # 引用的角色类型 + name: filebeat # 引用的角色名称 \ No newline at end of file diff --git a/OpenTelemetry/ES/filebast/02-filebeat-configmap.yaml b/OpenTelemetry/ES/filebast/02-filebeat-configmap.yaml new file mode 100644 index 0000000..6ea96f4 --- /dev/null +++ b/OpenTelemetry/ES/filebast/02-filebeat-configmap.yaml @@ -0,0 +1,233 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: filebeat-config + namespace: kube-system +data: + filebeat.yml: | + setup.ilm.enabled: false + setup.template.enabled: false + + filebeat.autodiscover: + providers: + - type: kubernetes + templates: + # ---------- ↓ json格式日志 ↓ ---------- + - condition: + and: + - regexp: + kubernetes.namespace: "^(sit|apex-evaluation)$" + - regexp: + kubernetes.labels.app: "^(lessie-go-api|apex)$" + config: + - type: filestream + id: "container-${data.kubernetes.container.id}" + prospector.scanner.symlinks: true + close.on_state_change.removed: false + parsers: + - container: ~ + paths: + - /var/log/containers/*-${data.kubernetes.container.id}.log + processors: + - add_kubernetes_metadata: + host: ${NODE_NAME} + - decode_json_fields: + fields: ["message"] + target: "mylog" + overwrite_keys: true + add_error_key: true + - drop_fields: + fields: + - "kubernetes.node.labels" + - "kubernetes.namespace_labels.kubernetes_io/metadata_name" + ignore_missing: true + # ---------- ↑ json格式日志 ↑ ---------- + + + # ---------- ↓ java语言的服务的Pod, agnet\admin\payment 项目自由文本格式日志 ↓ ---------- + - condition: + and: + - equals: + kubernetes.namespace: sit + - or: + - equals: + kubernetes.labels.app: "flymoon-admin" + - equals: + kubernetes.labels.app: "flymoon-agent" + - equals: + kubernetes.labels.app: "flymoon-payment" + config: + - type: filestream + id: "container-${data.kubernetes.container.id}" + prospector.scanner.symlinks: true + close.on_state_change.removed: false + parsers: + - container: ~ + - multiline: + type: pattern + pattern: '^\d{4}-\d{2}-\d{2}-\d{2}:\d{2}:\d{2}\.\d{3}' + negate: true + match: after + paths: + - /var/log/containers/*-${data.kubernetes.container.id}.log + processors: + - add_kubernetes_metadata: + host: ${NODE_NAME} + - dissect: + tokenizer: '%{timestamp} %{level} %{pid} --- [%{thread}] %{class} : [%{app_name->}] %{message}' + field: "message" + target_prefix: "mylog" + ignore_missing: true + overwrite_keys: true + - drop_fields: + fields: ["kubernetes.node.labels", "kubernetes.annotations"] + ignore_missing: true + + # ---------- ↑ java语言的服务的Pod, agnet\admin\payment 项目自由文本格式日志 ↑ ---------- + + + # ---------- ↓ java语言的服务的Pod, email 项目自由文本格式日志 ↓ ---------- + - condition: + and: + - equals: + kubernetes.namespace: sit + - equals: + kubernetes.labels.app: "flymoon-email" + config: + - type: filestream + id: "container-${data.kubernetes.container.id}" + prospector.scanner.symlinks: true + close.on_state_change.removed: false + parsers: + - container: ~ + - multiline: + type: pattern + pattern: '^\d{4}-\d{2}-\d{2}' + negate: true + match: after + paths: + - /var/log/containers/*-${data.kubernetes.container.id}.log + processors: + - add_kubernetes_metadata: + host: ${NODE_NAME} + - dissect: + tokenizer: '%{timestamp} %{level} %{pid} --- [%{thread}] %{class} : %{message}' + field: "message" + target_prefix: "mylog" + ignore_missing: true + overwrite_keys: true + - drop_fields: + fields: ["kubernetes.node.labels", "kubernetes.annotations"] + ignore_missing: true + # ---------- ↑ java语言的服务的Pod, email 项目自由文本格式日志 ↑ ---------- + + + # ---------- ↓ python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↓ ---------- + - condition: + and: + - equals: + kubernetes.namespace: sit + - equals: + kubernetes.labels.app: "lessie-agents" + config: + - type: filestream + id: "container-${data.kubernetes.container.id}" + prospector.scanner.symlinks: true + close.on_state_change.removed: false + parsers: + - container: ~ + paths: + - /var/log/containers/*-${data.kubernetes.container.id}.log + processors: + - add_kubernetes_metadata: + host: ${NODE_NAME} + # 第一层:仅解析符合时间戳开头的日志行(for业务告警的日志格式) + - dissect: + when: + regexp: + message: '^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}.*' + tokenizer: '%{timestamp} - %{level} - %{module} - %{function} - %{msg_body}' + field: "message" + target_prefix: "mylog" + ignore_missing: true + overwrite_keys: true + # 第二层:针对带有 [level: | event: | msg: | context:] 的日志,再做一次 dissect + - dissect: + when: + contains: + mylog.msg_body: "[level:" + tokenizer: '[level: %{event_level} | event: %{event} | msg: %{msg} | context: %{ctx_raw}]' + field: "mylog.msg_body" + target_prefix: "mylog" + ignore_missing: true + overwrite_keys: true + # 第三层:把 ctx_raw 再拆成独立字段 + - script: + lang: javascript + id: parse_context + source: > + function process(event) { + var ctx = event.Get("mylog.ctx_raw"); + if (!ctx) return; + var parts = ctx.trim().split(","); + for (var i = 0; i < parts.length; i++) { + var pair = parts[i].split(":"); + if (pair.length === 2) { + event.Put("mylog." + pair[0].trim(), pair[1].trim()); + } + } + } + # 第四层: 去除大量不需要的k8s元数据字段 + - drop_fields: + fields: + - "kubernetes.node.labels" + - "kubernetes.annotations" + ignore_missing: true + # ---------- ↑ python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↑ ---------- + + + # ---------- ↓ apex 动态创建的 python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↓ ---------- + - condition: + and: + - equals: + kubernetes.namespace: apex-evaluation + - equals: + kubernetes.labels.apex: "lessie-agents" + config: + - type: filestream + id: "container-${data.kubernetes.container.id}" + prospector.scanner.symlinks: true + close.on_state_change.removed: false + parsers: + - container: ~ + paths: + - /var/log/containers/*-${data.kubernetes.container.id}.log + processors: + - drop_fields: + fields: + - "kubernetes.node.labels" + - "kubernetes.annotations" + ignore_missing: true + # ---------- ↑ apex 动态创建的 python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↑ ---------- + + + + # ---- 输出到 Elasticsearch ---- + output.elasticsearch: + hosts: ["http://10.0.0.38:9200"] + username: "admin" + password: "G7ZSKFM4AQwHQpwA" + + indices: + - index: "k8s-%{[kubernetes.labels.environment]}-%{[kubernetes.labels.app]}-%{+yyyy.MM}" + when: + regexp: + kubernetes.labels.app: "(lessie-go-api|flymoon-admin|flymoon-agent|flymoon-payment|flymoon-email|lessie-agents|apex)" + + - index: "apex-python-%{+yyyy.MM}" + when: + equals: + kubernetes.labels.apex: "lessie-agents" + + logging.level: info + logging.selectors: ["*"] \ No newline at end of file diff --git a/OpenTelemetry/ES/filebast/022-filebeat-configmap.yaml b/OpenTelemetry/ES/filebast/022-filebeat-configmap.yaml new file mode 100644 index 0000000..6ea96f4 --- /dev/null +++ b/OpenTelemetry/ES/filebast/022-filebeat-configmap.yaml @@ -0,0 +1,233 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: filebeat-config + namespace: kube-system +data: + filebeat.yml: | + setup.ilm.enabled: false + setup.template.enabled: false + + filebeat.autodiscover: + providers: + - type: kubernetes + templates: + # ---------- ↓ json格式日志 ↓ ---------- + - condition: + and: + - regexp: + kubernetes.namespace: "^(sit|apex-evaluation)$" + - regexp: + kubernetes.labels.app: "^(lessie-go-api|apex)$" + config: + - type: filestream + id: "container-${data.kubernetes.container.id}" + prospector.scanner.symlinks: true + close.on_state_change.removed: false + parsers: + - container: ~ + paths: + - /var/log/containers/*-${data.kubernetes.container.id}.log + processors: + - add_kubernetes_metadata: + host: ${NODE_NAME} + - decode_json_fields: + fields: ["message"] + target: "mylog" + overwrite_keys: true + add_error_key: true + - drop_fields: + fields: + - "kubernetes.node.labels" + - "kubernetes.namespace_labels.kubernetes_io/metadata_name" + ignore_missing: true + # ---------- ↑ json格式日志 ↑ ---------- + + + # ---------- ↓ java语言的服务的Pod, agnet\admin\payment 项目自由文本格式日志 ↓ ---------- + - condition: + and: + - equals: + kubernetes.namespace: sit + - or: + - equals: + kubernetes.labels.app: "flymoon-admin" + - equals: + kubernetes.labels.app: "flymoon-agent" + - equals: + kubernetes.labels.app: "flymoon-payment" + config: + - type: filestream + id: "container-${data.kubernetes.container.id}" + prospector.scanner.symlinks: true + close.on_state_change.removed: false + parsers: + - container: ~ + - multiline: + type: pattern + pattern: '^\d{4}-\d{2}-\d{2}-\d{2}:\d{2}:\d{2}\.\d{3}' + negate: true + match: after + paths: + - /var/log/containers/*-${data.kubernetes.container.id}.log + processors: + - add_kubernetes_metadata: + host: ${NODE_NAME} + - dissect: + tokenizer: '%{timestamp} %{level} %{pid} --- [%{thread}] %{class} : [%{app_name->}] %{message}' + field: "message" + target_prefix: "mylog" + ignore_missing: true + overwrite_keys: true + - drop_fields: + fields: ["kubernetes.node.labels", "kubernetes.annotations"] + ignore_missing: true + + # ---------- ↑ java语言的服务的Pod, agnet\admin\payment 项目自由文本格式日志 ↑ ---------- + + + # ---------- ↓ java语言的服务的Pod, email 项目自由文本格式日志 ↓ ---------- + - condition: + and: + - equals: + kubernetes.namespace: sit + - equals: + kubernetes.labels.app: "flymoon-email" + config: + - type: filestream + id: "container-${data.kubernetes.container.id}" + prospector.scanner.symlinks: true + close.on_state_change.removed: false + parsers: + - container: ~ + - multiline: + type: pattern + pattern: '^\d{4}-\d{2}-\d{2}' + negate: true + match: after + paths: + - /var/log/containers/*-${data.kubernetes.container.id}.log + processors: + - add_kubernetes_metadata: + host: ${NODE_NAME} + - dissect: + tokenizer: '%{timestamp} %{level} %{pid} --- [%{thread}] %{class} : %{message}' + field: "message" + target_prefix: "mylog" + ignore_missing: true + overwrite_keys: true + - drop_fields: + fields: ["kubernetes.node.labels", "kubernetes.annotations"] + ignore_missing: true + # ---------- ↑ java语言的服务的Pod, email 项目自由文本格式日志 ↑ ---------- + + + # ---------- ↓ python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↓ ---------- + - condition: + and: + - equals: + kubernetes.namespace: sit + - equals: + kubernetes.labels.app: "lessie-agents" + config: + - type: filestream + id: "container-${data.kubernetes.container.id}" + prospector.scanner.symlinks: true + close.on_state_change.removed: false + parsers: + - container: ~ + paths: + - /var/log/containers/*-${data.kubernetes.container.id}.log + processors: + - add_kubernetes_metadata: + host: ${NODE_NAME} + # 第一层:仅解析符合时间戳开头的日志行(for业务告警的日志格式) + - dissect: + when: + regexp: + message: '^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}.*' + tokenizer: '%{timestamp} - %{level} - %{module} - %{function} - %{msg_body}' + field: "message" + target_prefix: "mylog" + ignore_missing: true + overwrite_keys: true + # 第二层:针对带有 [level: | event: | msg: | context:] 的日志,再做一次 dissect + - dissect: + when: + contains: + mylog.msg_body: "[level:" + tokenizer: '[level: %{event_level} | event: %{event} | msg: %{msg} | context: %{ctx_raw}]' + field: "mylog.msg_body" + target_prefix: "mylog" + ignore_missing: true + overwrite_keys: true + # 第三层:把 ctx_raw 再拆成独立字段 + - script: + lang: javascript + id: parse_context + source: > + function process(event) { + var ctx = event.Get("mylog.ctx_raw"); + if (!ctx) return; + var parts = ctx.trim().split(","); + for (var i = 0; i < parts.length; i++) { + var pair = parts[i].split(":"); + if (pair.length === 2) { + event.Put("mylog." + pair[0].trim(), pair[1].trim()); + } + } + } + # 第四层: 去除大量不需要的k8s元数据字段 + - drop_fields: + fields: + - "kubernetes.node.labels" + - "kubernetes.annotations" + ignore_missing: true + # ---------- ↑ python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↑ ---------- + + + # ---------- ↓ apex 动态创建的 python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↓ ---------- + - condition: + and: + - equals: + kubernetes.namespace: apex-evaluation + - equals: + kubernetes.labels.apex: "lessie-agents" + config: + - type: filestream + id: "container-${data.kubernetes.container.id}" + prospector.scanner.symlinks: true + close.on_state_change.removed: false + parsers: + - container: ~ + paths: + - /var/log/containers/*-${data.kubernetes.container.id}.log + processors: + - drop_fields: + fields: + - "kubernetes.node.labels" + - "kubernetes.annotations" + ignore_missing: true + # ---------- ↑ apex 动态创建的 python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↑ ---------- + + + + # ---- 输出到 Elasticsearch ---- + output.elasticsearch: + hosts: ["http://10.0.0.38:9200"] + username: "admin" + password: "G7ZSKFM4AQwHQpwA" + + indices: + - index: "k8s-%{[kubernetes.labels.environment]}-%{[kubernetes.labels.app]}-%{+yyyy.MM}" + when: + regexp: + kubernetes.labels.app: "(lessie-go-api|flymoon-admin|flymoon-agent|flymoon-payment|flymoon-email|lessie-agents|apex)" + + - index: "apex-python-%{+yyyy.MM}" + when: + equals: + kubernetes.labels.apex: "lessie-agents" + + logging.level: info + logging.selectors: ["*"] \ No newline at end of file diff --git a/OpenTelemetry/ES/filebast/03-filebeat-daemonset.yaml b/OpenTelemetry/ES/filebast/03-filebeat-daemonset.yaml new file mode 100644 index 0000000..c385c21 --- /dev/null +++ b/OpenTelemetry/ES/filebast/03-filebeat-daemonset.yaml @@ -0,0 +1,65 @@ +# 滚动更新 +# kubectl rollout restart daemonset filebeat -n kube-system + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: filebeat + namespace: kube-system + labels: + k8s-app: filebeat +spec: + selector: + matchLabels: + k8s-app: filebeat + template: + metadata: + labels: + k8s-app: filebeat + spec: + serviceAccountName: filebeat + terminationGracePeriodSeconds: 30 + containers: + - name: filebeat + image: docker.elastic.co/beats/filebeat:9.2.2 + args: + - "-e" + env: + - name: TZ + value: Asia/Shanghai + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + resources: + limits: + memory: 300Mi + requests: + cpu: 100m + memory: 200Mi + volumeMounts: + - name: config + mountPath: /usr/share/filebeat/filebeat.yml + subPath: filebeat.yml + - name: data + mountPath: /var/lib/filebeat-data + - name: containers + mountPath: /var/log/containers + readOnly: true + - name: pods + mountPath: /var/log/pods + readOnly: true + volumes: + - name: config + configMap: + name: filebeat-config + - name: data + hostPath: + path: /var/lib/filebeat-data + type: DirectoryOrCreate + - name: containers + hostPath: + path: /var/log/containers + - name: pods + hostPath: + path: /var/log/pods diff --git a/OpenTelemetry/ES/filebast/filebeat.yaml b/OpenTelemetry/ES/filebast/filebeat.yaml new file mode 100644 index 0000000..77260ee --- /dev/null +++ b/OpenTelemetry/ES/filebast/filebeat.yaml @@ -0,0 +1,226 @@ +setup.ilm.enabled: false +setup.template.enabled: false + +filebeat.autodiscover: + providers: + - type: kubernetes + templates: + # ---------- ↓ json格式日志 ↓ ---------- + - condition: + and: + - regexp: + kubernetes.namespace: "^(sit|apex-evaluation)$" + - regexp: + kubernetes.labels.app: "^(lessie-go-api|apex)$" + config: + - type: filestream + id: "container-${data.kubernetes.container.id}" + prospector.scanner.symlinks: true + close.on_state_change.removed: false + parsers: + - container: ~ + paths: + - /var/log/containers/*-${data.kubernetes.container.id}.log + processors: + - add_kubernetes_metadata: + host: ${NODE_NAME} + - decode_json_fields: + fields: ["message"] + target: "mylog" + overwrite_keys: true + add_error_key: true + - drop_fields: + fields: + - "kubernetes.node.labels" + - "kubernetes.namespace_labels.kubernetes_io/metadata_name" + ignore_missing: true + # ---------- ↑ json格式日志 ↑ ---------- + + + # ---------- ↓ java语言的服务的Pod, agnet\admin\payment 项目自由文本格式日志 ↓ ---------- + - condition: + and: + - equals: + kubernetes.namespace: sit + - or: + - equals: + kubernetes.labels.app: "flymoon-admin" + - equals: + kubernetes.labels.app: "flymoon-agent" + - equals: + kubernetes.labels.app: "flymoon-payment" + config: + - type: filestream + id: "container-${data.kubernetes.container.id}" + prospector.scanner.symlinks: true + close.on_state_change.removed: false + parsers: + - container: ~ + - multiline: + type: pattern + pattern: '^\d{4}-\d{2}-\d{2}-\d{2}:\d{2}:\d{2}\.\d{3}' + negate: true + match: after + paths: + - /var/log/containers/*-${data.kubernetes.container.id}.log + processors: + - add_kubernetes_metadata: + host: ${NODE_NAME} + - dissect: + tokenizer: '%{timestamp} %{level} %{pid} --- [%{thread}] %{class} : [%{app_name->}] %{message}' + field: "message" + target_prefix: "mylog" + ignore_missing: true + overwrite_keys: true + - drop_fields: + fields: ["kubernetes.node.labels", "kubernetes.annotations"] + ignore_missing: true + + # ---------- ↑ java语言的服务的Pod, agnet\admin\payment 项目自由文本格式日志 ↑ ---------- + + + # ---------- ↓ java语言的服务的Pod, email 项目自由文本格式日志 ↓ ---------- + - condition: + and: + - equals: + kubernetes.namespace: sit + - equals: + kubernetes.labels.app: "flymoon-email" + config: + - type: filestream + id: "container-${data.kubernetes.container.id}" + prospector.scanner.symlinks: true + close.on_state_change.removed: false + parsers: + - container: ~ + - multiline: + type: pattern + pattern: '^\d{4}-\d{2}-\d{2}' + negate: true + match: after + paths: + - /var/log/containers/*-${data.kubernetes.container.id}.log + processors: + - add_kubernetes_metadata: + host: ${NODE_NAME} + - dissect: + tokenizer: '%{timestamp} %{level} %{pid} --- [%{thread}] %{class} : %{message}' + field: "message" + target_prefix: "mylog" + ignore_missing: true + overwrite_keys: true + - drop_fields: + fields: ["kubernetes.node.labels", "kubernetes.annotations"] + ignore_missing: true + # ---------- ↑ java语言的服务的Pod, email 项目自由文本格式日志 ↑ ---------- + + + # ---------- ↓ python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↓ ---------- + - condition: + and: + - equals: + kubernetes.namespace: sit + - equals: + kubernetes.labels.app: "lessie-agents" + config: + - type: filestream + id: "container-${data.kubernetes.container.id}" + prospector.scanner.symlinks: true + close.on_state_change.removed: false + parsers: + - container: ~ + paths: + - /var/log/containers/*-${data.kubernetes.container.id}.log + processors: + - add_kubernetes_metadata: + host: ${NODE_NAME} + # 第一层:仅解析符合时间戳开头的日志行(for业务告警的日志格式) + - dissect: + when: + regexp: + message: '^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}.*' + tokenizer: '%{timestamp} - %{level} - %{module} - %{function} - %{msg_body}' + field: "message" + target_prefix: "mylog" + ignore_missing: true + overwrite_keys: true + # 第二层:针对带有 [level: | event: | msg: | context:] 的日志,再做一次 dissect + - dissect: + when: + contains: + mylog.msg_body: "[level:" + tokenizer: '[level: %{event_level} | event: %{event} | msg: %{msg} | context: %{ctx_raw}]' + field: "mylog.msg_body" + target_prefix: "mylog" + ignore_missing: true + overwrite_keys: true + # 第三层:把 ctx_raw 再拆成独立字段 + - script: + lang: javascript + id: parse_context + source: > + function process(event) { + var ctx = event.Get("mylog.ctx_raw"); + if (!ctx) return; + var parts = ctx.trim().split(","); + for (var i = 0; i < parts.length; i++) { + var pair = parts[i].split(":"); + if (pair.length === 2) { + event.Put("mylog." + pair[0].trim(), pair[1].trim()); + } + } + } + # 第四层: 去除大量不需要的k8s元数据字段 + - drop_fields: + fields: + - "kubernetes.node.labels" + - "kubernetes.annotations" + ignore_missing: true + # ---------- ↑ python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↑ ---------- + + + # ---------- ↓ apex 动态创建的 python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↓ ---------- + - condition: + and: + - equals: + kubernetes.namespace: apex-evaluation + - equals: + kubernetes.labels.apex: "lessie-agents" + config: + - type: filestream + id: "container-${data.kubernetes.container.id}" + prospector.scanner.symlinks: true + close.on_state_change.removed: false + parsers: + - container: ~ + paths: + - /var/log/containers/*-${data.kubernetes.container.id}.log + processors: + - drop_fields: + fields: + - "kubernetes.node.labels" + - "kubernetes.annotations" + ignore_missing: true + # ---------- ↑ apex 动态创建的 python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↑ ---------- + + + +# ---- 输出到 Elasticsearch ---- +output.elasticsearch: + hosts: ["http://10.0.0.38:9200"] + username: "admin" + password: "G7ZSKFM4AQwHQpwA" + + indices: + - index: "k8s-%{[kubernetes.labels.environment]}-%{[kubernetes.labels.app]}-%{+yyyy.MM.dd}" + when: + regexp: + kubernetes.labels.app: "(lessie-go-api|flymoon-admin|flymoon-agent|flymoon-payment|flymoon-email|lessie-agents|apex)" + + - index: "apex-python-%{+yyyy.MM.dd}" + when: + equals: + kubernetes.labels.apex: "lessie-agents" + +logging.level: info +logging.selectors: ["*"] \ No newline at end of file diff --git a/OpenTelemetry/ES/单节点/安装es.conf b/OpenTelemetry/ES/单节点/安装es.conf new file mode 100644 index 0000000..c350329 --- /dev/null +++ b/OpenTelemetry/ES/单节点/安装es.conf @@ -0,0 +1,143 @@ +# 前置 & 准备工作 +sudo dnf update -y +sudo dnf install -y nano wget curl unzip + +# 安全组防火墙开放9200端口、5601端口 + +# 安装 Elasticsearch 9.2.2 +# 导入官方 GPG key +sudo rpm --import https://artifacts.elastic.co/GPG-KEY-elasticsearch + +# 新建 yum repo 文件 +sudo tee /etc/yum.repos.d/elasticsearch.repo <<-'EOF' +[elasticsearch] +name=Elasticsearch repository for 9.x packages +baseurl=https://artifacts.elastic.co/packages/9.x/yum +gpgcheck=1 +gpgkey=https://artifacts.elastic.co/GPG-KEY-elasticsearch +enabled=1 +autorefresh=1 +type=rpm-md +EOF + +# 安装 Elasticsearch: +sudo dnf install elasticsearch --enablerepo=elasticsearch + +# 先不管直接启动、报错再查看日志,有可能是权限问题 +sudo systemctl daemon-reload +sudo systemctl enable elasticsearch +sudo systemctl start elasticsearch +sudo systemctl status elasticsearch +sudo journalctl -u elasticsearch -f + +# 手动创建日志目录 + 设置权限 +sudo mkdir -p /usr/share/elasticsearch/logs +sudo chown -R elasticsearch:elasticsearch /usr/share/elasticsearch/logs +sudo chmod 750 /usr/share/elasticsearch/logs + +# 设置 elastic 超级用户密码 (推荐立即设定): +sudo /usr/share/elasticsearch/bin/elasticsearch-reset-password -u elastic + +# 查看自签名证书,有则正常 +ll /etc/elasticsearch/certs/ + +# 查看 HTTP CA 证书指纹(用于其他客户端配置) +sudo openssl x509 -fingerprint -sha256 -in /etc/elasticsearch/certs/http_ca.crt -noout + +# 设置环境变量(替换为你的实际密码) +export ELASTIC_PASSWORD='MyElastic123!' +# 测试 HTTPS 请求(必须用 --cacert,因启用了 TLS) +curl --cacert /etc/elasticsearch/certs/http_ca.crt \ + -u elastic:$ELASTIC_PASSWORD \ + https://localhost:9200 + + +# 查看默认的配置文件 +grep -v '^\s*#\|^\s*$' /etc/elasticsearch/elasticsearch.yml +# 按实际情况修改配置文件集群名、非本地访问等 +cluster.name: my-test-es +path.data: /var/lib/elasticsearch +path.logs: /var/log/elasticsearch +network.host: 0.0.0.0 +xpack.security.enabled: true +xpack.security.enrollment.enabled: true +xpack.security.http.ssl: + enabled: true + keystore.path: certs/http.p12 +xpack.security.transport.ssl: + enabled: true + verification_mode: certificate + keystore.path: certs/transport.p12 + truststore.path: certs/transport.p12 +cluster.initial_master_nodes: ["weblessie-server-02"] +http.host: 0.0.0.0 + + +# 更改es的jvm大小 +vim /etc/elasticsearch/jvm.options +-Xms4g +-Xmx4g + +# 重启 +sudo systemctl restart elasticsearch + +# 准备token,后续在Kibana中使用 +sudo /usr/share/elasticsearch/bin/elasticsearch-create-enrollment-token -s kibana + + +# 准备安装 Kibana 9.2.2 +# 新建 repo /etc/yum.repos.d/kibana.repo +sudo tee /etc/yum.repos.d/kibana.repo <<-'EOF' +[kibana] +name=Kibana repository for 9.x packages +baseurl=https://artifacts.elastic.co/packages/9.x/yum +gpgcheck=1 +gpgkey=https://artifacts.elastic.co/GPG-KEY-elasticsearch +enabled=1 +autorefresh=1 +type=rpm-md +EOF + +# 安装 Kibana: +sudo dnf install kibana --enablerepo=kibana +# 启动 +sudo systemctl daemon-reload +sudo systemctl enable --now kibana + +# 访问 Kibana,输入生成的token +http://ip:5601 + +# 获取 “verification code” +/usr/share/kibana/bin/kibana-verification-code + +# 使用官方工具生成加密密钥(最规范) +sudo /usr/share/kibana/bin/kibana-encryption-keys generate --force +# 输出应类似: +# ✔ Encryption keys generated and written to /etc/kibana/kibana.yml: +# xpack.encryptedSavedObjects.encryptionKey +# xpack.reporting.encryptionKey +# xpack.security.encryptionKey + +# 修改配置文件 +grep -v '^\s*#\|^\s*$' /etc/kibana/kibana.yml +server.host: "0.0.0.0" +logging: + appenders: + file: + type: file + fileName: /var/log/kibana/kibana.log + layout: + type: json + root: + appenders: + - default + - file +pid.file: /run/kibana/kibana.pid +i18n.locale: "zh-CN" +elasticsearch.hosts: [https://10.0.0.38:9200] +elasticsearch.serviceAccountToken: AAEAAWVsYXN0aWMva2liYW5hL2Vucm9sbC1wcm9jZXNzLXRva2VuLTE3NjUzNDE4OTI3MjY6Um9KdUo2N1hSZVNPeGNzOXFDaUh2dw +elasticsearch.ssl.certificateAuthorities: [/var/lib/kibana/ca_1765341893683.crt] +xpack.fleet.outputs: [{id: fleet-default-output, name: default, is_default: true, is_default_monitoring: true, type: elasticsearch, hosts: [https://10.0.0.38:9200], ca_trusted_fingerprint: 80af64db043e12ebda11c10f70042af91306a705fdcb6285814a84b420c734a5}] +xpack.encryptedSavedObjects.encryptionKey: f10166c761265d5ca61e7fa2c1acac73 +xpack.reporting.encryptionKey: 1772a5152522675d5a38470e905b2817 +xpack.security.encryptionKey: d4b30e82e47f530a998e29cb0b8e5295 \ No newline at end of file diff --git a/OpenTelemetry/ES/单节点/证书使用示例.conf b/OpenTelemetry/ES/单节点/证书使用示例.conf new file mode 100644 index 0000000..0c36100 --- /dev/null +++ b/OpenTelemetry/ES/单节点/证书使用示例.conf @@ -0,0 +1,41 @@ + +# 获取ES 的证书指纹 +sudo openssl x509 -fingerprint -sha256 -in /etc/elasticsearch/certs/http_ca.crt -noout +sha256 Fingerprint=80:AF:64:DB:04:3E:12:EB:DA:11:C1:0F:70:04:2A:F9:13:06:A7:05:FD:CB:62:85:81:4A:84:B4:20:C7:34:A5 + +# kibana web创建的用户 +admin +G7ZSKFM4AQwHQpwA + + + +# Filebeat +output.elasticsearch: + hosts: ["https://49.51.33.153:9200"] + username: "elastic" + password: "-0NiIBOJGn2CATuPWzNc" + + # 用指纹验证(代替证书文件) + ssl.verification_mode: "certificate" + ssl.certificate_authorities: [] # 留空(不校验完整链) + ssl.supported_protocols: [TLSv1.2, TLSv1.3] + + # 关键:指定 CA 指纹(必须全大写,无 0x,带冒号) + ssl.ca_trusted_fingerprint: "80AF64DB043E12EBDA11C10F70042AF91306A705FD2CB6285814A84B420C734A5" + + + + + +# python +from elasticsearch import Elasticsearch + +es = Elasticsearch( + hosts=["https://49.51.33.153:9200"], + basic_auth=("elastic", "-0NiIBOJGn2CATuPWzNc"), + # 指纹必须去掉冒号,全大写 + ssl_assert_fingerprint="80AF64DB043E12EBDA11C10F70042AF91306A705FD2CB6285814A84B420C734A5", + verify_certs=True # 必须为 True +) + +print(es.info()) diff --git a/OpenTelemetry/prometheus/二进制部署/grafana/dm.sh b/OpenTelemetry/prometheus/二进制部署/grafana/dm.sh new file mode 100644 index 0000000..27c6211 --- /dev/null +++ b/OpenTelemetry/prometheus/二进制部署/grafana/dm.sh @@ -0,0 +1,46 @@ +# 下载tar +wget https://dl.grafana.com/grafana-enterprise/release/12.3.1/grafana-enterprise_12.3.1_20271043721_linux_amd64.tar.gz + +# 创建Grafana相关目录(数据+配置) +mkdir -p /data/grafana/ + +# 为 Grafana 创建用户帐户 +useradd -r -s /bin/false grafana + +# 将解压后的二进制文件移动到/data/grafana/ +tar -xzf grafana-enterprise_12.3.1_20271043721_linux_amd64.tar.gz -C /data/grafana/ + +# 所有者更改/data/grafana/为 Grafana 用户 +chown -R grafana:grafana /data/grafana/ + +# 复制默认的配置文件 +cp /data/grafana/conf/defaults.ini /data/grafana/conf/grafana.ini + +# 创建 Grafana 服务器 systemd 单元文件 +sudo touch /etc/systemd/system/grafana-server.service + +[Unit] +Description=Grafana Server +After=network.target + +[Service] +Type=simple +User=grafana +Group=grafana +ExecStart=/data/grafana/bin/grafana server --config=/data/grafana/conf/grafana.ini --homepath=/data/grafana +Restart=on-failure + +[Install] +WantedBy=multi-user.target + + +# 启用 Grafana 服务器 systemd 服务 +sudo systemctl daemon-reload +sudo systemctl start grafana-server +sudo systemctl enable grafana-server + + + + + + diff --git a/OpenTelemetry/prometheus/二进制部署/prometheus/dm.bash b/OpenTelemetry/prometheus/二进制部署/prometheus/dm.bash new file mode 100644 index 0000000..fcdc0e7 --- /dev/null +++ b/OpenTelemetry/prometheus/二进制部署/prometheus/dm.bash @@ -0,0 +1,81 @@ +# 创建目录 +mkdir -p /data/prometheus/ +mkdir -p /data/alertmanager/ + +# 下载tar包 +wget https://github.com/prometheus/prometheus/releases/download/v3.8.1/prometheus-3.8.1.linux-amd64.tar.gz +wget https://github.com/prometheus/alertmanager/releases/download/v0.30.0/alertmanager-0.30.0.linux-amd64.tar.gz + +# 创建系统用户(如果尚未创建) +sudo useradd --no-create-home --shell /bin/false prometheus || true + +# 授权目录权限 +sudo chown -R prometheus:prometheus /data/prometheus +sudo chown -R prometheus:prometheus /data/alertmanager + +# 创建文件 /etc/systemd/system/prometheus.service +[Unit] +Description=Prometheus +Wants=network-online.target +After=network-online.target + +[Service] +User=prometheus +Group=prometheus +Type=simple +# 注意:--storage.tsdb.path 指定数据存储位置,建议设在 /data 目录下 +ExecStart=/data/prometheus/prometheus \ + --config.file=/data/prometheus/prometheus.yml \ + --storage.tsdb.path=/data/prometheus/data \ + --web.console.templates=/data/prometheus/consoles \ + --web.console.libraries=/data/prometheus/console_libraries + +Restart=always + +[Install] +WantedBy=multi-user.target + +# 创建文件 /etc/systemd/system/alertmanager.service +[Unit] +Description=Alertmanager +Wants=network-online.target +After=network-online.target + +[Service] +User=prometheus +Group=prometheus +Type=simple +ExecStart=/data/alertmanager/alertmanager \ + --config.file=/data/alertmanager/alertmanager.yml \ + --storage.path=/data/alertmanager/data + +Restart=always + +[Install] +WantedBy=multi-user.target + +# 修改 Prometheus 关联 Alertmanager +# Alerting configuration +alerting: + alertmanagers: + - static_configs: + - targets: + - localhost:9093 # Alertmanager 默认端口 + + +# 重载 systemd +sudo systemctl daemon-reload + +# 启动并设置开机自启 +sudo systemctl enable --now prometheus +sudo systemctl enable --now alertmanager + +# 检查状态 +sudo systemctl status prometheus +sudo systemctl status alertmanager + + +配置文件检查 +在重启服务前,可以使用自带的工具检查语法是否正确: +Prometheus 检查: /data/prometheus/promtool check config /data/prometheus/prometheus.yml +Alertmanager 检查: /data/alertmanager/amtool check-config /data/alertmanager/alertmanager.yml \ No newline at end of file diff --git a/OpenTelemetry/readme.txt b/OpenTelemetry/readme.txt new file mode 100644 index 0000000..630a007 --- /dev/null +++ b/OpenTelemetry/readme.txt @@ -0,0 +1,53 @@ +一、存储 +ES、Prometheus、Tempo 均二进制部署在k8s集群外部 + +二、采集、处理、中转 +OpenTelemetry Collector 其它 部署在 k8s 集群内 + 1、DaemonSet Collector 采集(节点 / 容器级)「指标 + 日志」 + 2、Deployment Collector 接收DaemonSet Collector 的数据、处理、中转至存储「追踪数据」 + +三、三种类型数据 +1、指标数据(最终到 Prometheus) + 采集方:OTel Collector(部署在 k8s 内的DaemonSet 模式) + 采集内容:节点 CPU / 内存 / 磁盘(替代node-exporter)、容器资源使用率(替代kubelet指标)、业务 Pod 的自定义指标(需应用集成 OTel SDK)。 + 处理方:OTel Collector(Deployment 模式,集群级聚合) + 处理逻辑:标准化指标格式、补充 k8s 标签(如集群名、Pod 名)、批处理。 + 发送方:OTel Collector + 发送协议:Prometheus Remote Write + 接收方:集群外的 Prometheus。 + +graph TD + A[k8s节点] -->|DaemonSet Collector采集| B[节点CPU/内存/磁盘指标] + C[k8s容器] -->|DaemonSet Collector采集| D[容器使用率指标] + B -->|上报| E[Deployment Collector] + D -->|上报| E[Deployment Collector] + E -->|标准化+批处理| F[Remote Write协议] + F -->|转发| G[集群外Prometheus] + + + +2. 日志数据(最终到 ES) + 采集方:OTel Collector(DaemonSet 模式) + 采集内容:k8s 节点/var/log/containers目录下的容器日志(替代 Filebeat)。 + 处理方:OTel Collector(Deployment 模式) + 处理逻辑:解析日志格式(JSON / 正则)、过滤冗余日志、补充 k8s 资源标签。 + 发送方:OTel Collector + 发送协议:Elasticsearch API + 接收方:集群外的 ES。 + +3. 请求链路(追踪数据,最终到 Tempo) + 采集方:分两种场景 + 基础设施链路:OTel Collector(Deployment 模式)采集 k8s 组件的链路数据(如 Ingress/Nginx) + 业务链路:业务应用集成OTel SDK(如 Java Agent、Go SDK),在应用内部采集请求链路。 + 处理方:OTel Collector(Deployment 模式) + 处理逻辑:标准化 Trace 格式、关联 k8s 资源信息、批处理。 + 发送方:OTel Collector + 发送协议:OTLP(OpenTelemetry 协议,支持 gRPC/HTTP) + 接收方:集群外的 Tempo。 + + + +graph LR + A[指标接收器] -->|metrics流水线| B[指标处理器] --> C[Prometheus导出器] + D[日志接收器] -->|logs流水线| E[日志处理器] --> F[ES导出器] + G[追踪接收器] -->|traces流水线| H[追踪处理器] --> I[Tempo导出器] \ No newline at end of file diff --git a/OpenTelemetry/tempo/cos-tempo.yaml b/OpenTelemetry/tempo/cos-tempo.yaml new file mode 100644 index 0000000..7d091e0 --- /dev/null +++ b/OpenTelemetry/tempo/cos-tempo.yaml @@ -0,0 +1,58 @@ +server: + http_listen_port: 3200 # HTTP 接口监听端口 + grpc_listen_port: 9095 # gRPC 接口监听端口 + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 # OTLP gRPC 接口监听地址 + http: + endpoint: 0.0.0.0:4318 # OTLP HTTP 接口监听地址 + +ingester: + lifecycler: + ring: + replication_factor: 1 # 数据的副本数 + max_block_duration: 5m # 最大数据块时长 + trace_idle_period: 10s # 如果某个 Trace 长时间未活动,自动清理 + +compactor: + compaction: + block_retention: 720h # 数据块保留时间,720小时(30天) + compacted_block_retention: 168h # 压缩后的数据块保留时间,168小时(7天) + max_compaction_objects: 1000000 # 每次压缩的最大对象数 + +metrics_generator: + registry: + external_labels: + source: tempo + cluster: linux-microservices + storage: + path: /data/tempo/data/wal + remote_write: + - url: http://127.0.0.1:9090/api/v1/write + send_exemplars: true + +storage: + trace: + backend: s3 + s3: + endpoint: outscalelink-1324597558.cos.na-siliconvalley.myqcloud.com + bucket: outscalelink-1324597558 + prefix: tempo-data/ + forcepathstyle: true + enable_dual_stack: false + insecure: true + access_key: AKIDkgR4lHvU1QfieR7cxBLLTaUCh0S0dDev + secret_key: fAWjldKuPhz4wb6RedPzPccOwGOet9Ug + wal: + path: /data/tempo/data/wal + local: + path: /data/tempo/blocks + +overrides: + metrics_generator_processors: [service-graphs, span-metrics] + + diff --git a/OpenTelemetry/tempo/dm.sh b/OpenTelemetry/tempo/dm.sh new file mode 100644 index 0000000..699b96c --- /dev/null +++ b/OpenTelemetry/tempo/dm.sh @@ -0,0 +1,96 @@ +mkdir -p /data/tempo/{conf,data,metrics-generator} +mkdir -p /data/tempo/data/wal +mkdir -p /data/tempo/metrics-generator/wal + +chown -R tempo:tempo /data/tempo + +chown -R tempo:tempo /data/tempo/data/traces + +# 创建一个专用用户并配置服务,确保 Tempo 在后台稳定运行 +sudo useradd --no-create-home --shell /bin/false tempo || true + +# 下载tar包 +wget https://github.com/grafana/tempo/releases/download/v2.9.0/tempo_2.9.0_linux_amd64.tar.gz +# 解压 +tar -xzf tempo_2.9.0_linux_amd64.tar.gz -C /data/tempo/ +# 移动可执行文件到 /usr/local/bin/ +mv /data/tempo/tempo /data/tempo/tempo-cli /data/tempo/tempo-query /usr/local/bin/ +# 检查版本 +tempo --version + +# 创建配置文件(本地为存储介质) +vim local-tempo.yaml +server: + http_listen_port: 3200 # Tempo Web 端口(Grafana 对接用) + grpc_listen_port: 9095 # gRPC 端口(可选) + +distributor: + receivers: # 接收 OTel 追踪数据的协议(核心) + otlp: + protocols: + grpc: # 监听 4317 端口(和 OTel Collector 一致,接收 OTLP gRPC 数据) + endpoint: 0.0.0.0:4317 + http: # 监听 4318 端口(接收 OTLP HTTP 数据) + endpoint: 0.0.0.0:4318 + +ingester: + max_block_duration: 5m # 数据块存储时长(测试环境可设小) + trace_idle_period: 10s # 追踪会话空闲超时 + +compactor: + compaction: + block_retention: 30d # 追踪数据保留 30 天(可按需调整) + +storage: + trace: + backend: local # 单节点本地存储(生产可换 S3/MinIO) + local: + path: /data/tempo/data # 追踪数据存储目录(指定到 /data/tempo/data) + wal: + path: /data/tempo/data/wal # 预写日志目录(保证数据不丢) + +# 存储桶为存储对象 +vim cos-tempo.yaml + + +# 前台启动 +/usr/local/bin/tempo \ + -config.file=/data/tempo/conf/tempo.yaml \ + -config.expand-env=true + +# 检查服务状态 +systemctl is-active tempo + +# Systemd 服务守护进程 +vim /etc/systemd/system/tempo.service + +[Unit] +Description=Grafana Tempo +Wants=network-online.target +After=network-online.target + +[Service] +User=tempo +Group=tempo +Type=simple +# config.file 指定配置文件路径,这里的配置文件注意文件名 +ExecStart=/usr/local/bin/tempo \ + -config.file=/data/tempo/conf/tempo.yaml \ + -config.expand-env=true + +Restart=always +LimitNOFILE=65536 + +[Install] +WantedBy=multi-user.target + + +# 加载配置 +sudo systemctl daemon-reload +# 启动并设置自启 +sudo systemctl enable --now tempo +# 检查状态 +sudo systemctl status tempo + +# 查看日志 +journalctl -u tempo --no-pager -n 50 \ No newline at end of file diff --git a/jenkins/流水线配置/lessie-email.conf b/jenkins/流水线配置/lessie-email.conf index 519f749..efa92b4 100644 --- a/jenkins/流水线配置/lessie-email.conf +++ b/jenkins/流水线配置/lessie-email.conf @@ -56,7 +56,8 @@ pipeline { source .venv/bin/activate TIMESTAMP=\$(date +"%Y%m%d_%H%M%S") LOGFILE="${REMOTE_PROJECT_PATH}/logs/lessie_email_\${TIMESTAMP}.log" - nohup env ENV=s4 uv run uvicorn app.main:app --host 0.0.0.0 --port 8031 > "\$LOGFILE" 2>&1 & + nohup env ENV=s4 uv run uvicorn app.main:app --host 0.0.0.0 --port 8031 > "\$LOGFILE" 2>&1 & + // nohup env ENV=s4 uv run uvicorn app.main:app --host 0.0.0.0 --port 8031 --log-config logging_config.json > "\$LOGFILE" 2>&1 & ln -sf "\$LOGFILE" ${REMOTE_PROJECT_PATH}/logs/lessie_email_latest.log ' """ diff --git a/nginx/s4.jennie.im.conf b/nginx/s4.jennie.im.conf index b1c19ea..b0677fd 100644 --- a/nginx/s4.jennie.im.conf +++ b/nginx/s4.jennie.im.conf @@ -78,6 +78,8 @@ server { proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header X-Forwarded-Proto $scheme; + client_max_body_size 50m; + proxy_buffering off; proxy_cache off; proxy_http_version 1.1; diff --git a/prometheus/二进制部署/grafana/dm.sh b/prometheus/二进制部署/grafana/dm.sh new file mode 100644 index 0000000..27c6211 --- /dev/null +++ b/prometheus/二进制部署/grafana/dm.sh @@ -0,0 +1,46 @@ +# 下载tar +wget https://dl.grafana.com/grafana-enterprise/release/12.3.1/grafana-enterprise_12.3.1_20271043721_linux_amd64.tar.gz + +# 创建Grafana相关目录(数据+配置) +mkdir -p /data/grafana/ + +# 为 Grafana 创建用户帐户 +useradd -r -s /bin/false grafana + +# 将解压后的二进制文件移动到/data/grafana/ +tar -xzf grafana-enterprise_12.3.1_20271043721_linux_amd64.tar.gz -C /data/grafana/ + +# 所有者更改/data/grafana/为 Grafana 用户 +chown -R grafana:grafana /data/grafana/ + +# 复制默认的配置文件 +cp /data/grafana/conf/defaults.ini /data/grafana/conf/grafana.ini + +# 创建 Grafana 服务器 systemd 单元文件 +sudo touch /etc/systemd/system/grafana-server.service + +[Unit] +Description=Grafana Server +After=network.target + +[Service] +Type=simple +User=grafana +Group=grafana +ExecStart=/data/grafana/bin/grafana server --config=/data/grafana/conf/grafana.ini --homepath=/data/grafana +Restart=on-failure + +[Install] +WantedBy=multi-user.target + + +# 启用 Grafana 服务器 systemd 服务 +sudo systemctl daemon-reload +sudo systemctl start grafana-server +sudo systemctl enable grafana-server + + + + + + diff --git a/prometheus/二进制部署/prometheus/dm.bash b/prometheus/二进制部署/prometheus/dm.bash new file mode 100644 index 0000000..fcdc0e7 --- /dev/null +++ b/prometheus/二进制部署/prometheus/dm.bash @@ -0,0 +1,81 @@ +# 创建目录 +mkdir -p /data/prometheus/ +mkdir -p /data/alertmanager/ + +# 下载tar包 +wget https://github.com/prometheus/prometheus/releases/download/v3.8.1/prometheus-3.8.1.linux-amd64.tar.gz +wget https://github.com/prometheus/alertmanager/releases/download/v0.30.0/alertmanager-0.30.0.linux-amd64.tar.gz + +# 创建系统用户(如果尚未创建) +sudo useradd --no-create-home --shell /bin/false prometheus || true + +# 授权目录权限 +sudo chown -R prometheus:prometheus /data/prometheus +sudo chown -R prometheus:prometheus /data/alertmanager + +# 创建文件 /etc/systemd/system/prometheus.service +[Unit] +Description=Prometheus +Wants=network-online.target +After=network-online.target + +[Service] +User=prometheus +Group=prometheus +Type=simple +# 注意:--storage.tsdb.path 指定数据存储位置,建议设在 /data 目录下 +ExecStart=/data/prometheus/prometheus \ + --config.file=/data/prometheus/prometheus.yml \ + --storage.tsdb.path=/data/prometheus/data \ + --web.console.templates=/data/prometheus/consoles \ + --web.console.libraries=/data/prometheus/console_libraries + +Restart=always + +[Install] +WantedBy=multi-user.target + +# 创建文件 /etc/systemd/system/alertmanager.service +[Unit] +Description=Alertmanager +Wants=network-online.target +After=network-online.target + +[Service] +User=prometheus +Group=prometheus +Type=simple +ExecStart=/data/alertmanager/alertmanager \ + --config.file=/data/alertmanager/alertmanager.yml \ + --storage.path=/data/alertmanager/data + +Restart=always + +[Install] +WantedBy=multi-user.target + +# 修改 Prometheus 关联 Alertmanager +# Alerting configuration +alerting: + alertmanagers: + - static_configs: + - targets: + - localhost:9093 # Alertmanager 默认端口 + + +# 重载 systemd +sudo systemctl daemon-reload + +# 启动并设置开机自启 +sudo systemctl enable --now prometheus +sudo systemctl enable --now alertmanager + +# 检查状态 +sudo systemctl status prometheus +sudo systemctl status alertmanager + + +配置文件检查 +在重启服务前,可以使用自带的工具检查语法是否正确: +Prometheus 检查: /data/prometheus/promtool check config /data/prometheus/prometheus.yml +Alertmanager 检查: /data/alertmanager/amtool check-config /data/alertmanager/alertmanager.yml \ No newline at end of file diff --git a/tempo/cos-tempo.yaml b/tempo/cos-tempo.yaml new file mode 100644 index 0000000..7d091e0 --- /dev/null +++ b/tempo/cos-tempo.yaml @@ -0,0 +1,58 @@ +server: + http_listen_port: 3200 # HTTP 接口监听端口 + grpc_listen_port: 9095 # gRPC 接口监听端口 + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 # OTLP gRPC 接口监听地址 + http: + endpoint: 0.0.0.0:4318 # OTLP HTTP 接口监听地址 + +ingester: + lifecycler: + ring: + replication_factor: 1 # 数据的副本数 + max_block_duration: 5m # 最大数据块时长 + trace_idle_period: 10s # 如果某个 Trace 长时间未活动,自动清理 + +compactor: + compaction: + block_retention: 720h # 数据块保留时间,720小时(30天) + compacted_block_retention: 168h # 压缩后的数据块保留时间,168小时(7天) + max_compaction_objects: 1000000 # 每次压缩的最大对象数 + +metrics_generator: + registry: + external_labels: + source: tempo + cluster: linux-microservices + storage: + path: /data/tempo/data/wal + remote_write: + - url: http://127.0.0.1:9090/api/v1/write + send_exemplars: true + +storage: + trace: + backend: s3 + s3: + endpoint: outscalelink-1324597558.cos.na-siliconvalley.myqcloud.com + bucket: outscalelink-1324597558 + prefix: tempo-data/ + forcepathstyle: true + enable_dual_stack: false + insecure: true + access_key: AKIDkgR4lHvU1QfieR7cxBLLTaUCh0S0dDev + secret_key: fAWjldKuPhz4wb6RedPzPccOwGOet9Ug + wal: + path: /data/tempo/data/wal + local: + path: /data/tempo/blocks + +overrides: + metrics_generator_processors: [service-graphs, span-metrics] + + diff --git a/tempo/dm.sh b/tempo/dm.sh new file mode 100644 index 0000000..699b96c --- /dev/null +++ b/tempo/dm.sh @@ -0,0 +1,96 @@ +mkdir -p /data/tempo/{conf,data,metrics-generator} +mkdir -p /data/tempo/data/wal +mkdir -p /data/tempo/metrics-generator/wal + +chown -R tempo:tempo /data/tempo + +chown -R tempo:tempo /data/tempo/data/traces + +# 创建一个专用用户并配置服务,确保 Tempo 在后台稳定运行 +sudo useradd --no-create-home --shell /bin/false tempo || true + +# 下载tar包 +wget https://github.com/grafana/tempo/releases/download/v2.9.0/tempo_2.9.0_linux_amd64.tar.gz +# 解压 +tar -xzf tempo_2.9.0_linux_amd64.tar.gz -C /data/tempo/ +# 移动可执行文件到 /usr/local/bin/ +mv /data/tempo/tempo /data/tempo/tempo-cli /data/tempo/tempo-query /usr/local/bin/ +# 检查版本 +tempo --version + +# 创建配置文件(本地为存储介质) +vim local-tempo.yaml +server: + http_listen_port: 3200 # Tempo Web 端口(Grafana 对接用) + grpc_listen_port: 9095 # gRPC 端口(可选) + +distributor: + receivers: # 接收 OTel 追踪数据的协议(核心) + otlp: + protocols: + grpc: # 监听 4317 端口(和 OTel Collector 一致,接收 OTLP gRPC 数据) + endpoint: 0.0.0.0:4317 + http: # 监听 4318 端口(接收 OTLP HTTP 数据) + endpoint: 0.0.0.0:4318 + +ingester: + max_block_duration: 5m # 数据块存储时长(测试环境可设小) + trace_idle_period: 10s # 追踪会话空闲超时 + +compactor: + compaction: + block_retention: 30d # 追踪数据保留 30 天(可按需调整) + +storage: + trace: + backend: local # 单节点本地存储(生产可换 S3/MinIO) + local: + path: /data/tempo/data # 追踪数据存储目录(指定到 /data/tempo/data) + wal: + path: /data/tempo/data/wal # 预写日志目录(保证数据不丢) + +# 存储桶为存储对象 +vim cos-tempo.yaml + + +# 前台启动 +/usr/local/bin/tempo \ + -config.file=/data/tempo/conf/tempo.yaml \ + -config.expand-env=true + +# 检查服务状态 +systemctl is-active tempo + +# Systemd 服务守护进程 +vim /etc/systemd/system/tempo.service + +[Unit] +Description=Grafana Tempo +Wants=network-online.target +After=network-online.target + +[Service] +User=tempo +Group=tempo +Type=simple +# config.file 指定配置文件路径,这里的配置文件注意文件名 +ExecStart=/usr/local/bin/tempo \ + -config.file=/data/tempo/conf/tempo.yaml \ + -config.expand-env=true + +Restart=always +LimitNOFILE=65536 + +[Install] +WantedBy=multi-user.target + + +# 加载配置 +sudo systemctl daemon-reload +# 启动并设置自启 +sudo systemctl enable --now tempo +# 检查状态 +sudo systemctl status tempo + +# 查看日志 +journalctl -u tempo --no-pager -n 50 \ No newline at end of file diff --git a/1.conf b/其它/1.conf similarity index 100% rename from 1.conf rename to 其它/1.conf diff --git a/1.yml b/其它/1.yml similarity index 100% rename from 1.yml rename to 其它/1.yml diff --git a/12.yml b/其它/12.yml similarity index 100% rename from 12.yml rename to 其它/12.yml diff --git a/email_error条目.md b/其它/email_error条目.md similarity index 100% rename from email_error条目.md rename to 其它/email_error条目.md diff --git a/java_error条目.md b/其它/java_error条目.md similarity index 100% rename from java_error条目.md rename to 其它/java_error条目.md diff --git a/lessie.ai_error条目.md b/其它/lessie.ai_error条目.md similarity index 100% rename from lessie.ai_error条目.md rename to 其它/lessie.ai_error条目.md diff --git a/proc_monitor.sh b/其它/proc_monitor.sh similarity index 100% rename from proc_monitor.sh rename to 其它/proc_monitor.sh diff --git a/新建 文本文档.conf b/其它/新建 文本文档.conf similarity index 100% rename from 新建 文本文档.conf rename to 其它/新建 文本文档.conf diff --git a/日志格式.conf b/其它/日志格式.conf similarity index 100% rename from 日志格式.conf rename to 其它/日志格式.conf diff --git a/服务器降配置.sh b/其它/服务器降配置.sh similarity index 100% rename from 服务器降配置.sh rename to 其它/服务器降配置.sh diff --git a/系统硬盘扩容.md b/其它/系统硬盘扩容.md similarity index 100% rename from 系统硬盘扩容.md rename to 其它/系统硬盘扩容.md diff --git a/问IA.md b/其它/问IA.md similarity index 100% rename from 问IA.md rename to 其它/问IA.md diff --git a/需求 b/其它/需求 similarity index 99% rename from 需求 rename to 其它/需求 index 3f21d91..1266dfd 100644 --- a/需求 +++ b/其它/需求 @@ -72,4 +72,8 @@ log_format s1_jennie_im_log '客户端IP: $remote_addr | 用户: $remote_user | '请求方法和路径: "$request" | 状态码: $status | 响应大小: $body_bytes_sent | ' '来源页面: "$http_referer" | 客户端UA: "$http_user_agent" | ' '上游服务器: $upstream_addr | 上游响应耗时: $upstream_response_time | ' - '请求总耗时: $request_time | Host: $host'; \ No newline at end of file + '请求总耗时: $request_time | Host: $host'; + + + +