0119同步
This commit is contained in:
39
OpenTelemetry/Collector_v2/01-otel-gateway-rbac.yaml
Normal file
39
OpenTelemetry/Collector_v2/01-otel-gateway-rbac.yaml
Normal file
@@ -0,0 +1,39 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: monitoring
|
||||
---
|
||||
# 1. 权限配置 (RBAC)
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: otel-gateway
|
||||
namespace: monitoring
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: otel-gateway-role
|
||||
rules:
|
||||
# 允许读取 Pods, Nodes, Namespaces 用于打标和获取元数据
|
||||
- apiGroups: [""]
|
||||
resources: ["pods", "nodes", "namespaces", "services", "endpoints"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
# 允许读取 Events (用于采集 K8s 事件)
|
||||
- apiGroups: [""]
|
||||
resources: ["events"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: otel-gateway-binding
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: otel-gateway
|
||||
namespace: monitoring
|
||||
roleRef:
|
||||
kind: ClusterRole
|
||||
name: otel-gateway-role
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
|
||||
82
OpenTelemetry/Collector_v2/02-otel-gateway-config.yaml
Normal file
82
OpenTelemetry/Collector_v2/02-otel-gateway-config.yaml
Normal file
@@ -0,0 +1,82 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: otel-gateway-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
config.yaml: |
|
||||
receivers:
|
||||
# 接收来自 Agent 的数据 (gRPC 4317, HTTP 4318)
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
|
||||
# 1. 集群宏观: K8s 事件
|
||||
k8s_events:
|
||||
auth_type: serviceAccount
|
||||
|
||||
# 2. 集群宏观:拉取 TKE 自带的 tke-kube-state-metrics (Kube-State-Metrics)
|
||||
prometheus:
|
||||
config:
|
||||
scrape_configs:
|
||||
- job_name: 'tke-kube-state-metrics'
|
||||
scrape_interval: 30s
|
||||
static_configs:
|
||||
- targets: ['tke-kube-state-metrics.kube-system.svc.cluster.local:8180']
|
||||
|
||||
processors:
|
||||
batch:
|
||||
send_batch_size: 2000
|
||||
timeout: 10s
|
||||
|
||||
resourcedetection:
|
||||
detectors: [env, system]
|
||||
override: true
|
||||
|
||||
# 3. 注入集群 ID, 解决 Prometheus 重复采样报错
|
||||
resource:
|
||||
attributes:
|
||||
- key: cluster.name
|
||||
value: "test-k8s"
|
||||
action: upsert
|
||||
|
||||
# 将 OTLP Resource 属性转换为 Metric 标签,确保 Prometheus 能够区分不同 Pod/Node
|
||||
transform:
|
||||
metric_statements:
|
||||
- context: datapoint
|
||||
statements:
|
||||
- set(attributes["k8s_pod_name"], resource.attributes["k8s.pod.name"])
|
||||
- set(attributes["k8s_node_name"], resource.attributes["k8s.node.name"])
|
||||
- set(attributes["k8s_namespace_name"], resource.attributes["k8s.namespace.name"])
|
||||
- set(attributes["k8s_container_name"], resource.attributes["k8s.container.name"])
|
||||
- set(attributes["cluster_name"], resource.attributes["cluster.name"])
|
||||
|
||||
memory_limiter:
|
||||
check_interval: 1s
|
||||
limit_mib: 1500
|
||||
spike_limit_mib: 512
|
||||
|
||||
exporters:
|
||||
# 对接 Prometheus
|
||||
otlphttp/prometheus:
|
||||
endpoint: "http://10.0.0.38:9090/api/v1/write"
|
||||
tls:
|
||||
insecure: true
|
||||
|
||||
# 打印日志(用于排查)
|
||||
debug:
|
||||
verbosity: detailed
|
||||
|
||||
service:
|
||||
pipelines:
|
||||
metrics:
|
||||
receivers: [otlp, prometheus] # 汇聚微观(otlp)和宏观(prometheus)指标
|
||||
processors: [memory_limiter, resourcedetection, resource, transform, batch]
|
||||
exporters: [otlphttp/prometheus]
|
||||
logs:
|
||||
receivers: [k8s_events]
|
||||
processors: [memory_limiter, resourcedetection, resource, batch]
|
||||
exporters: [debug]
|
||||
60
OpenTelemetry/Collector_v2/03-otel-gateway-deployment.yaml
Normal file
60
OpenTelemetry/Collector_v2/03-otel-gateway-deployment.yaml
Normal file
@@ -0,0 +1,60 @@
|
||||
# 部署主体 (Deployment)
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: otel-gateway
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: otel-gateway
|
||||
spec:
|
||||
replicas: 1 # 采集 Events 和 KSM 建议单副本,避免数据重复
|
||||
selector:
|
||||
matchLabels:
|
||||
app: otel-gateway
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: otel-gateway
|
||||
spec:
|
||||
serviceAccountName: otel-gateway
|
||||
containers:
|
||||
- name: otel-collector
|
||||
image: otel/opentelemetry-collector-contrib:latest
|
||||
command:
|
||||
- "/otelcol-contrib"
|
||||
args:
|
||||
- "--config=/conf/config.yaml"
|
||||
volumeMounts:
|
||||
- name: config-vol
|
||||
mountPath: /conf
|
||||
resources:
|
||||
limits:
|
||||
cpu: 1
|
||||
memory: 2Gi
|
||||
requests:
|
||||
cpu: 200m
|
||||
memory: 400Mi
|
||||
volumes:
|
||||
- name: config-vol
|
||||
configMap:
|
||||
name: otel-gateway-config
|
||||
---
|
||||
# 服务暴露 (Service)
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: otel-gateway
|
||||
namespace: monitoring
|
||||
spec:
|
||||
clusterIP: None
|
||||
selector:
|
||||
app: otel-gateway
|
||||
ports:
|
||||
- name: grpc
|
||||
port: 4317
|
||||
targetPort: 4317
|
||||
protocol: TCP
|
||||
- name: http
|
||||
port: 4318
|
||||
targetPort: 4318
|
||||
protocol: TCP
|
||||
39
OpenTelemetry/Collector_v2/11-otel-agent-rbac.yaml
Normal file
39
OpenTelemetry/Collector_v2/11-otel-agent-rbac.yaml
Normal file
@@ -0,0 +1,39 @@
|
||||
# 1. 权限配置
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: otel-agent
|
||||
namespace: monitoring
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: otel-agent-role
|
||||
rules:
|
||||
# 允许读取 Pod 和 Node 信息
|
||||
- apiGroups: [""]
|
||||
resources: ["nodes", "nodes/stats", "nodes/proxy", "pods", "services", "endpoints"]
|
||||
verbs: ["get", "watch", "list"]
|
||||
|
||||
# 允许读取 ReplicaSets,以便 k8sattributes 处理器解析 Deployment 名称
|
||||
- apiGroups: ["apps"]
|
||||
resources: ["replicasets"]
|
||||
verbs: ["get", "watch", "list"]
|
||||
|
||||
# 非资源型 URL 权限 (访问 Kubelet 统计接口)
|
||||
- nonResourceURLs: ["/metrics", "/metrics/cadvisor"]
|
||||
verbs: ["get"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: otel-agent-binding
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: otel-agent
|
||||
namespace: monitoring
|
||||
roleRef:
|
||||
kind: ClusterRole
|
||||
name: otel-agent-role
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
|
||||
75
OpenTelemetry/Collector_v2/12-otel-agent-config.yaml
Normal file
75
OpenTelemetry/Collector_v2/12-otel-agent-config.yaml
Normal file
@@ -0,0 +1,75 @@
|
||||
# Agent 配置文件
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: otel-agent-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
config.yaml: |
|
||||
receivers:
|
||||
# 1. 采集宿主机宏观指标
|
||||
hostmetrics:
|
||||
collection_interval: 30s
|
||||
root_path: /hostfs
|
||||
scrapers:
|
||||
cpu: {}
|
||||
memory: {}
|
||||
load: {}
|
||||
disk: {}
|
||||
filesystem: {}
|
||||
network: {}
|
||||
paging: {}
|
||||
processes: {}
|
||||
|
||||
# 2. 采集 Pod/Container/Volume 微观指标
|
||||
kubeletstats:
|
||||
collection_interval: 30s
|
||||
auth_type: "serviceAccount"
|
||||
endpoint: "https://${env:K8S_NODE_NAME}:10250" # 使用环境变量定位本地 Kubelet
|
||||
insecure_skip_verify: true
|
||||
metric_groups:
|
||||
- node
|
||||
- pod
|
||||
- container
|
||||
- volume
|
||||
|
||||
processors:
|
||||
batch:
|
||||
send_batch_size: 1000
|
||||
timeout: 10s
|
||||
|
||||
resourcedetection:
|
||||
detectors: [env, system]
|
||||
|
||||
# 3. 提取 K8s 详细标签,确保指标唯一性
|
||||
k8sattributes:
|
||||
auth_type: "serviceAccount"
|
||||
passthrough: false
|
||||
extract:
|
||||
metadata:
|
||||
- k8s.pod.name
|
||||
- k8s.pod.uid
|
||||
- k8s.namespace.name
|
||||
- k8s.node.name
|
||||
- k8s.deployment.name
|
||||
- k8s.container.name
|
||||
pod_association:
|
||||
- sources:
|
||||
- from: resource_attribute
|
||||
name: k8s.pod.uid
|
||||
- sources:
|
||||
- from: connection
|
||||
|
||||
exporters:
|
||||
# 发送给集群内的 Gateway Service
|
||||
otlp:
|
||||
endpoint: "otel-gateway.monitoring.svc.cluster.local:4317"
|
||||
tls:
|
||||
insecure: true
|
||||
|
||||
service:
|
||||
pipelines:
|
||||
metrics:
|
||||
receivers: [hostmetrics, kubeletstats]
|
||||
processors: [resourcedetection, k8sattributes, batch]
|
||||
exporters: [otlp]
|
||||
55
OpenTelemetry/Collector_v2/13-otel-agent-daemonset.yaml
Normal file
55
OpenTelemetry/Collector_v2/13-otel-agent-daemonset.yaml
Normal file
@@ -0,0 +1,55 @@
|
||||
# 部署主体 (DaemonSet)
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: otel-agent
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: otel-agent
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: otel-agent
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: otel-agent
|
||||
spec:
|
||||
hostNetwork: true
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
serviceAccountName: otel-agent
|
||||
containers:
|
||||
- name: otel-collector
|
||||
image: otel/opentelemetry-collector-contrib:latest
|
||||
command:
|
||||
- "/otelcol-contrib"
|
||||
args:
|
||||
- "--config=/conf/config.yaml"
|
||||
env:
|
||||
# 获取当前节点名称,传给 kubeletstats 使用
|
||||
- name: K8S_NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
volumeMounts:
|
||||
- name: config-vol
|
||||
mountPath: /conf
|
||||
# 挂载宿主机根目录,以便采集宿主机指标
|
||||
- name: hostfs
|
||||
mountPath: /hostfs
|
||||
readOnly: true
|
||||
mountPropagation: HostToContainer
|
||||
resources:
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 500Mi
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 200Mi
|
||||
volumes:
|
||||
- name: config-vol
|
||||
configMap:
|
||||
name: otel-agent-config
|
||||
- name: hostfs
|
||||
hostPath:
|
||||
path: /
|
||||
13553
OpenTelemetry/Collector_v3/01-cert-manager.yaml
Normal file
13553
OpenTelemetry/Collector_v3/01-cert-manager.yaml
Normal file
File diff suppressed because it is too large
Load Diff
17874
OpenTelemetry/Collector_v3/02-opentelemetry-operator.yaml
Normal file
17874
OpenTelemetry/Collector_v3/02-opentelemetry-operator.yaml
Normal file
File diff suppressed because it is too large
Load Diff
96
OpenTelemetry/Collector_v3/03-otel-gateway.yaml
Normal file
96
OpenTelemetry/Collector_v3/03-otel-gateway.yaml
Normal file
@@ -0,0 +1,96 @@
|
||||
apiVersion: opentelemetry.io/v1beta1
|
||||
kind: OpenTelemetryCollector
|
||||
metadata:
|
||||
name: otel-gateway
|
||||
namespace: opentelemetry-operator-system
|
||||
spec:
|
||||
mode: deployment
|
||||
replicas: 1
|
||||
serviceAccount: otel-gateway-collector # Operator 会自动创建并绑定权限
|
||||
config:
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
|
||||
# --- 核心:采集 K8s 集群状态指标 ---
|
||||
# 采集 Deployment, DaemonSet, StatefulSet, HPA, Node 等资源的状态
|
||||
k8s_cluster:
|
||||
collection_interval: 30s
|
||||
node_conditions_to_report: [Ready, MemoryPressure, DiskPressure, PIDPressure]
|
||||
allocatable_types_to_report: [cpu, memory]
|
||||
|
||||
processors:
|
||||
batch:
|
||||
send_batch_size: 1000
|
||||
timeout: 10s
|
||||
memory_limiter:
|
||||
check_interval: 1s
|
||||
limit_percentage: 70
|
||||
spike_limit_percentage: 30
|
||||
|
||||
# 增加 K8s 元数据标签 (这也是 Gateway 的重要作用)
|
||||
k8sattributes:
|
||||
extract:
|
||||
metadata:
|
||||
- k8s.namespace.name
|
||||
- k8s.pod.name
|
||||
- k8s.deployment.name
|
||||
- k8s.statefulset.name
|
||||
- k8s.daemonset.name
|
||||
- k8s.cronjob.name
|
||||
- k8s.job.name
|
||||
- k8s.node.name
|
||||
pod_association:
|
||||
- sources:
|
||||
- from: resource_attribute
|
||||
name: k8s.pod.ip
|
||||
- sources:
|
||||
- from: resource_attribute
|
||||
name: k8s.pod.uid
|
||||
- sources:
|
||||
- from: connection
|
||||
|
||||
exporters:
|
||||
# 1. 导出 Metrics 到外部 Prometheus (使用 Remote Write)
|
||||
prometheusremotewrite:
|
||||
endpoint: "http://10.0.0.38:9090/api/v1/write"
|
||||
# 如果有 Basic Auth,在此配置
|
||||
# external_labels:
|
||||
# cluster: "test-k8s-cluster"
|
||||
|
||||
# 2. 导出 Traces 到外部 Tempo (使用 OTLP gRPC)
|
||||
# otlp/tempo:
|
||||
# endpoint: "<你的TEMPO_IP>:4317"
|
||||
# tls:
|
||||
# insecure: true
|
||||
|
||||
# 3. 导出 Logs 到外部 Elasticsearch (可选)
|
||||
# elasticsearch:
|
||||
# endpoints: ["http://<你的ES_IP>:9200"]
|
||||
# logs_index: "k8s-logs"
|
||||
|
||||
debug:
|
||||
verbosity: basic
|
||||
|
||||
service:
|
||||
pipelines:
|
||||
metrics:
|
||||
receivers: [otlp, k8s_cluster]
|
||||
processors: [memory_limiter, batch]
|
||||
# 确保 k8sattributes 在 batch 之前或之后取决于架构,Gateway通常主要做转发
|
||||
# 这里 k8s_cluster 产生的数据自带标签,otlp 来的数据应在 Agent 端打好标签
|
||||
exporters: [prometheusremotewrite]
|
||||
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch]
|
||||
exporters: [otlp/tempo]
|
||||
|
||||
# logs:
|
||||
# receivers: [otlp]
|
||||
# processors: [memory_limiter, batch]
|
||||
# exporters: [elasticsearch]
|
||||
88
OpenTelemetry/Collector_v3/04-otel-agent.yaml
Normal file
88
OpenTelemetry/Collector_v3/04-otel-agent.yaml
Normal file
@@ -0,0 +1,88 @@
|
||||
apiVersion: opentelemetry.io/v1beta1
|
||||
kind: OpenTelemetryCollector
|
||||
metadata:
|
||||
name: otel-agent
|
||||
namespace: opentelemetry-operator-system
|
||||
spec:
|
||||
mode: daemonset
|
||||
hostNetwork: true # 建议开启,以便更准确获取 Host 指标
|
||||
config:
|
||||
receivers:
|
||||
# 1. 采集 Pod 和 容器 的资源使用情况 (CPU/Mem)
|
||||
kubeletstats:
|
||||
collection_interval: 20s
|
||||
auth_type: "serviceAccount"
|
||||
endpoint: "${env:K8S_NODE_NAME}:10250"
|
||||
insecure_skip_verify: true
|
||||
metric_groups:
|
||||
- node
|
||||
- pod
|
||||
- container
|
||||
|
||||
# 2. 采集宿主机物理指标
|
||||
hostmetrics:
|
||||
collection_interval: 20s
|
||||
scrapers:
|
||||
cpu:
|
||||
memory:
|
||||
load:
|
||||
filesystem:
|
||||
network:
|
||||
|
||||
# 3. (可选) 采集日志
|
||||
# filelog:
|
||||
# include: [/var/log/pods/*/*/*.log]
|
||||
# ...
|
||||
|
||||
processors:
|
||||
batch:
|
||||
send_batch_size: 500
|
||||
timeout: 5s
|
||||
memory_limiter:
|
||||
check_interval: 1s
|
||||
limit_mib: 400
|
||||
spike_limit_mib: 100
|
||||
|
||||
# 资源检测:自动识别云厂商(腾讯云)信息、主机名等
|
||||
resourcedetection:
|
||||
detectors: [system] # 如果在腾讯云CVM上,可以尝试加入 'tencentcloud' 但 system 通常足够
|
||||
timeout: 2s
|
||||
override: false
|
||||
|
||||
# 关键:给指标打上 K8s 标签 (Pod Name, Namespace, Node Name)
|
||||
k8sattributes:
|
||||
passthrough: false
|
||||
extract:
|
||||
metadata:
|
||||
- k8s.pod.name
|
||||
- k8s.pod.uid
|
||||
- k8s.deployment.name
|
||||
- k8s.namespace.name
|
||||
- k8s.node.name
|
||||
pod_association:
|
||||
- sources:
|
||||
- from: resource_attribute
|
||||
name: k8s.pod.uid
|
||||
- sources:
|
||||
- from: resource_attribute
|
||||
name: k8s.pod.ip
|
||||
- sources:
|
||||
- from: connection
|
||||
|
||||
exporters:
|
||||
# 发送给集群内的 Gateway Service
|
||||
otlp:
|
||||
endpoint: "otel-gateway-collector.opentelemetry-operator-system.svc.cluster.local:4317"
|
||||
tls:
|
||||
insecure: true
|
||||
|
||||
service:
|
||||
pipelines:
|
||||
metrics:
|
||||
receivers: [kubeletstats, hostmetrics]
|
||||
processors: [resourcedetection, k8sattributes, memory_limiter, batch]
|
||||
exporters: [otlp]
|
||||
|
||||
# traces: # 如果应用配置了 sidecar 或其他方式发送 trace 到本地 agent
|
||||
# receivers: [otlp]
|
||||
# exporters: [otlp]
|
||||
4
OpenTelemetry/Collector_v3/monitoring-namespace.yaml
Normal file
4
OpenTelemetry/Collector_v3/monitoring-namespace.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: monitoring
|
||||
16
OpenTelemetry/Collector_v3/opentelemetry-ns-terminating.yaml
Normal file
16
OpenTelemetry/Collector_v3/opentelemetry-ns-terminating.yaml
Normal file
@@ -0,0 +1,16 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
annotations:
|
||||
kubectl.kubernetes.io/last-applied-configuration: |
|
||||
{"apiVersion":"v1","kind":"Namespace","metadata":{"annotations":{},"labels":{"app.kubernetes.io/name":"opentelemetry-operator","control-plane":"controller-manager"},"name":"opentelemetry-operator-system"}}
|
||||
creationTimestamp: "2025-12-11T08:28:51Z"
|
||||
deletionTimestamp: "2025-12-12T08:33:24Z"
|
||||
labels:
|
||||
app.kubernetes.io/name: opentelemetry-operator
|
||||
control-plane: controller-manager
|
||||
kubernetes.io/metadata.name: opentelemetry-operator-system
|
||||
name: opentelemetry-operator-system
|
||||
resourceVersion: "4706820514"
|
||||
uid: 6ebd60fa-2155-4f6e-8c3c-6d83447713c1
|
||||
spec:
|
||||
72
OpenTelemetry/Collector_v3/readme.txt
Normal file
72
OpenTelemetry/Collector_v3/readme.txt
Normal file
@@ -0,0 +1,72 @@
|
||||
一、安装 cert-manager
|
||||
OpenTelemetry Operator 提供了 OpenTelemetryCollector CRD(自定义资源定义),能自动处理服务发现和 RBAC(基于角色的访问控制)
|
||||
该 Operator 需要 cert-manager 来支持准入 Webhook,部署步骤如下:
|
||||
|
||||
安装OpenTelemetry Operator 自定义资源,需先部署 cert-manager ,使用yaml 直接部署,文件:02-cert-manager.yaml (官方文档里)
|
||||
官方文档链接:https://cert-manager.io/docs/installation/kubectl/
|
||||
|
||||
cert-manager 将安装在cert-manager命名空间中,安装了cert-manager后,通过以下方式验证其部署是否正确 检查cert-manager命名空间:
|
||||
kubectl get pods --namespace cert-manager
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
cert-manager-7b8b89f89d-tpchr 1/1 Running 0 24s
|
||||
cert-manager-cainjector-7f9fdd5dd5-px66h 1/1 Running 0 25s
|
||||
cert-manager-webhook-769f6b94cb-zmjmv 1/1 Running 0 24s
|
||||
|
||||
|
||||
|
||||
二、安装 OpenTelemetry Operator
|
||||
使用 helm 安装OpenTelemetry ,添加 Operator helm 仓库、更新、安装:
|
||||
helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts
|
||||
helm repo update
|
||||
helm install opentelemetry-operator open-telemetry/opentelemetry-operator \
|
||||
--namespace opentelemetry-operator \
|
||||
--create-namespace
|
||||
|
||||
或者直接使用yaml部署:文件 03-opentelemetry-operator.yaml
|
||||
kubectl apply -f https://github.com/open-telemetry/opentelemetry-operator/releases/latest/download/opentelemetry-operator.yaml
|
||||
|
||||
kubectl delete -f https://github.com/open-telemetry/opentelemetry-operator/releases/latest/download/opentelemetry-operator.yaml
|
||||
|
||||
# 查看安装的CRD
|
||||
kubectl get crd | grep opentelemetry.io
|
||||
输出:
|
||||
instrumentations.opentelemetry.io 2025-12-11T09:02:11Z
|
||||
opampbridges.opentelemetry.io 2025-12-11T09:02:13Z
|
||||
opentelemetrycollectors.opentelemetry.io 2025-12-11T09:02:14Z
|
||||
targetallocators.opentelemetry.io 2025-12-11T09:02:17Z
|
||||
|
||||
# 查看安装的operator控制器 Pod 状态
|
||||
kubectl get pods -n opentelemetry-operator-system
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
opentelemetry-operator-controller-manager-9c4b5467d-dhhp7 1/1 Running 0 3m10s
|
||||
|
||||
# 查看 Operator Deployment 状态
|
||||
kubectl get deploy opentelemetry-operator-controller-manager -n opentelemetry-operator-system
|
||||
NAME READY UP-TO-DATE AVAILABLE AGE
|
||||
opentelemetry-operator-controller-manager 1/1 1 1 4m43s
|
||||
|
||||
# 查看证书是否签发成功(验证与 cert-manager 协同正常)
|
||||
NAME READY SECRET AGE
|
||||
certificate.cert-manager.io/opentelemetry-operator-serving-cert True opentelemetry-operator-controller-manager-service-cert 6m13s
|
||||
NAME READY AGE
|
||||
issuer.cert-manager.io/opentelemetry-operator-selfsigned-issuer True 6m12s
|
||||
|
||||
# 查看 Operator 相关 CRD
|
||||
kubectl get crd | grep opentelemetry.io
|
||||
输出:
|
||||
instrumentations.opentelemetry.io 2026-01-14T07:28:56Z
|
||||
opampbridges.opentelemetry.io 2026-01-14T07:28:57Z
|
||||
opentelemetrycollectors.opentelemetry.io 2025-12-11T09:02:14Z
|
||||
targetallocators.opentelemetry.io 2026-01-14T07:29:03Z
|
||||
|
||||
|
||||
# 卸载命令
|
||||
# 格式:helm uninstall <Release 名称> -n <命名空间>
|
||||
helm uninstall opentelemetry-operator -n opentelemetry-operator
|
||||
|
||||
# 删除 CRD(仅当你确认不再需要任何 OTel 相关自定义资源实例时执行)
|
||||
# 批量删除 Operator 相关 CRD
|
||||
kubectl delete crd \
|
||||
instrumentations.opentelemetry.io \
|
||||
opentelemetrycollectors.opentelemetry.io \
|
||||
targetallocators.opentelemetry.io
|
||||
@@ -27,14 +27,21 @@ Type=simple
|
||||
ExecStart=/data/prometheus/prometheus \
|
||||
--config.file=/data/prometheus/prometheus.yml \
|
||||
--storage.tsdb.path=/data/prometheus/data \
|
||||
--web.enable-remote-write-receiver \
|
||||
--web.console.templates=/data/prometheus/consoles \
|
||||
--web.console.libraries=/data/prometheus/console_libraries
|
||||
--web.console.libraries=/data/prometheus/console_libraries \
|
||||
--storage.tsdb.retention.time=60d \
|
||||
--storage.tsdb.retention.size=60GB
|
||||
|
||||
Restart=always
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
|
||||
# ----------------------------------------
|
||||
|
||||
|
||||
# 创建文件 /etc/systemd/system/alertmanager.service
|
||||
[Unit]
|
||||
Description=Alertmanager
|
||||
|
||||
@@ -25,7 +25,6 @@ graph TD
|
||||
F -->|转发| G[集群外Prometheus]
|
||||
|
||||
|
||||
|
||||
2. 日志数据(最终到 ES)
|
||||
采集方:OTel Collector(DaemonSet 模式)
|
||||
采集内容:k8s 节点/var/log/containers目录下的容器日志(替代 Filebeat)。
|
||||
@@ -49,5 +48,5 @@ graph TD
|
||||
|
||||
graph LR
|
||||
A[指标接收器] -->|metrics流水线| B[指标处理器] --> C[Prometheus导出器]
|
||||
D[日志接收器] -->|logs流水线| E[日志处理器] --> F[ES导出器]
|
||||
G[追踪接收器] -->|traces流水线| H[追踪处理器] --> I[Tempo导出器]
|
||||
D[日志接收器] -->|logs流水线| E[日志处理器] --> F[ES导出器]
|
||||
G[追踪接收器] -->|traces流水线| H[追踪处理器] --> I[Tempo导出器]
|
||||
94
filebast/s5-lessie-server01/filebeat.yml
Normal file
94
filebast/s5-lessie-server01/filebeat.yml
Normal file
@@ -0,0 +1,94 @@
|
||||
# 配置索引模板名称和模式
|
||||
setup.template.name: "lessie-sit"
|
||||
setup.template.pattern: "lessie-sit*"
|
||||
setup.template.enabled: true
|
||||
setup.ilm.enabled: true
|
||||
|
||||
#主配置文件加载子配置文件
|
||||
filebeat.config.inputs:
|
||||
enabled: true
|
||||
path: /etc/filebeat/inputs.d/*.yml
|
||||
reload.enabled: true
|
||||
reload.period: 10s
|
||||
|
||||
|
||||
# 处理器
|
||||
processors:
|
||||
# lessie -------------------------
|
||||
|
||||
- dissect:
|
||||
when:
|
||||
equals:
|
||||
log_type: lessie_search.log
|
||||
tokenizer: '%{timestamp} - %{level} - %{module} - %{function} - %{message}'
|
||||
field: "message"
|
||||
target_prefix: "mylog"
|
||||
ignore_missing: true
|
||||
overwrite_keys: true
|
||||
|
||||
# 针对带有 [level: | event: | msg: | context:] 的日志,再做一次 dissect
|
||||
- dissect:
|
||||
when:
|
||||
regexp:
|
||||
mylog.message: '^\[level:.*\]'
|
||||
tokenizer: '[level: %{event_level} | event: %{event} | msg: %{msg} | context: %{context}]'
|
||||
field: "mylog.message"
|
||||
target_prefix: "mylog"
|
||||
ignore_missing: true
|
||||
overwrite_keys: true
|
||||
|
||||
# 把 context 再拆成独立字段
|
||||
- script:
|
||||
lang: javascript
|
||||
id: parse_context
|
||||
source: >
|
||||
function process(event) {
|
||||
var ctx = event.Get("mylog.context");
|
||||
if (ctx) {
|
||||
var parts = ctx.split(",");
|
||||
parts.forEach(function(p) {
|
||||
var kv = p.split(":");
|
||||
if (kv.length == 2) {
|
||||
event.Put("mylog." + kv[0].trim(), kv[1].trim());
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
# lessie ------------------------
|
||||
|
||||
- decode_json_fields:
|
||||
when:
|
||||
equals:
|
||||
log_type: go.log
|
||||
fields: ["message"]
|
||||
target: ""
|
||||
overwrite_keys: true
|
||||
add_error_key: true
|
||||
|
||||
|
||||
#输出
|
||||
output.elasticsearch:
|
||||
hosts: ["http://106.53.194.199:9200"]
|
||||
username: "admin"
|
||||
password: "123456"
|
||||
index: "%{[environment]}-%{[application]}-%{+yyyy.MM.dd}" # 按天分割索引
|
||||
bulk_max_size: 50 # 单批次传输最大文档数
|
||||
worker: 1 # 并行工作线程数
|
||||
timeout: 15s
|
||||
|
||||
# 日志记录
|
||||
logging.level: info
|
||||
logging.to_files: true
|
||||
logging.files:
|
||||
path: /var/log/filebeat
|
||||
name: filebeat.log
|
||||
keepfiles: 7
|
||||
permissions: 0644
|
||||
|
||||
# 设置队列和内存使用
|
||||
queue.mem:
|
||||
events: 1024
|
||||
flush.min_events: 512
|
||||
flush.timeout: 10s
|
||||
|
||||
|
||||
25
filebast/s5-lessie-server01/s5_lessie_search.yml
Normal file
25
filebast/s5-lessie-server01/s5_lessie_search.yml
Normal file
@@ -0,0 +1,25 @@
|
||||
- type: log
|
||||
id: s5_lessie_search
|
||||
enabled: true
|
||||
paths:
|
||||
- /data/webapps/lessie_sourcing_agents_s5/logs/lessie_sourcing_agents*.log
|
||||
include_lines: ['^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}'] # 只包含匹配该正则表达式的行
|
||||
fields:
|
||||
application: lessie_search
|
||||
log_type: lessie_search.log
|
||||
environment: s5
|
||||
instance: webdrive-server
|
||||
ip: 49.51.33.153
|
||||
fields_under_root: true
|
||||
# multiline.pattern: '^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}'
|
||||
# multiline.negate: true
|
||||
# multiline.match: after
|
||||
ignore_older: 24h # 忽略旧日志文件(避免处理已归档的日志)
|
||||
scan_frequency: 10s # 定期扫描新文件的频率
|
||||
clean_inactive: 25h # 清除超过一天未更新的文件
|
||||
close_inactive: 5m # 文件超过5分钟无更新则关闭
|
||||
close_renamed: true # 处理被重命名的文件
|
||||
start_position: beginning # 从文件的开头读取
|
||||
|
||||
|
||||
|
||||
@@ -207,3 +207,215 @@ server {
|
||||
|
||||
|
||||
|
||||
# ---------------修改,官网项目合成一个项目,使用新的框架----------------------------------
|
||||
|
||||
|
||||
# upstream official_backend {
|
||||
# server 10.0.0.5:3000;
|
||||
# server 10.0.0.15:3000;
|
||||
# }
|
||||
|
||||
upstream new_official_backend {
|
||||
server 10.0.0.5:3003;
|
||||
server 10.0.0.15:3003;
|
||||
}
|
||||
|
||||
log_format official_log '$remote_addr - $remote_user [$time_local] '
|
||||
'"$request" $status $body_bytes_sent '
|
||||
'"$http_referer" "$http_user_agent" '
|
||||
'upstream_addr=$upstream_addr '
|
||||
'upstream_status=$upstream_status '
|
||||
'upstream_response_time=$upstream_response_time '
|
||||
'request_time=$request_time';
|
||||
|
||||
# 1. 强制 HTTP 转 HTTPS(统一跳转到 www.lessie.ai)
|
||||
server {
|
||||
listen 80;
|
||||
server_name lessie.ai www.lessie.ai;
|
||||
return 301 https://lessie.ai$request_uri;
|
||||
}
|
||||
|
||||
# 2. 统一将 www.lessie.ai 重定向到 lessie.ai(HTTPS 保留)
|
||||
server {
|
||||
listen 443 ssl;
|
||||
server_name www.lessie.ai;
|
||||
|
||||
ssl_certificate /data/tengine/certificate/lessie.ai.pem;
|
||||
ssl_certificate_key /data/tengine/certificate/lessie.ai.key;
|
||||
|
||||
ssl_protocols TLSv1.2 TLSv1.3;
|
||||
ssl_ciphers HIGH:!aNULL:!MD5;
|
||||
|
||||
# 清除 HSTS
|
||||
add_header Strict-Transport-Security "max-age=0; includeSubDomains" always;
|
||||
|
||||
return 301 https://lessie.ai$request_uri;
|
||||
}
|
||||
|
||||
# 3. 正式服务站点(https://lessie.ai)
|
||||
server {
|
||||
listen 443 ssl;
|
||||
server_name lessie.ai;
|
||||
|
||||
ssl_certificate /data/tengine/certificate/lessie.ai.pem;
|
||||
ssl_certificate_key /data/tengine/certificate/lessie.ai.key;
|
||||
|
||||
ssl_protocols TLSv1.2 TLSv1.3;
|
||||
ssl_ciphers HIGH:!aNULL:!MD5;
|
||||
|
||||
access_log /data/tengine/logs/lessie.ai.access.log official_log;
|
||||
error_log /data/tengine/logs/lessie.ai.error.log;
|
||||
|
||||
# SSR 场景放大超时
|
||||
proxy_connect_timeout 300s;
|
||||
proxy_send_timeout 300s;
|
||||
proxy_read_timeout 300s;
|
||||
|
||||
# 拦截 PHP / WordPress 扫描
|
||||
location ~* \.php$ {
|
||||
return 444;
|
||||
}
|
||||
|
||||
# 新框架的业务页面逻辑
|
||||
# location ~ "^/([a-z]{2}(-[a-z]{2})?/)?(influencer-marketing|b2b-lead-generation|investor-scouting|recruiting|partnerships)" {
|
||||
# proxy_pass http://new_official_backend;
|
||||
# proxy_http_version 1.1;
|
||||
# proxy_set_header Host $host;
|
||||
# proxy_set_header X-Real-IP $remote_addr;
|
||||
# proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
# proxy_set_header X-Forwarded-Proto $scheme;
|
||||
# }
|
||||
|
||||
# Next.js 静态 & data
|
||||
# location ^~ /_next/ {
|
||||
# proxy_pass http://new_official_backend;
|
||||
# proxy_http_version 1.1;
|
||||
# proxy_set_header Host $host;
|
||||
# proxy_set_header X-Real-IP $remote_addr;
|
||||
# }
|
||||
|
||||
# Nuxt 静态资源
|
||||
# location ^~ /_nuxt/ {
|
||||
# proxy_pass http://official_backend;
|
||||
# proxy_set_header Host $host;
|
||||
# proxy_set_header X-Real-IP $remote_addr;
|
||||
# }
|
||||
|
||||
# Nuxt data / 其他 json
|
||||
# location ~ \.json$ {
|
||||
# proxy_pass http://official_backend;
|
||||
# proxy_set_header Host $host;
|
||||
# proxy_set_header X-Real-IP $remote_addr;
|
||||
# }
|
||||
|
||||
# 新站(Next SSR)
|
||||
location / {
|
||||
proxy_pass http://new_official_backend;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
}
|
||||
|
||||
location /video/ {
|
||||
root /data/tengine/html/lessie_official;
|
||||
expires 30d;
|
||||
add_header Cache-Control "public";
|
||||
add_header Accept-Ranges bytes;
|
||||
}
|
||||
|
||||
# 禁止 logo 缓存(默认给用户方形)
|
||||
location = /favicon.svg {
|
||||
# 判断 UA,如果是 Googlebot,改写路径
|
||||
if ($http_user_agent ~* "(Googlebot|Bingbot)") {
|
||||
rewrite ^/favicon.svg$ /favicon-google.svg last;
|
||||
}
|
||||
|
||||
proxy_pass http://new_official_backend;
|
||||
proxy_set_header Host $host;
|
||||
|
||||
add_header Cache-Control "no-cache, no-store, must-revalidate" always;
|
||||
add_header Pragma "no-cache" always;
|
||||
add_header Expires 0 always;
|
||||
}
|
||||
|
||||
# Googlebot 专用 favicon 文件(圆形图标)
|
||||
location = /favicon-google.svg {
|
||||
root /data/tengine/html/lessie_official;
|
||||
add_header Cache-Control "no-cache, no-store, must-revalidate" always;
|
||||
add_header Pragma "no-cache" always;
|
||||
add_header Expires 0 always;
|
||||
}
|
||||
|
||||
# 第三方邮件件平台调国内email(crm)
|
||||
location /prod-api/webhook/ {
|
||||
proxy_pass http://129.204.158.54:4997/webhook/;
|
||||
proxy_set_header Host 129.204.158.54;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
|
||||
proxy_intercept_errors off;
|
||||
proxy_buffering off;
|
||||
proxy_cache off;
|
||||
proxy_set_header Connection keep-alive;
|
||||
|
||||
add_header 'Access-Control-Allow-Origin' "$http_origin" always;
|
||||
add_header 'Access-Control-Allow-Credentials' 'true' always;
|
||||
add_header 'Access-Control-Allow-Methods' 'GET, POST, OPTIONS, PUT, DELETE' always;
|
||||
add_header 'Access-Control-Allow-Headers' 'Authorization,Content-Type,X-Requested-With,Accept,Origin' always;
|
||||
|
||||
if ($request_method = OPTIONS ) {
|
||||
return 204;
|
||||
}
|
||||
}
|
||||
|
||||
# 第三方邮件SendGrid平台调用
|
||||
location /prod-api/webhook/us {
|
||||
proxy_pass http://10.0.0.10:4997/webhook/us;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
|
||||
proxy_intercept_errors off;
|
||||
proxy_buffering off;
|
||||
proxy_cache off;
|
||||
proxy_set_header Connection keep-alive;
|
||||
|
||||
add_header 'Access-Control-Allow-Origin' "$http_origin" always;
|
||||
add_header 'Access-Control-Allow-Credentials' 'true' always;
|
||||
add_header 'Access-Control-Allow-Methods' 'GET, POST, OPTIONS, PUT, DELETE' always;
|
||||
add_header 'Access-Control-Allow-Headers' 'Authorization,Content-Type,X-Requested-With,Accept,Origin' always;
|
||||
|
||||
if ($request_method = OPTIONS ) {
|
||||
return 204;
|
||||
}
|
||||
}
|
||||
|
||||
# 第三方支付平台调用
|
||||
location /payment/ {
|
||||
proxy_pass http://10.0.0.8:8090;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
|
||||
proxy_intercept_errors off;
|
||||
proxy_buffering off;
|
||||
proxy_cache off;
|
||||
proxy_set_header Connection keep-alive;
|
||||
|
||||
add_header 'Access-Control-Allow-Origin' "$http_origin" always;
|
||||
add_header 'Access-Control-Allow-Credentials' 'true' always;
|
||||
add_header 'Access-Control-Allow-Methods' 'GET, POST, OPTIONS, PUT, DELETE' always;
|
||||
add_header 'Access-Control-Allow-Headers' 'Authorization,Content-Type,X-Requested-With,Accept,Origin' always;
|
||||
|
||||
if ($request_method = OPTIONS ) {
|
||||
return 204;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -4,26 +4,49 @@ process_names:
|
||||
cmdline:
|
||||
- ".*index.mjs.*"
|
||||
|
||||
- name: "weblessie-server_lessie_sourcing_8000"
|
||||
|
||||
- name: "weblessie-server_s1_lessie_sourcing_8000"
|
||||
cmdline:
|
||||
- "/data/webapps/lessie_sourcing_agents"
|
||||
- ".*server.py*"
|
||||
- "gunicorn"
|
||||
- ".*0.0.0.0:8000.*"
|
||||
|
||||
- name: "weblessie-server_lessie_sourcing_8002"
|
||||
- name: "weblessie-server_s4_lessie_sourcing_8001"
|
||||
cmdline:
|
||||
- "/data/webapps/lessie_sourcing_agents_02"
|
||||
- ".*server8002.*"
|
||||
- "gunicorn"
|
||||
- ".*0.0.0.0:8001.*"
|
||||
|
||||
- name: "weblessie-server_lessie_sourcing_7001"
|
||||
- name: "weblessie-server_s4_GO-lessie-sourcing-api"
|
||||
cmdline:
|
||||
- "/data/webapps/prod_lessie_sourcing_agents"
|
||||
- ".*server7001.*"
|
||||
- ".*lessie-sourcing-api-s4.*"
|
||||
|
||||
- name: "weblessie-server_GO-lessie-sourcing-api"
|
||||
- name: "weblessie-server_s1_GO-lessie-sourcing-api"
|
||||
cmdline:
|
||||
- ".*lessie-sourcing-api.*"
|
||||
- ".*lessie-sourcing-api$"
|
||||
|
||||
- name: "nexus"
|
||||
cmdline:
|
||||
- "sonatype-nexus-repository-3.86.2-01.jar"
|
||||
|
||||
- name: "jenkins"
|
||||
cmdline:
|
||||
- "jenkins.war"
|
||||
|
||||
- name: "nginx_master"
|
||||
cmdline:
|
||||
- "nginx: master process"
|
||||
|
||||
- name: "nginx_worker"
|
||||
cmdline:
|
||||
- "nginx: worker process"
|
||||
|
||||
- name: "weblessie-server_s4-lessie-email"
|
||||
cmdline:
|
||||
- '--port\s+8031$'
|
||||
|
||||
- name: "webapp_lessie_react"
|
||||
cmdline:
|
||||
- 'cwd=/data/webapps/lessie-react'
|
||||
|
||||
- name: "webapp_lessie_next"
|
||||
cmdline:
|
||||
- 'cwd=/data/webapps/lessie-next'
|
||||
Reference in New Issue
Block a user