2026-01-27同步

This commit is contained in:
2026-01-27 18:21:17 +08:00
parent cf5b9c9d2b
commit aab08068c3
17 changed files with 588 additions and 421 deletions

View File

@@ -1,96 +0,0 @@
apiVersion: opentelemetry.io/v1beta1
kind: OpenTelemetryCollector
metadata:
name: otel-gateway
namespace: opentelemetry-operator-system
spec:
mode: deployment
replicas: 1
serviceAccount: otel-gateway-collector # Operator 会自动创建并绑定权限
config:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
# --- 核心:采集 K8s 集群状态指标 ---
# 采集 Deployment, DaemonSet, StatefulSet, HPA, Node 等资源的状态
k8s_cluster:
collection_interval: 30s
node_conditions_to_report: [Ready, MemoryPressure, DiskPressure, PIDPressure]
allocatable_types_to_report: [cpu, memory]
processors:
batch:
send_batch_size: 1000
timeout: 10s
memory_limiter:
check_interval: 1s
limit_percentage: 70
spike_limit_percentage: 30
# 增加 K8s 元数据标签 (这也是 Gateway 的重要作用)
k8sattributes:
extract:
metadata:
- k8s.namespace.name
- k8s.pod.name
- k8s.deployment.name
- k8s.statefulset.name
- k8s.daemonset.name
- k8s.cronjob.name
- k8s.job.name
- k8s.node.name
pod_association:
- sources:
- from: resource_attribute
name: k8s.pod.ip
- sources:
- from: resource_attribute
name: k8s.pod.uid
- sources:
- from: connection
exporters:
# 1. 导出 Metrics 到外部 Prometheus (使用 Remote Write)
prometheusremotewrite:
endpoint: "http://10.0.0.38:9090/api/v1/write"
# 如果有 Basic Auth在此配置
# external_labels:
# cluster: "test-k8s-cluster"
# 2. 导出 Traces 到外部 Tempo (使用 OTLP gRPC)
# otlp/tempo:
# endpoint: "<你的TEMPO_IP>:4317"
# tls:
# insecure: true
# 3. 导出 Logs 到外部 Elasticsearch (可选)
# elasticsearch:
# endpoints: ["http://<你的ES_IP>:9200"]
# logs_index: "k8s-logs"
debug:
verbosity: basic
service:
pipelines:
metrics:
receivers: [otlp, k8s_cluster]
processors: [memory_limiter, batch]
# 确保 k8sattributes 在 batch 之前或之后取决于架构Gateway通常主要做转发
# 这里 k8s_cluster 产生的数据自带标签otlp 来的数据应在 Agent 端打好标签
exporters: [prometheusremotewrite]
traces:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [otlp/tempo]
# logs:
# receivers: [otlp]
# processors: [memory_limiter, batch]
# exporters: [elasticsearch]

View File

@@ -0,0 +1,38 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: otel-collector-sa
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: otel-collector-role
rules:
- apiGroups: [""]
resources: ["events", "nodes", "nodes/proxy", "nodes/stats", "services", "endpoints", "pods", "namespaces", "replicationcontrollers", "resourcequotas"]
verbs: ["get", "list", "watch"]
- apiGroups: ["apps"]
resources: ["statefulsets", "daemonsets", "deployments", "replicasets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["batch"]
resources: ["jobs", "cronjobs"]
verbs: ["get", "list", "watch"]
- apiGroups: ["autoscaling"]
resources: ["horizontalpodautoscalers"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: otel-collector-binding
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: otel-collector-role
subjects:
- kind: ServiceAccount
name: otel-collector-sa
namespace: monitoring

View File

@@ -1,88 +0,0 @@
apiVersion: opentelemetry.io/v1beta1
kind: OpenTelemetryCollector
metadata:
name: otel-agent
namespace: opentelemetry-operator-system
spec:
mode: daemonset
hostNetwork: true # 建议开启,以便更准确获取 Host 指标
config:
receivers:
# 1. 采集 Pod 和 容器 的资源使用情况 (CPU/Mem)
kubeletstats:
collection_interval: 20s
auth_type: "serviceAccount"
endpoint: "${env:K8S_NODE_NAME}:10250"
insecure_skip_verify: true
metric_groups:
- node
- pod
- container
# 2. 采集宿主机物理指标
hostmetrics:
collection_interval: 20s
scrapers:
cpu:
memory:
load:
filesystem:
network:
# 3. (可选) 采集日志
# filelog:
# include: [/var/log/pods/*/*/*.log]
# ...
processors:
batch:
send_batch_size: 500
timeout: 5s
memory_limiter:
check_interval: 1s
limit_mib: 400
spike_limit_mib: 100
# 资源检测:自动识别云厂商(腾讯云)信息、主机名等
resourcedetection:
detectors: [system] # 如果在腾讯云CVM上可以尝试加入 'tencentcloud' 但 system 通常足够
timeout: 2s
override: false
# 关键:给指标打上 K8s 标签 (Pod Name, Namespace, Node Name)
k8sattributes:
passthrough: false
extract:
metadata:
- k8s.pod.name
- k8s.pod.uid
- k8s.deployment.name
- k8s.namespace.name
- k8s.node.name
pod_association:
- sources:
- from: resource_attribute
name: k8s.pod.uid
- sources:
- from: resource_attribute
name: k8s.pod.ip
- sources:
- from: connection
exporters:
# 发送给集群内的 Gateway Service
otlp:
endpoint: "otel-gateway-collector.opentelemetry-operator-system.svc.cluster.local:4317"
tls:
insecure: true
service:
pipelines:
metrics:
receivers: [kubeletstats, hostmetrics]
processors: [resourcedetection, k8sattributes, memory_limiter, batch]
exporters: [otlp]
# traces: # 如果应用配置了 sidecar 或其他方式发送 trace 到本地 agent
# receivers: [otlp]
# exporters: [otlp]

View File

@@ -0,0 +1,57 @@
apiVersion: opentelemetry.io/v1beta1
kind: OpenTelemetryCollector
metadata:
name: otel-gateway
namespace: monitoring
spec:
mode: deployment
image: otel/opentelemetry-collector-contrib:0.144.0
replicas: 1
serviceAccount: otel-collector-sa
env:
- name: K8S_NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
config:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
k8s_cluster:
collection_interval: 30s
k8s_events: {}
processors:
batch:
send_batch_size: 1000
timeout: 10s
resourcedetection:
detectors: [env, system, k8snode]
exporters:
debug:
verbosity: detailed
otlp_http/prometheus:
endpoint: "http://10.0.0.38:9090/api/v1/otlp"
elasticsearch:
endpoints: ["http://10.0.0.38:9200"]
logs_index: "k8s-test-cluster-events"
user: "elastic"
password: "-0NiIBOJGn2CATuPWzNc"
service:
pipelines:
metrics:
receivers: [otlp, k8s_cluster]
processors: [resourcedetection, batch]
exporters: [otlp_http/prometheus]
logs:
receivers: [k8s_events]
processors: [batch]
exporters: [elasticsearch, debug]

View File

@@ -0,0 +1,55 @@
apiVersion: opentelemetry.io/v1beta1
kind: OpenTelemetryCollector
metadata:
name: otel-agent
namespace: monitoring
spec:
mode: daemonset
image: otel/opentelemetry-collector-contrib:0.144.0
serviceAccount: otel-collector-sa
env:
- name: K8S_NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
# --- 新增在这里定义集群名称Prod 环境改一下这个值即可 ---
- name: CLUSTER_NAME
value: "test-k8s-cluster"
config:
receivers:
hostmetrics:
collection_interval: 30s
scrapers:
cpu: {}
memory: {}
kubeletstats:
collection_interval: 30s
auth_type: "serviceAccount"
endpoint: "https://${env:K8S_NODE_NAME}:10250"
insecure_skip_verify: true
processors:
batch: {}
resourcedetection:
detectors: [env, system, k8snode]
# --- 新增:强制给所有指标打上集群名称标签 ---
resource:
attributes:
- key: k8s.cluster.name
value: ${env:CLUSTER_NAME}
action: insert
exporters:
otlp:
endpoint: "otel-gateway-collector.monitoring.svc.cluster.local:4317"
tls:
insecure: true
service:
pipelines:
metrics:
receivers: [hostmetrics, kubeletstats]
# 注意:这里要加上 resource 处理器
processors: [resourcedetection, resource, batch]
exporters: [otlp]

View File

@@ -49,4 +49,12 @@ graph TD
graph LR
A[指标接收器] -->|metrics流水线| B[指标处理器] --> C[Prometheus导出器]
D[日志接收器] -->|logs流水线| E[日志处理器] --> F[ES导出器]
G[追踪接收器] -->|traces流水线| H[追踪处理器] --> I[Tempo导出器]
G[追踪接收器] -->|traces流水线| H[追踪处理器] --> I[Tempo导出器]
10.0.0.38:9090
10.0.0.38:9200
elastic
-0NiIBOJGn2CATuPWzNc