0119同步

This commit is contained in:
2026-01-19 22:08:33 +08:00
parent 0384834345
commit cf5b9c9d2b
24 changed files with 32428 additions and 15 deletions

View File

@@ -0,0 +1,39 @@
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
---
# 1. 权限配置 (RBAC)
apiVersion: v1
kind: ServiceAccount
metadata:
name: otel-gateway
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: otel-gateway-role
rules:
# 允许读取 Pods, Nodes, Namespaces 用于打标和获取元数据
- apiGroups: [""]
resources: ["pods", "nodes", "namespaces", "services", "endpoints"]
verbs: ["get", "list", "watch"]
# 允许读取 Events (用于采集 K8s 事件)
- apiGroups: [""]
resources: ["events"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: otel-gateway-binding
subjects:
- kind: ServiceAccount
name: otel-gateway
namespace: monitoring
roleRef:
kind: ClusterRole
name: otel-gateway-role
apiGroup: rbac.authorization.k8s.io

View File

@@ -0,0 +1,82 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: otel-gateway-config
namespace: monitoring
data:
config.yaml: |
receivers:
# 接收来自 Agent 的数据 (gRPC 4317, HTTP 4318)
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
# 1. 集群宏观: K8s 事件
k8s_events:
auth_type: serviceAccount
# 2. 集群宏观:拉取 TKE 自带的 tke-kube-state-metrics (Kube-State-Metrics)
prometheus:
config:
scrape_configs:
- job_name: 'tke-kube-state-metrics'
scrape_interval: 30s
static_configs:
- targets: ['tke-kube-state-metrics.kube-system.svc.cluster.local:8180']
processors:
batch:
send_batch_size: 2000
timeout: 10s
resourcedetection:
detectors: [env, system]
override: true
# 3. 注入集群 ID, 解决 Prometheus 重复采样报错
resource:
attributes:
- key: cluster.name
value: "test-k8s"
action: upsert
# 将 OTLP Resource 属性转换为 Metric 标签,确保 Prometheus 能够区分不同 Pod/Node
transform:
metric_statements:
- context: datapoint
statements:
- set(attributes["k8s_pod_name"], resource.attributes["k8s.pod.name"])
- set(attributes["k8s_node_name"], resource.attributes["k8s.node.name"])
- set(attributes["k8s_namespace_name"], resource.attributes["k8s.namespace.name"])
- set(attributes["k8s_container_name"], resource.attributes["k8s.container.name"])
- set(attributes["cluster_name"], resource.attributes["cluster.name"])
memory_limiter:
check_interval: 1s
limit_mib: 1500
spike_limit_mib: 512
exporters:
# 对接 Prometheus
otlphttp/prometheus:
endpoint: "http://10.0.0.38:9090/api/v1/write"
tls:
insecure: true
# 打印日志(用于排查)
debug:
verbosity: detailed
service:
pipelines:
metrics:
receivers: [otlp, prometheus] # 汇聚微观(otlp)和宏观(prometheus)指标
processors: [memory_limiter, resourcedetection, resource, transform, batch]
exporters: [otlphttp/prometheus]
logs:
receivers: [k8s_events]
processors: [memory_limiter, resourcedetection, resource, batch]
exporters: [debug]

View File

@@ -0,0 +1,60 @@
# 部署主体 (Deployment)
apiVersion: apps/v1
kind: Deployment
metadata:
name: otel-gateway
namespace: monitoring
labels:
app: otel-gateway
spec:
replicas: 1 # 采集 Events 和 KSM 建议单副本,避免数据重复
selector:
matchLabels:
app: otel-gateway
template:
metadata:
labels:
app: otel-gateway
spec:
serviceAccountName: otel-gateway
containers:
- name: otel-collector
image: otel/opentelemetry-collector-contrib:latest
command:
- "/otelcol-contrib"
args:
- "--config=/conf/config.yaml"
volumeMounts:
- name: config-vol
mountPath: /conf
resources:
limits:
cpu: 1
memory: 2Gi
requests:
cpu: 200m
memory: 400Mi
volumes:
- name: config-vol
configMap:
name: otel-gateway-config
---
# 服务暴露 (Service)
apiVersion: v1
kind: Service
metadata:
name: otel-gateway
namespace: monitoring
spec:
clusterIP: None
selector:
app: otel-gateway
ports:
- name: grpc
port: 4317
targetPort: 4317
protocol: TCP
- name: http
port: 4318
targetPort: 4318
protocol: TCP

View File

@@ -0,0 +1,39 @@
# 1. 权限配置
apiVersion: v1
kind: ServiceAccount
metadata:
name: otel-agent
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: otel-agent-role
rules:
# 允许读取 Pod 和 Node 信息
- apiGroups: [""]
resources: ["nodes", "nodes/stats", "nodes/proxy", "pods", "services", "endpoints"]
verbs: ["get", "watch", "list"]
# 允许读取 ReplicaSets以便 k8sattributes 处理器解析 Deployment 名称
- apiGroups: ["apps"]
resources: ["replicasets"]
verbs: ["get", "watch", "list"]
# 非资源型 URL 权限 (访问 Kubelet 统计接口)
- nonResourceURLs: ["/metrics", "/metrics/cadvisor"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: otel-agent-binding
subjects:
- kind: ServiceAccount
name: otel-agent
namespace: monitoring
roleRef:
kind: ClusterRole
name: otel-agent-role
apiGroup: rbac.authorization.k8s.io

View File

@@ -0,0 +1,75 @@
# Agent 配置文件
apiVersion: v1
kind: ConfigMap
metadata:
name: otel-agent-config
namespace: monitoring
data:
config.yaml: |
receivers:
# 1. 采集宿主机宏观指标
hostmetrics:
collection_interval: 30s
root_path: /hostfs
scrapers:
cpu: {}
memory: {}
load: {}
disk: {}
filesystem: {}
network: {}
paging: {}
processes: {}
# 2. 采集 Pod/Container/Volume 微观指标
kubeletstats:
collection_interval: 30s
auth_type: "serviceAccount"
endpoint: "https://${env:K8S_NODE_NAME}:10250" # 使用环境变量定位本地 Kubelet
insecure_skip_verify: true
metric_groups:
- node
- pod
- container
- volume
processors:
batch:
send_batch_size: 1000
timeout: 10s
resourcedetection:
detectors: [env, system]
# 3. 提取 K8s 详细标签,确保指标唯一性
k8sattributes:
auth_type: "serviceAccount"
passthrough: false
extract:
metadata:
- k8s.pod.name
- k8s.pod.uid
- k8s.namespace.name
- k8s.node.name
- k8s.deployment.name
- k8s.container.name
pod_association:
- sources:
- from: resource_attribute
name: k8s.pod.uid
- sources:
- from: connection
exporters:
# 发送给集群内的 Gateway Service
otlp:
endpoint: "otel-gateway.monitoring.svc.cluster.local:4317"
tls:
insecure: true
service:
pipelines:
metrics:
receivers: [hostmetrics, kubeletstats]
processors: [resourcedetection, k8sattributes, batch]
exporters: [otlp]

View File

@@ -0,0 +1,55 @@
# 部署主体 (DaemonSet)
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: otel-agent
namespace: monitoring
labels:
app: otel-agent
spec:
selector:
matchLabels:
app: otel-agent
template:
metadata:
labels:
app: otel-agent
spec:
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
serviceAccountName: otel-agent
containers:
- name: otel-collector
image: otel/opentelemetry-collector-contrib:latest
command:
- "/otelcol-contrib"
args:
- "--config=/conf/config.yaml"
env:
# 获取当前节点名称,传给 kubeletstats 使用
- name: K8S_NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- name: config-vol
mountPath: /conf
# 挂载宿主机根目录,以便采集宿主机指标
- name: hostfs
mountPath: /hostfs
readOnly: true
mountPropagation: HostToContainer
resources:
limits:
cpu: 500m
memory: 500Mi
requests:
cpu: 100m
memory: 200Mi
volumes:
- name: config-vol
configMap:
name: otel-agent-config
- name: hostfs
hostPath:
path: /

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,96 @@
apiVersion: opentelemetry.io/v1beta1
kind: OpenTelemetryCollector
metadata:
name: otel-gateway
namespace: opentelemetry-operator-system
spec:
mode: deployment
replicas: 1
serviceAccount: otel-gateway-collector # Operator 会自动创建并绑定权限
config:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
# --- 核心:采集 K8s 集群状态指标 ---
# 采集 Deployment, DaemonSet, StatefulSet, HPA, Node 等资源的状态
k8s_cluster:
collection_interval: 30s
node_conditions_to_report: [Ready, MemoryPressure, DiskPressure, PIDPressure]
allocatable_types_to_report: [cpu, memory]
processors:
batch:
send_batch_size: 1000
timeout: 10s
memory_limiter:
check_interval: 1s
limit_percentage: 70
spike_limit_percentage: 30
# 增加 K8s 元数据标签 (这也是 Gateway 的重要作用)
k8sattributes:
extract:
metadata:
- k8s.namespace.name
- k8s.pod.name
- k8s.deployment.name
- k8s.statefulset.name
- k8s.daemonset.name
- k8s.cronjob.name
- k8s.job.name
- k8s.node.name
pod_association:
- sources:
- from: resource_attribute
name: k8s.pod.ip
- sources:
- from: resource_attribute
name: k8s.pod.uid
- sources:
- from: connection
exporters:
# 1. 导出 Metrics 到外部 Prometheus (使用 Remote Write)
prometheusremotewrite:
endpoint: "http://10.0.0.38:9090/api/v1/write"
# 如果有 Basic Auth在此配置
# external_labels:
# cluster: "test-k8s-cluster"
# 2. 导出 Traces 到外部 Tempo (使用 OTLP gRPC)
# otlp/tempo:
# endpoint: "<你的TEMPO_IP>:4317"
# tls:
# insecure: true
# 3. 导出 Logs 到外部 Elasticsearch (可选)
# elasticsearch:
# endpoints: ["http://<你的ES_IP>:9200"]
# logs_index: "k8s-logs"
debug:
verbosity: basic
service:
pipelines:
metrics:
receivers: [otlp, k8s_cluster]
processors: [memory_limiter, batch]
# 确保 k8sattributes 在 batch 之前或之后取决于架构Gateway通常主要做转发
# 这里 k8s_cluster 产生的数据自带标签otlp 来的数据应在 Agent 端打好标签
exporters: [prometheusremotewrite]
traces:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [otlp/tempo]
# logs:
# receivers: [otlp]
# processors: [memory_limiter, batch]
# exporters: [elasticsearch]

View File

@@ -0,0 +1,88 @@
apiVersion: opentelemetry.io/v1beta1
kind: OpenTelemetryCollector
metadata:
name: otel-agent
namespace: opentelemetry-operator-system
spec:
mode: daemonset
hostNetwork: true # 建议开启,以便更准确获取 Host 指标
config:
receivers:
# 1. 采集 Pod 和 容器 的资源使用情况 (CPU/Mem)
kubeletstats:
collection_interval: 20s
auth_type: "serviceAccount"
endpoint: "${env:K8S_NODE_NAME}:10250"
insecure_skip_verify: true
metric_groups:
- node
- pod
- container
# 2. 采集宿主机物理指标
hostmetrics:
collection_interval: 20s
scrapers:
cpu:
memory:
load:
filesystem:
network:
# 3. (可选) 采集日志
# filelog:
# include: [/var/log/pods/*/*/*.log]
# ...
processors:
batch:
send_batch_size: 500
timeout: 5s
memory_limiter:
check_interval: 1s
limit_mib: 400
spike_limit_mib: 100
# 资源检测:自动识别云厂商(腾讯云)信息、主机名等
resourcedetection:
detectors: [system] # 如果在腾讯云CVM上可以尝试加入 'tencentcloud' 但 system 通常足够
timeout: 2s
override: false
# 关键:给指标打上 K8s 标签 (Pod Name, Namespace, Node Name)
k8sattributes:
passthrough: false
extract:
metadata:
- k8s.pod.name
- k8s.pod.uid
- k8s.deployment.name
- k8s.namespace.name
- k8s.node.name
pod_association:
- sources:
- from: resource_attribute
name: k8s.pod.uid
- sources:
- from: resource_attribute
name: k8s.pod.ip
- sources:
- from: connection
exporters:
# 发送给集群内的 Gateway Service
otlp:
endpoint: "otel-gateway-collector.opentelemetry-operator-system.svc.cluster.local:4317"
tls:
insecure: true
service:
pipelines:
metrics:
receivers: [kubeletstats, hostmetrics]
processors: [resourcedetection, k8sattributes, memory_limiter, batch]
exporters: [otlp]
# traces: # 如果应用配置了 sidecar 或其他方式发送 trace 到本地 agent
# receivers: [otlp]
# exporters: [otlp]

View File

@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: monitoring

View File

@@ -0,0 +1,16 @@
apiVersion: v1
kind: Namespace
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"v1","kind":"Namespace","metadata":{"annotations":{},"labels":{"app.kubernetes.io/name":"opentelemetry-operator","control-plane":"controller-manager"},"name":"opentelemetry-operator-system"}}
creationTimestamp: "2025-12-11T08:28:51Z"
deletionTimestamp: "2025-12-12T08:33:24Z"
labels:
app.kubernetes.io/name: opentelemetry-operator
control-plane: controller-manager
kubernetes.io/metadata.name: opentelemetry-operator-system
name: opentelemetry-operator-system
resourceVersion: "4706820514"
uid: 6ebd60fa-2155-4f6e-8c3c-6d83447713c1
spec:

View File

@@ -0,0 +1,72 @@
一、安装 cert-manager
OpenTelemetry Operator 提供了 OpenTelemetryCollector CRD自定义资源定义能自动处理服务发现和 RBAC基于角色的访问控制
该 Operator 需要 cert-manager 来支持准入 Webhook部署步骤如下
安装OpenTelemetry Operator 自定义资源,需先部署 cert-manager 使用yaml 直接部署文件02-cert-manager.yaml (官方文档里)
官方文档链接https://cert-manager.io/docs/installation/kubectl/
cert-manager 将安装在cert-manager命名空间中安装了cert-manager后通过以下方式验证其部署是否正确 检查cert-manager命名空间:
kubectl get pods --namespace cert-manager
NAME READY STATUS RESTARTS AGE
cert-manager-7b8b89f89d-tpchr 1/1 Running 0 24s
cert-manager-cainjector-7f9fdd5dd5-px66h 1/1 Running 0 25s
cert-manager-webhook-769f6b94cb-zmjmv 1/1 Running 0 24s
二、安装 OpenTelemetry Operator
使用 helm 安装OpenTelemetry ,添加 Operator helm 仓库、更新、安装:
helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts
helm repo update
helm install opentelemetry-operator open-telemetry/opentelemetry-operator \
--namespace opentelemetry-operator \
--create-namespace
或者直接使用yaml部署文件 03-opentelemetry-operator.yaml
kubectl apply -f https://github.com/open-telemetry/opentelemetry-operator/releases/latest/download/opentelemetry-operator.yaml
kubectl delete -f https://github.com/open-telemetry/opentelemetry-operator/releases/latest/download/opentelemetry-operator.yaml
# 查看安装的CRD
kubectl get crd | grep opentelemetry.io
输出:
instrumentations.opentelemetry.io 2025-12-11T09:02:11Z
opampbridges.opentelemetry.io 2025-12-11T09:02:13Z
opentelemetrycollectors.opentelemetry.io 2025-12-11T09:02:14Z
targetallocators.opentelemetry.io 2025-12-11T09:02:17Z
# 查看安装的operator控制器 Pod 状态
kubectl get pods -n opentelemetry-operator-system
NAME READY STATUS RESTARTS AGE
opentelemetry-operator-controller-manager-9c4b5467d-dhhp7 1/1 Running 0 3m10s
# 查看 Operator Deployment 状态
kubectl get deploy opentelemetry-operator-controller-manager -n opentelemetry-operator-system
NAME READY UP-TO-DATE AVAILABLE AGE
opentelemetry-operator-controller-manager 1/1 1 1 4m43s
# 查看证书是否签发成功(验证与 cert-manager 协同正常)
NAME READY SECRET AGE
certificate.cert-manager.io/opentelemetry-operator-serving-cert True opentelemetry-operator-controller-manager-service-cert 6m13s
NAME READY AGE
issuer.cert-manager.io/opentelemetry-operator-selfsigned-issuer True 6m12s
# 查看 Operator 相关 CRD
kubectl get crd | grep opentelemetry.io
输出:
instrumentations.opentelemetry.io 2026-01-14T07:28:56Z
opampbridges.opentelemetry.io 2026-01-14T07:28:57Z
opentelemetrycollectors.opentelemetry.io 2025-12-11T09:02:14Z
targetallocators.opentelemetry.io 2026-01-14T07:29:03Z
# 卸载命令
# 格式helm uninstall <Release 名称> -n <命名空间>
helm uninstall opentelemetry-operator -n opentelemetry-operator
# 删除 CRD仅当你确认不再需要任何 OTel 相关自定义资源实例时执行)
# 批量删除 Operator 相关 CRD
kubectl delete crd \
instrumentations.opentelemetry.io \
opentelemetrycollectors.opentelemetry.io \
targetallocators.opentelemetry.io

View File

@@ -27,14 +27,21 @@ Type=simple
ExecStart=/data/prometheus/prometheus \
--config.file=/data/prometheus/prometheus.yml \
--storage.tsdb.path=/data/prometheus/data \
--web.enable-remote-write-receiver \
--web.console.templates=/data/prometheus/consoles \
--web.console.libraries=/data/prometheus/console_libraries
--web.console.libraries=/data/prometheus/console_libraries \
--storage.tsdb.retention.time=60d \
--storage.tsdb.retention.size=60GB
Restart=always
[Install]
WantedBy=multi-user.target
# ----------------------------------------
# 创建文件 /etc/systemd/system/alertmanager.service
[Unit]
Description=Alertmanager

View File

@@ -25,7 +25,6 @@ graph TD
F -->|转发| G[集群外Prometheus]
2. 日志数据(最终到 ES
采集方OTel CollectorDaemonSet 模式)
采集内容k8s 节点/var/log/containers目录下的容器日志替代 Filebeat
@@ -49,5 +48,5 @@ graph TD
graph LR
A[指标接收器] -->|metrics流水线| B[指标处理器] --> C[Prometheus导出器]
D[日志接收器] -->|logs流水线| E[日志处理器] --> F[ES导出器]
G[追踪接收器] -->|traces流水线| H[追踪处理器] --> I[Tempo导出器]
D[日志接收器] -->|logs流水线| E[日志处理器] --> F[ES导出器]
G[追踪接收器] -->|traces流水线| H[追踪处理器] --> I[Tempo导出器]