0119同步

This commit is contained in:
2026-01-19 22:08:33 +08:00
parent 0384834345
commit cf5b9c9d2b
24 changed files with 32428 additions and 15 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,96 @@
apiVersion: opentelemetry.io/v1beta1
kind: OpenTelemetryCollector
metadata:
name: otel-gateway
namespace: opentelemetry-operator-system
spec:
mode: deployment
replicas: 1
serviceAccount: otel-gateway-collector # Operator 会自动创建并绑定权限
config:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
# --- 核心:采集 K8s 集群状态指标 ---
# 采集 Deployment, DaemonSet, StatefulSet, HPA, Node 等资源的状态
k8s_cluster:
collection_interval: 30s
node_conditions_to_report: [Ready, MemoryPressure, DiskPressure, PIDPressure]
allocatable_types_to_report: [cpu, memory]
processors:
batch:
send_batch_size: 1000
timeout: 10s
memory_limiter:
check_interval: 1s
limit_percentage: 70
spike_limit_percentage: 30
# 增加 K8s 元数据标签 (这也是 Gateway 的重要作用)
k8sattributes:
extract:
metadata:
- k8s.namespace.name
- k8s.pod.name
- k8s.deployment.name
- k8s.statefulset.name
- k8s.daemonset.name
- k8s.cronjob.name
- k8s.job.name
- k8s.node.name
pod_association:
- sources:
- from: resource_attribute
name: k8s.pod.ip
- sources:
- from: resource_attribute
name: k8s.pod.uid
- sources:
- from: connection
exporters:
# 1. 导出 Metrics 到外部 Prometheus (使用 Remote Write)
prometheusremotewrite:
endpoint: "http://10.0.0.38:9090/api/v1/write"
# 如果有 Basic Auth在此配置
# external_labels:
# cluster: "test-k8s-cluster"
# 2. 导出 Traces 到外部 Tempo (使用 OTLP gRPC)
# otlp/tempo:
# endpoint: "<你的TEMPO_IP>:4317"
# tls:
# insecure: true
# 3. 导出 Logs 到外部 Elasticsearch (可选)
# elasticsearch:
# endpoints: ["http://<你的ES_IP>:9200"]
# logs_index: "k8s-logs"
debug:
verbosity: basic
service:
pipelines:
metrics:
receivers: [otlp, k8s_cluster]
processors: [memory_limiter, batch]
# 确保 k8sattributes 在 batch 之前或之后取决于架构Gateway通常主要做转发
# 这里 k8s_cluster 产生的数据自带标签otlp 来的数据应在 Agent 端打好标签
exporters: [prometheusremotewrite]
traces:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [otlp/tempo]
# logs:
# receivers: [otlp]
# processors: [memory_limiter, batch]
# exporters: [elasticsearch]

View File

@@ -0,0 +1,88 @@
apiVersion: opentelemetry.io/v1beta1
kind: OpenTelemetryCollector
metadata:
name: otel-agent
namespace: opentelemetry-operator-system
spec:
mode: daemonset
hostNetwork: true # 建议开启,以便更准确获取 Host 指标
config:
receivers:
# 1. 采集 Pod 和 容器 的资源使用情况 (CPU/Mem)
kubeletstats:
collection_interval: 20s
auth_type: "serviceAccount"
endpoint: "${env:K8S_NODE_NAME}:10250"
insecure_skip_verify: true
metric_groups:
- node
- pod
- container
# 2. 采集宿主机物理指标
hostmetrics:
collection_interval: 20s
scrapers:
cpu:
memory:
load:
filesystem:
network:
# 3. (可选) 采集日志
# filelog:
# include: [/var/log/pods/*/*/*.log]
# ...
processors:
batch:
send_batch_size: 500
timeout: 5s
memory_limiter:
check_interval: 1s
limit_mib: 400
spike_limit_mib: 100
# 资源检测:自动识别云厂商(腾讯云)信息、主机名等
resourcedetection:
detectors: [system] # 如果在腾讯云CVM上可以尝试加入 'tencentcloud' 但 system 通常足够
timeout: 2s
override: false
# 关键:给指标打上 K8s 标签 (Pod Name, Namespace, Node Name)
k8sattributes:
passthrough: false
extract:
metadata:
- k8s.pod.name
- k8s.pod.uid
- k8s.deployment.name
- k8s.namespace.name
- k8s.node.name
pod_association:
- sources:
- from: resource_attribute
name: k8s.pod.uid
- sources:
- from: resource_attribute
name: k8s.pod.ip
- sources:
- from: connection
exporters:
# 发送给集群内的 Gateway Service
otlp:
endpoint: "otel-gateway-collector.opentelemetry-operator-system.svc.cluster.local:4317"
tls:
insecure: true
service:
pipelines:
metrics:
receivers: [kubeletstats, hostmetrics]
processors: [resourcedetection, k8sattributes, memory_limiter, batch]
exporters: [otlp]
# traces: # 如果应用配置了 sidecar 或其他方式发送 trace 到本地 agent
# receivers: [otlp]
# exporters: [otlp]

View File

@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: monitoring

View File

@@ -0,0 +1,16 @@
apiVersion: v1
kind: Namespace
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"v1","kind":"Namespace","metadata":{"annotations":{},"labels":{"app.kubernetes.io/name":"opentelemetry-operator","control-plane":"controller-manager"},"name":"opentelemetry-operator-system"}}
creationTimestamp: "2025-12-11T08:28:51Z"
deletionTimestamp: "2025-12-12T08:33:24Z"
labels:
app.kubernetes.io/name: opentelemetry-operator
control-plane: controller-manager
kubernetes.io/metadata.name: opentelemetry-operator-system
name: opentelemetry-operator-system
resourceVersion: "4706820514"
uid: 6ebd60fa-2155-4f6e-8c3c-6d83447713c1
spec:

View File

@@ -0,0 +1,72 @@
一、安装 cert-manager
OpenTelemetry Operator 提供了 OpenTelemetryCollector CRD自定义资源定义能自动处理服务发现和 RBAC基于角色的访问控制
该 Operator 需要 cert-manager 来支持准入 Webhook部署步骤如下
安装OpenTelemetry Operator 自定义资源,需先部署 cert-manager 使用yaml 直接部署文件02-cert-manager.yaml (官方文档里)
官方文档链接https://cert-manager.io/docs/installation/kubectl/
cert-manager 将安装在cert-manager命名空间中安装了cert-manager后通过以下方式验证其部署是否正确 检查cert-manager命名空间:
kubectl get pods --namespace cert-manager
NAME READY STATUS RESTARTS AGE
cert-manager-7b8b89f89d-tpchr 1/1 Running 0 24s
cert-manager-cainjector-7f9fdd5dd5-px66h 1/1 Running 0 25s
cert-manager-webhook-769f6b94cb-zmjmv 1/1 Running 0 24s
二、安装 OpenTelemetry Operator
使用 helm 安装OpenTelemetry ,添加 Operator helm 仓库、更新、安装:
helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts
helm repo update
helm install opentelemetry-operator open-telemetry/opentelemetry-operator \
--namespace opentelemetry-operator \
--create-namespace
或者直接使用yaml部署文件 03-opentelemetry-operator.yaml
kubectl apply -f https://github.com/open-telemetry/opentelemetry-operator/releases/latest/download/opentelemetry-operator.yaml
kubectl delete -f https://github.com/open-telemetry/opentelemetry-operator/releases/latest/download/opentelemetry-operator.yaml
# 查看安装的CRD
kubectl get crd | grep opentelemetry.io
输出:
instrumentations.opentelemetry.io 2025-12-11T09:02:11Z
opampbridges.opentelemetry.io 2025-12-11T09:02:13Z
opentelemetrycollectors.opentelemetry.io 2025-12-11T09:02:14Z
targetallocators.opentelemetry.io 2025-12-11T09:02:17Z
# 查看安装的operator控制器 Pod 状态
kubectl get pods -n opentelemetry-operator-system
NAME READY STATUS RESTARTS AGE
opentelemetry-operator-controller-manager-9c4b5467d-dhhp7 1/1 Running 0 3m10s
# 查看 Operator Deployment 状态
kubectl get deploy opentelemetry-operator-controller-manager -n opentelemetry-operator-system
NAME READY UP-TO-DATE AVAILABLE AGE
opentelemetry-operator-controller-manager 1/1 1 1 4m43s
# 查看证书是否签发成功(验证与 cert-manager 协同正常)
NAME READY SECRET AGE
certificate.cert-manager.io/opentelemetry-operator-serving-cert True opentelemetry-operator-controller-manager-service-cert 6m13s
NAME READY AGE
issuer.cert-manager.io/opentelemetry-operator-selfsigned-issuer True 6m12s
# 查看 Operator 相关 CRD
kubectl get crd | grep opentelemetry.io
输出:
instrumentations.opentelemetry.io 2026-01-14T07:28:56Z
opampbridges.opentelemetry.io 2026-01-14T07:28:57Z
opentelemetrycollectors.opentelemetry.io 2025-12-11T09:02:14Z
targetallocators.opentelemetry.io 2026-01-14T07:29:03Z
# 卸载命令
# 格式helm uninstall <Release 名称> -n <命名空间>
helm uninstall opentelemetry-operator -n opentelemetry-operator
# 删除 CRD仅当你确认不再需要任何 OTel 相关自定义资源实例时执行)
# 批量删除 Operator 相关 CRD
kubectl delete crd \
instrumentations.opentelemetry.io \
opentelemetrycollectors.opentelemetry.io \
targetallocators.opentelemetry.io