apiVersion: v1 kind: ConfigMap metadata: name: otel-gateway-config namespace: monitoring data: config.yaml: | receivers: # 接收来自 Agent 的数据 (gRPC 4317, HTTP 4318) otlp: protocols: grpc: endpoint: 0.0.0.0:4317 http: endpoint: 0.0.0.0:4318 # 1. 集群宏观: K8s 事件 k8s_events: auth_type: serviceAccount # 2. 集群宏观:拉取 TKE 自带的 tke-kube-state-metrics (Kube-State-Metrics) prometheus: config: scrape_configs: - job_name: 'tke-kube-state-metrics' scrape_interval: 30s static_configs: - targets: ['tke-kube-state-metrics.kube-system.svc.cluster.local:8180'] processors: batch: send_batch_size: 2000 timeout: 10s resourcedetection: detectors: [env, system] override: true # 3. 注入集群 ID, 解决 Prometheus 重复采样报错 resource: attributes: - key: cluster.name value: "test-k8s" action: upsert # 将 OTLP Resource 属性转换为 Metric 标签,确保 Prometheus 能够区分不同 Pod/Node transform: metric_statements: - context: datapoint statements: - set(attributes["k8s_pod_name"], resource.attributes["k8s.pod.name"]) - set(attributes["k8s_node_name"], resource.attributes["k8s.node.name"]) - set(attributes["k8s_namespace_name"], resource.attributes["k8s.namespace.name"]) - set(attributes["k8s_container_name"], resource.attributes["k8s.container.name"]) - set(attributes["cluster_name"], resource.attributes["cluster.name"]) memory_limiter: check_interval: 1s limit_mib: 1500 spike_limit_mib: 512 exporters: # 对接 Prometheus otlphttp/prometheus: endpoint: "http://10.0.0.38:9090/api/v1/write" tls: insecure: true # 打印日志(用于排查) debug: verbosity: detailed service: pipelines: metrics: receivers: [otlp, prometheus] # 汇聚微观(otlp)和宏观(prometheus)指标 processors: [memory_limiter, resourcedetection, resource, transform, batch] exporters: [otlphttp/prometheus] logs: receivers: [k8s_events] processors: [memory_limiter, resourcedetection, resource, batch] exporters: [debug]