改改改

This commit is contained in:
2026-01-09 17:50:32 +08:00
parent cf14d8a6db
commit 0384834345
37 changed files with 1944 additions and 2 deletions

29
OpenTelemetry/1.txt Normal file
View File

@@ -0,0 +1,29 @@
阶段 1仅部署指标采集当前目标
部署内容:
创建monitoring命名空间 + Collector 的 RBAC 权限;
部署 DaemonSet Collector仅配置hostmetrics/kubeletstats接收器采集节点 / 容器指标);
部署 Deployment Collector仅配置otlp接收器 + prometheusremotewrite导出器转发指标到 Prometheus
核心配置模块:
plaintext
receivers: hostmetrics、kubeletstats、otlp
processors: batch、resource
exporters: prometheusremotewrite
pipelines: metrics关联上述接收器/处理器/导出器)
阶段 2新增日志采集基于阶段 1 扩展)
无冲突操作:
仅更新 DaemonSet Collector 的 ConfigMap新增filelog接收器配置日志采集路径并在pipelines中新增logs流水线
仅更新 Deployment Collector 的 ConfigMap新增elasticsearch导出器并在pipelines中新增logs流水线
重启 DaemonSet/Deployment Collector Pod配置热更新
核心逻辑:
日志采集依赖 DaemonSet 挂载宿主机日志目录(仅需在 DaemonSet 的 Pod 配置中新增 volume 挂载,不影响原有指标采集);
日志的filelog接收器、elasticsearch导出器与指标的模块完全独立互不干扰
阶段 3新增追踪采集基于阶段 1+2 扩展)
无冲突操作:
无需修改 DaemonSet Collector追踪无需节点级采集
仅更新 Deployment Collector 的 ConfigMap新增otlp/tempo导出器并在pipelines中新增traces流水线
重启 Deployment Collector Pod
核心逻辑:
追踪仅需 Deployment Collector 暴露 4317/4318 端口(阶段 1 已配置otlp接收器无需新增
追踪的otlp/tempo导出器与指标 / 日志的模块完全独立仅新增traces流水线即可。

View File

@@ -0,0 +1,56 @@
# 1. 创建monitoring命名空间
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
labels:
name: monitoring
---
# 2. 创建ServiceAccount
apiVersion: v1
kind: ServiceAccount
metadata:
name: otel-collector
namespace: monitoring
---
# 3. 创建ClusterRole最小权限
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: otel-collector-role
rules:
# 读取节点/Pod/服务元数据(基础权限)
- apiGroups: [""]
resources: ["nodes", "pods", "services", "endpoints", "nodes/metrics", "nodes/stats"]
verbs: ["get", "list", "watch"]
# 后续增加
# # 新增采集Deployment/DaemonSet/StatefulSetapps API组
# - apiGroups: ["apps"]
# resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
# verbs: ["get", "list", "watch"]
# # 新增采集HPAautoscaling API组
# - apiGroups: ["autoscaling"]
# resources: ["horizontalpodautoscalers"]
# verbs: ["get", "list", "watch"]
# # 新增采集k8s事件可选用于故障排查
# - apiGroups: [""]
# resources: ["events"]
# verbs: ["get", "list", "watch"]
---
# 4. 绑定ClusterRole到ServiceAccount
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: otel-collector-binding
subjects:
- kind: ServiceAccount
name: otel-collector
namespace: monitoring
roleRef:
kind: ClusterRole
name: otel-collector-role
apiGroup: rbac.authorization.k8s.io

View File

@@ -0,0 +1,64 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: otel-collector-config
namespace: monitoring
data:
config.yaml: |
# 全局配置
receivers:
# 1. 节点级指标采集仅DaemonSet生效
hostmetrics:
collection_interval: 30s
scrapers:
cpu: {}
memory: {}
disk: {}
filesystem: {}
network: {}
load: {}
processes: {}
# 2. 容器级指标采集仅DaemonSet生效修复kubeletstats配置
kubeletstats:
collection_interval: 30s
auth_type: "serviceAccount"
endpoint: "https://${K8S_NODE_NAME}:10250"
insecure_skip_verify: true
# 3. OTLP接收器DaemonSet/Deployment都生效
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
processors:
batch: {}
resource:
attributes:
- key: k8s.cluster.name
value: test-k8s
action: insert
- key: k8s.node.name
from_attribute: host.name
action: insert
exporters:
prometheusremotewrite:
endpoint: "http://10.0.0.38:9090/api/v1/write"
external_labels:
k8s_cluster: test-k8s
# 核心修复service配置
service:
pipelines:
metrics:
receivers: [hostmetrics, kubeletstats, otlp]
processors: [batch, resource]
exporters: [prometheusremotewrite]
telemetry:
logs:
level: info
metrics:
endpoint: 0.0.0.0:8888
collection_interval: 60s

View File

@@ -0,0 +1,57 @@
# 部署 DaemonSet节点级指标采集
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: otel-collector-daemonset
namespace: monitoring
labels:
app: otel-collector-daemonset
spec:
selector:
matchLabels:
app: otel-collector-daemonset
template:
metadata:
labels:
app: otel-collector-daemonset
spec:
serviceAccountName: otel-collector
hostNetwork: false # 无需主机网络
containers:
- name: otel-collector
image: otel/opentelemetry-collector-contrib:latest
args: ["--config=/etc/otel-collector/config.yaml"]
# 挂载宿主机目录(采集节点指标)
volumeMounts:
- name: otel-config
mountPath: /etc/otel-collector
- name: proc
mountPath: /proc
readOnly: true
- name: sys
mountPath: /sys
readOnly: true
- name: rootfs
mountPath: /rootfs
readOnly: true
# 资源限制(按需调整)
resources:
limits:
cpu: 500m
memory: 512Mi
requests:
cpu: 100m
memory: 256Mi
volumes:
- name: otel-config
configMap:
name: otel-collector-config
- name: proc
hostPath:
path: /proc
- name: sys
hostPath:
path: /sys
- name: rootfs
hostPath:
path: /

View File

@@ -0,0 +1,43 @@
# 部署 Deployment集群级聚合转发
apiVersion: apps/v1
kind: Deployment
metadata:
name: otel-collector-deployment
namespace: monitoring
labels:
app: otel-collector-deployment
spec:
replicas: 1 # 测试环境单副本生产可扩为2
selector:
matchLabels:
app: otel-collector-deployment
template:
metadata:
labels:
app: otel-collector-deployment
spec:
serviceAccountName: otel-collector
containers:
- name: otel-collector
image: otel/opentelemetry-collector-contrib:latest
args: ["--config=/etc/otel-collector/config.yaml"]
volumeMounts:
- name: otel-config
mountPath: /etc/otel-collector
# 暴露端口
ports:
- containerPort: 4317 # OTLP gRPC
- containerPort: 4318 # OTLP HTTP
- containerPort: 8888 # 自身监控
# 资源限制
resources:
limits:
cpu: 500m
memory: 512Mi
requests:
cpu: 100m
memory: 256Mi
volumes:
- name: otel-config
configMap:
name: otel-collector-config

View File

@@ -0,0 +1,57 @@
# 全局配置
receivers:
# 1. 节点级指标采集仅DaemonSet生效
hostmetrics:
collection_interval: 30s
scrapers:
cpu: {}
memory: {}
disk: {}
filesystem: {}
network: {}
load: {}
processes: {}
# 2. 容器级指标采集仅DaemonSet生效修复kubeletstats配置
kubeletstats:
collection_interval: 30s
auth_type: "serviceAccount"
endpoint: "https://${K8S_NODE_NAME}:10250"
insecure_skip_verify: true
# 3. OTLP接收器DaemonSet/Deployment都生效
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
processors:
batch: {}
resource:
attributes:
- key: k8s.cluster.name
value: test-k8s
action: insert
- key: k8s.node.name
from_attribute: host.name
action: insert
exporters:
prometheusremotewrite:
endpoint: "http://10.0.0.38:9090/api/v1/write"
external_labels:
k8s_cluster: test-k8s
# 核心修复service配置
service:
pipelines:
metrics:
receivers: [hostmetrics, kubeletstats, otlp]
processors: [batch, resource]
exporters: [prometheusremotewrite]
telemetry:
logs:
level: info
metrics:
endpoint: 0.0.0.0:8888
collection_interval: 60s

View File

@@ -0,0 +1,73 @@
# 定义 Filebeat 的服务账户(ServiceAccount)
apiVersion: v1
kind: ServiceAccount
metadata:
name: filebeat # 服务账户名称
namespace: kube-system # 所在命名空间
labels:
k8s-app: filebeat # 标签,标识这是 Filebeat 应用
---
# 定义 Filebeat 的集群角色(ClusterRole),授予集群范围的权限
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: filebeat # 集群角色名称
labels:
k8s-app: filebeat # 标签
rules:
# 授予对 namespaces, pods, nodes 资源的 get, list, watch 权限
- apiGroups: [""]
resources: ["namespaces", "pods", "nodes"]
verbs: ["get", "list", "watch"]
# 授予对 ReplicaSets 的 get, list, watch 权限
- apiGroups: ["apps"]
resources: ["replicasets"]
verbs: ["get", "list", "watch"]
# 授予对 Jobs 的 get, list, watch 权限
- apiGroups: ["batch"]
resources: ["jobs"]
verbs: ["get", "list", "watch"]
---
# 定义 Filebeat 的角色(Role),授予命名空间范围的权限
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: filebeat # 角色名称
namespace: kube-system # 作用命名空间
labels:
k8s-app: filebeat # 标签
rules:
# 授予对 leases 资源的 get, create, update 权限
# Leases 用于协调和领导者选举
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["get", "create", "update"]
---
# 将 Filebeat 的服务账户与集群角色绑定(ClusterRoleBinding)
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: filebeat # 绑定名称
subjects:
- kind: ServiceAccount # 主体类型为服务账户
name: filebeat # 服务账户名称
namespace: kube-system # 服务账户所在命名空间
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole # 引用的角色类型
name: filebeat # 引用的角色名称
---
# 将 Filebeat 的服务账户与角色绑定(RoleBinding)
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: filebeat # 绑定名称
namespace: kube-system # 作用命名空间
subjects:
- kind: ServiceAccount # 主体类型为服务账户
name: filebeat # 服务账户名称
namespace: kube-system # 服务账户所在命名空间
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role # 引用的角色类型
name: filebeat # 引用的角色名称

View File

@@ -0,0 +1,233 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: filebeat-config
namespace: kube-system
data:
filebeat.yml: |
setup.ilm.enabled: false
setup.template.enabled: false
filebeat.autodiscover:
providers:
- type: kubernetes
templates:
# ---------- ↓ json格式日志 ↓ ----------
- condition:
and:
- regexp:
kubernetes.namespace: "^(sit|apex-evaluation)$"
- regexp:
kubernetes.labels.app: "^(lessie-go-api|apex)$"
config:
- type: filestream
id: "container-${data.kubernetes.container.id}"
prospector.scanner.symlinks: true
close.on_state_change.removed: false
parsers:
- container: ~
paths:
- /var/log/containers/*-${data.kubernetes.container.id}.log
processors:
- add_kubernetes_metadata:
host: ${NODE_NAME}
- decode_json_fields:
fields: ["message"]
target: "mylog"
overwrite_keys: true
add_error_key: true
- drop_fields:
fields:
- "kubernetes.node.labels"
- "kubernetes.namespace_labels.kubernetes_io/metadata_name"
ignore_missing: true
# ---------- ↑ json格式日志 ↑ ----------
# ---------- ↓ java语言的服务的Pod, agnet\admin\payment 项目自由文本格式日志 ↓ ----------
- condition:
and:
- equals:
kubernetes.namespace: sit
- or:
- equals:
kubernetes.labels.app: "flymoon-admin"
- equals:
kubernetes.labels.app: "flymoon-agent"
- equals:
kubernetes.labels.app: "flymoon-payment"
config:
- type: filestream
id: "container-${data.kubernetes.container.id}"
prospector.scanner.symlinks: true
close.on_state_change.removed: false
parsers:
- container: ~
- multiline:
type: pattern
pattern: '^\d{4}-\d{2}-\d{2}-\d{2}:\d{2}:\d{2}\.\d{3}'
negate: true
match: after
paths:
- /var/log/containers/*-${data.kubernetes.container.id}.log
processors:
- add_kubernetes_metadata:
host: ${NODE_NAME}
- dissect:
tokenizer: '%{timestamp} %{level} %{pid} --- [%{thread}] %{class} : [%{app_name->}] %{message}'
field: "message"
target_prefix: "mylog"
ignore_missing: true
overwrite_keys: true
- drop_fields:
fields: ["kubernetes.node.labels", "kubernetes.annotations"]
ignore_missing: true
# ---------- ↑ java语言的服务的Pod, agnet\admin\payment 项目自由文本格式日志 ↑ ----------
# ---------- ↓ java语言的服务的Pod, email 项目自由文本格式日志 ↓ ----------
- condition:
and:
- equals:
kubernetes.namespace: sit
- equals:
kubernetes.labels.app: "flymoon-email"
config:
- type: filestream
id: "container-${data.kubernetes.container.id}"
prospector.scanner.symlinks: true
close.on_state_change.removed: false
parsers:
- container: ~
- multiline:
type: pattern
pattern: '^\d{4}-\d{2}-\d{2}'
negate: true
match: after
paths:
- /var/log/containers/*-${data.kubernetes.container.id}.log
processors:
- add_kubernetes_metadata:
host: ${NODE_NAME}
- dissect:
tokenizer: '%{timestamp} %{level} %{pid} --- [%{thread}] %{class} : %{message}'
field: "message"
target_prefix: "mylog"
ignore_missing: true
overwrite_keys: true
- drop_fields:
fields: ["kubernetes.node.labels", "kubernetes.annotations"]
ignore_missing: true
# ---------- ↑ java语言的服务的Pod, email 项目自由文本格式日志 ↑ ----------
# ---------- ↓ python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↓ ----------
- condition:
and:
- equals:
kubernetes.namespace: sit
- equals:
kubernetes.labels.app: "lessie-agents"
config:
- type: filestream
id: "container-${data.kubernetes.container.id}"
prospector.scanner.symlinks: true
close.on_state_change.removed: false
parsers:
- container: ~
paths:
- /var/log/containers/*-${data.kubernetes.container.id}.log
processors:
- add_kubernetes_metadata:
host: ${NODE_NAME}
# 第一层:仅解析符合时间戳开头的日志行(for业务告警的日志格式)
- dissect:
when:
regexp:
message: '^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}.*'
tokenizer: '%{timestamp} - %{level} - %{module} - %{function} - %{msg_body}'
field: "message"
target_prefix: "mylog"
ignore_missing: true
overwrite_keys: true
# 第二层:针对带有 [level: | event: | msg: | context:] 的日志,再做一次 dissect
- dissect:
when:
contains:
mylog.msg_body: "[level:"
tokenizer: '[level: %{event_level} | event: %{event} | msg: %{msg} | context: %{ctx_raw}]'
field: "mylog.msg_body"
target_prefix: "mylog"
ignore_missing: true
overwrite_keys: true
# 第三层:把 ctx_raw 再拆成独立字段
- script:
lang: javascript
id: parse_context
source: >
function process(event) {
var ctx = event.Get("mylog.ctx_raw");
if (!ctx) return;
var parts = ctx.trim().split(",");
for (var i = 0; i < parts.length; i++) {
var pair = parts[i].split(":");
if (pair.length === 2) {
event.Put("mylog." + pair[0].trim(), pair[1].trim());
}
}
}
# 第四层: 去除大量不需要的k8s元数据字段
- drop_fields:
fields:
- "kubernetes.node.labels"
- "kubernetes.annotations"
ignore_missing: true
# ---------- ↑ python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↑ ----------
# ---------- ↓ apex 动态创建的 python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↓ ----------
- condition:
and:
- equals:
kubernetes.namespace: apex-evaluation
- equals:
kubernetes.labels.apex: "lessie-agents"
config:
- type: filestream
id: "container-${data.kubernetes.container.id}"
prospector.scanner.symlinks: true
close.on_state_change.removed: false
parsers:
- container: ~
paths:
- /var/log/containers/*-${data.kubernetes.container.id}.log
processors:
- drop_fields:
fields:
- "kubernetes.node.labels"
- "kubernetes.annotations"
ignore_missing: true
# ---------- ↑ apex 动态创建的 python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↑ ----------
# ---- 输出到 Elasticsearch ----
output.elasticsearch:
hosts: ["http://10.0.0.38:9200"]
username: "admin"
password: "G7ZSKFM4AQwHQpwA"
indices:
- index: "k8s-%{[kubernetes.labels.environment]}-%{[kubernetes.labels.app]}-%{+yyyy.MM}"
when:
regexp:
kubernetes.labels.app: "(lessie-go-api|flymoon-admin|flymoon-agent|flymoon-payment|flymoon-email|lessie-agents|apex)"
- index: "apex-python-%{+yyyy.MM}"
when:
equals:
kubernetes.labels.apex: "lessie-agents"
logging.level: info
logging.selectors: ["*"]

View File

@@ -0,0 +1,233 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: filebeat-config
namespace: kube-system
data:
filebeat.yml: |
setup.ilm.enabled: false
setup.template.enabled: false
filebeat.autodiscover:
providers:
- type: kubernetes
templates:
# ---------- ↓ json格式日志 ↓ ----------
- condition:
and:
- regexp:
kubernetes.namespace: "^(sit|apex-evaluation)$"
- regexp:
kubernetes.labels.app: "^(lessie-go-api|apex)$"
config:
- type: filestream
id: "container-${data.kubernetes.container.id}"
prospector.scanner.symlinks: true
close.on_state_change.removed: false
parsers:
- container: ~
paths:
- /var/log/containers/*-${data.kubernetes.container.id}.log
processors:
- add_kubernetes_metadata:
host: ${NODE_NAME}
- decode_json_fields:
fields: ["message"]
target: "mylog"
overwrite_keys: true
add_error_key: true
- drop_fields:
fields:
- "kubernetes.node.labels"
- "kubernetes.namespace_labels.kubernetes_io/metadata_name"
ignore_missing: true
# ---------- ↑ json格式日志 ↑ ----------
# ---------- ↓ java语言的服务的Pod, agnet\admin\payment 项目自由文本格式日志 ↓ ----------
- condition:
and:
- equals:
kubernetes.namespace: sit
- or:
- equals:
kubernetes.labels.app: "flymoon-admin"
- equals:
kubernetes.labels.app: "flymoon-agent"
- equals:
kubernetes.labels.app: "flymoon-payment"
config:
- type: filestream
id: "container-${data.kubernetes.container.id}"
prospector.scanner.symlinks: true
close.on_state_change.removed: false
parsers:
- container: ~
- multiline:
type: pattern
pattern: '^\d{4}-\d{2}-\d{2}-\d{2}:\d{2}:\d{2}\.\d{3}'
negate: true
match: after
paths:
- /var/log/containers/*-${data.kubernetes.container.id}.log
processors:
- add_kubernetes_metadata:
host: ${NODE_NAME}
- dissect:
tokenizer: '%{timestamp} %{level} %{pid} --- [%{thread}] %{class} : [%{app_name->}] %{message}'
field: "message"
target_prefix: "mylog"
ignore_missing: true
overwrite_keys: true
- drop_fields:
fields: ["kubernetes.node.labels", "kubernetes.annotations"]
ignore_missing: true
# ---------- ↑ java语言的服务的Pod, agnet\admin\payment 项目自由文本格式日志 ↑ ----------
# ---------- ↓ java语言的服务的Pod, email 项目自由文本格式日志 ↓ ----------
- condition:
and:
- equals:
kubernetes.namespace: sit
- equals:
kubernetes.labels.app: "flymoon-email"
config:
- type: filestream
id: "container-${data.kubernetes.container.id}"
prospector.scanner.symlinks: true
close.on_state_change.removed: false
parsers:
- container: ~
- multiline:
type: pattern
pattern: '^\d{4}-\d{2}-\d{2}'
negate: true
match: after
paths:
- /var/log/containers/*-${data.kubernetes.container.id}.log
processors:
- add_kubernetes_metadata:
host: ${NODE_NAME}
- dissect:
tokenizer: '%{timestamp} %{level} %{pid} --- [%{thread}] %{class} : %{message}'
field: "message"
target_prefix: "mylog"
ignore_missing: true
overwrite_keys: true
- drop_fields:
fields: ["kubernetes.node.labels", "kubernetes.annotations"]
ignore_missing: true
# ---------- ↑ java语言的服务的Pod, email 项目自由文本格式日志 ↑ ----------
# ---------- ↓ python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↓ ----------
- condition:
and:
- equals:
kubernetes.namespace: sit
- equals:
kubernetes.labels.app: "lessie-agents"
config:
- type: filestream
id: "container-${data.kubernetes.container.id}"
prospector.scanner.symlinks: true
close.on_state_change.removed: false
parsers:
- container: ~
paths:
- /var/log/containers/*-${data.kubernetes.container.id}.log
processors:
- add_kubernetes_metadata:
host: ${NODE_NAME}
# 第一层:仅解析符合时间戳开头的日志行(for业务告警的日志格式)
- dissect:
when:
regexp:
message: '^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}.*'
tokenizer: '%{timestamp} - %{level} - %{module} - %{function} - %{msg_body}'
field: "message"
target_prefix: "mylog"
ignore_missing: true
overwrite_keys: true
# 第二层:针对带有 [level: | event: | msg: | context:] 的日志,再做一次 dissect
- dissect:
when:
contains:
mylog.msg_body: "[level:"
tokenizer: '[level: %{event_level} | event: %{event} | msg: %{msg} | context: %{ctx_raw}]'
field: "mylog.msg_body"
target_prefix: "mylog"
ignore_missing: true
overwrite_keys: true
# 第三层:把 ctx_raw 再拆成独立字段
- script:
lang: javascript
id: parse_context
source: >
function process(event) {
var ctx = event.Get("mylog.ctx_raw");
if (!ctx) return;
var parts = ctx.trim().split(",");
for (var i = 0; i < parts.length; i++) {
var pair = parts[i].split(":");
if (pair.length === 2) {
event.Put("mylog." + pair[0].trim(), pair[1].trim());
}
}
}
# 第四层: 去除大量不需要的k8s元数据字段
- drop_fields:
fields:
- "kubernetes.node.labels"
- "kubernetes.annotations"
ignore_missing: true
# ---------- ↑ python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↑ ----------
# ---------- ↓ apex 动态创建的 python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↓ ----------
- condition:
and:
- equals:
kubernetes.namespace: apex-evaluation
- equals:
kubernetes.labels.apex: "lessie-agents"
config:
- type: filestream
id: "container-${data.kubernetes.container.id}"
prospector.scanner.symlinks: true
close.on_state_change.removed: false
parsers:
- container: ~
paths:
- /var/log/containers/*-${data.kubernetes.container.id}.log
processors:
- drop_fields:
fields:
- "kubernetes.node.labels"
- "kubernetes.annotations"
ignore_missing: true
# ---------- ↑ apex 动态创建的 python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↑ ----------
# ---- 输出到 Elasticsearch ----
output.elasticsearch:
hosts: ["http://10.0.0.38:9200"]
username: "admin"
password: "G7ZSKFM4AQwHQpwA"
indices:
- index: "k8s-%{[kubernetes.labels.environment]}-%{[kubernetes.labels.app]}-%{+yyyy.MM}"
when:
regexp:
kubernetes.labels.app: "(lessie-go-api|flymoon-admin|flymoon-agent|flymoon-payment|flymoon-email|lessie-agents|apex)"
- index: "apex-python-%{+yyyy.MM}"
when:
equals:
kubernetes.labels.apex: "lessie-agents"
logging.level: info
logging.selectors: ["*"]

View File

@@ -0,0 +1,65 @@
# 滚动更新
# kubectl rollout restart daemonset filebeat -n kube-system
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: filebeat
namespace: kube-system
labels:
k8s-app: filebeat
spec:
selector:
matchLabels:
k8s-app: filebeat
template:
metadata:
labels:
k8s-app: filebeat
spec:
serviceAccountName: filebeat
terminationGracePeriodSeconds: 30
containers:
- name: filebeat
image: docker.elastic.co/beats/filebeat:9.2.2
args:
- "-e"
env:
- name: TZ
value: Asia/Shanghai
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
resources:
limits:
memory: 300Mi
requests:
cpu: 100m
memory: 200Mi
volumeMounts:
- name: config
mountPath: /usr/share/filebeat/filebeat.yml
subPath: filebeat.yml
- name: data
mountPath: /var/lib/filebeat-data
- name: containers
mountPath: /var/log/containers
readOnly: true
- name: pods
mountPath: /var/log/pods
readOnly: true
volumes:
- name: config
configMap:
name: filebeat-config
- name: data
hostPath:
path: /var/lib/filebeat-data
type: DirectoryOrCreate
- name: containers
hostPath:
path: /var/log/containers
- name: pods
hostPath:
path: /var/log/pods

View File

@@ -0,0 +1,226 @@
setup.ilm.enabled: false
setup.template.enabled: false
filebeat.autodiscover:
providers:
- type: kubernetes
templates:
# ---------- ↓ json格式日志 ↓ ----------
- condition:
and:
- regexp:
kubernetes.namespace: "^(sit|apex-evaluation)$"
- regexp:
kubernetes.labels.app: "^(lessie-go-api|apex)$"
config:
- type: filestream
id: "container-${data.kubernetes.container.id}"
prospector.scanner.symlinks: true
close.on_state_change.removed: false
parsers:
- container: ~
paths:
- /var/log/containers/*-${data.kubernetes.container.id}.log
processors:
- add_kubernetes_metadata:
host: ${NODE_NAME}
- decode_json_fields:
fields: ["message"]
target: "mylog"
overwrite_keys: true
add_error_key: true
- drop_fields:
fields:
- "kubernetes.node.labels"
- "kubernetes.namespace_labels.kubernetes_io/metadata_name"
ignore_missing: true
# ---------- ↑ json格式日志 ↑ ----------
# ---------- ↓ java语言的服务的Pod, agnet\admin\payment 项目自由文本格式日志 ↓ ----------
- condition:
and:
- equals:
kubernetes.namespace: sit
- or:
- equals:
kubernetes.labels.app: "flymoon-admin"
- equals:
kubernetes.labels.app: "flymoon-agent"
- equals:
kubernetes.labels.app: "flymoon-payment"
config:
- type: filestream
id: "container-${data.kubernetes.container.id}"
prospector.scanner.symlinks: true
close.on_state_change.removed: false
parsers:
- container: ~
- multiline:
type: pattern
pattern: '^\d{4}-\d{2}-\d{2}-\d{2}:\d{2}:\d{2}\.\d{3}'
negate: true
match: after
paths:
- /var/log/containers/*-${data.kubernetes.container.id}.log
processors:
- add_kubernetes_metadata:
host: ${NODE_NAME}
- dissect:
tokenizer: '%{timestamp} %{level} %{pid} --- [%{thread}] %{class} : [%{app_name->}] %{message}'
field: "message"
target_prefix: "mylog"
ignore_missing: true
overwrite_keys: true
- drop_fields:
fields: ["kubernetes.node.labels", "kubernetes.annotations"]
ignore_missing: true
# ---------- ↑ java语言的服务的Pod, agnet\admin\payment 项目自由文本格式日志 ↑ ----------
# ---------- ↓ java语言的服务的Pod, email 项目自由文本格式日志 ↓ ----------
- condition:
and:
- equals:
kubernetes.namespace: sit
- equals:
kubernetes.labels.app: "flymoon-email"
config:
- type: filestream
id: "container-${data.kubernetes.container.id}"
prospector.scanner.symlinks: true
close.on_state_change.removed: false
parsers:
- container: ~
- multiline:
type: pattern
pattern: '^\d{4}-\d{2}-\d{2}'
negate: true
match: after
paths:
- /var/log/containers/*-${data.kubernetes.container.id}.log
processors:
- add_kubernetes_metadata:
host: ${NODE_NAME}
- dissect:
tokenizer: '%{timestamp} %{level} %{pid} --- [%{thread}] %{class} : %{message}'
field: "message"
target_prefix: "mylog"
ignore_missing: true
overwrite_keys: true
- drop_fields:
fields: ["kubernetes.node.labels", "kubernetes.annotations"]
ignore_missing: true
# ---------- ↑ java语言的服务的Pod, email 项目自由文本格式日志 ↑ ----------
# ---------- ↓ python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↓ ----------
- condition:
and:
- equals:
kubernetes.namespace: sit
- equals:
kubernetes.labels.app: "lessie-agents"
config:
- type: filestream
id: "container-${data.kubernetes.container.id}"
prospector.scanner.symlinks: true
close.on_state_change.removed: false
parsers:
- container: ~
paths:
- /var/log/containers/*-${data.kubernetes.container.id}.log
processors:
- add_kubernetes_metadata:
host: ${NODE_NAME}
# 第一层:仅解析符合时间戳开头的日志行(for业务告警的日志格式)
- dissect:
when:
regexp:
message: '^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}.*'
tokenizer: '%{timestamp} - %{level} - %{module} - %{function} - %{msg_body}'
field: "message"
target_prefix: "mylog"
ignore_missing: true
overwrite_keys: true
# 第二层:针对带有 [level: | event: | msg: | context:] 的日志,再做一次 dissect
- dissect:
when:
contains:
mylog.msg_body: "[level:"
tokenizer: '[level: %{event_level} | event: %{event} | msg: %{msg} | context: %{ctx_raw}]'
field: "mylog.msg_body"
target_prefix: "mylog"
ignore_missing: true
overwrite_keys: true
# 第三层:把 ctx_raw 再拆成独立字段
- script:
lang: javascript
id: parse_context
source: >
function process(event) {
var ctx = event.Get("mylog.ctx_raw");
if (!ctx) return;
var parts = ctx.trim().split(",");
for (var i = 0; i < parts.length; i++) {
var pair = parts[i].split(":");
if (pair.length === 2) {
event.Put("mylog." + pair[0].trim(), pair[1].trim());
}
}
}
# 第四层: 去除大量不需要的k8s元数据字段
- drop_fields:
fields:
- "kubernetes.node.labels"
- "kubernetes.annotations"
ignore_missing: true
# ---------- ↑ python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↑ ----------
# ---------- ↓ apex 动态创建的 python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↓ ----------
- condition:
and:
- equals:
kubernetes.namespace: apex-evaluation
- equals:
kubernetes.labels.apex: "lessie-agents"
config:
- type: filestream
id: "container-${data.kubernetes.container.id}"
prospector.scanner.symlinks: true
close.on_state_change.removed: false
parsers:
- container: ~
paths:
- /var/log/containers/*-${data.kubernetes.container.id}.log
processors:
- drop_fields:
fields:
- "kubernetes.node.labels"
- "kubernetes.annotations"
ignore_missing: true
# ---------- ↑ apex 动态创建的 python语言的agents服务的Pod, lessie-agents 项目自由文本格式日志 ↑ ----------
# ---- 输出到 Elasticsearch ----
output.elasticsearch:
hosts: ["http://10.0.0.38:9200"]
username: "admin"
password: "G7ZSKFM4AQwHQpwA"
indices:
- index: "k8s-%{[kubernetes.labels.environment]}-%{[kubernetes.labels.app]}-%{+yyyy.MM.dd}"
when:
regexp:
kubernetes.labels.app: "(lessie-go-api|flymoon-admin|flymoon-agent|flymoon-payment|flymoon-email|lessie-agents|apex)"
- index: "apex-python-%{+yyyy.MM.dd}"
when:
equals:
kubernetes.labels.apex: "lessie-agents"
logging.level: info
logging.selectors: ["*"]

View File

@@ -0,0 +1,143 @@
# 前置 & 准备工作
sudo dnf update -y
sudo dnf install -y nano wget curl unzip
# 安全组防火墙开放9200端口、5601端口
# 安装 Elasticsearch 9.2.2
# 导入官方 GPG key
sudo rpm --import https://artifacts.elastic.co/GPG-KEY-elasticsearch
# 新建 yum repo 文件
sudo tee /etc/yum.repos.d/elasticsearch.repo <<-'EOF'
[elasticsearch]
name=Elasticsearch repository for 9.x packages
baseurl=https://artifacts.elastic.co/packages/9.x/yum
gpgcheck=1
gpgkey=https://artifacts.elastic.co/GPG-KEY-elasticsearch
enabled=1
autorefresh=1
type=rpm-md
EOF
# 安装 Elasticsearch
sudo dnf install elasticsearch --enablerepo=elasticsearch
# 先不管直接启动、报错再查看日志,有可能是权限问题
sudo systemctl daemon-reload
sudo systemctl enable elasticsearch
sudo systemctl start elasticsearch
sudo systemctl status elasticsearch
sudo journalctl -u elasticsearch -f
# 手动创建日志目录 + 设置权限
sudo mkdir -p /usr/share/elasticsearch/logs
sudo chown -R elasticsearch:elasticsearch /usr/share/elasticsearch/logs
sudo chmod 750 /usr/share/elasticsearch/logs
# 设置 elastic 超级用户密码 (推荐立即设定)
sudo /usr/share/elasticsearch/bin/elasticsearch-reset-password -u elastic
# 查看自签名证书,有则正常
ll /etc/elasticsearch/certs/
# 查看 HTTP CA 证书指纹(用于其他客户端配置)
sudo openssl x509 -fingerprint -sha256 -in /etc/elasticsearch/certs/http_ca.crt -noout
# 设置环境变量(替换为你的实际密码)
export ELASTIC_PASSWORD='MyElastic123!'
# 测试 HTTPS 请求(必须用 --cacert因启用了 TLS
curl --cacert /etc/elasticsearch/certs/http_ca.crt \
-u elastic:$ELASTIC_PASSWORD \
https://localhost:9200
# 查看默认的配置文件
grep -v '^\s*#\|^\s*$' /etc/elasticsearch/elasticsearch.yml
# 按实际情况修改配置文件集群名、非本地访问等
cluster.name: my-test-es
path.data: /var/lib/elasticsearch
path.logs: /var/log/elasticsearch
network.host: 0.0.0.0
xpack.security.enabled: true
xpack.security.enrollment.enabled: true
xpack.security.http.ssl:
enabled: true
keystore.path: certs/http.p12
xpack.security.transport.ssl:
enabled: true
verification_mode: certificate
keystore.path: certs/transport.p12
truststore.path: certs/transport.p12
cluster.initial_master_nodes: ["weblessie-server-02"]
http.host: 0.0.0.0
# 更改es的jvm大小
vim /etc/elasticsearch/jvm.options
-Xms4g
-Xmx4g
# 重启
sudo systemctl restart elasticsearch
# 准备token后续在Kibana中使用
sudo /usr/share/elasticsearch/bin/elasticsearch-create-enrollment-token -s kibana
# 准备安装 Kibana 9.2.2
# 新建 repo /etc/yum.repos.d/kibana.repo
sudo tee /etc/yum.repos.d/kibana.repo <<-'EOF'
[kibana]
name=Kibana repository for 9.x packages
baseurl=https://artifacts.elastic.co/packages/9.x/yum
gpgcheck=1
gpgkey=https://artifacts.elastic.co/GPG-KEY-elasticsearch
enabled=1
autorefresh=1
type=rpm-md
EOF
# 安装 Kibana
sudo dnf install kibana --enablerepo=kibana
# 启动
sudo systemctl daemon-reload
sudo systemctl enable --now kibana
# 访问 Kibana输入生成的token
http://ip:5601
# 获取 “verification code”
/usr/share/kibana/bin/kibana-verification-code
# 使用官方工具生成加密密钥(最规范)
sudo /usr/share/kibana/bin/kibana-encryption-keys generate --force
# 输出应类似:
# ✔ Encryption keys generated and written to /etc/kibana/kibana.yml:
# xpack.encryptedSavedObjects.encryptionKey
# xpack.reporting.encryptionKey
# xpack.security.encryptionKey
# 修改配置文件
grep -v '^\s*#\|^\s*$' /etc/kibana/kibana.yml
server.host: "0.0.0.0"
logging:
appenders:
file:
type: file
fileName: /var/log/kibana/kibana.log
layout:
type: json
root:
appenders:
- default
- file
pid.file: /run/kibana/kibana.pid
i18n.locale: "zh-CN"
elasticsearch.hosts: [https://10.0.0.38:9200]
elasticsearch.serviceAccountToken: AAEAAWVsYXN0aWMva2liYW5hL2Vucm9sbC1wcm9jZXNzLXRva2VuLTE3NjUzNDE4OTI3MjY6Um9KdUo2N1hSZVNPeGNzOXFDaUh2dw
elasticsearch.ssl.certificateAuthorities: [/var/lib/kibana/ca_1765341893683.crt]
xpack.fleet.outputs: [{id: fleet-default-output, name: default, is_default: true, is_default_monitoring: true, type: elasticsearch, hosts: [https://10.0.0.38:9200], ca_trusted_fingerprint: 80af64db043e12ebda11c10f70042af91306a705fdcb6285814a84b420c734a5}]
xpack.encryptedSavedObjects.encryptionKey: f10166c761265d5ca61e7fa2c1acac73
xpack.reporting.encryptionKey: 1772a5152522675d5a38470e905b2817
xpack.security.encryptionKey: d4b30e82e47f530a998e29cb0b8e5295

View File

@@ -0,0 +1,41 @@
# 获取ES 的证书指纹
sudo openssl x509 -fingerprint -sha256 -in /etc/elasticsearch/certs/http_ca.crt -noout
sha256 Fingerprint=80:AF:64:DB:04:3E:12:EB:DA:11:C1:0F:70:04:2A:F9:13:06:A7:05:FD:CB:62:85:81:4A:84:B4:20:C7:34:A5
# kibana web创建的用户
admin
G7ZSKFM4AQwHQpwA
# Filebeat
output.elasticsearch:
hosts: ["https://49.51.33.153:9200"]
username: "elastic"
password: "-0NiIBOJGn2CATuPWzNc"
# 用指纹验证(代替证书文件)
ssl.verification_mode: "certificate"
ssl.certificate_authorities: [] # 留空(不校验完整链)
ssl.supported_protocols: [TLSv1.2, TLSv1.3]
# 关键:指定 CA 指纹(必须全大写,无 0x带冒号
ssl.ca_trusted_fingerprint: "80AF64DB043E12EBDA11C10F70042AF91306A705FD2CB6285814A84B420C734A5"
# python
from elasticsearch import Elasticsearch
es = Elasticsearch(
hosts=["https://49.51.33.153:9200"],
basic_auth=("elastic", "-0NiIBOJGn2CATuPWzNc"),
# 指纹必须去掉冒号,全大写
ssl_assert_fingerprint="80AF64DB043E12EBDA11C10F70042AF91306A705FD2CB6285814A84B420C734A5",
verify_certs=True # 必须为 True
)
print(es.info())

View File

@@ -0,0 +1,46 @@
# 下载tar
wget https://dl.grafana.com/grafana-enterprise/release/12.3.1/grafana-enterprise_12.3.1_20271043721_linux_amd64.tar.gz
# 创建Grafana相关目录数据+配置)
mkdir -p /data/grafana/
# 为 Grafana 创建用户帐户
useradd -r -s /bin/false grafana
# 将解压后的二进制文件移动到/data/grafana/
tar -xzf grafana-enterprise_12.3.1_20271043721_linux_amd64.tar.gz -C /data/grafana/
# 所有者更改/data/grafana/为 Grafana 用户
chown -R grafana:grafana /data/grafana/
# 复制默认的配置文件
cp /data/grafana/conf/defaults.ini /data/grafana/conf/grafana.ini
# 创建 Grafana 服务器 systemd 单元文件
sudo touch /etc/systemd/system/grafana-server.service
[Unit]
Description=Grafana Server
After=network.target
[Service]
Type=simple
User=grafana
Group=grafana
ExecStart=/data/grafana/bin/grafana server --config=/data/grafana/conf/grafana.ini --homepath=/data/grafana
Restart=on-failure
[Install]
WantedBy=multi-user.target
# 启用 Grafana 服务器 systemd 服务
sudo systemctl daemon-reload
sudo systemctl start grafana-server
sudo systemctl enable grafana-server

View File

@@ -0,0 +1,81 @@
# 创建目录
mkdir -p /data/prometheus/
mkdir -p /data/alertmanager/
# 下载tar包
wget https://github.com/prometheus/prometheus/releases/download/v3.8.1/prometheus-3.8.1.linux-amd64.tar.gz
wget https://github.com/prometheus/alertmanager/releases/download/v0.30.0/alertmanager-0.30.0.linux-amd64.tar.gz
# 创建系统用户(如果尚未创建)
sudo useradd --no-create-home --shell /bin/false prometheus || true
# 授权目录权限
sudo chown -R prometheus:prometheus /data/prometheus
sudo chown -R prometheus:prometheus /data/alertmanager
# 创建文件 /etc/systemd/system/prometheus.service
[Unit]
Description=Prometheus
Wants=network-online.target
After=network-online.target
[Service]
User=prometheus
Group=prometheus
Type=simple
# 注意:--storage.tsdb.path 指定数据存储位置,建议设在 /data 目录下
ExecStart=/data/prometheus/prometheus \
--config.file=/data/prometheus/prometheus.yml \
--storage.tsdb.path=/data/prometheus/data \
--web.console.templates=/data/prometheus/consoles \
--web.console.libraries=/data/prometheus/console_libraries
Restart=always
[Install]
WantedBy=multi-user.target
# 创建文件 /etc/systemd/system/alertmanager.service
[Unit]
Description=Alertmanager
Wants=network-online.target
After=network-online.target
[Service]
User=prometheus
Group=prometheus
Type=simple
ExecStart=/data/alertmanager/alertmanager \
--config.file=/data/alertmanager/alertmanager.yml \
--storage.path=/data/alertmanager/data
Restart=always
[Install]
WantedBy=multi-user.target
# 修改 Prometheus 关联 Alertmanager
# Alerting configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093 # Alertmanager 默认端口
# 重载 systemd
sudo systemctl daemon-reload
# 启动并设置开机自启
sudo systemctl enable --now prometheus
sudo systemctl enable --now alertmanager
# 检查状态
sudo systemctl status prometheus
sudo systemctl status alertmanager
配置文件检查
在重启服务前,可以使用自带的工具检查语法是否正确:
Prometheus 检查: /data/prometheus/promtool check config /data/prometheus/prometheus.yml
Alertmanager 检查: /data/alertmanager/amtool check-config /data/alertmanager/alertmanager.yml

53
OpenTelemetry/readme.txt Normal file
View File

@@ -0,0 +1,53 @@
一、存储
ES、Prometheus、Tempo 均二进制部署在k8s集群外部
二、采集、处理、中转
OpenTelemetry Collector 其它 部署在 k8s 集群内
1、DaemonSet Collector 采集(节点 / 容器级)「指标 + 日志」
2、Deployment Collector 接收DaemonSet Collector 的数据、处理、中转至存储「追踪数据」
三、三种类型数据
1、指标数据最终到 Prometheus
采集方OTel Collector部署在 k8s 内的DaemonSet 模式)
采集内容:节点 CPU / 内存 / 磁盘替代node-exporter、容器资源使用率替代kubelet指标、业务 Pod 的自定义指标(需应用集成 OTel SDK
处理方OTel CollectorDeployment 模式,集群级聚合)
处理逻辑:标准化指标格式、补充 k8s 标签如集群名、Pod 名)、批处理。
发送方OTel Collector
发送协议Prometheus Remote Write
接收方:集群外的 Prometheus。
graph TD
A[k8s节点] -->|DaemonSet Collector采集| B[节点CPU/内存/磁盘指标]
C[k8s容器] -->|DaemonSet Collector采集| D[容器使用率指标]
B -->|上报| E[Deployment Collector]
D -->|上报| E[Deployment Collector]
E -->|标准化+批处理| F[Remote Write协议]
F -->|转发| G[集群外Prometheus]
2. 日志数据(最终到 ES
采集方OTel CollectorDaemonSet 模式)
采集内容k8s 节点/var/log/containers目录下的容器日志替代 Filebeat
处理方OTel CollectorDeployment 模式)
处理逻辑解析日志格式JSON / 正则)、过滤冗余日志、补充 k8s 资源标签。
发送方OTel Collector
发送协议Elasticsearch API
接收方:集群外的 ES。
3. 请求链路(追踪数据,最终到 Tempo
采集方:分两种场景
基础设施链路OTel CollectorDeployment 模式)采集 k8s 组件的链路数据(如 Ingress/Nginx
业务链路业务应用集成OTel SDK如 Java Agent、Go SDK在应用内部采集请求链路。
处理方OTel CollectorDeployment 模式)
处理逻辑:标准化 Trace 格式、关联 k8s 资源信息、批处理。
发送方OTel Collector
发送协议OTLPOpenTelemetry 协议,支持 gRPC/HTTP
接收方:集群外的 Tempo。
graph LR
A[指标接收器] -->|metrics流水线| B[指标处理器] --> C[Prometheus导出器]
D[日志接收器] -->|logs流水线| E[日志处理器] --> F[ES导出器]
G[追踪接收器] -->|traces流水线| H[追踪处理器] --> I[Tempo导出器]

View File

@@ -0,0 +1,58 @@
server:
http_listen_port: 3200 # HTTP 接口监听端口
grpc_listen_port: 9095 # gRPC 接口监听端口
distributor:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317 # OTLP gRPC 接口监听地址
http:
endpoint: 0.0.0.0:4318 # OTLP HTTP 接口监听地址
ingester:
lifecycler:
ring:
replication_factor: 1 # 数据的副本数
max_block_duration: 5m # 最大数据块时长
trace_idle_period: 10s # 如果某个 Trace 长时间未活动,自动清理
compactor:
compaction:
block_retention: 720h # 数据块保留时间720小时30天
compacted_block_retention: 168h # 压缩后的数据块保留时间168小时7天
max_compaction_objects: 1000000 # 每次压缩的最大对象数
metrics_generator:
registry:
external_labels:
source: tempo
cluster: linux-microservices
storage:
path: /data/tempo/data/wal
remote_write:
- url: http://127.0.0.1:9090/api/v1/write
send_exemplars: true
storage:
trace:
backend: s3
s3:
endpoint: outscalelink-1324597558.cos.na-siliconvalley.myqcloud.com
bucket: outscalelink-1324597558
prefix: tempo-data/
forcepathstyle: true
enable_dual_stack: false
insecure: true
access_key: AKIDkgR4lHvU1QfieR7cxBLLTaUCh0S0dDev
secret_key: fAWjldKuPhz4wb6RedPzPccOwGOet9Ug
wal:
path: /data/tempo/data/wal
local:
path: /data/tempo/blocks
overrides:
metrics_generator_processors: [service-graphs, span-metrics]

96
OpenTelemetry/tempo/dm.sh Normal file
View File

@@ -0,0 +1,96 @@
mkdir -p /data/tempo/{conf,data,metrics-generator}
mkdir -p /data/tempo/data/wal
mkdir -p /data/tempo/metrics-generator/wal
chown -R tempo:tempo /data/tempo
chown -R tempo:tempo /data/tempo/data/traces
# 创建一个专用用户并配置服务,确保 Tempo 在后台稳定运行
sudo useradd --no-create-home --shell /bin/false tempo || true
# 下载tar包
wget https://github.com/grafana/tempo/releases/download/v2.9.0/tempo_2.9.0_linux_amd64.tar.gz
# 解压
tar -xzf tempo_2.9.0_linux_amd64.tar.gz -C /data/tempo/
# 移动可执行文件到 /usr/local/bin/
mv /data/tempo/tempo /data/tempo/tempo-cli /data/tempo/tempo-query /usr/local/bin/
# 检查版本
tempo --version
# 创建配置文件(本地为存储介质)
vim local-tempo.yaml
server:
http_listen_port: 3200 # Tempo Web 端口Grafana 对接用)
grpc_listen_port: 9095 # gRPC 端口(可选)
distributor:
receivers: # 接收 OTel 追踪数据的协议(核心)
otlp:
protocols:
grpc: # 监听 4317 端口(和 OTel Collector 一致,接收 OTLP gRPC 数据)
endpoint: 0.0.0.0:4317
http: # 监听 4318 端口(接收 OTLP HTTP 数据)
endpoint: 0.0.0.0:4318
ingester:
max_block_duration: 5m # 数据块存储时长(测试环境可设小)
trace_idle_period: 10s # 追踪会话空闲超时
compactor:
compaction:
block_retention: 30d # 追踪数据保留 30 天(可按需调整)
storage:
trace:
backend: local # 单节点本地存储(生产可换 S3/MinIO
local:
path: /data/tempo/data # 追踪数据存储目录(指定到 /data/tempo/data
wal:
path: /data/tempo/data/wal # 预写日志目录(保证数据不丢)
# 存储桶为存储对象
vim cos-tempo.yaml
# 前台启动
/usr/local/bin/tempo \
-config.file=/data/tempo/conf/tempo.yaml \
-config.expand-env=true
# 检查服务状态
systemctl is-active tempo
# Systemd 服务守护进程
vim /etc/systemd/system/tempo.service
[Unit]
Description=Grafana Tempo
Wants=network-online.target
After=network-online.target
[Service]
User=tempo
Group=tempo
Type=simple
# config.file 指定配置文件路径,这里的配置文件注意文件名
ExecStart=/usr/local/bin/tempo \
-config.file=/data/tempo/conf/tempo.yaml \
-config.expand-env=true
Restart=always
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
# 加载配置
sudo systemctl daemon-reload
# 启动并设置自启
sudo systemctl enable --now tempo
# 检查状态
sudo systemctl status tempo
# 查看日志
journalctl -u tempo --no-pager -n 50