KubeZero/charts/kubezero-telemetry/values.yaml

657 lines
15 KiB
YAML

data-prepper:
enabled: false
image:
tag: 2.10.1
securityContext:
capabilities:
drop:
- ALL
#readOnlyRootFilesystem: true
#runAsNonRoot: true
#runAsUser: 1000
pipelineConfig:
config:
simple-sample-pipeline: null
otel-trace-pipeline:
workers: 1
delay: "100"
source:
otel_trace_source:
ssl: false
buffer:
bounded_blocking:
# default value is 12800
#buffer_size: 25600
# Default is 200.
# Make sure buffer_size >= workers * batch_size
#batch_size: 400
sink:
- pipeline:
name: "raw-traces-pipeline"
- pipeline:
name: "otel-service-map-pipeline"
raw-traces-pipeline:
workers: 1
delay: 3000
source:
pipeline:
name: "otel-trace-pipeline"
buffer:
bounded_blocking:
#buffer_size: 25600
#batch_size: 400
processor:
- otel_traces:
- otel_trace_group:
hosts: [ "https://telemetry:9200" ]
insecure: true
username: "admin"
password: "admin"
sink:
- opensearch:
hosts: ["https://telemetry:9200"]
username: "admin"
password: "admin"
insecure: true
index_type: trace-analytics-raw
otel-service-map-pipeline:
workers: 1
delay: 3000
source:
pipeline:
name: "otel-trace-pipeline"
processor:
- service_map:
window_duration: 180
buffer:
bounded_blocking:
#buffer_size: 25600
#batch_size: 400
sink:
- opensearch:
hosts: ["https://telemetry:9200"]
username: "admin"
password: "admin"
insecure: true
index_type: trace-analytics-service-map
#index: otel-v1-apm-span-%{yyyy.MM.dd}
#max_retries: 20
bulk_size: 4
config:
log4j2-rolling.properties: |
status = error
dest = err
name = PropertiesConfig
appender.console.type = Console
appender.console.name = STDOUT
appender.console.layout.type = PatternLayout
appender.console.layout.pattern = %d{ISO8601} [%t] %-5p %40C - %m%n
rootLogger.level = warn
rootLogger.appenderRef.stdout.ref = STDOUT
logger.pipeline.name = org.opensearch.dataprepper.pipeline
logger.pipeline.level = info
logger.parser.name = org.opensearch.dataprepper.parser
logger.parser.level = info
logger.plugins.name = org.opensearch.dataprepper.plugins
logger.plugins.level = info
opentelemetry-collector:
enabled: false
mode: deployment
image:
repository: "otel/opentelemetry-collector-contrib"
config:
extensions:
health_check:
endpoint: ${env:MY_POD_IP}:13133
exporters:
otlp/jaeger:
endpoint: telemetry-jaeger-collector:4317
tls:
insecure: true
otlp/data-prepper:
endpoint: telemetry-data-prepper:21890
tls:
insecure: true
receivers:
otlp:
protocols:
grpc:
endpoint: ${env:MY_POD_IP}:4317
http:
endpoint: ${env:MY_POD_IP}:4318
service:
telemetry:
metrics:
address: ${env:MY_POD_IP}:8888
extensions:
- health_check
pipelines:
traces:
exporters:
- otlp/jaeger
- otlp/data-prepper
processors:
- memory_limiter
- batch
receivers:
- otlp
ports:
jaeger-compact:
enabled: false
jaeger-thrift:
enabled: false
jaeger-grpc:
enabled: false
zipkin:
enabled: false
metrics:
enabled: true
serviceMonitor:
enabled: false
podDisruptionBudget:
enabled: false
# minAvailable: 2
# maxUnavailable: 1
jaeger:
enabled: false
agent:
enabled: false
collector:
service:
otlp:
grpc:
name: otlp-grpc
port: 4317
http:
name: otlp-http
port: 4318
serviceMonitor:
enabled: false
# https://www.jaegertracing.io/docs/1.53/deployment/#collector
storage:
type: elasticsearch
elasticsearch:
scheme: https
host: telemetry
user: admin
password: admin
cmdlineParams:
es.tls.enabled: ""
es.tls.skip-host-verify: ""
es.num-replicas: 1
es.num-shards: 2
#es.tags-as-fields.all: ""
provisionDataStore:
cassandra: false
elasticsearch: false
query:
agentSidecar:
enabled: false
serviceMonitor:
enabled: false
istio:
enabled: false
gateway: istio-ingress/private-ingressgateway
url: jaeger.example.com
opensearch:
version: 2.17.0
prometheus: false
# custom cluster settings
#settings:
# index.number_of_shards: 1
nodeSets: []
#- name: default-nodes
# replicas: 2
# storage:
# size: 16Gi
# class: my-fancy-SSDs
# zone: us-west-2a
# resources:
# limits:
# #cpu: 1
# memory: 2Gi
# requests:
# cpu: 500m
# memory: 2Gi
dashboard:
enabled: false
istio:
enabled: false
gateway: istio-ingress/private-ingressgateway
url: telemetry-dashboard.example.com
# New logging pipeline
fluentd:
enabled: false
image:
repository: public.ecr.aws/zero-downtime/fluentd-concenter
tag: v1.16.5-1-g09dc31c
istio:
enabled: false
kind: StatefulSet
replicaCount: 1
#plugins:
#- fluent-plugin-s3
source:
sharedKey: secretref+k8s://v1/Secret/kubezero/kubezero-secrets/telemetry.fluentd.source.sharedKey # "cloudbender"
output:
# Defaults to OpenSearch in same namespace
host: telemetry
user: admin
password: admin
service:
ports:
- name: tcp-forward
protocol: TCP
containerPort: 24224
- name: http-fluentd
protocol: TCP
containerPort: 9880
metrics:
serviceMonitor:
enabled: false
dashboards:
enabled: false
# No need for docker nor /var/log
mountVarLogDirectory: false
mountDockerContainersDirectory: false
# no rbac required until we need WebAuth identity for eg. s3
rbac:
create: false
resources:
requests:
cpu: 200m
memory: 256Mi
limits:
memory: 512Mi
persistence:
enabled: true
storageClass: ""
size: 1Gi
volumes:
- name: trust-store
secret:
secretName: telemetry-nodes-http-tls
items:
- key: tls.crt
path: ca.crt
volumeMounts:
- name: trust-store
mountPath: "/run/pki"
readOnly: true
securityContext:
capabilities:
drop:
- ALL
#readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 100
configMapConfigs:
- fluentd-prometheus-conf
fileConfigs:
00_system.conf: |-
<system>
root_dir /fluentd/log
log_level info
ignore_repeated_log_interval 60s
ignore_same_log_interval 60s
workers 1
</system>
01_sources.conf: |-
<source>
@type http
@label @KUBERNETES
port 9880
bind 0.0.0.0
keepalive_timeout 30
</source>
<source>
@type forward
@label @KUBERNETES
port 24224
bind 0.0.0.0
# skip_invalid_event true
send_keepalive_packet true
<security>
self_hostname "telemetry-fluentd"
shared_key {{ .Values.source.sharedKey }}
</security>
</source>
02_filters.conf: |-
<label @KUBERNETES>
# prevent log feedback loops, discard logs from our own pods
<match kube.logging.fluentd>
@type relabel
@label @FLUENT_LOG
</match>
# Exclude current fluent-bit multiline noise
# Still relevant ??
<filter kube.logging.fluent-bit>
@type grep
<exclude>
key log
pattern /could not append content to multiline context/
</exclude>
</filter>
# Generate Hash ID to break endless loop for already ingested events during retries
<filter **>
@type opensearch_genid
use_entire_record true
</filter>
# Route through DISPATCH for Prometheus metrics
<match **>
@type relabel
@label @DISPATCH
</match>
</label>
04_outputs.conf: |-
<label @OUTPUT>
<match **>
@id out_os
@type opensearch
# @log_level debug
include_tag_key true
id_key _hash
remove_keys _hash
write_operation create
# we have oj in the fluentd-concenter image
prefer_oj_serializer true
# KubeZero pipeline incl. GeoIP etc.
#pipeline fluentd
http_backend typhoeus
ca_file /run/pki/ca.crt
port 9200
scheme https
hosts {{ .Values.output.host }}
user {{ .Values.output.user }}
password {{ .Values.output.password }}
log_es_400_reason
logstash_format true
reconnect_on_error true
reload_on_failure true
request_timeout 300s
#sniffer_class_name Fluent::Plugin::OpenSearchSimpleSniffer
#with_transporter_log true
verify_es_version_at_startup false
default_opensearch_version 2
#suppress_type_name true
# Retry failed bulk requests
# https://github.com/uken/fluent-plugin-elasticsearch#unrecoverable-error-types
unrecoverable_error_types ["out_of_memory_error"]
bulk_message_request_threshold 1048576
<buffer>
@type file
flush_mode interval
flush_thread_count 2
flush_interval 10s
chunk_limit_size 2MB
total_limit_size 1GB
flush_at_shutdown true
retry_type exponential_backoff
retry_timeout 6h
overflow_action drop_oldest_chunk
disable_chunk_backup true
</buffer>
</match>
</label>
fluent-bit:
enabled: false
#image:
#repository: public.ecr.aws/zero-downtime/fluent-bit
#tag: 2.0.10
testFramework:
enabled: false
service:
internalTrafficPolicy: Local
extraPorts:
- name: otel
port: 4318
containerPort: 4318
protocol: TCP
serviceMonitor:
enabled: false
#rbac:
# nodeAccess: true
#hostNetwork: true
#dnsPolicy: ClusterFirstWithHostNet
tolerations:
- effect: NoSchedule
operator: Exists
resources:
requests:
cpu: 20m
memory: 48Mi
limits:
memory: 128Mi
config:
output:
host: telemetry-fluentd
sharedKey: secretref+k8s://v1/Secret/kubezero/kubezero-secrets/telemetry.fluentd.source.sharedKey
tls: false
output_otel:
host: telemetry-opentelemetry-collector
input:
memBufLimit: 16MB
refreshInterval: 5
logLevel: info
flushInterval: 5
#extraRecords:
# source.clustername: MyKubeCluster
service: |
[SERVICE]
Flush {{ .Values.config.flushInterval }}
Daemon Off
Log_Level {{ .Values.config.logLevel }}
Parsers_File parsers.conf
Parsers_File custom_parsers.conf
HTTP_Server On
HTTP_Listen 0.0.0.0
HTTP_Port {{ .Values.service.port }}
Health_Check On
inputs: |
[INPUT]
Name tail
Path /var/log/containers/*.log
# Exclude ourselves to current error spam, https://github.com/fluent/fluent-bit/issues/5769
# Todo: Rather limit / filter spam message than exclude all together -> ideally locally, next dataprepper
Exclude_Path *logging-fluent-bit*
multiline.parser cri
Tag cri.*
Skip_Long_Lines On
Skip_Empty_Lines On
DB /var/log/flb_kube.db
DB.Sync Normal
DB.locking true
# Buffer_Max_Size 1M
{{- with .Values.config.input }}
Mem_Buf_Limit {{ .memBufLimit }}
Refresh_Interval {{ .refreshInterval }}
{{- end }}
[INPUT]
Name opentelemetry
Tag otel
filters: |
[FILTER]
Name parser
Match cri.*
Parser cri-log
Key_Name log
[FILTER]
Name kubernetes
Match cri.*
Merge_Log On
Merge_Log_Key kube
Kube_Tag_Prefix cri.var.log.containers.
Keep_Log Off
Annotations Off
K8S-Logging.Parser Off
K8S-Logging.Exclude Off
Kube_Meta_Cache_TTL 3600s
Buffer_Size 0
#Use_Kubelet true
{{- if index .Values "config" "extraRecords" }}
[FILTER]
Name record_modifier
Match cri.*
{{- range $k,$v := index .Values "config" "extraRecords" }}
Record {{ $k }} {{ $v }}
{{- end }}
{{- end }}
[FILTER]
Name rewrite_tag
Match cri.*
Emitter_Name kube_tag_rewriter
Rule $kubernetes['pod_id'] .* kube.$kubernetes['namespace_name'].$kubernetes['container_name'] false
[FILTER]
Name lua
Match kube.*
script /fluent-bit/scripts/kubezero.lua
call nest_k8s_ns
outputs: |
[OUTPUT]
Match kube.*
Name forward
Host {{ .Values.config.output.host }}
Port 24224
Shared_Key {{ .Values.config.output.sharedKey }}
tls {{ ternary "on" "off" .Values.config.output.tls }}
Send_options true
Require_ack_response true
[OUTPUT]
Name opentelemetry
Match otel
Host {{ .Values.config.output_otel.host }}
Port 4318
#Metrics_uri /v1/metrics
Traces_uri /v1/traces
#Logs_uri /v1/logs
customParsers: |
[PARSER]
Name cri-log
Format regex
Regex ^(?<time>.+) (?<stream>stdout|stderr) (?<logtag>F|P) (?<log>.*)$
Time_Key time
Time_Format %Y-%m-%dT%H:%M:%S.%L%z
luaScripts:
kubezero.lua: |
function nest_k8s_ns(tag, timestamp, record)
if not record['kubernetes']['namespace_name'] then
return 0, 0, 0
end
new_record = {}
for key, val in pairs(record) do
if key == 'kube' then
new_record[key] = {}
new_record[key][record['kubernetes']['namespace_name']] = record[key]
else
new_record[key] = record[key]
end
end
return 1, timestamp, new_record
end
daemonSetVolumes:
- name: varlog
hostPath:
path: /var/log
- name: newlog
hostPath:
path: /var/lib/containers/logs
daemonSetVolumeMounts:
- name: varlog
mountPath: /var/log
- name: newlog
mountPath: /var/lib/containers/logs