fix[logging]: Use custom fluentd image, further tune fluentd ES connection

2021-09-03 19:21:14 +02:00 · 2021-09-03 19:21:14 +02:00 · f803531dc8
parent ab1e2c5922
commit f803531dc8
3 changed files with 28 additions and 21 deletions
--- a/charts/kubezero-logging/Chart.yaml
+++ b/charts/kubezero-logging/Chart.yaml
@ -2,7 +2,7 @@ apiVersion: v2
 name: kubezero-logging
 description: KubeZero Umbrella Chart for complete EFK stack
 type: application
-version: 0.7.10
+version: 0.7.12
 appVersion: 1.6.0
 home: https://kubezero.com
 icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
--- a/charts/kubezero-logging/README.md
+++ b/charts/kubezero-logging/README.md
@ -1,6 +1,6 @@
 # kubezero-logging

-![Version: 0.7.6](https://img.shields.io/badge/Version-0.7.6-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.6.0](https://img.shields.io/badge/AppVersion-1.6.0-informational?style=flat-square)
+![Version: 0.7.11](https://img.shields.io/badge/Version-0.7.11-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.6.0](https://img.shields.io/badge/AppVersion-1.6.0-informational?style=flat-square)

 KubeZero Umbrella Chart for complete EFK stack

@ -19,8 +19,8 @@ Kubernetes: `>= 1.18.0`
 | Repository | Name | Version |
 |------------|------|---------|
 |  | eck-operator | 1.6.0 |
-|  | fluent-bit | 0.15.15 |
-|  | fluentd | 0.2.6 |
+|  | fluent-bit | 0.16.3 |
+|  | fluentd | 0.2.10 |
 | https://zero-down-time.github.io/kubezero/ | kubezero-lib | >= 0.1.3 |

 ## Changes from upstream
@ -78,7 +78,7 @@ Kubernetes: `>= 1.18.0`
 | fluent-bit.config.output.sharedKey | string | `"cloudbender"` |  |
 | fluent-bit.config.output.tls | bool | `false` |  |
 | fluent-bit.config.outputs | string | `"[OUTPUT]\n    Match *\n    Name forward\n    Host {{ .Values.config.output.host }}\n    Port 24224\n    Shared_Key {{ .Values.config.output.sharedKey }}\n    tls {{ ternary \"on\" \"off\" .Values.config.output.tls }}\n    Send_options true\n    Require_ack_response true\n"` |  |
-| fluent-bit.config.service | string | `"[SERVICE]\n    Flush {{ .Values.config.flushInterval }}\n    Daemon Off\n    Log_Level {{ .Values.config.logLevel }}\n    Parsers_File parsers.conf\n    Parsers_File custom_parsers.conf\n    HTTP_Server On\n    HTTP_Listen 0.0.0.0\n    HTTP_Port {{ .Values.service.port }}\n"` |  |
+| fluent-bit.config.service | string | `"[SERVICE]\n    Flush {{ .Values.config.flushInterval }}\n    Daemon Off\n    Log_Level {{ .Values.config.logLevel }}\n    Parsers_File parsers.conf\n    Parsers_File custom_parsers.conf\n    HTTP_Server On\n    HTTP_Listen 0.0.0.0\n    HTTP_Port {{ .Values.service.port }}\n    Health_Check On\n"` |  |
 | fluent-bit.daemonSetVolumeMounts[0].mountPath | string | `"/var/log"` |  |
 | fluent-bit.daemonSetVolumeMounts[0].name | string | `"varlog"` |  |
 | fluent-bit.daemonSetVolumeMounts[1].mountPath | string | `"/etc/machine-id"` |  |
@ -90,7 +90,7 @@ Kubernetes: `>= 1.18.0`
 | fluent-bit.daemonSetVolumes[1].hostPath.type | string | `"File"` |  |
 | fluent-bit.daemonSetVolumes[1].name | string | `"etcmachineid"` |  |
 | fluent-bit.enabled | bool | `false` |  |
-| fluent-bit.image.tag | string | `"1.8.3"` |  |
+| fluent-bit.image.tag | string | `"1.8.5"` |  |
 | fluent-bit.luaScripts."kubezero.lua" | string | `"function nest_k8s_ns(tag, timestamp, record)\n    if not record['kubernetes']['namespace_name'] then\n        return 0, 0, 0\n    end\n    new_record = {}\n    for key, val in pairs(record) do\n        if key == 'kube' then\n            new_record[key] = {}\n            new_record[key][record['kubernetes']['namespace_name']] = record[key]\n        else\n            new_record[key] = record[key]\n        end\n    end\n    return 1, timestamp, new_record\nend\n"` |  |
 | fluent-bit.resources.limits.memory | string | `"64Mi"` |  |
 | fluent-bit.resources.requests.cpu | string | `"20m"` |  |
@ -109,17 +109,18 @@ Kubernetes: `>= 1.18.0`
 | fluentd.env[1].name | string | `"OUTPUT_PASSWORD"` |  |
 | fluentd.env[1].valueFrom.secretKeyRef.key | string | `"elastic"` |  |
 | fluentd.env[1].valueFrom.secretKeyRef.name | string | `"logging-es-elastic-user"` |  |
-| fluentd.fileConfigs."00_system.conf" | string | `"<system>\n  root_dir /var/log/fluentd\n  log_level info\n  ignore_repeated_log_interval 60s\n  ignore_same_log_interval 60s\n  workers 2\n</system>"` |  |
+| fluentd.fileConfigs."00_system.conf" | string | `"<system>\n  root_dir /var/log/fluentd\n  log_level info\n  ignore_repeated_log_interval 60s\n  ignore_same_log_interval 60s\n  workers 1\n</system>"` |  |
 | fluentd.fileConfigs."01_sources.conf" | string | `"<source>\n  @type http\n  @label @KUBERNETES\n  port 9880\n  bind 0.0.0.0\n  keepalive_timeout 30\n</source>\n\n<source>\n  @type forward\n  @label @KUBERNETES\n  port 24224\n  bind 0.0.0.0\n  # skip_invalid_event true\n  send_keepalive_packet true\n  <security>\n    self_hostname \"#{ENV['HOSTNAME']}\"\n    shared_key {{ .Values.shared_key }}\n  </security>\n</source>"` |  |
-| fluentd.fileConfigs."02_filters.conf" | string | `"<label @KUBERNETES>\n  # prevent log feedback loops eg. ES has issues etc.\n  # discard logs from our own pods\n  <match kube.logging.fluentd>\n    @type relabel\n    @label @FLUENT_LOG\n  </match>\n\n  # Exclude current fluent-bit multiline noise\n  <filter kube.logging.fluent-bit>\n    @type grep\n    <exclude>\n      key log\n      pattern /could not append content to multiline context/\n    </exclude>\n  </filter>\n\n  <match **>\n    @type relabel\n    @label @DISPATCH\n  </match>\n</label>"` |  |
-| fluentd.fileConfigs."04_outputs.conf" | string | `"<label @OUTPUT>\n  <match **>\n    @id out_es\n    @type elasticsearch\n    # @log_level debug\n    include_tag_key true\n    id_key id\n    remove_keys id\n\n    # KubeZero pipeline incl. GeoIP etc.\n    pipeline fluentd\n\n    hosts \"{{ .Values.output.host }}\"\n    port 9200\n    scheme http\n    user elastic\n    password \"#{ENV['OUTPUT_PASSWORD']}\"\n\n    log_es_400_reason\n    logstash_format true\n    reconnect_on_error true\n    reload_on_failure true\n    request_timeout 60s\n    suppress_type_name true\n    slow_flush_log_threshold 55.0\n\n    # Retry failed bulk requests\n    # https://github.com/uken/fluent-plugin-elasticsearch#unrecoverable-error-types\n    unrecoverable_error_types [\"out_of_memory_error\"]\n    bulk_message_request_threshold 2097152\n\n    <buffer>\n      @type file\n\n      flush_mode interval\n      flush_thread_count 1\n      flush_interval 30s\n\n      chunk_limit_size 8MB\n      total_limit_size 2GB\n\n      flush_at_shutdown true\n      retry_type exponential_backoff\n      retry_timeout 6h\n      overflow_action drop_oldest_chunk\n      disable_chunk_backup true\n    </buffer>\n  </match>\n</label>"` |  |
-| fluentd.image.repository | string | `"fluent/fluentd-kubernetes-daemonset"` |  |
-| fluentd.image.tag | string | `"v1-debian-elasticsearch"` |  |
+| fluentd.fileConfigs."02_filters.conf" | string | `"<label @KUBERNETES>\n  # prevent log feedback loops eg. ES has issues etc.\n  # discard logs from our own pods\n  <match kube.logging.fluentd>\n    @type relabel\n    @label @FLUENT_LOG\n  </match>\n\n  # Exclude current fluent-bit multiline noise\n  <filter kube.logging.fluent-bit>\n    @type grep\n    <exclude>\n      key log\n      pattern /could not append content to multiline context/\n    </exclude>\n  </filter>\n\n  # Generate Hash ID to break endless loop for already ingested events during retries\n  <filter **>\n    @type elasticsearch_genid\n    use_entire_record true\n  </filter>\n\n  # Route through DISPATCH for Prometheus metrics\n  <match **>\n    @type relabel\n    @label @DISPATCH\n  </match>\n</label>"` |  |
+| fluentd.fileConfigs."04_outputs.conf" | string | `"<label @OUTPUT>\n  <match **>\n    @id out_es\n    @type elasticsearch\n    # @log_level debug\n    include_tag_key true\n\n    id_key _hash\n    remove_keys _hash\n    write_operation create\n\n    # KubeZero pipeline incl. GeoIP etc.\n    pipeline fluentd\n\n    hosts \"{{ .Values.output.host }}\"\n    port 9200\n    scheme http\n    user elastic\n    password \"#{ENV['OUTPUT_PASSWORD']}\"\n\n    log_es_400_reason\n    logstash_format true\n    reconnect_on_error true\n    reload_on_failure true\n    request_timeout 60s\n    slow_flush_log_threshold 55.0\n\n    with_transporter_log true\n\n    verify_es_version_at_startup false\n    default_elasticsearch_version 7\n    suppress_type_name true\n\n    # Retry failed bulk requests\n    # https://github.com/uken/fluent-plugin-elasticsearch#unrecoverable-error-types\n    unrecoverable_error_types [\"out_of_memory_error\"]\n    bulk_message_request_threshold 2097152\n\n    <buffer>\n      @type file\n\n      flush_mode interval\n      flush_thread_count 2\n      flush_interval 30s\n\n      chunk_limit_size 4MB\n      total_limit_size 2GB\n\n      flush_at_shutdown true\n      retry_type exponential_backoff\n      retry_timeout 6h\n      overflow_action drop_oldest_chunk\n      disable_chunk_backup true\n    </buffer>\n  </match>\n</label>"` |  |
+| fluentd.image.repository | string | `"public.ecr.aws/zero-downtime/fluentd-concenter"` |  |
+| fluentd.image.tag | string | `"v1.14.1"` |  |
 | fluentd.istio.enabled | bool | `false` |  |
 | fluentd.kind | string | `"Deployment"` |  |
 | fluentd.metrics.serviceMonitor.additionalLabels.release | string | `"metrics"` |  |
 | fluentd.metrics.serviceMonitor.enabled | bool | `false` |  |
 | fluentd.output.host | string | `"logging-es-http"` |  |
+| fluentd.podSecurityPolicy.enabled | bool | `false` |  |
 | fluentd.replicaCount | int | `1` |  |
 | fluentd.resources.limits.memory | string | `"512Mi"` |  |
 | fluentd.resources.requests.cpu | string | `"200m"` |  |
--- a/charts/kubezero-logging/values.yaml
+++ b/charts/kubezero-logging/values.yaml
@ -45,9 +45,8 @@ kibana:
 fluentd:
  enabled: false
  image:
-    repository: fluent/fluentd-kubernetes-daemonset
-    # tag: v1-debian-elasticsearch
-    tag: v1.11.1-debian-elasticsearch7-1.3
+    repository: public.ecr.aws/zero-downtime/fluentd-concenter
+    tag: v1.14.1
  istio:
    enabled: false

@ -123,7 +122,7 @@ fluentd:
  fileConfigs:
    00_system.conf: |-
      <system>
-        root_dir /var/log/fluentd
+        root_dir /fluentd/log
        log_level info
        ignore_repeated_log_interval 60s
        ignore_same_log_interval 60s
@ -189,8 +188,10 @@ fluentd:
          @type elasticsearch
          # @log_level debug
          include_tag_key true
+
          id_key _hash
          remove_keys _hash
+          write_operation create

          # KubeZero pipeline incl. GeoIP etc.
          pipeline fluentd
@ -205,24 +206,29 @@ fluentd:
          logstash_format true
          reconnect_on_error true
          reload_on_failure true
-          request_timeout 60s
-          suppress_type_name true
+          request_timeout 300s
          slow_flush_log_threshold 55.0

+          #with_transporter_log true
+
+          verify_es_version_at_startup false
+          default_elasticsearch_version 7
+          suppress_type_name true
+
          # Retry failed bulk requests
          # https://github.com/uken/fluent-plugin-elasticsearch#unrecoverable-error-types
          unrecoverable_error_types ["out_of_memory_error"]
-          bulk_message_request_threshold 2097152
+          bulk_message_request_threshold 1048576

          <buffer>
            @type file

            flush_mode interval
            flush_thread_count 2
-            flush_interval 30s
+            flush_interval 10s

-            chunk_limit_size 4MB
-            total_limit_size 2GB
+            chunk_limit_size 2MB
+            total_limit_size 1GB

            flush_at_shutdown true
            retry_type exponential_backoff