fix: various tweaks and fixes

This commit is contained in:
Stefan Reimer 2024-12-04 18:40:10 +00:00
parent 2fa48e9fcd
commit 144d826aaf
14 changed files with 120 additions and 68 deletions

View File

@ -227,7 +227,10 @@ cluster-autoscaler:
scan-interval: 30s
skip-nodes-with-local-storage: false
balance-similar-node-groups: true
ignore-daemonsets-utilization: true
ignore-taint: "node.cilium.io/agent-not-ready"
# Disable for non-clustered control-plane
# leader-elect: false
#securityContext:
# runAsNonRoot: true

View File

@ -92,10 +92,11 @@ Kubernetes: `>= 1.26.0`
| kube-prometheus-stack.alertmanager.config.inhibit_rules[2].target_matchers[0] | string | `"severity = info"` | |
| kube-prometheus-stack.alertmanager.config.inhibit_rules[3].source_matchers[0] | string | `"alertname = ClusterAutoscalerNodeGroupsEnabled"` | |
| kube-prometheus-stack.alertmanager.config.inhibit_rules[3].target_matchers[0] | string | `"alertname =~ \"KubeCPUOvercommit|KubeMemoryOvercommit\""` | |
| kube-prometheus-stack.alertmanager.config.route.group_by[0] | string | `"severity"` | |
| kube-prometheus-stack.alertmanager.config.route.group_by[1] | string | `"clusterName"` | |
| kube-prometheus-stack.alertmanager.config.route.group_by[0] | string | `"alertname"` | |
| kube-prometheus-stack.alertmanager.config.route.group_by[1] | string | `"severity"` | |
| kube-prometheus-stack.alertmanager.config.route.group_by[2] | string | `"status"` | |
| kube-prometheus-stack.alertmanager.config.route.group_interval | string | `"5m"` | |
| kube-prometheus-stack.alertmanager.config.route.group_wait | string | `"10s"` | |
| kube-prometheus-stack.alertmanager.config.route.group_wait | string | `"15s"` | |
| kube-prometheus-stack.alertmanager.config.route.repeat_interval | string | `"4h"` | |
| kube-prometheus-stack.alertmanager.config.route.routes[0].matchers[0] | string | `"severity = none"` | |
| kube-prometheus-stack.alertmanager.config.route.routes[0].receiver | string | `"null"` | |

View File

@ -18,7 +18,7 @@
"subdir": "contrib/mixin"
}
},
"version": "19aa0dbe8fd6317a237bae9b6ea52a4f1b445b19",
"version": "a45cc4961e827009dba7d164e441a0ebe2eef870",
"sum": "IXI3LQIT9NmTPJAk8WLUJd5+qZfcGpeNCyWIK7oEpws="
},
{
@ -88,7 +88,7 @@
"subdir": "grafana-builder"
}
},
"version": "5a6b86b475e427b2dbd9e4af0bcafbb6da0507a5",
"version": "767befa8fb46a07be516dec2777d7d89909a529d",
"sum": "yxqWcq/N3E/a/XreeU6EuE6X7kYPnG0AspAQFKOjASo="
},
{
@ -118,8 +118,8 @@
"subdir": ""
}
},
"version": "bdbf7f45cedf37d07567be7519fa4139043f9335",
"sum": "j4EAKfqkbPvBFGnBjt4hex2bdNHPpuFWrCxfq5L6EkU="
"version": "a3fbf21977deb89b7d843eb8371170c011ea6835",
"sum": "57zW2IGJ9zbYd8BI0qe6JkoWTRSMNiBUWC6+YcnEsWo="
},
{
"source": {
@ -128,7 +128,7 @@
"subdir": "jsonnet/kube-state-metrics"
}
},
"version": "17151aca659e0659259b5e1f5675acf849281ade",
"version": "32e7727ff4613b0f55dfc18aff15afb8c04d03c5",
"sum": "lO7jUSzAIy8Yk9pOWJIWgPRhubkWzVh56W6wtYfbVH4="
},
{
@ -138,7 +138,7 @@
"subdir": "jsonnet/kube-state-metrics-mixin"
}
},
"version": "17151aca659e0659259b5e1f5675acf849281ade",
"version": "32e7727ff4613b0f55dfc18aff15afb8c04d03c5",
"sum": "qclI7LwucTjBef3PkGBkKxF0mfZPbHnn4rlNWKGtR4c="
},
{
@ -148,8 +148,8 @@
"subdir": "jsonnet/kube-prometheus"
}
},
"version": "c503e5cc5403dd5d56b1c0c5933827baee64aeaf",
"sum": "fJqINQiYJPmllXFFO+Hl5HrPYANMbhHFUQ28tl0Vi00="
"version": "7e5a571a3fb735c78e17c76a637eb7e8bb5dd086",
"sum": "uTw/Mj+X91S+oqUpAX81xcfWPDlox0tdSZY/YBw7nGE="
},
{
"source": {
@ -158,7 +158,7 @@
"subdir": "jsonnet/mixin"
}
},
"version": "e951bd3037a053fea681510ccde211c28dc657e1",
"version": "a366602bacb2c8d773a9cee058b6971b8d2e3732",
"sum": "gi+knjdxs2T715iIQIntrimbHRgHnpM8IFBJDD1gYfs=",
"name": "prometheus-operator-mixin"
},
@ -169,8 +169,8 @@
"subdir": "jsonnet/prometheus-operator"
}
},
"version": "e951bd3037a053fea681510ccde211c28dc657e1",
"sum": "YOJjmladGD1PcgNae0h88Mm235CsZSfwf2a4DIcMJFU="
"version": "a366602bacb2c8d773a9cee058b6971b8d2e3732",
"sum": "z0/lCiMusMHTqntsosMVGYkVcSZjCpyZBmUMVUsK5nA="
},
{
"source": {
@ -179,7 +179,7 @@
"subdir": "doc/alertmanager-mixin"
}
},
"version": "f6b942cf9b3a503d59192eada300d2ad97cba82f",
"version": "0f65e8fa5fc72d2678655105c0213b416ca6f34c",
"sum": "Mf4h1BYLle2nrgjf/HXrBbl0Zk8N+xaoEM017o0BC+k=",
"name": "alertmanager"
},
@ -190,7 +190,7 @@
"subdir": "docs/node-mixin"
}
},
"version": "49d177bf95417b117ab612a376e2434d5dd61c2d",
"version": "cf8c6891cc610e54f70383addd4bb6079f0add35",
"sum": "cQCW+1N0Xae5yXecCWDK2oAlN0luBS/5GrwBYSlaFms="
},
{
@ -200,8 +200,8 @@
"subdir": "documentation/prometheus-mixin"
}
},
"version": "789c9b1a5e455850ed9b3c89cafb37df75ce1e50",
"sum": "dYLcLzGH4yF3qB7OGC/7z4nqeTNjv42L7Q3BENU8XJI=",
"version": "b407c2930da4f50c0d17fc39404c6302a9eb740b",
"sum": "OYT5u3S8DbamuJV/v3gbWSteOvFzMeNwMj+u4Apk7jM=",
"name": "prometheus"
},
{
@ -222,7 +222,7 @@
"subdir": "mixin"
}
},
"version": "f9da21ec0b28073875520159fe72ab744c255b2e",
"version": "7037331e6ea7dbe85a1b7af37bf8ea277a80663d",
"sum": "ieCD4eMgGbOlrI8GmckGPHBGQDcLasE1rULYq56W/bs=",
"name": "thanos-mixin"
}

View File

@ -29,14 +29,43 @@ local etcdMixin = addMixin({
},
});
local kp = (import 'kube-prometheus/main.libsonnet') +
{
values+:: {
common+: {
namespace: 'monitoring',
},
},
};
local kp = (import 'kube-prometheus/main.libsonnet') + {
values+:: {
common+: {
namespace: 'monitoring',
},
},
kubernetesControlPlane+: {
prometheusRule+: {
spec+: {
groups: [
(
if group.name == 'kubernetes-resources' then
group {
rules: [
{
alert: 'ClusterAutoscalerNodeGroupsEnabled',
expr: 'cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0 or vector(1)',
'for': '5m',
labels: {
severity: 'none',
},
annotations: {
description: 'Inhibitor rule if the Cluster Autoscaler found at least one node group',
summary: 'Cluster Autoscaler found at least one node group.',
},
},
] + super.rules,
}
else
group
)
for group in super.groups
],
},
},
},
};
// We just want the Prometheus Rules
{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } +

View File

@ -123,7 +123,7 @@
{
"alert": "KubeDaemonSetRolloutStuck",
"annotations": {
"description": "DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.",
"description": "DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15m.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck",
"summary": "DaemonSet rollout is stuck."
},
@ -228,6 +228,18 @@
{
"name": "kubernetes-resources",
"rules": [
{
"alert": "ClusterAutoscalerNodeGroupsEnabled",
"annotations": {
"description": "Inhibitor rule if the Cluster Autoscaler found at least one node group",
"summary": "Cluster Autoscaler found at least one node group."
},
"expr": "cluster_autoscaler_node_groups_count{job=\"addons-aws-cluster-autoscaler\",node_group_type=\"autoscaled\"} > 0",
"for": "5m",
"labels": {
"severity": "none"
}
},
{
"alert": "KubeCPUOvercommit",
"annotations": {
@ -506,7 +518,7 @@
{
"alert": "KubeClientCertificateExpiration",
"annotations": {
"description": "A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days.",
"description": "A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration",
"summary": "Client certificate is about to expire."
},
@ -519,7 +531,7 @@
{
"alert": "KubeClientCertificateExpiration",
"annotations": {
"description": "A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours.",
"description": "A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration",
"summary": "Client certificate is about to expire."
},

View File

@ -6,7 +6,7 @@
"app.kubernetes.io/component": "controller",
"app.kubernetes.io/name": "prometheus-operator",
"app.kubernetes.io/part-of": "kube-prometheus",
"app.kubernetes.io/version": "0.78.1",
"app.kubernetes.io/version": "0.78.2",
"prometheus": "k8s",
"role": "alert-rules"
},

View File

@ -7,7 +7,7 @@
"app.kubernetes.io/instance": "k8s",
"app.kubernetes.io/name": "prometheus",
"app.kubernetes.io/part-of": "kube-prometheus",
"app.kubernetes.io/version": "2.55.1",
"app.kubernetes.io/version": "3.0.1",
"prometheus": "k8s",
"role": "alert-rules"
},
@ -74,9 +74,9 @@
{
"alert": "PrometheusErrorSendingAlertsToSomeAlertmanagers",
"annotations": {
"description": "{{ printf \"%.1f\" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.",
"description": "{{ printf \"%.1f\" $value }}% of alerts sent by Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}} were affected by errors.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers",
"summary": "Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager."
"summary": "More than 1% of alerts sent by Prometheus to a specific Alertmanager were affected by errors."
},
"expr": "(\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n* 100\n> 1\n",
"for": "15m",

View File

@ -89,7 +89,7 @@ spec:
severity: warning
- alert: KubeDaemonSetRolloutStuck
annotations:
description: DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15 minutes.
description: DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
summary: DaemonSet rollout is stuck.
expr: "(\n (\n kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"}\n !=\n 0\n ) or (\n kube_daemonset_status_updated_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_available{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_daemonset_status_updated_number_scheduled{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n"
@ -166,6 +166,14 @@ spec:
severity: warning
- name: kubernetes-resources
rules:
- alert: ClusterAutoscalerNodeGroupsEnabled
annotations:
description: Inhibitor rule if the Cluster Autoscaler found at least one node group
summary: Cluster Autoscaler found at least one node group.
expr: cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0
for: 5m
labels:
severity: none
- alert: KubeCPUOvercommit
annotations:
description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Pods by {{`{{`}} $value {{`}}`}} CPU shares and cannot tolerate node failure.
@ -395,7 +403,7 @@ spec:
rules:
- alert: KubeClientCertificateExpiration
annotations:
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days.
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: 'histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
@ -410,7 +418,7 @@ spec:
severity: warning
- alert: KubeClientCertificateExpiration
annotations:
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours.
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: 'histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400

View File

@ -57,9 +57,9 @@ spec:
severity: warning
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.alertmanager{{`}}`}}.'
description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% of alerts sent by Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.alertmanager{{`}}`}} were affected by errors.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager were affected by errors.
expr: "(\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n* 100\n> 1\n"
for: 15m
labels:

View File

@ -1,19 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "zdt-inhibitors" | trunc 63 | trimSuffix "-" }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "kubezero-lib.labels" . | nindent 4 }}
spec:
groups:
- name: zdt-inhibitors
rules:
- alert: ClusterAutoscalerNodeGroupsEnabled
annotations:
description: "This rule is meant to inhibit other rules and should not be forwarded.\nThe Cluster Autoscaler found at least one node group"
summary: Cluster Autoscaler found at least one node group.
expr: 'cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0'
for: 15m
labels:
severity: none

View File

@ -223,8 +223,8 @@ kube-prometheus-stack:
global:
resolve_timeout: 5m
route:
group_by: ['severity', 'clusterName']
group_wait: 10s
group_by: ['alertname', 'severity', 'status']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
routes:
@ -252,7 +252,7 @@ kube-prometheus-stack:
- alertname = InfoInhibitor
target_matchers:
- severity = info
# Disable cluster overcommiy alerts if we have cluster autoscaler available
# Disable cluster overcommit alerts if we have cluster autoscaler available
- source_matchers:
- alertname = ClusterAutoscalerNodeGroupsEnabled
target_matchers:

View File

@ -6,7 +6,7 @@ clusterBackup:
{{- toYaml . | nindent 2 }}
{{- end }}
{{- if .Values.global.aws.region }}
{{- if eq .Values.global.platform "aws" }}
# AWS
extraEnv:
- name: AWS_DEFAULT_REGION
@ -20,7 +20,7 @@ forseti:
{{- toYaml . | nindent 2 }}
{{- end }}
{{- if .Values.global.aws.region }}
{{- if eq .Values.global.platform "aws" }}
# AWS
aws:
region: {{ $.Values.global.aws.region }}
@ -34,7 +34,7 @@ external-dns:
{{- toYaml . | nindent 2 }}
{{- end }}
{{- if .Values.global.aws.region }}
{{- if eq .Values.global.platform "aws" }}
# AWS
txtOwnerId: {{ .Values.global.clusterName }}
provider: aws
@ -67,13 +67,18 @@ external-dns:
cluster-autoscaler:
enabled: {{ ternary "true" "false" (or (hasKey .Values.global.aws "region") (index .Values "addons" "cluster-autoscaler" "enabled")) }}
autoDiscovery:
clusterName: {{ .Values.global.clusterName }}
{{- if not .Values.global.highAvailable }}
extraArgs:
leader-elect: false
{{- end }}
{{- with omit (index .Values "addons" "cluster-autoscaler") "enabled" }}
{{- toYaml . | nindent 2 }}
{{- end }}
autoDiscovery:
clusterName: {{ .Values.global.clusterName }}
{{- with .Values.metrics }}
serviceMonitor:
enabled: {{ .enabled }}
@ -82,7 +87,7 @@ cluster-autoscaler:
# enabled: {{ .enabled }}
{{- end }}
{{- if .Values.global.aws.region }}
{{- if eq .Values.global.platform "aws" }}
# AWS
awsRegion: {{ .Values.global.aws.region }}

View File

@ -20,6 +20,16 @@ cloudnative-pg:
{{- end }}
{{- end }}
{{- with index .Values "operators" "strimzi-kafka-operator" }}
strimzi-kafka-operator:
{{- toYaml . | nindent 2 }}
{{- with $.Values.metrics }}
monitoring:
podMonitorEnabled: {{ .enabled }}
{{- end }}
{{- end }}
{{- end }}

View File

@ -1,5 +1,8 @@
{{- define "telemetry-values" }}
metrics:
enabled: {{ .Values.metrics.enabled }}
{{- if index .Values "telemetry" "fluent-bit" }}
fluent-bit:
{{- with index .Values.telemetry "fluent-bit" }}