fix: various tweaks and fixes

This commit is contained in:
Stefan Reimer 2024-12-04 18:40:10 +00:00
parent 2fa48e9fcd
commit 144d826aaf
14 changed files with 120 additions and 68 deletions

View File

@ -227,7 +227,10 @@ cluster-autoscaler:
scan-interval: 30s scan-interval: 30s
skip-nodes-with-local-storage: false skip-nodes-with-local-storage: false
balance-similar-node-groups: true balance-similar-node-groups: true
ignore-daemonsets-utilization: true
ignore-taint: "node.cilium.io/agent-not-ready" ignore-taint: "node.cilium.io/agent-not-ready"
# Disable for non-clustered control-plane
# leader-elect: false
#securityContext: #securityContext:
# runAsNonRoot: true # runAsNonRoot: true

View File

@ -92,10 +92,11 @@ Kubernetes: `>= 1.26.0`
| kube-prometheus-stack.alertmanager.config.inhibit_rules[2].target_matchers[0] | string | `"severity = info"` | | | kube-prometheus-stack.alertmanager.config.inhibit_rules[2].target_matchers[0] | string | `"severity = info"` | |
| kube-prometheus-stack.alertmanager.config.inhibit_rules[3].source_matchers[0] | string | `"alertname = ClusterAutoscalerNodeGroupsEnabled"` | | | kube-prometheus-stack.alertmanager.config.inhibit_rules[3].source_matchers[0] | string | `"alertname = ClusterAutoscalerNodeGroupsEnabled"` | |
| kube-prometheus-stack.alertmanager.config.inhibit_rules[3].target_matchers[0] | string | `"alertname =~ \"KubeCPUOvercommit|KubeMemoryOvercommit\""` | | | kube-prometheus-stack.alertmanager.config.inhibit_rules[3].target_matchers[0] | string | `"alertname =~ \"KubeCPUOvercommit|KubeMemoryOvercommit\""` | |
| kube-prometheus-stack.alertmanager.config.route.group_by[0] | string | `"severity"` | | | kube-prometheus-stack.alertmanager.config.route.group_by[0] | string | `"alertname"` | |
| kube-prometheus-stack.alertmanager.config.route.group_by[1] | string | `"clusterName"` | | | kube-prometheus-stack.alertmanager.config.route.group_by[1] | string | `"severity"` | |
| kube-prometheus-stack.alertmanager.config.route.group_by[2] | string | `"status"` | |
| kube-prometheus-stack.alertmanager.config.route.group_interval | string | `"5m"` | | | kube-prometheus-stack.alertmanager.config.route.group_interval | string | `"5m"` | |
| kube-prometheus-stack.alertmanager.config.route.group_wait | string | `"10s"` | | | kube-prometheus-stack.alertmanager.config.route.group_wait | string | `"15s"` | |
| kube-prometheus-stack.alertmanager.config.route.repeat_interval | string | `"4h"` | | | kube-prometheus-stack.alertmanager.config.route.repeat_interval | string | `"4h"` | |
| kube-prometheus-stack.alertmanager.config.route.routes[0].matchers[0] | string | `"severity = none"` | | | kube-prometheus-stack.alertmanager.config.route.routes[0].matchers[0] | string | `"severity = none"` | |
| kube-prometheus-stack.alertmanager.config.route.routes[0].receiver | string | `"null"` | | | kube-prometheus-stack.alertmanager.config.route.routes[0].receiver | string | `"null"` | |

View File

@ -18,7 +18,7 @@
"subdir": "contrib/mixin" "subdir": "contrib/mixin"
} }
}, },
"version": "19aa0dbe8fd6317a237bae9b6ea52a4f1b445b19", "version": "a45cc4961e827009dba7d164e441a0ebe2eef870",
"sum": "IXI3LQIT9NmTPJAk8WLUJd5+qZfcGpeNCyWIK7oEpws=" "sum": "IXI3LQIT9NmTPJAk8WLUJd5+qZfcGpeNCyWIK7oEpws="
}, },
{ {
@ -88,7 +88,7 @@
"subdir": "grafana-builder" "subdir": "grafana-builder"
} }
}, },
"version": "5a6b86b475e427b2dbd9e4af0bcafbb6da0507a5", "version": "767befa8fb46a07be516dec2777d7d89909a529d",
"sum": "yxqWcq/N3E/a/XreeU6EuE6X7kYPnG0AspAQFKOjASo=" "sum": "yxqWcq/N3E/a/XreeU6EuE6X7kYPnG0AspAQFKOjASo="
}, },
{ {
@ -118,8 +118,8 @@
"subdir": "" "subdir": ""
} }
}, },
"version": "bdbf7f45cedf37d07567be7519fa4139043f9335", "version": "a3fbf21977deb89b7d843eb8371170c011ea6835",
"sum": "j4EAKfqkbPvBFGnBjt4hex2bdNHPpuFWrCxfq5L6EkU=" "sum": "57zW2IGJ9zbYd8BI0qe6JkoWTRSMNiBUWC6+YcnEsWo="
}, },
{ {
"source": { "source": {
@ -128,7 +128,7 @@
"subdir": "jsonnet/kube-state-metrics" "subdir": "jsonnet/kube-state-metrics"
} }
}, },
"version": "17151aca659e0659259b5e1f5675acf849281ade", "version": "32e7727ff4613b0f55dfc18aff15afb8c04d03c5",
"sum": "lO7jUSzAIy8Yk9pOWJIWgPRhubkWzVh56W6wtYfbVH4=" "sum": "lO7jUSzAIy8Yk9pOWJIWgPRhubkWzVh56W6wtYfbVH4="
}, },
{ {
@ -138,7 +138,7 @@
"subdir": "jsonnet/kube-state-metrics-mixin" "subdir": "jsonnet/kube-state-metrics-mixin"
} }
}, },
"version": "17151aca659e0659259b5e1f5675acf849281ade", "version": "32e7727ff4613b0f55dfc18aff15afb8c04d03c5",
"sum": "qclI7LwucTjBef3PkGBkKxF0mfZPbHnn4rlNWKGtR4c=" "sum": "qclI7LwucTjBef3PkGBkKxF0mfZPbHnn4rlNWKGtR4c="
}, },
{ {
@ -148,8 +148,8 @@
"subdir": "jsonnet/kube-prometheus" "subdir": "jsonnet/kube-prometheus"
} }
}, },
"version": "c503e5cc5403dd5d56b1c0c5933827baee64aeaf", "version": "7e5a571a3fb735c78e17c76a637eb7e8bb5dd086",
"sum": "fJqINQiYJPmllXFFO+Hl5HrPYANMbhHFUQ28tl0Vi00=" "sum": "uTw/Mj+X91S+oqUpAX81xcfWPDlox0tdSZY/YBw7nGE="
}, },
{ {
"source": { "source": {
@ -158,7 +158,7 @@
"subdir": "jsonnet/mixin" "subdir": "jsonnet/mixin"
} }
}, },
"version": "e951bd3037a053fea681510ccde211c28dc657e1", "version": "a366602bacb2c8d773a9cee058b6971b8d2e3732",
"sum": "gi+knjdxs2T715iIQIntrimbHRgHnpM8IFBJDD1gYfs=", "sum": "gi+knjdxs2T715iIQIntrimbHRgHnpM8IFBJDD1gYfs=",
"name": "prometheus-operator-mixin" "name": "prometheus-operator-mixin"
}, },
@ -169,8 +169,8 @@
"subdir": "jsonnet/prometheus-operator" "subdir": "jsonnet/prometheus-operator"
} }
}, },
"version": "e951bd3037a053fea681510ccde211c28dc657e1", "version": "a366602bacb2c8d773a9cee058b6971b8d2e3732",
"sum": "YOJjmladGD1PcgNae0h88Mm235CsZSfwf2a4DIcMJFU=" "sum": "z0/lCiMusMHTqntsosMVGYkVcSZjCpyZBmUMVUsK5nA="
}, },
{ {
"source": { "source": {
@ -179,7 +179,7 @@
"subdir": "doc/alertmanager-mixin" "subdir": "doc/alertmanager-mixin"
} }
}, },
"version": "f6b942cf9b3a503d59192eada300d2ad97cba82f", "version": "0f65e8fa5fc72d2678655105c0213b416ca6f34c",
"sum": "Mf4h1BYLle2nrgjf/HXrBbl0Zk8N+xaoEM017o0BC+k=", "sum": "Mf4h1BYLle2nrgjf/HXrBbl0Zk8N+xaoEM017o0BC+k=",
"name": "alertmanager" "name": "alertmanager"
}, },
@ -190,7 +190,7 @@
"subdir": "docs/node-mixin" "subdir": "docs/node-mixin"
} }
}, },
"version": "49d177bf95417b117ab612a376e2434d5dd61c2d", "version": "cf8c6891cc610e54f70383addd4bb6079f0add35",
"sum": "cQCW+1N0Xae5yXecCWDK2oAlN0luBS/5GrwBYSlaFms=" "sum": "cQCW+1N0Xae5yXecCWDK2oAlN0luBS/5GrwBYSlaFms="
}, },
{ {
@ -200,8 +200,8 @@
"subdir": "documentation/prometheus-mixin" "subdir": "documentation/prometheus-mixin"
} }
}, },
"version": "789c9b1a5e455850ed9b3c89cafb37df75ce1e50", "version": "b407c2930da4f50c0d17fc39404c6302a9eb740b",
"sum": "dYLcLzGH4yF3qB7OGC/7z4nqeTNjv42L7Q3BENU8XJI=", "sum": "OYT5u3S8DbamuJV/v3gbWSteOvFzMeNwMj+u4Apk7jM=",
"name": "prometheus" "name": "prometheus"
}, },
{ {
@ -222,7 +222,7 @@
"subdir": "mixin" "subdir": "mixin"
} }
}, },
"version": "f9da21ec0b28073875520159fe72ab744c255b2e", "version": "7037331e6ea7dbe85a1b7af37bf8ea277a80663d",
"sum": "ieCD4eMgGbOlrI8GmckGPHBGQDcLasE1rULYq56W/bs=", "sum": "ieCD4eMgGbOlrI8GmckGPHBGQDcLasE1rULYq56W/bs=",
"name": "thanos-mixin" "name": "thanos-mixin"
} }

View File

@ -29,14 +29,43 @@ local etcdMixin = addMixin({
}, },
}); });
local kp = (import 'kube-prometheus/main.libsonnet') + local kp = (import 'kube-prometheus/main.libsonnet') + {
{ values+:: {
values+:: { common+: {
common+: { namespace: 'monitoring',
namespace: 'monitoring', },
}, },
}, kubernetesControlPlane+: {
}; prometheusRule+: {
spec+: {
groups: [
(
if group.name == 'kubernetes-resources' then
group {
rules: [
{
alert: 'ClusterAutoscalerNodeGroupsEnabled',
expr: 'cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0 or vector(1)',
'for': '5m',
labels: {
severity: 'none',
},
annotations: {
description: 'Inhibitor rule if the Cluster Autoscaler found at least one node group',
summary: 'Cluster Autoscaler found at least one node group.',
},
},
] + super.rules,
}
else
group
)
for group in super.groups
],
},
},
},
};
// We just want the Prometheus Rules // We just want the Prometheus Rules
{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + { 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } +

View File

@ -123,7 +123,7 @@
{ {
"alert": "KubeDaemonSetRolloutStuck", "alert": "KubeDaemonSetRolloutStuck",
"annotations": { "annotations": {
"description": "DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.", "description": "DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15m.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck", "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck",
"summary": "DaemonSet rollout is stuck." "summary": "DaemonSet rollout is stuck."
}, },
@ -228,6 +228,18 @@
{ {
"name": "kubernetes-resources", "name": "kubernetes-resources",
"rules": [ "rules": [
{
"alert": "ClusterAutoscalerNodeGroupsEnabled",
"annotations": {
"description": "Inhibitor rule if the Cluster Autoscaler found at least one node group",
"summary": "Cluster Autoscaler found at least one node group."
},
"expr": "cluster_autoscaler_node_groups_count{job=\"addons-aws-cluster-autoscaler\",node_group_type=\"autoscaled\"} > 0",
"for": "5m",
"labels": {
"severity": "none"
}
},
{ {
"alert": "KubeCPUOvercommit", "alert": "KubeCPUOvercommit",
"annotations": { "annotations": {
@ -506,7 +518,7 @@
{ {
"alert": "KubeClientCertificateExpiration", "alert": "KubeClientCertificateExpiration",
"annotations": { "annotations": {
"description": "A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days.", "description": "A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration", "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration",
"summary": "Client certificate is about to expire." "summary": "Client certificate is about to expire."
}, },
@ -519,7 +531,7 @@
{ {
"alert": "KubeClientCertificateExpiration", "alert": "KubeClientCertificateExpiration",
"annotations": { "annotations": {
"description": "A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours.", "description": "A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration", "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration",
"summary": "Client certificate is about to expire." "summary": "Client certificate is about to expire."
}, },

View File

@ -6,7 +6,7 @@
"app.kubernetes.io/component": "controller", "app.kubernetes.io/component": "controller",
"app.kubernetes.io/name": "prometheus-operator", "app.kubernetes.io/name": "prometheus-operator",
"app.kubernetes.io/part-of": "kube-prometheus", "app.kubernetes.io/part-of": "kube-prometheus",
"app.kubernetes.io/version": "0.78.1", "app.kubernetes.io/version": "0.78.2",
"prometheus": "k8s", "prometheus": "k8s",
"role": "alert-rules" "role": "alert-rules"
}, },

View File

@ -7,7 +7,7 @@
"app.kubernetes.io/instance": "k8s", "app.kubernetes.io/instance": "k8s",
"app.kubernetes.io/name": "prometheus", "app.kubernetes.io/name": "prometheus",
"app.kubernetes.io/part-of": "kube-prometheus", "app.kubernetes.io/part-of": "kube-prometheus",
"app.kubernetes.io/version": "2.55.1", "app.kubernetes.io/version": "3.0.1",
"prometheus": "k8s", "prometheus": "k8s",
"role": "alert-rules" "role": "alert-rules"
}, },
@ -74,9 +74,9 @@
{ {
"alert": "PrometheusErrorSendingAlertsToSomeAlertmanagers", "alert": "PrometheusErrorSendingAlertsToSomeAlertmanagers",
"annotations": { "annotations": {
"description": "{{ printf \"%.1f\" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.", "description": "{{ printf \"%.1f\" $value }}% of alerts sent by Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}} were affected by errors.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers", "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers",
"summary": "Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager." "summary": "More than 1% of alerts sent by Prometheus to a specific Alertmanager were affected by errors."
}, },
"expr": "(\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n* 100\n> 1\n", "expr": "(\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n* 100\n> 1\n",
"for": "15m", "for": "15m",

View File

@ -89,7 +89,7 @@ spec:
severity: warning severity: warning
- alert: KubeDaemonSetRolloutStuck - alert: KubeDaemonSetRolloutStuck
annotations: annotations:
description: DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15 minutes. description: DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
summary: DaemonSet rollout is stuck. summary: DaemonSet rollout is stuck.
expr: "(\n (\n kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"}\n !=\n 0\n ) or (\n kube_daemonset_status_updated_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_available{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_daemonset_status_updated_number_scheduled{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n" expr: "(\n (\n kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"}\n !=\n 0\n ) or (\n kube_daemonset_status_updated_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_available{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_daemonset_status_updated_number_scheduled{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n"
@ -166,6 +166,14 @@ spec:
severity: warning severity: warning
- name: kubernetes-resources - name: kubernetes-resources
rules: rules:
- alert: ClusterAutoscalerNodeGroupsEnabled
annotations:
description: Inhibitor rule if the Cluster Autoscaler found at least one node group
summary: Cluster Autoscaler found at least one node group.
expr: cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0
for: 5m
labels:
severity: none
- alert: KubeCPUOvercommit - alert: KubeCPUOvercommit
annotations: annotations:
description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Pods by {{`{{`}} $value {{`}}`}} CPU shares and cannot tolerate node failure. description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Pods by {{`{{`}} $value {{`}}`}} CPU shares and cannot tolerate node failure.
@ -395,7 +403,7 @@ spec:
rules: rules:
- alert: KubeClientCertificateExpiration - alert: KubeClientCertificateExpiration
annotations: annotations:
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days. description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
summary: Client certificate is about to expire. summary: Client certificate is about to expire.
expr: 'histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 expr: 'histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
@ -410,7 +418,7 @@ spec:
severity: warning severity: warning
- alert: KubeClientCertificateExpiration - alert: KubeClientCertificateExpiration
annotations: annotations:
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours. description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
summary: Client certificate is about to expire. summary: Client certificate is about to expire.
expr: 'histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 expr: 'histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400

View File

@ -57,9 +57,9 @@ spec:
severity: warning severity: warning
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations: annotations:
description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.alertmanager{{`}}`}}.' description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% of alerts sent by Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.alertmanager{{`}}`}} were affected by errors.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager were affected by errors.
expr: "(\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n* 100\n> 1\n" expr: "(\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n* 100\n> 1\n"
for: 15m for: 15m
labels: labels:

View File

@ -1,19 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "zdt-inhibitors" | trunc 63 | trimSuffix "-" }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "kubezero-lib.labels" . | nindent 4 }}
spec:
groups:
- name: zdt-inhibitors
rules:
- alert: ClusterAutoscalerNodeGroupsEnabled
annotations:
description: "This rule is meant to inhibit other rules and should not be forwarded.\nThe Cluster Autoscaler found at least one node group"
summary: Cluster Autoscaler found at least one node group.
expr: 'cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0'
for: 15m
labels:
severity: none

View File

@ -223,8 +223,8 @@ kube-prometheus-stack:
global: global:
resolve_timeout: 5m resolve_timeout: 5m
route: route:
group_by: ['severity', 'clusterName'] group_by: ['alertname', 'severity', 'status']
group_wait: 10s group_wait: 30s
group_interval: 5m group_interval: 5m
repeat_interval: 4h repeat_interval: 4h
routes: routes:
@ -252,7 +252,7 @@ kube-prometheus-stack:
- alertname = InfoInhibitor - alertname = InfoInhibitor
target_matchers: target_matchers:
- severity = info - severity = info
# Disable cluster overcommiy alerts if we have cluster autoscaler available # Disable cluster overcommit alerts if we have cluster autoscaler available
- source_matchers: - source_matchers:
- alertname = ClusterAutoscalerNodeGroupsEnabled - alertname = ClusterAutoscalerNodeGroupsEnabled
target_matchers: target_matchers:

View File

@ -6,7 +6,7 @@ clusterBackup:
{{- toYaml . | nindent 2 }} {{- toYaml . | nindent 2 }}
{{- end }} {{- end }}
{{- if .Values.global.aws.region }} {{- if eq .Values.global.platform "aws" }}
# AWS # AWS
extraEnv: extraEnv:
- name: AWS_DEFAULT_REGION - name: AWS_DEFAULT_REGION
@ -20,7 +20,7 @@ forseti:
{{- toYaml . | nindent 2 }} {{- toYaml . | nindent 2 }}
{{- end }} {{- end }}
{{- if .Values.global.aws.region }} {{- if eq .Values.global.platform "aws" }}
# AWS # AWS
aws: aws:
region: {{ $.Values.global.aws.region }} region: {{ $.Values.global.aws.region }}
@ -34,7 +34,7 @@ external-dns:
{{- toYaml . | nindent 2 }} {{- toYaml . | nindent 2 }}
{{- end }} {{- end }}
{{- if .Values.global.aws.region }} {{- if eq .Values.global.platform "aws" }}
# AWS # AWS
txtOwnerId: {{ .Values.global.clusterName }} txtOwnerId: {{ .Values.global.clusterName }}
provider: aws provider: aws
@ -67,13 +67,18 @@ external-dns:
cluster-autoscaler: cluster-autoscaler:
enabled: {{ ternary "true" "false" (or (hasKey .Values.global.aws "region") (index .Values "addons" "cluster-autoscaler" "enabled")) }} enabled: {{ ternary "true" "false" (or (hasKey .Values.global.aws "region") (index .Values "addons" "cluster-autoscaler" "enabled")) }}
autoDiscovery:
clusterName: {{ .Values.global.clusterName }}
{{- if not .Values.global.highAvailable }}
extraArgs:
leader-elect: false
{{- end }}
{{- with omit (index .Values "addons" "cluster-autoscaler") "enabled" }} {{- with omit (index .Values "addons" "cluster-autoscaler") "enabled" }}
{{- toYaml . | nindent 2 }} {{- toYaml . | nindent 2 }}
{{- end }} {{- end }}
autoDiscovery:
clusterName: {{ .Values.global.clusterName }}
{{- with .Values.metrics }} {{- with .Values.metrics }}
serviceMonitor: serviceMonitor:
enabled: {{ .enabled }} enabled: {{ .enabled }}
@ -82,7 +87,7 @@ cluster-autoscaler:
# enabled: {{ .enabled }} # enabled: {{ .enabled }}
{{- end }} {{- end }}
{{- if .Values.global.aws.region }} {{- if eq .Values.global.platform "aws" }}
# AWS # AWS
awsRegion: {{ .Values.global.aws.region }} awsRegion: {{ .Values.global.aws.region }}

View File

@ -20,6 +20,16 @@ cloudnative-pg:
{{- end }} {{- end }}
{{- end }} {{- end }}
{{- with index .Values "operators" "strimzi-kafka-operator" }}
strimzi-kafka-operator:
{{- toYaml . | nindent 2 }}
{{- with $.Values.metrics }}
monitoring:
podMonitorEnabled: {{ .enabled }}
{{- end }}
{{- end }}
{{- end }} {{- end }}

View File

@ -1,5 +1,8 @@
{{- define "telemetry-values" }} {{- define "telemetry-values" }}
metrics:
enabled: {{ .Values.metrics.enabled }}
{{- if index .Values "telemetry" "fluent-bit" }} {{- if index .Values "telemetry" "fluent-bit" }}
fluent-bit: fluent-bit:
{{- with index .Values.telemetry "fluent-bit" }} {{- with index .Values.telemetry "fluent-bit" }}