diff --git a/charts/kubezero-metrics/README.md b/charts/kubezero-metrics/README.md index 47f98bf1..388eeae4 100644 --- a/charts/kubezero-metrics/README.md +++ b/charts/kubezero-metrics/README.md @@ -90,13 +90,11 @@ Kubernetes: `>= 1.26.0` | kube-prometheus-stack.alertmanager.config.inhibit_rules[2].equal[0] | string | `"namespace"` | | | kube-prometheus-stack.alertmanager.config.inhibit_rules[2].source_matchers[0] | string | `"alertname = InfoInhibitor"` | | | kube-prometheus-stack.alertmanager.config.inhibit_rules[2].target_matchers[0] | string | `"severity = info"` | | -| kube-prometheus-stack.alertmanager.config.inhibit_rules[3].source_matchers[0] | string | `"alertname = ClusterAutoscalerNodeGroupsEnabled"` | | -| kube-prometheus-stack.alertmanager.config.inhibit_rules[3].target_matchers[0] | string | `"alertname =~ \"KubeCPUOvercommit|KubeMemoryOvercommit\""` | | | kube-prometheus-stack.alertmanager.config.route.group_by[0] | string | `"alertname"` | | | kube-prometheus-stack.alertmanager.config.route.group_by[1] | string | `"severity"` | | | kube-prometheus-stack.alertmanager.config.route.group_by[2] | string | `"status"` | | | kube-prometheus-stack.alertmanager.config.route.group_interval | string | `"5m"` | | -| kube-prometheus-stack.alertmanager.config.route.group_wait | string | `"15s"` | | +| kube-prometheus-stack.alertmanager.config.route.group_wait | string | `"30s"` | | | kube-prometheus-stack.alertmanager.config.route.repeat_interval | string | `"4h"` | | | kube-prometheus-stack.alertmanager.config.route.routes[0].matchers[0] | string | `"severity = none"` | | | kube-prometheus-stack.alertmanager.config.route.routes[0].receiver | string | `"null"` | | diff --git a/charts/kubezero-metrics/jsonnet/jsonnetfile.lock.json b/charts/kubezero-metrics/jsonnet/jsonnetfile.lock.json index 64c88cb6..21f5dda7 100644 --- a/charts/kubezero-metrics/jsonnet/jsonnetfile.lock.json +++ b/charts/kubezero-metrics/jsonnet/jsonnetfile.lock.json @@ -18,7 +18,7 @@ "subdir": "contrib/mixin" } }, - "version": "a45cc4961e827009dba7d164e441a0ebe2eef870", + "version": "2b323071a8bd4f02ddaf63e0dfa1fd98c221dccb", "sum": "IXI3LQIT9NmTPJAk8WLUJd5+qZfcGpeNCyWIK7oEpws=" }, { diff --git a/charts/kubezero-metrics/jsonnet/rules.libsonnet b/charts/kubezero-metrics/jsonnet/rules.libsonnet index fabe38cf..b7328c50 100644 --- a/charts/kubezero-metrics/jsonnet/rules.libsonnet +++ b/charts/kubezero-metrics/jsonnet/rules.libsonnet @@ -42,20 +42,22 @@ local kp = (import 'kube-prometheus/main.libsonnet') + { ( if group.name == 'kubernetes-resources' then group { - rules: [ - { - alert: 'ClusterAutoscalerNodeGroupsEnabled', - expr: 'cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0 or vector(1)', - 'for': '5m', - labels: { - severity: 'none', - }, - annotations: { - description: 'Inhibitor rule if the Cluster Autoscaler found at least one node group', - summary: 'Cluster Autoscaler found at least one node group.', - }, + rules: std.filter( + function(rule) + rule.alert != 'KubeCPUOvercommit' && rule.alert != 'KubeMemoryOvercommit', + group.rules + ) + [{ + alert: 'ClusterAutoscalerNodeGroupsEnabled', + expr: 'cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0', + 'for': '5m', + labels: { + severity: 'none', }, - ] + super.rules, + annotations: { + description: 'Inhibitor rule if the Cluster Autoscaler found at least one node group', + summary: 'Cluster Autoscaler found at least one node group.', + }, + }], } else group diff --git a/charts/kubezero-metrics/jsonnet/rules/kubernetes-prometheusRule b/charts/kubezero-metrics/jsonnet/rules/kubernetes-prometheusRule index 0e0e0cc2..0909ee9b 100644 --- a/charts/kubezero-metrics/jsonnet/rules/kubernetes-prometheusRule +++ b/charts/kubezero-metrics/jsonnet/rules/kubernetes-prometheusRule @@ -228,44 +228,6 @@ { "name": "kubernetes-resources", "rules": [ - { - "alert": "ClusterAutoscalerNodeGroupsEnabled", - "annotations": { - "description": "Inhibitor rule if the Cluster Autoscaler found at least one node group", - "summary": "Cluster Autoscaler found at least one node group." - }, - "expr": "cluster_autoscaler_node_groups_count{job=\"addons-aws-cluster-autoscaler\",node_group_type=\"autoscaled\"} > 0", - "for": "5m", - "labels": { - "severity": "none" - } - }, - { - "alert": "KubeCPUOvercommit", - "annotations": { - "description": "Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.", - "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit", - "summary": "Cluster has overcommitted CPU resource requests." - }, - "expr": "sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster) - max(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster)) > 0\nand\n(sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster) - max(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster)) > 0\n", - "for": "10m", - "labels": { - "severity": "warning" - } - }, - { - "alert": "KubeMemoryOvercommit", - "annotations": { - "description": "Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.", - "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit", - "summary": "Cluster has overcommitted memory resource requests." - }, - "expr": "sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster) - max(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster)) > 0\nand\n(sum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster) - max(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster)) > 0\n", - "for": "10m", - "labels": { - "severity": "warning" - } - }, { "alert": "KubeCPUQuotaOvercommit", "annotations": { @@ -343,6 +305,18 @@ "labels": { "severity": "info" } + }, + { + "alert": "ClusterAutoscalerNodeGroupsEnabled", + "annotations": { + "description": "Inhibitor rule if the Cluster Autoscaler found at least one node group", + "summary": "Cluster Autoscaler found at least one node group." + }, + "expr": "cluster_autoscaler_node_groups_count{job=\"addons-aws-cluster-autoscaler\",node_group_type=\"autoscaled\"} > 0", + "for": "5m", + "labels": { + "severity": "none" + } } ] }, diff --git a/charts/kubezero-metrics/templates/rules/kubernetes.yaml b/charts/kubezero-metrics/templates/rules/kubernetes.yaml index a40e75c5..79d0ffdf 100644 --- a/charts/kubezero-metrics/templates/rules/kubernetes.yaml +++ b/charts/kubezero-metrics/templates/rules/kubernetes.yaml @@ -166,44 +166,6 @@ spec: severity: warning - name: kubernetes-resources rules: - - alert: ClusterAutoscalerNodeGroupsEnabled - annotations: - description: Inhibitor rule if the Cluster Autoscaler found at least one node group - summary: Cluster Autoscaler found at least one node group. - expr: cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0 - for: 5m - labels: - severity: none - - alert: KubeCPUOvercommit - annotations: - description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Pods by {{`{{`}} $value {{`}}`}} CPU shares and cannot tolerate node failure. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit - summary: Cluster has overcommitted CPU resource requests. - expr: 'sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 - - and - - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 - - ' - for: 10m - labels: - severity: warning - - alert: KubeMemoryOvercommit - annotations: - description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted memory resource requests for Pods by {{`{{`}} $value | humanize {{`}}`}} bytes and cannot tolerate node failure. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit - summary: Cluster has overcommitted memory resource requests. - expr: 'sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 - - and - - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 - - ' - for: 10m - labels: - severity: warning - alert: KubeCPUQuotaOvercommit annotations: description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Namespaces. @@ -258,6 +220,14 @@ spec: for: 15m labels: severity: info + - alert: ClusterAutoscalerNodeGroupsEnabled + annotations: + description: Inhibitor rule if the Cluster Autoscaler found at least one node group + summary: Cluster Autoscaler found at least one node group. + expr: cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0 + for: 5m + labels: + severity: none - name: kubernetes-storage rules: - alert: KubePersistentVolumeFillingUp diff --git a/charts/kubezero-metrics/values.yaml b/charts/kubezero-metrics/values.yaml index f249045e..ae1620f6 100644 --- a/charts/kubezero-metrics/values.yaml +++ b/charts/kubezero-metrics/values.yaml @@ -252,11 +252,6 @@ kube-prometheus-stack: - alertname = InfoInhibitor target_matchers: - severity = info - # Disable cluster overcommit alerts if we have cluster autoscaler available - - source_matchers: - - alertname = ClusterAutoscalerNodeGroupsEnabled - target_matchers: - - alertname =~ "KubeCPUOvercommit|KubeMemoryOvercommit" alertmanagerSpec: # externalUrl: logFormat: json