fix: disable OverCommit alerts globally

This commit is contained in:
Stefan Reimer 2024-12-04 19:49:33 +00:00
parent 144d826aaf
commit fcb1ee5896
6 changed files with 37 additions and 98 deletions

View File

@ -90,13 +90,11 @@ Kubernetes: `>= 1.26.0`
| kube-prometheus-stack.alertmanager.config.inhibit_rules[2].equal[0] | string | `"namespace"` | | | kube-prometheus-stack.alertmanager.config.inhibit_rules[2].equal[0] | string | `"namespace"` | |
| kube-prometheus-stack.alertmanager.config.inhibit_rules[2].source_matchers[0] | string | `"alertname = InfoInhibitor"` | | | kube-prometheus-stack.alertmanager.config.inhibit_rules[2].source_matchers[0] | string | `"alertname = InfoInhibitor"` | |
| kube-prometheus-stack.alertmanager.config.inhibit_rules[2].target_matchers[0] | string | `"severity = info"` | | | kube-prometheus-stack.alertmanager.config.inhibit_rules[2].target_matchers[0] | string | `"severity = info"` | |
| kube-prometheus-stack.alertmanager.config.inhibit_rules[3].source_matchers[0] | string | `"alertname = ClusterAutoscalerNodeGroupsEnabled"` | |
| kube-prometheus-stack.alertmanager.config.inhibit_rules[3].target_matchers[0] | string | `"alertname =~ \"KubeCPUOvercommit|KubeMemoryOvercommit\""` | |
| kube-prometheus-stack.alertmanager.config.route.group_by[0] | string | `"alertname"` | | | kube-prometheus-stack.alertmanager.config.route.group_by[0] | string | `"alertname"` | |
| kube-prometheus-stack.alertmanager.config.route.group_by[1] | string | `"severity"` | | | kube-prometheus-stack.alertmanager.config.route.group_by[1] | string | `"severity"` | |
| kube-prometheus-stack.alertmanager.config.route.group_by[2] | string | `"status"` | | | kube-prometheus-stack.alertmanager.config.route.group_by[2] | string | `"status"` | |
| kube-prometheus-stack.alertmanager.config.route.group_interval | string | `"5m"` | | | kube-prometheus-stack.alertmanager.config.route.group_interval | string | `"5m"` | |
| kube-prometheus-stack.alertmanager.config.route.group_wait | string | `"15s"` | | | kube-prometheus-stack.alertmanager.config.route.group_wait | string | `"30s"` | |
| kube-prometheus-stack.alertmanager.config.route.repeat_interval | string | `"4h"` | | | kube-prometheus-stack.alertmanager.config.route.repeat_interval | string | `"4h"` | |
| kube-prometheus-stack.alertmanager.config.route.routes[0].matchers[0] | string | `"severity = none"` | | | kube-prometheus-stack.alertmanager.config.route.routes[0].matchers[0] | string | `"severity = none"` | |
| kube-prometheus-stack.alertmanager.config.route.routes[0].receiver | string | `"null"` | | | kube-prometheus-stack.alertmanager.config.route.routes[0].receiver | string | `"null"` | |

View File

@ -18,7 +18,7 @@
"subdir": "contrib/mixin" "subdir": "contrib/mixin"
} }
}, },
"version": "a45cc4961e827009dba7d164e441a0ebe2eef870", "version": "2b323071a8bd4f02ddaf63e0dfa1fd98c221dccb",
"sum": "IXI3LQIT9NmTPJAk8WLUJd5+qZfcGpeNCyWIK7oEpws=" "sum": "IXI3LQIT9NmTPJAk8WLUJd5+qZfcGpeNCyWIK7oEpws="
}, },
{ {

View File

@ -42,10 +42,13 @@ local kp = (import 'kube-prometheus/main.libsonnet') + {
( (
if group.name == 'kubernetes-resources' then if group.name == 'kubernetes-resources' then
group { group {
rules: [ rules: std.filter(
{ function(rule)
rule.alert != 'KubeCPUOvercommit' && rule.alert != 'KubeMemoryOvercommit',
group.rules
) + [{
alert: 'ClusterAutoscalerNodeGroupsEnabled', alert: 'ClusterAutoscalerNodeGroupsEnabled',
expr: 'cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0 or vector(1)', expr: 'cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0',
'for': '5m', 'for': '5m',
labels: { labels: {
severity: 'none', severity: 'none',
@ -54,8 +57,7 @@ local kp = (import 'kube-prometheus/main.libsonnet') + {
description: 'Inhibitor rule if the Cluster Autoscaler found at least one node group', description: 'Inhibitor rule if the Cluster Autoscaler found at least one node group',
summary: 'Cluster Autoscaler found at least one node group.', summary: 'Cluster Autoscaler found at least one node group.',
}, },
}, }],
] + super.rules,
} }
else else
group group

View File

@ -228,44 +228,6 @@
{ {
"name": "kubernetes-resources", "name": "kubernetes-resources",
"rules": [ "rules": [
{
"alert": "ClusterAutoscalerNodeGroupsEnabled",
"annotations": {
"description": "Inhibitor rule if the Cluster Autoscaler found at least one node group",
"summary": "Cluster Autoscaler found at least one node group."
},
"expr": "cluster_autoscaler_node_groups_count{job=\"addons-aws-cluster-autoscaler\",node_group_type=\"autoscaled\"} > 0",
"for": "5m",
"labels": {
"severity": "none"
}
},
{
"alert": "KubeCPUOvercommit",
"annotations": {
"description": "Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit",
"summary": "Cluster has overcommitted CPU resource requests."
},
"expr": "sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster) - max(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster)) > 0\nand\n(sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster) - max(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster)) > 0\n",
"for": "10m",
"labels": {
"severity": "warning"
}
},
{
"alert": "KubeMemoryOvercommit",
"annotations": {
"description": "Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit",
"summary": "Cluster has overcommitted memory resource requests."
},
"expr": "sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster) - max(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster)) > 0\nand\n(sum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster) - max(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster)) > 0\n",
"for": "10m",
"labels": {
"severity": "warning"
}
},
{ {
"alert": "KubeCPUQuotaOvercommit", "alert": "KubeCPUQuotaOvercommit",
"annotations": { "annotations": {
@ -343,6 +305,18 @@
"labels": { "labels": {
"severity": "info" "severity": "info"
} }
},
{
"alert": "ClusterAutoscalerNodeGroupsEnabled",
"annotations": {
"description": "Inhibitor rule if the Cluster Autoscaler found at least one node group",
"summary": "Cluster Autoscaler found at least one node group."
},
"expr": "cluster_autoscaler_node_groups_count{job=\"addons-aws-cluster-autoscaler\",node_group_type=\"autoscaled\"} > 0",
"for": "5m",
"labels": {
"severity": "none"
}
} }
] ]
}, },

View File

@ -166,44 +166,6 @@ spec:
severity: warning severity: warning
- name: kubernetes-resources - name: kubernetes-resources
rules: rules:
- alert: ClusterAutoscalerNodeGroupsEnabled
annotations:
description: Inhibitor rule if the Cluster Autoscaler found at least one node group
summary: Cluster Autoscaler found at least one node group.
expr: cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0
for: 5m
labels:
severity: none
- alert: KubeCPUOvercommit
annotations:
description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Pods by {{`{{`}} $value {{`}}`}} CPU shares and cannot tolerate node failure.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
summary: Cluster has overcommitted CPU resource requests.
expr: 'sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
and
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
'
for: 10m
labels:
severity: warning
- alert: KubeMemoryOvercommit
annotations:
description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted memory resource requests for Pods by {{`{{`}} $value | humanize {{`}}`}} bytes and cannot tolerate node failure.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
summary: Cluster has overcommitted memory resource requests.
expr: 'sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
and
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
'
for: 10m
labels:
severity: warning
- alert: KubeCPUQuotaOvercommit - alert: KubeCPUQuotaOvercommit
annotations: annotations:
description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Namespaces. description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Namespaces.
@ -258,6 +220,14 @@ spec:
for: 15m for: 15m
labels: labels:
severity: info severity: info
- alert: ClusterAutoscalerNodeGroupsEnabled
annotations:
description: Inhibitor rule if the Cluster Autoscaler found at least one node group
summary: Cluster Autoscaler found at least one node group.
expr: cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0
for: 5m
labels:
severity: none
- name: kubernetes-storage - name: kubernetes-storage
rules: rules:
- alert: KubePersistentVolumeFillingUp - alert: KubePersistentVolumeFillingUp

View File

@ -252,11 +252,6 @@ kube-prometheus-stack:
- alertname = InfoInhibitor - alertname = InfoInhibitor
target_matchers: target_matchers:
- severity = info - severity = info
# Disable cluster overcommit alerts if we have cluster autoscaler available
- source_matchers:
- alertname = ClusterAutoscalerNodeGroupsEnabled
target_matchers:
- alertname =~ "KubeCPUOvercommit|KubeMemoryOvercommit"
alertmanagerSpec: alertmanagerSpec:
# externalUrl: # externalUrl:
logFormat: json logFormat: json