fix: disable OverCommit alerts globally
This commit is contained in:
parent
144d826aaf
commit
fcb1ee5896
@ -90,13 +90,11 @@ Kubernetes: `>= 1.26.0`
|
||||
| kube-prometheus-stack.alertmanager.config.inhibit_rules[2].equal[0] | string | `"namespace"` | |
|
||||
| kube-prometheus-stack.alertmanager.config.inhibit_rules[2].source_matchers[0] | string | `"alertname = InfoInhibitor"` | |
|
||||
| kube-prometheus-stack.alertmanager.config.inhibit_rules[2].target_matchers[0] | string | `"severity = info"` | |
|
||||
| kube-prometheus-stack.alertmanager.config.inhibit_rules[3].source_matchers[0] | string | `"alertname = ClusterAutoscalerNodeGroupsEnabled"` | |
|
||||
| kube-prometheus-stack.alertmanager.config.inhibit_rules[3].target_matchers[0] | string | `"alertname =~ \"KubeCPUOvercommit|KubeMemoryOvercommit\""` | |
|
||||
| kube-prometheus-stack.alertmanager.config.route.group_by[0] | string | `"alertname"` | |
|
||||
| kube-prometheus-stack.alertmanager.config.route.group_by[1] | string | `"severity"` | |
|
||||
| kube-prometheus-stack.alertmanager.config.route.group_by[2] | string | `"status"` | |
|
||||
| kube-prometheus-stack.alertmanager.config.route.group_interval | string | `"5m"` | |
|
||||
| kube-prometheus-stack.alertmanager.config.route.group_wait | string | `"15s"` | |
|
||||
| kube-prometheus-stack.alertmanager.config.route.group_wait | string | `"30s"` | |
|
||||
| kube-prometheus-stack.alertmanager.config.route.repeat_interval | string | `"4h"` | |
|
||||
| kube-prometheus-stack.alertmanager.config.route.routes[0].matchers[0] | string | `"severity = none"` | |
|
||||
| kube-prometheus-stack.alertmanager.config.route.routes[0].receiver | string | `"null"` | |
|
||||
|
@ -18,7 +18,7 @@
|
||||
"subdir": "contrib/mixin"
|
||||
}
|
||||
},
|
||||
"version": "a45cc4961e827009dba7d164e441a0ebe2eef870",
|
||||
"version": "2b323071a8bd4f02ddaf63e0dfa1fd98c221dccb",
|
||||
"sum": "IXI3LQIT9NmTPJAk8WLUJd5+qZfcGpeNCyWIK7oEpws="
|
||||
},
|
||||
{
|
||||
|
@ -42,20 +42,22 @@ local kp = (import 'kube-prometheus/main.libsonnet') + {
|
||||
(
|
||||
if group.name == 'kubernetes-resources' then
|
||||
group {
|
||||
rules: [
|
||||
{
|
||||
alert: 'ClusterAutoscalerNodeGroupsEnabled',
|
||||
expr: 'cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0 or vector(1)',
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'none',
|
||||
},
|
||||
annotations: {
|
||||
description: 'Inhibitor rule if the Cluster Autoscaler found at least one node group',
|
||||
summary: 'Cluster Autoscaler found at least one node group.',
|
||||
},
|
||||
rules: std.filter(
|
||||
function(rule)
|
||||
rule.alert != 'KubeCPUOvercommit' && rule.alert != 'KubeMemoryOvercommit',
|
||||
group.rules
|
||||
) + [{
|
||||
alert: 'ClusterAutoscalerNodeGroupsEnabled',
|
||||
expr: 'cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0',
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'none',
|
||||
},
|
||||
] + super.rules,
|
||||
annotations: {
|
||||
description: 'Inhibitor rule if the Cluster Autoscaler found at least one node group',
|
||||
summary: 'Cluster Autoscaler found at least one node group.',
|
||||
},
|
||||
}],
|
||||
}
|
||||
else
|
||||
group
|
||||
|
@ -228,44 +228,6 @@
|
||||
{
|
||||
"name": "kubernetes-resources",
|
||||
"rules": [
|
||||
{
|
||||
"alert": "ClusterAutoscalerNodeGroupsEnabled",
|
||||
"annotations": {
|
||||
"description": "Inhibitor rule if the Cluster Autoscaler found at least one node group",
|
||||
"summary": "Cluster Autoscaler found at least one node group."
|
||||
},
|
||||
"expr": "cluster_autoscaler_node_groups_count{job=\"addons-aws-cluster-autoscaler\",node_group_type=\"autoscaled\"} > 0",
|
||||
"for": "5m",
|
||||
"labels": {
|
||||
"severity": "none"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "KubeCPUOvercommit",
|
||||
"annotations": {
|
||||
"description": "Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit",
|
||||
"summary": "Cluster has overcommitted CPU resource requests."
|
||||
},
|
||||
"expr": "sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster) - max(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster)) > 0\nand\n(sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster) - max(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster)) > 0\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "KubeMemoryOvercommit",
|
||||
"annotations": {
|
||||
"description": "Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit",
|
||||
"summary": "Cluster has overcommitted memory resource requests."
|
||||
},
|
||||
"expr": "sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster) - max(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster)) > 0\nand\n(sum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster) - max(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster)) > 0\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "KubeCPUQuotaOvercommit",
|
||||
"annotations": {
|
||||
@ -343,6 +305,18 @@
|
||||
"labels": {
|
||||
"severity": "info"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "ClusterAutoscalerNodeGroupsEnabled",
|
||||
"annotations": {
|
||||
"description": "Inhibitor rule if the Cluster Autoscaler found at least one node group",
|
||||
"summary": "Cluster Autoscaler found at least one node group."
|
||||
},
|
||||
"expr": "cluster_autoscaler_node_groups_count{job=\"addons-aws-cluster-autoscaler\",node_group_type=\"autoscaled\"} > 0",
|
||||
"for": "5m",
|
||||
"labels": {
|
||||
"severity": "none"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
|
@ -166,44 +166,6 @@ spec:
|
||||
severity: warning
|
||||
- name: kubernetes-resources
|
||||
rules:
|
||||
- alert: ClusterAutoscalerNodeGroupsEnabled
|
||||
annotations:
|
||||
description: Inhibitor rule if the Cluster Autoscaler found at least one node group
|
||||
summary: Cluster Autoscaler found at least one node group.
|
||||
expr: cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: none
|
||||
- alert: KubeCPUOvercommit
|
||||
annotations:
|
||||
description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Pods by {{`{{`}} $value {{`}}`}} CPU shares and cannot tolerate node failure.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
expr: 'sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
|
||||
|
||||
and
|
||||
|
||||
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
|
||||
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeMemoryOvercommit
|
||||
annotations:
|
||||
description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted memory resource requests for Pods by {{`{{`}} $value | humanize {{`}}`}} bytes and cannot tolerate node failure.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
|
||||
summary: Cluster has overcommitted memory resource requests.
|
||||
expr: 'sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
|
||||
|
||||
and
|
||||
|
||||
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
|
||||
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeCPUQuotaOvercommit
|
||||
annotations:
|
||||
description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Namespaces.
|
||||
@ -258,6 +220,14 @@ spec:
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
- alert: ClusterAutoscalerNodeGroupsEnabled
|
||||
annotations:
|
||||
description: Inhibitor rule if the Cluster Autoscaler found at least one node group
|
||||
summary: Cluster Autoscaler found at least one node group.
|
||||
expr: cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: none
|
||||
- name: kubernetes-storage
|
||||
rules:
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
|
@ -252,11 +252,6 @@ kube-prometheus-stack:
|
||||
- alertname = InfoInhibitor
|
||||
target_matchers:
|
||||
- severity = info
|
||||
# Disable cluster overcommit alerts if we have cluster autoscaler available
|
||||
- source_matchers:
|
||||
- alertname = ClusterAutoscalerNodeGroupsEnabled
|
||||
target_matchers:
|
||||
- alertname =~ "KubeCPUOvercommit|KubeMemoryOvercommit"
|
||||
alertmanagerSpec:
|
||||
# externalUrl:
|
||||
logFormat: json
|
||||
|
Loading…
Reference in New Issue
Block a user