fix: disable OverCommit alerts globally
This commit is contained in:
parent
144d826aaf
commit
fcb1ee5896
@ -90,13 +90,11 @@ Kubernetes: `>= 1.26.0`
|
|||||||
| kube-prometheus-stack.alertmanager.config.inhibit_rules[2].equal[0] | string | `"namespace"` | |
|
| kube-prometheus-stack.alertmanager.config.inhibit_rules[2].equal[0] | string | `"namespace"` | |
|
||||||
| kube-prometheus-stack.alertmanager.config.inhibit_rules[2].source_matchers[0] | string | `"alertname = InfoInhibitor"` | |
|
| kube-prometheus-stack.alertmanager.config.inhibit_rules[2].source_matchers[0] | string | `"alertname = InfoInhibitor"` | |
|
||||||
| kube-prometheus-stack.alertmanager.config.inhibit_rules[2].target_matchers[0] | string | `"severity = info"` | |
|
| kube-prometheus-stack.alertmanager.config.inhibit_rules[2].target_matchers[0] | string | `"severity = info"` | |
|
||||||
| kube-prometheus-stack.alertmanager.config.inhibit_rules[3].source_matchers[0] | string | `"alertname = ClusterAutoscalerNodeGroupsEnabled"` | |
|
|
||||||
| kube-prometheus-stack.alertmanager.config.inhibit_rules[3].target_matchers[0] | string | `"alertname =~ \"KubeCPUOvercommit|KubeMemoryOvercommit\""` | |
|
|
||||||
| kube-prometheus-stack.alertmanager.config.route.group_by[0] | string | `"alertname"` | |
|
| kube-prometheus-stack.alertmanager.config.route.group_by[0] | string | `"alertname"` | |
|
||||||
| kube-prometheus-stack.alertmanager.config.route.group_by[1] | string | `"severity"` | |
|
| kube-prometheus-stack.alertmanager.config.route.group_by[1] | string | `"severity"` | |
|
||||||
| kube-prometheus-stack.alertmanager.config.route.group_by[2] | string | `"status"` | |
|
| kube-prometheus-stack.alertmanager.config.route.group_by[2] | string | `"status"` | |
|
||||||
| kube-prometheus-stack.alertmanager.config.route.group_interval | string | `"5m"` | |
|
| kube-prometheus-stack.alertmanager.config.route.group_interval | string | `"5m"` | |
|
||||||
| kube-prometheus-stack.alertmanager.config.route.group_wait | string | `"15s"` | |
|
| kube-prometheus-stack.alertmanager.config.route.group_wait | string | `"30s"` | |
|
||||||
| kube-prometheus-stack.alertmanager.config.route.repeat_interval | string | `"4h"` | |
|
| kube-prometheus-stack.alertmanager.config.route.repeat_interval | string | `"4h"` | |
|
||||||
| kube-prometheus-stack.alertmanager.config.route.routes[0].matchers[0] | string | `"severity = none"` | |
|
| kube-prometheus-stack.alertmanager.config.route.routes[0].matchers[0] | string | `"severity = none"` | |
|
||||||
| kube-prometheus-stack.alertmanager.config.route.routes[0].receiver | string | `"null"` | |
|
| kube-prometheus-stack.alertmanager.config.route.routes[0].receiver | string | `"null"` | |
|
||||||
|
@ -18,7 +18,7 @@
|
|||||||
"subdir": "contrib/mixin"
|
"subdir": "contrib/mixin"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "a45cc4961e827009dba7d164e441a0ebe2eef870",
|
"version": "2b323071a8bd4f02ddaf63e0dfa1fd98c221dccb",
|
||||||
"sum": "IXI3LQIT9NmTPJAk8WLUJd5+qZfcGpeNCyWIK7oEpws="
|
"sum": "IXI3LQIT9NmTPJAk8WLUJd5+qZfcGpeNCyWIK7oEpws="
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -42,10 +42,13 @@ local kp = (import 'kube-prometheus/main.libsonnet') + {
|
|||||||
(
|
(
|
||||||
if group.name == 'kubernetes-resources' then
|
if group.name == 'kubernetes-resources' then
|
||||||
group {
|
group {
|
||||||
rules: [
|
rules: std.filter(
|
||||||
{
|
function(rule)
|
||||||
|
rule.alert != 'KubeCPUOvercommit' && rule.alert != 'KubeMemoryOvercommit',
|
||||||
|
group.rules
|
||||||
|
) + [{
|
||||||
alert: 'ClusterAutoscalerNodeGroupsEnabled',
|
alert: 'ClusterAutoscalerNodeGroupsEnabled',
|
||||||
expr: 'cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0 or vector(1)',
|
expr: 'cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0',
|
||||||
'for': '5m',
|
'for': '5m',
|
||||||
labels: {
|
labels: {
|
||||||
severity: 'none',
|
severity: 'none',
|
||||||
@ -54,8 +57,7 @@ local kp = (import 'kube-prometheus/main.libsonnet') + {
|
|||||||
description: 'Inhibitor rule if the Cluster Autoscaler found at least one node group',
|
description: 'Inhibitor rule if the Cluster Autoscaler found at least one node group',
|
||||||
summary: 'Cluster Autoscaler found at least one node group.',
|
summary: 'Cluster Autoscaler found at least one node group.',
|
||||||
},
|
},
|
||||||
},
|
}],
|
||||||
] + super.rules,
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
group
|
group
|
||||||
|
@ -228,44 +228,6 @@
|
|||||||
{
|
{
|
||||||
"name": "kubernetes-resources",
|
"name": "kubernetes-resources",
|
||||||
"rules": [
|
"rules": [
|
||||||
{
|
|
||||||
"alert": "ClusterAutoscalerNodeGroupsEnabled",
|
|
||||||
"annotations": {
|
|
||||||
"description": "Inhibitor rule if the Cluster Autoscaler found at least one node group",
|
|
||||||
"summary": "Cluster Autoscaler found at least one node group."
|
|
||||||
},
|
|
||||||
"expr": "cluster_autoscaler_node_groups_count{job=\"addons-aws-cluster-autoscaler\",node_group_type=\"autoscaled\"} > 0",
|
|
||||||
"for": "5m",
|
|
||||||
"labels": {
|
|
||||||
"severity": "none"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"alert": "KubeCPUOvercommit",
|
|
||||||
"annotations": {
|
|
||||||
"description": "Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.",
|
|
||||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit",
|
|
||||||
"summary": "Cluster has overcommitted CPU resource requests."
|
|
||||||
},
|
|
||||||
"expr": "sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster) - max(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster)) > 0\nand\n(sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster) - max(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster)) > 0\n",
|
|
||||||
"for": "10m",
|
|
||||||
"labels": {
|
|
||||||
"severity": "warning"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"alert": "KubeMemoryOvercommit",
|
|
||||||
"annotations": {
|
|
||||||
"description": "Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.",
|
|
||||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit",
|
|
||||||
"summary": "Cluster has overcommitted memory resource requests."
|
|
||||||
},
|
|
||||||
"expr": "sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster) - max(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster)) > 0\nand\n(sum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster) - max(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster)) > 0\n",
|
|
||||||
"for": "10m",
|
|
||||||
"labels": {
|
|
||||||
"severity": "warning"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"alert": "KubeCPUQuotaOvercommit",
|
"alert": "KubeCPUQuotaOvercommit",
|
||||||
"annotations": {
|
"annotations": {
|
||||||
@ -343,6 +305,18 @@
|
|||||||
"labels": {
|
"labels": {
|
||||||
"severity": "info"
|
"severity": "info"
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "ClusterAutoscalerNodeGroupsEnabled",
|
||||||
|
"annotations": {
|
||||||
|
"description": "Inhibitor rule if the Cluster Autoscaler found at least one node group",
|
||||||
|
"summary": "Cluster Autoscaler found at least one node group."
|
||||||
|
},
|
||||||
|
"expr": "cluster_autoscaler_node_groups_count{job=\"addons-aws-cluster-autoscaler\",node_group_type=\"autoscaled\"} > 0",
|
||||||
|
"for": "5m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "none"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -166,44 +166,6 @@ spec:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- name: kubernetes-resources
|
- name: kubernetes-resources
|
||||||
rules:
|
rules:
|
||||||
- alert: ClusterAutoscalerNodeGroupsEnabled
|
|
||||||
annotations:
|
|
||||||
description: Inhibitor rule if the Cluster Autoscaler found at least one node group
|
|
||||||
summary: Cluster Autoscaler found at least one node group.
|
|
||||||
expr: cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: none
|
|
||||||
- alert: KubeCPUOvercommit
|
|
||||||
annotations:
|
|
||||||
description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Pods by {{`{{`}} $value {{`}}`}} CPU shares and cannot tolerate node failure.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
|
|
||||||
summary: Cluster has overcommitted CPU resource requests.
|
|
||||||
expr: 'sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
|
|
||||||
|
|
||||||
and
|
|
||||||
|
|
||||||
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
|
|
||||||
|
|
||||||
'
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeMemoryOvercommit
|
|
||||||
annotations:
|
|
||||||
description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted memory resource requests for Pods by {{`{{`}} $value | humanize {{`}}`}} bytes and cannot tolerate node failure.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
|
|
||||||
summary: Cluster has overcommitted memory resource requests.
|
|
||||||
expr: 'sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
|
|
||||||
|
|
||||||
and
|
|
||||||
|
|
||||||
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
|
|
||||||
|
|
||||||
'
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeCPUQuotaOvercommit
|
- alert: KubeCPUQuotaOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Namespaces.
|
description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Namespaces.
|
||||||
@ -258,6 +220,14 @@ spec:
|
|||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: info
|
severity: info
|
||||||
|
- alert: ClusterAutoscalerNodeGroupsEnabled
|
||||||
|
annotations:
|
||||||
|
description: Inhibitor rule if the Cluster Autoscaler found at least one node group
|
||||||
|
summary: Cluster Autoscaler found at least one node group.
|
||||||
|
expr: cluster_autoscaler_node_groups_count{job="addons-aws-cluster-autoscaler",node_group_type="autoscaled"} > 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: none
|
||||||
- name: kubernetes-storage
|
- name: kubernetes-storage
|
||||||
rules:
|
rules:
|
||||||
- alert: KubePersistentVolumeFillingUp
|
- alert: KubePersistentVolumeFillingUp
|
||||||
|
@ -252,11 +252,6 @@ kube-prometheus-stack:
|
|||||||
- alertname = InfoInhibitor
|
- alertname = InfoInhibitor
|
||||||
target_matchers:
|
target_matchers:
|
||||||
- severity = info
|
- severity = info
|
||||||
# Disable cluster overcommit alerts if we have cluster autoscaler available
|
|
||||||
- source_matchers:
|
|
||||||
- alertname = ClusterAutoscalerNodeGroupsEnabled
|
|
||||||
target_matchers:
|
|
||||||
- alertname =~ "KubeCPUOvercommit|KubeMemoryOvercommit"
|
|
||||||
alertmanagerSpec:
|
alertmanagerSpec:
|
||||||
# externalUrl:
|
# externalUrl:
|
||||||
logFormat: json
|
logFormat: json
|
||||||
|
Loading…
Reference in New Issue
Block a user