fix(metrics): More fixes around AlertLabels

This commit is contained in:
Stefan Reimer 2021-09-29 16:33:33 +02:00
parent 3056e1614f
commit c97c9e6d91
19 changed files with 20 additions and 524 deletions

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: kubezero-metrics
description: KubeZero Umbrella Chart for Prometheus, Grafana and Alertmanager as well as all Kubernetes integrations.
type: application
version: 0.5.2
version: 0.5.4
home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords:

View File

@ -1,6 +1,6 @@
# kubezero-metrics
![Version: 0.5.2](https://img.shields.io/badge/Version-0.5.2-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)
![Version: 0.5.4](https://img.shields.io/badge/Version-0.5.4-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)
KubeZero Umbrella Chart for Prometheus, Grafana and Alertmanager as well as all Kubernetes integrations.

View File

@ -1,16 +1,15 @@
#!/bin/bash -ex
JB='./jb-linux-amd64'
which jsonnet > /dev/null || { echo "Required jsonnet not found!"; exit 1;}
which jb > /dev/null || { echo "Required jb ( json-bundler ) not found!"; exit 1;}
which jsonnet > /dev/null || { echo "Required jsonnet not found!"; }
[ -x $JB ] || { wget https://github.com/jsonnet-bundler/jsonnet-bundler/releases/download/v0.4.0/jb-linux-amd64 && chmod +x $JB; }
#which gojsontoyaml || go install github.com/brancz/gojsontoyaml@latest
# wget https://github.com/jsonnet-bundler/jsonnet-bundler/releases/download/v0.4.0/jb-linux-amd64
[ -r jsonnetfile.json ] || $JB init
[ -r jsonnetfile.json ] || jb init
if [ -r jsonnetfile.lock.json ]; then
$JB update
jb update
else
$JB install github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus@main
jb install github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus@main
fi
rm -rf dashboards && mkdir -p dashboards

View File

@ -38,7 +38,7 @@
"subdir": "grafana-builder"
}
},
"version": "bc9b685050691a78ee414cd8f789857de0eabe8d",
"version": "b7eae75972a369bf8ebfb03dcb0d4c14464ef85a",
"sum": "GRf2GvwEU4jhXV+JOonXSZ4wdDv8mnHBPCQ6TUVd+g8="
},
{
@ -98,7 +98,7 @@
"subdir": "jsonnet/mixin"
}
},
"version": "c6b5cc2e178002683ab3afe8d023a8bb2508ba19",
"version": "42fc15967e35e0cca68cf935f844086edbc82d0e",
"sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U=",
"name": "prometheus-operator-mixin"
},
@ -109,7 +109,7 @@
"subdir": "jsonnet/prometheus-operator"
}
},
"version": "c6b5cc2e178002683ab3afe8d023a8bb2508ba19",
"version": "42fc15967e35e0cca68cf935f844086edbc82d0e",
"sum": "sECNXs/aIEreFUma1BWVyknBygqh3AVJEB3msmrAYYY="
},
{

View File

@ -2,8 +2,6 @@
rules:
- name: alertmanager
url: file://rules/alertmanager-prometheusRule
- name: etcd-mixin
url: file://rules/etcd-mixin-prometheusRule
- name: kube-prometheus
url: file://rules/kube-prometheus-prometheusRule
- name: kube-state-metrics
@ -16,3 +14,5 @@ rules:
url: file://rules/prometheus-operator-prometheusRule
- name: prometheus
url: file://rules/prometheus-prometheusRule
- name: etcd-mixin
url: file://rules/etcd-mixin-prometheusRule

View File

@ -1,6 +1,6 @@
configmap: grafana-dashboards-metrics
gzip: true
folder: Metrics
folder: KubeZero
dashboards:
- name: AlertManager
url: https://grafana.com/api/dashboards/9578/revisions/4/download

View File

@ -1,5 +1,6 @@
configmap: grafana-dashboards-zdt
gzip: true
folder: KubeZero
dashboards:
- name: home
url: file://zdt/home.json

View File

@ -61,19 +61,9 @@ for r in config['rules']:
# Encode {{ }} for helm
text = text.replace("{{", "{{`{{").replace("}}", "}}`}}").replace("{{`{{", "{{`{{`}}").replace("}}`}}", "{{`}}`}}")
text = textwrap.indent(text, ' '*2)
# add support for additionalRuleLabels from Helm values
helm_labels = '''{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
'''
helm_labels = textwrap.indent(helm_labels, ' '*8)
text = text.replace("labels:\n", "labels:\n{}".format(helm_labels))
rule += '''spec:\n'''
rule += text+'\n'
rule += textwrap.indent(text, ' '*2)+'\n'
if 'condition' in r:
rule += '{{- end }}'+'\n'

File diff suppressed because one or more lines are too long

View File

@ -6,6 +6,8 @@ metadata:
labels:
grafana_dashboard: "1"
{{- include "kubezero-lib.labels" . | nindent 4 }}
annotations:
k8s-sidecar-target-directory: KubeZero
binaryData:
home.json.gz:
H4sIAAAAAAAC/9VVS2/UMBC+8yuMDwikdnezj1b0VrVCVLwKFJBoq8obTxJrHTvYTndb1P+O7byczULFDS6J55vxeB6fxz+fIISJENIQw6TQ+Aj9tJAFOdPGSpdeQjXqNcuScXMmrDLa61BKDNGyVDFYBT5XMgeTQalxYAOCLLnTG1VCgGeM7kBZLMWJ5FI5hypdkueTPTSNIvtZLPZQ9CJ0LUjuDz7uckHP0DEHZXohmLvC21Gis6UkiuJa9+D/1/b74MwxUGa2osWpAHNGLSJKzitEkSK7kJIbVlh84kHmTKYv/ZozsXJVvbz2YkEEcN3Wtalqv3qNd69JGHB6IkXC0rY51RZISMmNb9lDkKG8BaVsQatTm+xaf6nVnUvd95VZcRb4WLsE5gGwaZKr5TsnD3z7xA9aURZ9TnnQNtWAcNTCTy+/n16gtzKV188zYwp9NB7HVIzuQcl9KtfCsBxGtuZjojUYPV6VS3DKMbd79nVOON8/mI8KkaIrfE/NFX5xJb4Bjy35kJHojbX/bu3ROzCKxRo9RW0cQTK59PTDOVErdyweJFbwMmXiKyhtE3Kmh6PFaNb6wC7QV5by281z+OeMJWaoMJ5a+LUNNfBTs9PAxlRR1DH8S0SJFltMiaaPMGW2mykt47Zuye/Jk0hOQfkrGJ6Rk82ZgdwZz0L8RwnKnY/DbutMrl8DoUykejh0nPYTxJah/O4rgzXQ3TafgajYVSMhXA+Uhii1a6ch6W+q/SjDmp1DCp02o0wPieTGnJ/k/y+ZesAf2DT/ezYB0C+Kuzo142c4epigsBltcj6cCm0H3hID2qD3sNYosWMA+aFzat2gC7brfgtrWbfEfqunQUGiQLua4NmkaiXWcQY56VgxPaxgc8fr9qpVZdnjh50fecHtKyjS4YPePXEux06fVPMLC7nej7I6ZmxkjeHetoLFK1DdZvuCUxA9wjcJ3TA78NUtCZ49q1wEZI0mgTALhSjv1otgHYXCbBJqsm49DdZR/dBfN3nZJG46Vlw+ekro+CB0HJ4ynYcC7daHNIx3O5Zm6neF7RX7XgqvXyq51lZZK2ryOa45qjmmofY5waW/FPjg4wdIFy/f3VfobUul+ZOHX/6tTqj9CQAA

View File

@ -23,9 +23,6 @@ spec:
'
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: AlertmanagerMembersInconsistent
annotations:
@ -35,9 +32,6 @@ spec:
expr: "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n max_over_time(alertmanager_cluster_members{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m])\n< on (namespace,service) group_left\n count by (namespace,service) (max_over_time(alertmanager_cluster_members{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m]))\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: AlertmanagerFailedToSendAlerts
annotations:
@ -47,9 +41,6 @@ spec:
expr: "(\n rate(alertmanager_notifications_failed_total{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m])\n/\n rate(alertmanager_notifications_total{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m])\n)\n> 0.01\n"
for: 5m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
@ -59,9 +50,6 @@ spec:
expr: "min by (namespace,service, integration) (\n rate(alertmanager_notifications_failed_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration=~`.*`}[5m])\n/\n rate(alertmanager_notifications_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration=~`.*`}[5m])\n)\n> 0.01\n"
for: 5m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
@ -71,9 +59,6 @@ spec:
expr: "min by (namespace,service, integration) (\n rate(alertmanager_notifications_failed_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration!~`.*`}[5m])\n/\n rate(alertmanager_notifications_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration!~`.*`}[5m])\n)\n> 0.01\n"
for: 5m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: AlertmanagerConfigInconsistent
annotations:
@ -83,9 +68,6 @@ spec:
expr: "count by (namespace,service) (\n count_values by (namespace,service) (\"config_hash\", alertmanager_config_hash{job=\"alertmanager-main\",namespace=\"monitoring\"})\n)\n!= 1\n"
for: 20m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: AlertmanagerClusterDown
annotations:
@ -95,9 +77,6 @@ spec:
expr: "(\n count by (namespace,service) (\n avg_over_time(up{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m]) < 0.5\n )\n/\n count by (namespace,service) (\n up{job=\"alertmanager-main\",namespace=\"monitoring\"}\n )\n)\n>= 0.5\n"
for: 5m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: AlertmanagerClusterCrashlooping
annotations:
@ -107,8 +86,5 @@ spec:
expr: "(\n count by (namespace,service) (\n changes(process_start_time_seconds{job=\"alertmanager-main\",namespace=\"monitoring\"}[10m]) > 4\n )\n/\n count by (namespace,service) (\n up{job=\"alertmanager-main\",namespace=\"monitoring\"}\n )\n)\n>= 0.5\n"
for: 5m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical

View File

@ -16,9 +16,6 @@ spec:
expr: "max without (endpoint) (\n sum without (instance, pod) (up{job=~\".*etcd.*\"} == bool 0)\nor\n count without (To) (\n sum without (instance, pod) (rate(etcd_network_peer_sent_failures_total{job=~\".*etcd.*\"}[120s])) > 0.01\n )\n)\n> 0\n"
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: etcdInsufficientMembers
annotations:
@ -29,9 +26,6 @@ spec:
'
for: 3m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: etcdNoLeader
annotations:
@ -42,9 +36,6 @@ spec:
'
for: 1m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: etcdHighNumberOfLeaderChanges
annotations:
@ -55,9 +46,6 @@ spec:
'
for: 5m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: etcdGRPCRequestsSlow
annotations:
@ -70,9 +58,6 @@ spec:
'
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: etcdMemberCommunicationSlow
annotations:
@ -85,9 +70,6 @@ spec:
'
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: etcdHighNumberOfFailedProposals
annotations:
@ -98,9 +80,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: etcdHighFsyncDurations
annotations:
@ -113,9 +92,6 @@ spec:
'
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: etcdHighFsyncDurations
annotations:
@ -127,9 +103,6 @@ spec:
'
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: etcdHighCommitDurations
annotations:
@ -142,9 +115,6 @@ spec:
'
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: etcdBackendQuotaLowSpace
annotations:
@ -154,9 +124,6 @@ spec:
'
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: etcdExcessiveDatabaseGrowth
annotations:
@ -166,8 +133,5 @@ spec:
'
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning

View File

@ -17,9 +17,6 @@ spec:
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: Watchdog
annotations:
@ -38,9 +35,6 @@ spec:
summary: An alert that should always be firing to certify that Alertmanager is working properly.
expr: vector(1)
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: none
- name: node-network
rules:
@ -54,9 +48,6 @@ spec:
'
for: 2m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- name: kube-prometheus-node-recording.rules
rules:

View File

@ -17,9 +17,6 @@ spec:
expr: "(sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\",result=\"error\"}[5m]))\n /\nsum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\"}[5m])))\n> 0.01\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: KubeStateMetricsWatchErrors
annotations:
@ -29,9 +26,6 @@ spec:
expr: "(sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\",result=\"error\"}[5m]))\n /\nsum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\"}[5m])))\n> 0.01\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: KubeStateMetricsShardingMismatch
annotations:
@ -43,9 +37,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: KubeStateMetricsShardsMissing
annotations:
@ -55,8 +46,5 @@ spec:
expr: "2^max(kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) - 1\n -\nsum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job=\"kube-state-metrics\"}) )\n!= 0\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical

View File

@ -23,9 +23,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubePodNotReady
annotations:
@ -35,9 +32,6 @@ spec:
expr: "sum by (namespace, pod) (\n max by(namespace, pod) (\n kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Pending|Unknown\"}\n ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (\n 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!=\"Job\"})\n )\n) > 0\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeDeploymentGenerationMismatch
annotations:
@ -47,9 +41,6 @@ spec:
expr: "kube_deployment_status_observed_generation{job=\"kube-state-metrics\"}\n !=\nkube_deployment_metadata_generation{job=\"kube-state-metrics\"}\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeDeploymentReplicasMismatch
annotations:
@ -59,9 +50,6 @@ spec:
expr: "(\n kube_deployment_spec_replicas{job=\"kube-state-metrics\"}\n >\n kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}\n) and (\n changes(kube_deployment_status_replicas_updated{job=\"kube-state-metrics\"}[10m])\n ==\n 0\n)\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeStatefulSetReplicasMismatch
annotations:
@ -71,9 +59,6 @@ spec:
expr: "(\n kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas{job=\"kube-state-metrics\"}\n) and (\n changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[10m])\n ==\n 0\n)\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeStatefulSetGenerationMismatch
annotations:
@ -83,9 +68,6 @@ spec:
expr: "kube_statefulset_status_observed_generation{job=\"kube-state-metrics\"}\n !=\nkube_statefulset_metadata_generation{job=\"kube-state-metrics\"}\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
@ -95,9 +77,6 @@ spec:
expr: "(\n max without (revision) (\n kube_statefulset_status_current_revision{job=\"kube-state-metrics\"}\n unless\n kube_statefulset_status_update_revision{job=\"kube-state-metrics\"}\n )\n *\n (\n kube_statefulset_replicas{job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeDaemonSetRolloutStuck
annotations:
@ -107,9 +86,6 @@ spec:
expr: "(\n (\n kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"}\n !=\n 0\n ) or (\n kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_available{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeContainerWaiting
annotations:
@ -121,9 +97,6 @@ spec:
'
for: 1h
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeDaemonSetNotScheduled
annotations:
@ -133,9 +106,6 @@ spec:
expr: "kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n -\nkube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"} > 0\n"
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeDaemonSetMisScheduled
annotations:
@ -147,9 +117,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeJobCompletion
annotations:
@ -161,9 +128,6 @@ spec:
'
for: 12h
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeJobFailed
annotations:
@ -175,9 +139,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeHpaReplicasMismatch
annotations:
@ -187,9 +148,6 @@ spec:
expr: "(kube_horizontalpodautoscaler_status_desired_replicas{job=\"kube-state-metrics\"}\n !=\nkube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"})\n and\n(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n >\nkube_horizontalpodautoscaler_spec_min_replicas{job=\"kube-state-metrics\"})\n and\n(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n <\nkube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\"})\n and\nchanges(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}[15m]) == 0\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeHpaMaxedOut
annotations:
@ -199,9 +157,6 @@ spec:
expr: "kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n ==\nkube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\"}\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- name: kubernetes-resources
rules:
@ -219,9 +174,6 @@ spec:
'
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeMemoryOvercommit
annotations:
@ -237,9 +189,6 @@ spec:
'
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeCPUQuotaOvercommit
annotations:
@ -249,9 +198,6 @@ spec:
expr: "sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"cpu\"})\n /\nsum(kube_node_status_allocatable{resource=\"cpu\"})\n > 1.5\n"
for: 5m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeMemoryQuotaOvercommit
annotations:
@ -261,9 +207,6 @@ spec:
expr: "sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"memory\"})\n /\nsum(kube_node_status_allocatable{resource=\"memory\",job=\"kube-state-metrics\"})\n > 1.5\n"
for: 5m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeQuotaAlmostFull
annotations:
@ -273,9 +216,6 @@ spec:
expr: "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n > 0.9 < 1\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: info
- alert: KubeQuotaFullyUsed
annotations:
@ -285,9 +225,6 @@ spec:
expr: "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n == 1\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: info
- alert: KubeQuotaExceeded
annotations:
@ -297,9 +234,6 @@ spec:
expr: "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n > 1\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: CPUThrottlingHigh
annotations:
@ -309,9 +243,6 @@ spec:
expr: "sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\", }[5m])) by (container, pod, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)\n > ( 25 / 100 )\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: info
- name: kubernetes-storage
rules:
@ -323,9 +254,6 @@ spec:
expr: "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) < 0.03\nand\nkubelet_volume_stats_used_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1\n"
for: 1m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: KubePersistentVolumeFillingUp
annotations:
@ -335,9 +263,6 @@ spec:
expr: "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) < 0.15\nand\nkubelet_volume_stats_used_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nand\npredict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}[6h], 4 * 24 * 3600) < 0\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1\n"
for: 1h
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubePersistentVolumeErrors
annotations:
@ -349,9 +274,6 @@ spec:
'
for: 5m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- name: kubernetes-system
rules:
@ -365,9 +287,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeClientErrors
annotations:
@ -377,9 +296,6 @@ spec:
expr: "(sum(rate(rest_client_requests_total{code=~\"5..\"}[5m])) by (instance, job, namespace)\n /\nsum(rate(rest_client_requests_total[5m])) by (instance, job, namespace))\n> 0.01\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- name: kube-apiserver-slos
rules:
@ -397,9 +313,6 @@ spec:
'
for: 2m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
long: 1h
severity: critical
short: 5m
@ -417,9 +330,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
long: 6h
severity: critical
short: 30m
@ -437,9 +347,6 @@ spec:
'
for: 1h
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
long: 1d
severity: warning
short: 2h
@ -457,9 +364,6 @@ spec:
'
for: 3h
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
long: 3d
severity: warning
short: 6h
@ -474,9 +378,6 @@ spec:
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeClientCertificateExpiration
annotations:
@ -487,9 +388,6 @@ spec:
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: AggregatedAPIErrors
annotations:
@ -500,9 +398,6 @@ spec:
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: AggregatedAPIDown
annotations:
@ -514,9 +409,6 @@ spec:
'
for: 5m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeAPIDown
annotations:
@ -528,9 +420,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: KubeAPITerminatedRequests
annotations:
@ -542,9 +431,6 @@ spec:
'
for: 5m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- name: kubernetes-system-kubelet
rules:
@ -558,9 +444,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeNodeUnreachable
annotations:
@ -572,9 +455,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeletTooManyPods
annotations:
@ -584,9 +464,6 @@ spec:
expr: "count by(node) (\n (kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})\n)\n/\nmax by(node) (\n kube_node_status_capacity{job=\"kube-state-metrics\",resource=\"pods\"} != 1\n) > 0.95\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: info
- alert: KubeNodeReadinessFlapping
annotations:
@ -598,9 +475,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeletPlegDurationHigh
annotations:
@ -612,9 +486,6 @@ spec:
'
for: 5m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeletPodStartUpLatencyHigh
annotations:
@ -626,9 +497,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeletClientCertificateExpiration
annotations:
@ -639,9 +507,6 @@ spec:
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeletClientCertificateExpiration
annotations:
@ -652,9 +517,6 @@ spec:
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: KubeletServerCertificateExpiration
annotations:
@ -665,9 +527,6 @@ spec:
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeletServerCertificateExpiration
annotations:
@ -678,9 +537,6 @@ spec:
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: KubeletClientCertificateRenewalErrors
annotations:
@ -692,9 +548,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeletServerCertificateRenewalErrors
annotations:
@ -706,9 +559,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: KubeletDown
annotations:
@ -720,9 +570,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- name: kubernetes-system-scheduler
rules:
@ -736,9 +583,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- name: kubernetes-system-controller-manager
rules:
@ -752,108 +596,63 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- name: kube-apiserver-burnrate.rules
rules:
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[1d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[1d]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[1d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n"
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: read
record: apiserver_request:burnrate1d
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[1h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[1h]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[1h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n"
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: read
record: apiserver_request:burnrate1h
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[2h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[2h]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[2h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n"
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: read
record: apiserver_request:burnrate2h
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[30m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[30m]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[30m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n"
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: read
record: apiserver_request:burnrate30m
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[3d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[3d]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[3d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n"
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: read
record: apiserver_request:burnrate3d
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[5m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[5m]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[5m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n"
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: read
record: apiserver_request:burnrate5m
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[6h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[6h]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[6h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n"
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: read
record: apiserver_request:burnrate6h
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[1d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n"
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: write
record: apiserver_request:burnrate1d
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[1h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n"
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: write
record: apiserver_request:burnrate1h
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[2h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n"
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: write
record: apiserver_request:burnrate2h
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[30m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n"
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: write
record: apiserver_request:burnrate30m
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[3d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n"
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: write
record: apiserver_request:burnrate3d
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[5m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n"
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: write
record: apiserver_request:burnrate5m
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[6h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n"
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: write
record: apiserver_request:burnrate6h
- name: kube-apiserver-histogram.rules
@ -862,9 +661,6 @@ spec:
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
quantile: '0.99'
verb: read
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
@ -872,9 +668,6 @@ spec:
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
quantile: '0.99'
verb: write
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
@ -882,27 +675,18 @@ spec:
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
quantile: '0.99'
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: 'histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
quantile: '0.9'
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: 'histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
quantile: '0.5'
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- interval: 3m
@ -916,18 +700,12 @@ spec:
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: read
record: code:apiserver_request_total:increase30d
- expr: 'sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: write
record: code:apiserver_request_total:increase30d
- expr: 'sum by (cluster, verb, scope) (increase(apiserver_request_duration_seconds_count[1h]))
@ -949,41 +727,26 @@ spec:
- expr: "1 - (\n (\n # write too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"})\n ) +\n (\n # read too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"\
})\n )\n ) +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d)\n"
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: all
record: apiserver_request:availability30d
- expr: "1 - (\n sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n # too slow\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\"})\n"
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: read
record: apiserver_request:availability30d
- expr: "1 - (\n (\n # too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\"})\n"
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: write
record: apiserver_request:availability30d
- expr: 'sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: read
record: code_resource:apiserver_request_total:rate5m
- expr: 'sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
verb: write
record: code_resource:apiserver_request_total:rate5m
- expr: 'sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h]))
@ -1032,23 +795,14 @@ spec:
record: namespace_cpu:kube_pod_container_resource_limits:sum
- expr: "max by (cluster, namespace, workload, pod) (\n label_replace(\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"ReplicaSet\"},\n \"replicaset\", \"$1\", \"owner_name\", \"(.*)\"\n ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (\n 1, max by (replicaset, namespace, owner_name) (\n kube_replicaset_owner{job=\"kube-state-metrics\"}\n )\n ),\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n)\n"
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
workload_type: deployment
record: namespace_workload_pod:kube_pod_owner:relabel
- expr: "max by (cluster, namespace, workload, pod) (\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"DaemonSet\"},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n)\n"
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
workload_type: daemonset
record: namespace_workload_pod:kube_pod_owner:relabel
- expr: "max by (cluster, namespace, workload, pod) (\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"StatefulSet\"},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n)\n"
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
workload_type: statefulset
record: namespace_workload_pod:kube_pod_owner:relabel
- name: kube-scheduler.rules
@ -1057,81 +811,54 @@ spec:
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
quantile: '0.99'
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- expr: 'histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
quantile: '0.99'
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
- expr: 'histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
quantile: '0.99'
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
- expr: 'histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
quantile: '0.9'
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- expr: 'histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
quantile: '0.9'
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
- expr: 'histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
quantile: '0.9'
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
- expr: 'histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
quantile: '0.5'
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- expr: 'histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
quantile: '0.5'
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
- expr: 'histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
quantile: '0.5'
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
- name: node.rules
@ -1148,27 +875,18 @@ spec:
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
quantile: '0.99'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- expr: 'histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
quantile: '0.9'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- expr: 'histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
quantile: '0.5'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile

View File

@ -17,9 +17,6 @@ spec:
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
for: 1h
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: NodeFilesystemSpaceFillingUp
annotations:
@ -29,9 +26,6 @@ spec:
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 15\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
for: 1h
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
@ -41,9 +35,6 @@ spec:
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
for: 30m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
@ -53,9 +44,6 @@ spec:
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
for: 30m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: NodeFilesystemFilesFillingUp
annotations:
@ -65,9 +53,6 @@ spec:
expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
for: 1h
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: NodeFilesystemFilesFillingUp
annotations:
@ -77,9 +62,6 @@ spec:
expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
for: 1h
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
@ -89,9 +71,6 @@ spec:
expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
for: 1h
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
@ -101,9 +80,6 @@ spec:
expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
for: 1h
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: NodeNetworkReceiveErrs
annotations:
@ -115,9 +91,6 @@ spec:
'
for: 1h
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: NodeNetworkTransmitErrs
annotations:
@ -129,9 +102,6 @@ spec:
'
for: 1h
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
@ -142,9 +112,6 @@ spec:
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: NodeTextFileCollectorScrapeError
annotations:
@ -155,9 +122,6 @@ spec:
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: NodeClockSkewDetected
annotations:
@ -167,9 +131,6 @@ spec:
expr: "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n"
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: NodeClockNotSynchronising
annotations:
@ -185,9 +146,6 @@ spec:
'
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: NodeRAIDDegraded
annotations:
@ -199,9 +157,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: NodeRAIDDiskFailure
annotations:
@ -212,9 +167,6 @@ spec:
'
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: NodeFileDescriptorLimit
annotations:
@ -224,9 +176,6 @@ spec:
expr: "(\n node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"node-exporter\"} > 70\n)\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: NodeFileDescriptorLimit
annotations:
@ -236,9 +185,6 @@ spec:
expr: "(\n node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"node-exporter\"} > 90\n)\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- name: node-exporter.rules
rules:

View File

@ -19,9 +19,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: PrometheusOperatorWatchErrors
annotations:
@ -33,9 +30,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: PrometheusOperatorSyncFailed
annotations:
@ -47,9 +41,6 @@ spec:
'
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: PrometheusOperatorReconcileErrors
annotations:
@ -61,9 +52,6 @@ spec:
'
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: PrometheusOperatorNodeLookupErrors
annotations:
@ -75,9 +63,6 @@ spec:
'
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: PrometheusOperatorNotReady
annotations:
@ -89,9 +74,6 @@ spec:
'
for: 5m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: PrometheusOperatorRejectedResources
annotations:
@ -103,8 +85,5 @@ spec:
'
for: 5m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning

View File

@ -23,9 +23,6 @@ spec:
'
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: PrometheusNotificationQueueRunningFull
annotations:
@ -35,9 +32,6 @@ spec:
expr: "# Without min_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n predict_linear(prometheus_notifications_queue_length{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m], 60 * 30)\n>\n min_over_time(prometheus_notifications_queue_capacity{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
@ -47,9 +41,6 @@ spec:
expr: "(\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n* 100\n> 1\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: PrometheusNotConnectedToAlertmanagers
annotations:
@ -65,9 +56,6 @@ spec:
'
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: PrometheusTSDBReloadsFailing
annotations:
@ -79,9 +67,6 @@ spec:
'
for: 4h
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: PrometheusTSDBCompactionsFailing
annotations:
@ -93,9 +78,6 @@ spec:
'
for: 4h
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: PrometheusNotIngestingSamples
annotations:
@ -105,9 +87,6 @@ spec:
expr: "(\n rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) <= 0\nand\n (\n sum without(scrape_job) (prometheus_target_metadata_cache_entries{job=\"prometheus-k8s\",namespace=\"monitoring\"}) > 0\n or\n sum without(rule_group) (prometheus_rule_group_rules{job=\"prometheus-k8s\",namespace=\"monitoring\"}) > 0\n )\n)\n"
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: PrometheusDuplicateTimestamps
annotations:
@ -119,9 +98,6 @@ spec:
'
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: PrometheusOutOfOrderTimestamps
annotations:
@ -133,9 +109,6 @@ spec:
'
for: 10m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: PrometheusRemoteStorageFailures
annotations:
@ -145,9 +118,6 @@ spec:
expr: "(\n (rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n/\n (\n (rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n +\n (rate(prometheus_remote_storage_succeeded_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n )\n)\n* 100\n> 1\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: PrometheusRemoteWriteBehind
annotations:
@ -157,9 +127,6 @@ spec:
expr: "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n- ignoring(remote_name, url) group_right\n max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n> 120\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: PrometheusRemoteWriteDesiredShards
annotations:
@ -169,9 +136,6 @@ spec:
expr: "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_shards_desired{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n>\n max_over_time(prometheus_remote_storage_shards_max{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: PrometheusRuleFailures
annotations:
@ -183,9 +147,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: PrometheusMissingRuleEvaluations
annotations:
@ -197,9 +158,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: PrometheusTargetLimitHit
annotations:
@ -211,9 +169,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: PrometheusLabelLimitHit
annotations:
@ -225,9 +180,6 @@ spec:
'
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: warning
- alert: PrometheusTargetSyncFailure
annotations:
@ -239,9 +191,6 @@ spec:
'
for: 5m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
@ -251,8 +200,5 @@ spec:
expr: "min without (alertmanager) (\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\",alertmanager!~``}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\",alertmanager!~``}[5m])\n)\n* 100\n> 3\n"
for: 15m
labels:
{{- if .Values.additionalRuleLabels }}
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
{{- end }}
severity: critical

View File

@ -1,7 +1,3 @@
#additionalRuleLabels:
# clusterName: myTestCluster
# awsRegion: eu-central-1
kube-prometheus-stack:
global:
rbac:
@ -105,7 +101,7 @@ kube-prometheus-stack:
logFormat: json
# externalUrl:
# gather all monitors
# gather ALL monitors
podMonitorSelectorNilUsesHelmValues: false
serviceMonitorSelectorNilUsesHelmValues: false
ruleSelectorNilUsesHelmValues: false