fix(metrics): More fixes around AlertLabels
This commit is contained in:
parent
439cd6d49e
commit
06eda41283
@ -2,7 +2,7 @@ apiVersion: v2
|
||||
name: kubezero-metrics
|
||||
description: KubeZero Umbrella Chart for Prometheus, Grafana and Alertmanager as well as all Kubernetes integrations.
|
||||
type: application
|
||||
version: 0.5.2
|
||||
version: 0.5.4
|
||||
home: https://kubezero.com
|
||||
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
|
||||
keywords:
|
||||
|
@ -1,6 +1,6 @@
|
||||
# kubezero-metrics
|
||||
|
||||
![Version: 0.5.2](https://img.shields.io/badge/Version-0.5.2-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)
|
||||
![Version: 0.5.4](https://img.shields.io/badge/Version-0.5.4-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)
|
||||
|
||||
KubeZero Umbrella Chart for Prometheus, Grafana and Alertmanager as well as all Kubernetes integrations.
|
||||
|
||||
|
@ -1,16 +1,15 @@
|
||||
#!/bin/bash -ex
|
||||
|
||||
JB='./jb-linux-amd64'
|
||||
which jsonnet > /dev/null || { echo "Required jsonnet not found!"; exit 1;}
|
||||
which jb > /dev/null || { echo "Required jb ( json-bundler ) not found!"; exit 1;}
|
||||
|
||||
which jsonnet > /dev/null || { echo "Required jsonnet not found!"; }
|
||||
[ -x $JB ] || { wget https://github.com/jsonnet-bundler/jsonnet-bundler/releases/download/v0.4.0/jb-linux-amd64 && chmod +x $JB; }
|
||||
#which gojsontoyaml || go install github.com/brancz/gojsontoyaml@latest
|
||||
# wget https://github.com/jsonnet-bundler/jsonnet-bundler/releases/download/v0.4.0/jb-linux-amd64
|
||||
|
||||
[ -r jsonnetfile.json ] || $JB init
|
||||
[ -r jsonnetfile.json ] || jb init
|
||||
if [ -r jsonnetfile.lock.json ]; then
|
||||
$JB update
|
||||
jb update
|
||||
else
|
||||
$JB install github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus@main
|
||||
jb install github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus@main
|
||||
fi
|
||||
|
||||
rm -rf dashboards && mkdir -p dashboards
|
||||
|
@ -38,7 +38,7 @@
|
||||
"subdir": "grafana-builder"
|
||||
}
|
||||
},
|
||||
"version": "bc9b685050691a78ee414cd8f789857de0eabe8d",
|
||||
"version": "b7eae75972a369bf8ebfb03dcb0d4c14464ef85a",
|
||||
"sum": "GRf2GvwEU4jhXV+JOonXSZ4wdDv8mnHBPCQ6TUVd+g8="
|
||||
},
|
||||
{
|
||||
@ -98,7 +98,7 @@
|
||||
"subdir": "jsonnet/mixin"
|
||||
}
|
||||
},
|
||||
"version": "c6b5cc2e178002683ab3afe8d023a8bb2508ba19",
|
||||
"version": "42fc15967e35e0cca68cf935f844086edbc82d0e",
|
||||
"sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U=",
|
||||
"name": "prometheus-operator-mixin"
|
||||
},
|
||||
@ -109,7 +109,7 @@
|
||||
"subdir": "jsonnet/prometheus-operator"
|
||||
}
|
||||
},
|
||||
"version": "c6b5cc2e178002683ab3afe8d023a8bb2508ba19",
|
||||
"version": "42fc15967e35e0cca68cf935f844086edbc82d0e",
|
||||
"sum": "sECNXs/aIEreFUma1BWVyknBygqh3AVJEB3msmrAYYY="
|
||||
},
|
||||
{
|
||||
|
@ -2,8 +2,6 @@
|
||||
rules:
|
||||
- name: alertmanager
|
||||
url: file://rules/alertmanager-prometheusRule
|
||||
- name: etcd-mixin
|
||||
url: file://rules/etcd-mixin-prometheusRule
|
||||
- name: kube-prometheus
|
||||
url: file://rules/kube-prometheus-prometheusRule
|
||||
- name: kube-state-metrics
|
||||
@ -16,3 +14,5 @@ rules:
|
||||
url: file://rules/prometheus-operator-prometheusRule
|
||||
- name: prometheus
|
||||
url: file://rules/prometheus-prometheusRule
|
||||
- name: etcd-mixin
|
||||
url: file://rules/etcd-mixin-prometheusRule
|
||||
|
@ -1,6 +1,6 @@
|
||||
configmap: grafana-dashboards-metrics
|
||||
gzip: true
|
||||
folder: Metrics
|
||||
folder: KubeZero
|
||||
dashboards:
|
||||
- name: AlertManager
|
||||
url: https://grafana.com/api/dashboards/9578/revisions/4/download
|
||||
|
@ -1,5 +1,6 @@
|
||||
configmap: grafana-dashboards-zdt
|
||||
gzip: true
|
||||
folder: KubeZero
|
||||
dashboards:
|
||||
- name: home
|
||||
url: file://zdt/home.json
|
||||
|
@ -61,19 +61,9 @@ for r in config['rules']:
|
||||
|
||||
# Encode {{ }} for helm
|
||||
text = text.replace("{{", "{{`{{").replace("}}", "}}`}}").replace("{{`{{", "{{`{{`}}").replace("}}`}}", "{{`}}`}}")
|
||||
text = textwrap.indent(text, ' '*2)
|
||||
|
||||
# add support for additionalRuleLabels from Helm values
|
||||
helm_labels = '''{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
'''
|
||||
helm_labels = textwrap.indent(helm_labels, ' '*8)
|
||||
|
||||
text = text.replace("labels:\n", "labels:\n{}".format(helm_labels))
|
||||
|
||||
rule += '''spec:\n'''
|
||||
rule += text+'\n'
|
||||
rule += textwrap.indent(text, ' '*2)+'\n'
|
||||
|
||||
if 'condition' in r:
|
||||
rule += '{{- end }}'+'\n'
|
||||
|
File diff suppressed because one or more lines are too long
@ -6,6 +6,8 @@ metadata:
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
{{- include "kubezero-lib.labels" . | nindent 4 }}
|
||||
annotations:
|
||||
k8s-sidecar-target-directory: KubeZero
|
||||
binaryData:
|
||||
home.json.gz:
|
||||
H4sIAAAAAAAC/9VVS2/UMBC+8yuMDwikdnezj1b0VrVCVLwKFJBoq8obTxJrHTvYTndb1P+O7byczULFDS6J55vxeB6fxz+fIISJENIQw6TQ+Aj9tJAFOdPGSpdeQjXqNcuScXMmrDLa61BKDNGyVDFYBT5XMgeTQalxYAOCLLnTG1VCgGeM7kBZLMWJ5FI5hypdkueTPTSNIvtZLPZQ9CJ0LUjuDz7uckHP0DEHZXohmLvC21Gis6UkiuJa9+D/1/b74MwxUGa2osWpAHNGLSJKzitEkSK7kJIbVlh84kHmTKYv/ZozsXJVvbz2YkEEcN3Wtalqv3qNd69JGHB6IkXC0rY51RZISMmNb9lDkKG8BaVsQatTm+xaf6nVnUvd95VZcRb4WLsE5gGwaZKr5TsnD3z7xA9aURZ9TnnQNtWAcNTCTy+/n16gtzKV188zYwp9NB7HVIzuQcl9KtfCsBxGtuZjojUYPV6VS3DKMbd79nVOON8/mI8KkaIrfE/NFX5xJb4Bjy35kJHojbX/bu3ROzCKxRo9RW0cQTK59PTDOVErdyweJFbwMmXiKyhtE3Kmh6PFaNb6wC7QV5by281z+OeMJWaoMJ5a+LUNNfBTs9PAxlRR1DH8S0SJFltMiaaPMGW2mykt47Zuye/Jk0hOQfkrGJ6Rk82ZgdwZz0L8RwnKnY/DbutMrl8DoUykejh0nPYTxJah/O4rgzXQ3TafgajYVSMhXA+Uhii1a6ch6W+q/SjDmp1DCp02o0wPieTGnJ/k/y+ZesAf2DT/ezYB0C+Kuzo142c4epigsBltcj6cCm0H3hID2qD3sNYosWMA+aFzat2gC7brfgtrWbfEfqunQUGiQLua4NmkaiXWcQY56VgxPaxgc8fr9qpVZdnjh50fecHtKyjS4YPePXEux06fVPMLC7nej7I6ZmxkjeHetoLFK1DdZvuCUxA9wjcJ3TA78NUtCZ49q1wEZI0mgTALhSjv1otgHYXCbBJqsm49DdZR/dBfN3nZJG46Vlw+ekro+CB0HJ4ynYcC7daHNIx3O5Zm6neF7RX7XgqvXyq51lZZK2ryOa45qjmmofY5waW/FPjg4wdIFy/f3VfobUul+ZOHX/6tTqj9CQAA
|
||||
|
@ -23,9 +23,6 @@ spec:
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: AlertmanagerMembersInconsistent
|
||||
annotations:
|
||||
@ -35,9 +32,6 @@ spec:
|
||||
expr: "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n max_over_time(alertmanager_cluster_members{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m])\n< on (namespace,service) group_left\n count by (namespace,service) (max_over_time(alertmanager_cluster_members{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m]))\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: AlertmanagerFailedToSendAlerts
|
||||
annotations:
|
||||
@ -47,9 +41,6 @@ spec:
|
||||
expr: "(\n rate(alertmanager_notifications_failed_total{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m])\n/\n rate(alertmanager_notifications_total{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m])\n)\n> 0.01\n"
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
annotations:
|
||||
@ -59,9 +50,6 @@ spec:
|
||||
expr: "min by (namespace,service, integration) (\n rate(alertmanager_notifications_failed_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration=~`.*`}[5m])\n/\n rate(alertmanager_notifications_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration=~`.*`}[5m])\n)\n> 0.01\n"
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
annotations:
|
||||
@ -71,9 +59,6 @@ spec:
|
||||
expr: "min by (namespace,service, integration) (\n rate(alertmanager_notifications_failed_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration!~`.*`}[5m])\n/\n rate(alertmanager_notifications_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration!~`.*`}[5m])\n)\n> 0.01\n"
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
annotations:
|
||||
@ -83,9 +68,6 @@ spec:
|
||||
expr: "count by (namespace,service) (\n count_values by (namespace,service) (\"config_hash\", alertmanager_config_hash{job=\"alertmanager-main\",namespace=\"monitoring\"})\n)\n!= 1\n"
|
||||
for: 20m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: AlertmanagerClusterDown
|
||||
annotations:
|
||||
@ -95,9 +77,6 @@ spec:
|
||||
expr: "(\n count by (namespace,service) (\n avg_over_time(up{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m]) < 0.5\n )\n/\n count by (namespace,service) (\n up{job=\"alertmanager-main\",namespace=\"monitoring\"}\n )\n)\n>= 0.5\n"
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: AlertmanagerClusterCrashlooping
|
||||
annotations:
|
||||
@ -107,8 +86,5 @@ spec:
|
||||
expr: "(\n count by (namespace,service) (\n changes(process_start_time_seconds{job=\"alertmanager-main\",namespace=\"monitoring\"}[10m]) > 4\n )\n/\n count by (namespace,service) (\n up{job=\"alertmanager-main\",namespace=\"monitoring\"}\n )\n)\n>= 0.5\n"
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
|
||||
|
@ -16,9 +16,6 @@ spec:
|
||||
expr: "max without (endpoint) (\n sum without (instance, pod) (up{job=~\".*etcd.*\"} == bool 0)\nor\n count without (To) (\n sum without (instance, pod) (rate(etcd_network_peer_sent_failures_total{job=~\".*etcd.*\"}[120s])) > 0.01\n )\n)\n> 0\n"
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: etcdInsufficientMembers
|
||||
annotations:
|
||||
@ -29,9 +26,6 @@ spec:
|
||||
'
|
||||
for: 3m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: etcdNoLeader
|
||||
annotations:
|
||||
@ -42,9 +36,6 @@ spec:
|
||||
'
|
||||
for: 1m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: etcdHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
@ -55,9 +46,6 @@ spec:
|
||||
'
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
@ -70,9 +58,6 @@ spec:
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: etcdMemberCommunicationSlow
|
||||
annotations:
|
||||
@ -85,9 +70,6 @@ spec:
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedProposals
|
||||
annotations:
|
||||
@ -98,9 +80,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
@ -113,9 +92,6 @@ spec:
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
@ -127,9 +103,6 @@ spec:
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: etcdHighCommitDurations
|
||||
annotations:
|
||||
@ -142,9 +115,6 @@ spec:
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: etcdBackendQuotaLowSpace
|
||||
annotations:
|
||||
@ -154,9 +124,6 @@ spec:
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: etcdExcessiveDatabaseGrowth
|
||||
annotations:
|
||||
@ -166,8 +133,5 @@ spec:
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
|
||||
|
@ -17,9 +17,6 @@ spec:
|
||||
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: Watchdog
|
||||
annotations:
|
||||
@ -38,9 +35,6 @@ spec:
|
||||
summary: An alert that should always be firing to certify that Alertmanager is working properly.
|
||||
expr: vector(1)
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: none
|
||||
- name: node-network
|
||||
rules:
|
||||
@ -54,9 +48,6 @@ spec:
|
||||
'
|
||||
for: 2m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- name: kube-prometheus-node-recording.rules
|
||||
rules:
|
||||
|
@ -17,9 +17,6 @@ spec:
|
||||
expr: "(sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\",result=\"error\"}[5m]))\n /\nsum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\"}[5m])))\n> 0.01\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: KubeStateMetricsWatchErrors
|
||||
annotations:
|
||||
@ -29,9 +26,6 @@ spec:
|
||||
expr: "(sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\",result=\"error\"}[5m]))\n /\nsum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\"}[5m])))\n> 0.01\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: KubeStateMetricsShardingMismatch
|
||||
annotations:
|
||||
@ -43,9 +37,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: KubeStateMetricsShardsMissing
|
||||
annotations:
|
||||
@ -55,8 +46,5 @@ spec:
|
||||
expr: "2^max(kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) - 1\n -\nsum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job=\"kube-state-metrics\"}) )\n!= 0\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
|
||||
|
@ -23,9 +23,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubePodNotReady
|
||||
annotations:
|
||||
@ -35,9 +32,6 @@ spec:
|
||||
expr: "sum by (namespace, pod) (\n max by(namespace, pod) (\n kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Pending|Unknown\"}\n ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (\n 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!=\"Job\"})\n )\n) > 0\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeDeploymentGenerationMismatch
|
||||
annotations:
|
||||
@ -47,9 +41,6 @@ spec:
|
||||
expr: "kube_deployment_status_observed_generation{job=\"kube-state-metrics\"}\n !=\nkube_deployment_metadata_generation{job=\"kube-state-metrics\"}\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeDeploymentReplicasMismatch
|
||||
annotations:
|
||||
@ -59,9 +50,6 @@ spec:
|
||||
expr: "(\n kube_deployment_spec_replicas{job=\"kube-state-metrics\"}\n >\n kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}\n) and (\n changes(kube_deployment_status_replicas_updated{job=\"kube-state-metrics\"}[10m])\n ==\n 0\n)\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeStatefulSetReplicasMismatch
|
||||
annotations:
|
||||
@ -71,9 +59,6 @@ spec:
|
||||
expr: "(\n kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas{job=\"kube-state-metrics\"}\n) and (\n changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[10m])\n ==\n 0\n)\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeStatefulSetGenerationMismatch
|
||||
annotations:
|
||||
@ -83,9 +68,6 @@ spec:
|
||||
expr: "kube_statefulset_status_observed_generation{job=\"kube-state-metrics\"}\n !=\nkube_statefulset_metadata_generation{job=\"kube-state-metrics\"}\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
||||
annotations:
|
||||
@ -95,9 +77,6 @@ spec:
|
||||
expr: "(\n max without (revision) (\n kube_statefulset_status_current_revision{job=\"kube-state-metrics\"}\n unless\n kube_statefulset_status_update_revision{job=\"kube-state-metrics\"}\n )\n *\n (\n kube_statefulset_replicas{job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeDaemonSetRolloutStuck
|
||||
annotations:
|
||||
@ -107,9 +86,6 @@ spec:
|
||||
expr: "(\n (\n kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"}\n !=\n 0\n ) or (\n kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_available{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeContainerWaiting
|
||||
annotations:
|
||||
@ -121,9 +97,6 @@ spec:
|
||||
'
|
||||
for: 1h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeDaemonSetNotScheduled
|
||||
annotations:
|
||||
@ -133,9 +106,6 @@ spec:
|
||||
expr: "kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n -\nkube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"} > 0\n"
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeDaemonSetMisScheduled
|
||||
annotations:
|
||||
@ -147,9 +117,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeJobCompletion
|
||||
annotations:
|
||||
@ -161,9 +128,6 @@ spec:
|
||||
'
|
||||
for: 12h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeJobFailed
|
||||
annotations:
|
||||
@ -175,9 +139,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeHpaReplicasMismatch
|
||||
annotations:
|
||||
@ -187,9 +148,6 @@ spec:
|
||||
expr: "(kube_horizontalpodautoscaler_status_desired_replicas{job=\"kube-state-metrics\"}\n !=\nkube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"})\n and\n(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n >\nkube_horizontalpodautoscaler_spec_min_replicas{job=\"kube-state-metrics\"})\n and\n(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n <\nkube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\"})\n and\nchanges(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}[15m]) == 0\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeHpaMaxedOut
|
||||
annotations:
|
||||
@ -199,9 +157,6 @@ spec:
|
||||
expr: "kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n ==\nkube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\"}\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- name: kubernetes-resources
|
||||
rules:
|
||||
@ -219,9 +174,6 @@ spec:
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeMemoryOvercommit
|
||||
annotations:
|
||||
@ -237,9 +189,6 @@ spec:
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeCPUQuotaOvercommit
|
||||
annotations:
|
||||
@ -249,9 +198,6 @@ spec:
|
||||
expr: "sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"cpu\"})\n /\nsum(kube_node_status_allocatable{resource=\"cpu\"})\n > 1.5\n"
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeMemoryQuotaOvercommit
|
||||
annotations:
|
||||
@ -261,9 +207,6 @@ spec:
|
||||
expr: "sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"memory\"})\n /\nsum(kube_node_status_allocatable{resource=\"memory\",job=\"kube-state-metrics\"})\n > 1.5\n"
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeQuotaAlmostFull
|
||||
annotations:
|
||||
@ -273,9 +216,6 @@ spec:
|
||||
expr: "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n > 0.9 < 1\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: info
|
||||
- alert: KubeQuotaFullyUsed
|
||||
annotations:
|
||||
@ -285,9 +225,6 @@ spec:
|
||||
expr: "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n == 1\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: info
|
||||
- alert: KubeQuotaExceeded
|
||||
annotations:
|
||||
@ -297,9 +234,6 @@ spec:
|
||||
expr: "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n > 1\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: CPUThrottlingHigh
|
||||
annotations:
|
||||
@ -309,9 +243,6 @@ spec:
|
||||
expr: "sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\", }[5m])) by (container, pod, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)\n > ( 25 / 100 )\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: info
|
||||
- name: kubernetes-storage
|
||||
rules:
|
||||
@ -323,9 +254,6 @@ spec:
|
||||
expr: "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) < 0.03\nand\nkubelet_volume_stats_used_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1\n"
|
||||
for: 1m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
@ -335,9 +263,6 @@ spec:
|
||||
expr: "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) < 0.15\nand\nkubelet_volume_stats_used_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nand\npredict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}[6h], 4 * 24 * 3600) < 0\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1\n"
|
||||
for: 1h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubePersistentVolumeErrors
|
||||
annotations:
|
||||
@ -349,9 +274,6 @@ spec:
|
||||
'
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- name: kubernetes-system
|
||||
rules:
|
||||
@ -365,9 +287,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeClientErrors
|
||||
annotations:
|
||||
@ -377,9 +296,6 @@ spec:
|
||||
expr: "(sum(rate(rest_client_requests_total{code=~\"5..\"}[5m])) by (instance, job, namespace)\n /\nsum(rate(rest_client_requests_total[5m])) by (instance, job, namespace))\n> 0.01\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- name: kube-apiserver-slos
|
||||
rules:
|
||||
@ -397,9 +313,6 @@ spec:
|
||||
'
|
||||
for: 2m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
long: 1h
|
||||
severity: critical
|
||||
short: 5m
|
||||
@ -417,9 +330,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
long: 6h
|
||||
severity: critical
|
||||
short: 30m
|
||||
@ -437,9 +347,6 @@ spec:
|
||||
'
|
||||
for: 1h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
long: 1d
|
||||
severity: warning
|
||||
short: 2h
|
||||
@ -457,9 +364,6 @@ spec:
|
||||
'
|
||||
for: 3h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
long: 3d
|
||||
severity: warning
|
||||
short: 6h
|
||||
@ -474,9 +378,6 @@ spec:
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
@ -487,9 +388,6 @@ spec:
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: AggregatedAPIErrors
|
||||
annotations:
|
||||
@ -500,9 +398,6 @@ spec:
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: AggregatedAPIDown
|
||||
annotations:
|
||||
@ -514,9 +409,6 @@ spec:
|
||||
'
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeAPIDown
|
||||
annotations:
|
||||
@ -528,9 +420,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: KubeAPITerminatedRequests
|
||||
annotations:
|
||||
@ -542,9 +431,6 @@ spec:
|
||||
'
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- name: kubernetes-system-kubelet
|
||||
rules:
|
||||
@ -558,9 +444,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeNodeUnreachable
|
||||
annotations:
|
||||
@ -572,9 +455,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeletTooManyPods
|
||||
annotations:
|
||||
@ -584,9 +464,6 @@ spec:
|
||||
expr: "count by(node) (\n (kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})\n)\n/\nmax by(node) (\n kube_node_status_capacity{job=\"kube-state-metrics\",resource=\"pods\"} != 1\n) > 0.95\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: info
|
||||
- alert: KubeNodeReadinessFlapping
|
||||
annotations:
|
||||
@ -598,9 +475,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeletPlegDurationHigh
|
||||
annotations:
|
||||
@ -612,9 +486,6 @@ spec:
|
||||
'
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeletPodStartUpLatencyHigh
|
||||
annotations:
|
||||
@ -626,9 +497,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeletClientCertificateExpiration
|
||||
annotations:
|
||||
@ -639,9 +507,6 @@ spec:
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeletClientCertificateExpiration
|
||||
annotations:
|
||||
@ -652,9 +517,6 @@ spec:
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: KubeletServerCertificateExpiration
|
||||
annotations:
|
||||
@ -665,9 +527,6 @@ spec:
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeletServerCertificateExpiration
|
||||
annotations:
|
||||
@ -678,9 +537,6 @@ spec:
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: KubeletClientCertificateRenewalErrors
|
||||
annotations:
|
||||
@ -692,9 +548,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeletServerCertificateRenewalErrors
|
||||
annotations:
|
||||
@ -706,9 +559,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: KubeletDown
|
||||
annotations:
|
||||
@ -720,9 +570,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- name: kubernetes-system-scheduler
|
||||
rules:
|
||||
@ -736,9 +583,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- name: kubernetes-system-controller-manager
|
||||
rules:
|
||||
@ -752,108 +596,63 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- name: kube-apiserver-burnrate.rules
|
||||
rules:
|
||||
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[1d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[1d]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[1d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n"
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: read
|
||||
record: apiserver_request:burnrate1d
|
||||
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[1h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[1h]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[1h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n"
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: read
|
||||
record: apiserver_request:burnrate1h
|
||||
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[2h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[2h]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[2h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n"
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: read
|
||||
record: apiserver_request:burnrate2h
|
||||
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[30m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[30m]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[30m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n"
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: read
|
||||
record: apiserver_request:burnrate30m
|
||||
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[3d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[3d]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[3d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n"
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: read
|
||||
record: apiserver_request:burnrate3d
|
||||
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[5m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[5m]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[5m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n"
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: read
|
||||
record: apiserver_request:burnrate5m
|
||||
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[6h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[6h]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[6h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n"
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: read
|
||||
record: apiserver_request:burnrate6h
|
||||
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[1d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n"
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: write
|
||||
record: apiserver_request:burnrate1d
|
||||
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[1h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n"
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: write
|
||||
record: apiserver_request:burnrate1h
|
||||
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[2h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n"
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: write
|
||||
record: apiserver_request:burnrate2h
|
||||
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[30m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n"
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: write
|
||||
record: apiserver_request:burnrate30m
|
||||
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[3d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n"
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: write
|
||||
record: apiserver_request:burnrate3d
|
||||
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[5m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n"
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: write
|
||||
record: apiserver_request:burnrate5m
|
||||
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[6h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n"
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: write
|
||||
record: apiserver_request:burnrate6h
|
||||
- name: kube-apiserver-histogram.rules
|
||||
@ -862,9 +661,6 @@ spec:
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
quantile: '0.99'
|
||||
verb: read
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
@ -872,9 +668,6 @@ spec:
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
quantile: '0.99'
|
||||
verb: write
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
@ -882,27 +675,18 @@ spec:
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: 'histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: 'histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- interval: 3m
|
||||
@ -916,18 +700,12 @@ spec:
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: read
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- expr: 'sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: write
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- expr: 'sum by (cluster, verb, scope) (increase(apiserver_request_duration_seconds_count[1h]))
|
||||
@ -949,41 +727,26 @@ spec:
|
||||
- expr: "1 - (\n (\n # write too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"})\n ) +\n (\n # read too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"\
|
||||
})\n )\n ) +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d)\n"
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: all
|
||||
record: apiserver_request:availability30d
|
||||
- expr: "1 - (\n sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n # too slow\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\"})\n"
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: read
|
||||
record: apiserver_request:availability30d
|
||||
- expr: "1 - (\n (\n # too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\"})\n"
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: write
|
||||
record: apiserver_request:availability30d
|
||||
- expr: 'sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: read
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- expr: 'sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
verb: write
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- expr: 'sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h]))
|
||||
@ -1032,23 +795,14 @@ spec:
|
||||
record: namespace_cpu:kube_pod_container_resource_limits:sum
|
||||
- expr: "max by (cluster, namespace, workload, pod) (\n label_replace(\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"ReplicaSet\"},\n \"replicaset\", \"$1\", \"owner_name\", \"(.*)\"\n ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (\n 1, max by (replicaset, namespace, owner_name) (\n kube_replicaset_owner{job=\"kube-state-metrics\"}\n )\n ),\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n)\n"
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
workload_type: deployment
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- expr: "max by (cluster, namespace, workload, pod) (\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"DaemonSet\"},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n)\n"
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
workload_type: daemonset
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- expr: "max by (cluster, namespace, workload, pod) (\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"StatefulSet\"},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n)\n"
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
workload_type: statefulset
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- name: kube-scheduler.rules
|
||||
@ -1057,81 +811,54 @@ spec:
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- expr: 'histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- expr: 'histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- expr: 'histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- expr: 'histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- expr: 'histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- expr: 'histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- expr: 'histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- expr: 'histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- name: node.rules
|
||||
@ -1148,27 +875,18 @@ spec:
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
quantile: '0.99'
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- expr: 'histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
quantile: '0.9'
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- expr: 'histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
quantile: '0.5'
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
|
||||
|
@ -17,9 +17,6 @@ spec:
|
||||
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
|
||||
for: 1h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
@ -29,9 +26,6 @@ spec:
|
||||
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 15\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
|
||||
for: 1h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
@ -41,9 +35,6 @@ spec:
|
||||
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
|
||||
for: 30m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
@ -53,9 +44,6 @@ spec:
|
||||
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
|
||||
for: 30m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
@ -65,9 +53,6 @@ spec:
|
||||
expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
|
||||
for: 1h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
@ -77,9 +62,6 @@ spec:
|
||||
expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
|
||||
for: 1h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
@ -89,9 +71,6 @@ spec:
|
||||
expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
|
||||
for: 1h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
@ -101,9 +80,6 @@ spec:
|
||||
expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
|
||||
for: 1h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
@ -115,9 +91,6 @@ spec:
|
||||
'
|
||||
for: 1h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
@ -129,9 +102,6 @@ spec:
|
||||
'
|
||||
for: 1h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeHighNumberConntrackEntriesUsed
|
||||
annotations:
|
||||
@ -142,9 +112,6 @@ spec:
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeTextFileCollectorScrapeError
|
||||
annotations:
|
||||
@ -155,9 +122,6 @@ spec:
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
@ -167,9 +131,6 @@ spec:
|
||||
expr: "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n"
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
@ -185,9 +146,6 @@ spec:
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeRAIDDegraded
|
||||
annotations:
|
||||
@ -199,9 +157,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: NodeRAIDDiskFailure
|
||||
annotations:
|
||||
@ -212,9 +167,6 @@ spec:
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
@ -224,9 +176,6 @@ spec:
|
||||
expr: "(\n node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"node-exporter\"} > 70\n)\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
@ -236,9 +185,6 @@ spec:
|
||||
expr: "(\n node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"node-exporter\"} > 90\n)\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- name: node-exporter.rules
|
||||
rules:
|
||||
|
@ -19,9 +19,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorWatchErrors
|
||||
annotations:
|
||||
@ -33,9 +30,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorSyncFailed
|
||||
annotations:
|
||||
@ -47,9 +41,6 @@ spec:
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorReconcileErrors
|
||||
annotations:
|
||||
@ -61,9 +52,6 @@ spec:
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorNodeLookupErrors
|
||||
annotations:
|
||||
@ -75,9 +63,6 @@ spec:
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorNotReady
|
||||
annotations:
|
||||
@ -89,9 +74,6 @@ spec:
|
||||
'
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorRejectedResources
|
||||
annotations:
|
||||
@ -103,8 +85,5 @@ spec:
|
||||
'
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
|
||||
|
@ -23,9 +23,6 @@ spec:
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: PrometheusNotificationQueueRunningFull
|
||||
annotations:
|
||||
@ -35,9 +32,6 @@ spec:
|
||||
expr: "# Without min_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n predict_linear(prometheus_notifications_queue_length{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m], 60 * 30)\n>\n min_over_time(prometheus_notifications_queue_capacity{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||
annotations:
|
||||
@ -47,9 +41,6 @@ spec:
|
||||
expr: "(\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n* 100\n> 1\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusNotConnectedToAlertmanagers
|
||||
annotations:
|
||||
@ -65,9 +56,6 @@ spec:
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusTSDBReloadsFailing
|
||||
annotations:
|
||||
@ -79,9 +67,6 @@ spec:
|
||||
'
|
||||
for: 4h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusTSDBCompactionsFailing
|
||||
annotations:
|
||||
@ -93,9 +78,6 @@ spec:
|
||||
'
|
||||
for: 4h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusNotIngestingSamples
|
||||
annotations:
|
||||
@ -105,9 +87,6 @@ spec:
|
||||
expr: "(\n rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) <= 0\nand\n (\n sum without(scrape_job) (prometheus_target_metadata_cache_entries{job=\"prometheus-k8s\",namespace=\"monitoring\"}) > 0\n or\n sum without(rule_group) (prometheus_rule_group_rules{job=\"prometheus-k8s\",namespace=\"monitoring\"}) > 0\n )\n)\n"
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusDuplicateTimestamps
|
||||
annotations:
|
||||
@ -119,9 +98,6 @@ spec:
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusOutOfOrderTimestamps
|
||||
annotations:
|
||||
@ -133,9 +109,6 @@ spec:
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusRemoteStorageFailures
|
||||
annotations:
|
||||
@ -145,9 +118,6 @@ spec:
|
||||
expr: "(\n (rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n/\n (\n (rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n +\n (rate(prometheus_remote_storage_succeeded_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n )\n)\n* 100\n> 1\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: PrometheusRemoteWriteBehind
|
||||
annotations:
|
||||
@ -157,9 +127,6 @@ spec:
|
||||
expr: "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n- ignoring(remote_name, url) group_right\n max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n> 120\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: PrometheusRemoteWriteDesiredShards
|
||||
annotations:
|
||||
@ -169,9 +136,6 @@ spec:
|
||||
expr: "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_shards_desired{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n>\n max_over_time(prometheus_remote_storage_shards_max{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusRuleFailures
|
||||
annotations:
|
||||
@ -183,9 +147,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: PrometheusMissingRuleEvaluations
|
||||
annotations:
|
||||
@ -197,9 +158,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusTargetLimitHit
|
||||
annotations:
|
||||
@ -211,9 +169,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusLabelLimitHit
|
||||
annotations:
|
||||
@ -225,9 +180,6 @@ spec:
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusTargetSyncFailure
|
||||
annotations:
|
||||
@ -239,9 +191,6 @@ spec:
|
||||
'
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
@ -251,8 +200,5 @@ spec:
|
||||
expr: "min without (alertmanager) (\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\",alertmanager!~``}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\",alertmanager!~``}[5m])\n)\n* 100\n> 3\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
|
||||
|
@ -1,7 +1,3 @@
|
||||
#additionalRuleLabels:
|
||||
# clusterName: myTestCluster
|
||||
# awsRegion: eu-central-1
|
||||
|
||||
kube-prometheus-stack:
|
||||
global:
|
||||
rbac:
|
||||
@ -105,7 +101,7 @@ kube-prometheus-stack:
|
||||
logFormat: json
|
||||
# externalUrl:
|
||||
|
||||
# gather all monitors
|
||||
# gather ALL monitors
|
||||
podMonitorSelectorNilUsesHelmValues: false
|
||||
serviceMonitorSelectorNilUsesHelmValues: false
|
||||
ruleSelectorNilUsesHelmValues: false
|
||||
|
Loading…
Reference in New Issue
Block a user