chore: metrics cleanup
This commit is contained in:
parent
8949fd1147
commit
c10853f2d2
@ -1,614 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'alertmanager-overview' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "alertmanager-overview" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }}
|
||||
labels:
|
||||
{{- if $.Values.grafana.sidecar.dashboards.label }}
|
||||
{{ $.Values.grafana.sidecar.dashboards.label }}: {{ ternary $.Values.grafana.sidecar.dashboards.labelValue "1" (not (empty $.Values.grafana.sidecar.dashboards.labelValue)) | quote }}
|
||||
{{- end }}
|
||||
app: {{ template "kube-prometheus-stack.name" $ }}-grafana
|
||||
{{ include "kube-prometheus-stack.labels" $ | indent 4 }}
|
||||
data:
|
||||
alertmanager-overview.json: |-
|
||||
{
|
||||
"__inputs": [
|
||||
|
||||
],
|
||||
"__requires": [
|
||||
|
||||
],
|
||||
"annotations": {
|
||||
"list": [
|
||||
|
||||
]
|
||||
},
|
||||
"editable": false,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 1,
|
||||
"hideControls": false,
|
||||
"id": null,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"refresh": "30s",
|
||||
"rows": [
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"description": "current set of alerts stored in the Alertmanager",
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
|
||||
},
|
||||
"id": 2,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": false,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(alertmanager_alerts{namespace=~\"$namespace\",service=~\"$service\"}) by (namespace,service,instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Alerts",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "none",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "none",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"description": "rate of successful and invalid alerts received by the Alertmanager",
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
|
||||
},
|
||||
"id": 3,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": false,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(alertmanager_alerts_received_total{namespace=~\"$namespace\",service=~\"$service\"}[$__rate_interval])) by (namespace,service,instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} Received",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(alertmanager_alerts_invalid_total{namespace=~\"$namespace\",service=~\"$service\"}[$__rate_interval])) by (namespace,service,instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} Invalid",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Alerts receive rate",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "ops",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "ops",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Alerts",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"description": "rate of successful and invalid notifications sent by the Alertmanager",
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
|
||||
},
|
||||
"id": 4,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": false,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": "integration",
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(alertmanager_notifications_total{namespace=~\"$namespace\",service=~\"$service\", integration=\"$integration\"}[$__rate_interval])) by (integration,namespace,service,instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} Total",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(alertmanager_notifications_failed_total{namespace=~\"$namespace\",service=~\"$service\", integration=\"$integration\"}[$__rate_interval])) by (integration,namespace,service,instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} Failed",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "$integration: Notifications Send Rate",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "ops",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "ops",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"description": "latency of notifications sent by the Alertmanager",
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
|
||||
},
|
||||
"id": 5,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": false,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": "integration",
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.99,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=~\"$namespace\",service=~\"$service\", integration=\"$integration\"}[$__rate_interval])) by (le,namespace,service,instance)\n) \n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} 99th Percentile",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.50,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=~\"$namespace\",service=~\"$service\", integration=\"$integration\"}[$__rate_interval])) by (le,namespace,service,instance)\n) \n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} Median",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(alertmanager_notification_latency_seconds_sum{namespace=~\"$namespace\",service=~\"$service\", integration=\"$integration\"}[$__rate_interval])) by (namespace,service,instance)\n/\nsum(rate(alertmanager_notification_latency_seconds_count{namespace=~\"$namespace\",service=~\"$service\", integration=\"$integration\"}[$__rate_interval])) by (namespace,service,instance)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} Average",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "$integration: Notification Duration",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "s",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "s",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Notifications",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 14,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"alertmanager-mixin"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"text": "Prometheus",
|
||||
"value": "Prometheus"
|
||||
},
|
||||
"hide": 0,
|
||||
"label": "Data Source",
|
||||
"name": "datasource",
|
||||
"options": [
|
||||
|
||||
],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {
|
||||
"text": "",
|
||||
"value": ""
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "namespace",
|
||||
"multi": false,
|
||||
"name": "namespace",
|
||||
"options": [
|
||||
|
||||
],
|
||||
"query": "label_values(alertmanager_alerts, namespace)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [
|
||||
|
||||
],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {
|
||||
"text": "",
|
||||
"value": ""
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "service",
|
||||
"multi": false,
|
||||
"name": "service",
|
||||
"options": [
|
||||
|
||||
],
|
||||
"query": "label_values(alertmanager_alerts, service)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [
|
||||
|
||||
],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {
|
||||
"text": "all",
|
||||
"value": "$__all"
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"hide": 2,
|
||||
"includeAll": true,
|
||||
"label": null,
|
||||
"multi": false,
|
||||
"name": "integration",
|
||||
"options": [
|
||||
|
||||
],
|
||||
"query": "label_values(alertmanager_notifications_total{integration=~\".*\"}, integration)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [
|
||||
|
||||
],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "{{ .Values.grafana.defaultDashboardsTimezone }}",
|
||||
"title": "Alertmanager / Overview",
|
||||
"uid": "alertmanager-overview",
|
||||
"version": 0
|
||||
}
|
||||
{{- end }}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,636 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'grafana-overview' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "grafana-overview" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }}
|
||||
labels:
|
||||
{{- if $.Values.grafana.sidecar.dashboards.label }}
|
||||
{{ $.Values.grafana.sidecar.dashboards.label }}: {{ ternary $.Values.grafana.sidecar.dashboards.labelValue "1" (not (empty $.Values.grafana.sidecar.dashboards.labelValue)) | quote }}
|
||||
{{- end }}
|
||||
app: {{ template "kube-prometheus-stack.name" $ }}-grafana
|
||||
{{ include "kube-prometheus-stack.labels" $ | indent 4 }}
|
||||
data:
|
||||
grafana-overview.json: |-
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"target": {
|
||||
"limit": 100,
|
||||
"matchAny": false,
|
||||
"tags": [
|
||||
|
||||
],
|
||||
"type": "dashboard"
|
||||
},
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"id": 3085,
|
||||
"iteration": 1631554945276,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
|
||||
],
|
||||
"noValue": "0",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"mean"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"text": {
|
||||
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "8.1.3",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "grafana_alerting_result_total{job=~\"$job\", instance=~\"$instance\", state=\"alerting\"}",
|
||||
"instant": true,
|
||||
"interval": "",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Firing Alerts",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 0
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"mean"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"text": {
|
||||
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "8.1.3",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(grafana_stat_totals_dashboard{job=~\"$job\", instance=~\"$instance\"})",
|
||||
"interval": "",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Dashboards",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"align": null,
|
||||
"displayMode": "auto"
|
||||
},
|
||||
"mappings": [
|
||||
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"id": 10,
|
||||
"options": {
|
||||
"showHeader": true
|
||||
},
|
||||
"pluginVersion": "8.1.3",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "grafana_build_info{job=~\"$job\", instance=~\"$instance\"}",
|
||||
"instant": true,
|
||||
"interval": "",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Build Info",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "labelsToFields",
|
||||
"options": {
|
||||
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"branch": true,
|
||||
"container": true,
|
||||
"goversion": true,
|
||||
"namespace": true,
|
||||
"pod": true,
|
||||
"revision": true
|
||||
},
|
||||
"indexByName": {
|
||||
"Time": 7,
|
||||
"Value": 11,
|
||||
"branch": 4,
|
||||
"container": 8,
|
||||
"edition": 2,
|
||||
"goversion": 6,
|
||||
"instance": 1,
|
||||
"job": 0,
|
||||
"namespace": 9,
|
||||
"pod": 10,
|
||||
"revision": 5,
|
||||
"version": 3
|
||||
},
|
||||
"renameByName": {
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"links": [
|
||||
|
||||
]
|
||||
},
|
||||
"overrides": [
|
||||
|
||||
]
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 5
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 2,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"alertThreshold": true
|
||||
},
|
||||
"percentage": false,
|
||||
"pluginVersion": "8.1.3",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (status_code) (irate(grafana_http_request_duration_seconds_count{job=~\"$job\", instance=~\"$instance\"}[1m])) ",
|
||||
"interval": "",
|
||||
"legendFormat": "{{`{{`}}status_code{{`}}`}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [
|
||||
|
||||
],
|
||||
"timeShift": null,
|
||||
"title": "RPS",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"$$hashKey": "object:157",
|
||||
"format": "reqps",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"$$hashKey": "object:158",
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"links": [
|
||||
|
||||
]
|
||||
},
|
||||
"overrides": [
|
||||
|
||||
]
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 5
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 4,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"alertThreshold": true
|
||||
},
|
||||
"percentage": false,
|
||||
"pluginVersion": "8.1.3",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"exemplar": true,
|
||||
"expr": "histogram_quantile(0.99, sum(irate(grafana_http_request_duration_seconds_bucket{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) by (le)) * 1",
|
||||
"interval": "",
|
||||
"legendFormat": "99th Percentile",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"exemplar": true,
|
||||
"expr": "histogram_quantile(0.50, sum(irate(grafana_http_request_duration_seconds_bucket{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) by (le)) * 1",
|
||||
"interval": "",
|
||||
"legendFormat": "50th Percentile",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"exemplar": true,
|
||||
"expr": "sum(irate(grafana_http_request_duration_seconds_sum{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) * 1 / sum(irate(grafana_http_request_duration_seconds_count{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval]))",
|
||||
"interval": "",
|
||||
"legendFormat": "Average",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [
|
||||
|
||||
],
|
||||
"timeShift": null,
|
||||
"title": "Request Latency",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"$$hashKey": "object:210",
|
||||
"format": "ms",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"$$hashKey": "object:211",
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 30,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"selected": true,
|
||||
"text": "dev-cortex",
|
||||
"value": "dev-cortex"
|
||||
},
|
||||
"description": null,
|
||||
"error": null,
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": null,
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [
|
||||
|
||||
],
|
||||
"query": "prometheus",
|
||||
"queryValue": "",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": ".*",
|
||||
"current": {
|
||||
"selected": false,
|
||||
"text": [
|
||||
"default/grafana"
|
||||
],
|
||||
"value": [
|
||||
"default/grafana"
|
||||
]
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"definition": "label_values(grafana_build_info, job)",
|
||||
"description": null,
|
||||
"error": null,
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": null,
|
||||
"multi": true,
|
||||
"name": "job",
|
||||
"options": [
|
||||
|
||||
],
|
||||
"query": {
|
||||
"query": "label_values(grafana_build_info, job)",
|
||||
"refId": "Billing Admin-job-Variable-Query"
|
||||
},
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 0,
|
||||
"tagValuesQuery": "",
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": ".*",
|
||||
"current": {
|
||||
"selected": false,
|
||||
"text": "All",
|
||||
"value": "$__all"
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"definition": "label_values(grafana_build_info, instance)",
|
||||
"description": null,
|
||||
"error": null,
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": null,
|
||||
"multi": true,
|
||||
"name": "instance",
|
||||
"options": [
|
||||
|
||||
],
|
||||
"query": {
|
||||
"query": "label_values(grafana_build_info, instance)",
|
||||
"refId": "Billing Admin-instance-Variable-Query"
|
||||
},
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 0,
|
||||
"tagValuesQuery": "",
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
]
|
||||
},
|
||||
"timezone": "{{ .Values.grafana.defaultDashboardsTimezone }}",
|
||||
"title": "Grafana Overview",
|
||||
"uid": "6be0s85Mk",
|
||||
"version": 2
|
||||
}
|
||||
{{- end }}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,587 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'persistentvolumesusage' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "persistentvolumesusage" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }}
|
||||
labels:
|
||||
{{- if $.Values.grafana.sidecar.dashboards.label }}
|
||||
{{ $.Values.grafana.sidecar.dashboards.label }}: {{ ternary $.Values.grafana.sidecar.dashboards.labelValue "1" (not (empty $.Values.grafana.sidecar.dashboards.labelValue)) | quote }}
|
||||
{{- end }}
|
||||
app: {{ template "kube-prometheus-stack.name" $ }}-grafana
|
||||
{{ include "kube-prometheus-stack.labels" $ | indent 4 }}
|
||||
data:
|
||||
persistentvolumesusage.json: |-
|
||||
{
|
||||
"__inputs": [
|
||||
|
||||
],
|
||||
"__requires": [
|
||||
|
||||
],
|
||||
"annotations": {
|
||||
"list": [
|
||||
|
||||
]
|
||||
},
|
||||
"editable": false,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"hideControls": false,
|
||||
"id": null,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"refresh": "10s",
|
||||
"rows": [
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
|
||||
},
|
||||
"id": 2,
|
||||
"interval": "1m",
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 9,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(\n sum without(instance, node) (topk(1, (kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n -\n sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Used Space",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Free Space",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Volume Space Usage",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cacheTimeout": null,
|
||||
"colorBackground": false,
|
||||
"colorValue": false,
|
||||
"colors": [
|
||||
"rgba(50, 172, 45, 0.97)",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"rgba(245, 54, 54, 0.9)"
|
||||
],
|
||||
"datasource": "$datasource",
|
||||
"format": "percent",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
"minValue": 0,
|
||||
"show": true,
|
||||
"thresholdLabels": false,
|
||||
"thresholdMarkers": true
|
||||
},
|
||||
"gridPos": {
|
||||
|
||||
},
|
||||
"id": 3,
|
||||
"interval": "1m",
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"rightSide": true
|
||||
},
|
||||
"links": [
|
||||
|
||||
],
|
||||
"mappingType": 1,
|
||||
"mappingTypes": [
|
||||
{
|
||||
"name": "value to text",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"name": "range to text",
|
||||
"value": 2
|
||||
}
|
||||
],
|
||||
"maxDataPoints": 100,
|
||||
"nullPointMode": "connected",
|
||||
"nullText": null,
|
||||
"postfix": "",
|
||||
"postfixFontSize": "50%",
|
||||
"prefix": "",
|
||||
"prefixFontSize": "50%",
|
||||
"rangeMaps": [
|
||||
{
|
||||
"from": "null",
|
||||
"text": "N/A",
|
||||
"to": "null"
|
||||
}
|
||||
],
|
||||
"span": 3,
|
||||
"sparkline": {
|
||||
"fillColor": "rgba(31, 118, 189, 0.18)",
|
||||
"full": false,
|
||||
"lineColor": "rgb(31, 120, 193)",
|
||||
"show": false
|
||||
},
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "max without(instance,node) (\n(\n topk(1, kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n topk(1, kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n/\ntopk(1, kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n* 100)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": "80, 90",
|
||||
"title": "Volume Space Usage",
|
||||
"tooltip": {
|
||||
"shared": false
|
||||
},
|
||||
"type": "singlestat",
|
||||
"valueFontSize": "80%",
|
||||
"valueMaps": [
|
||||
{
|
||||
"op": "=",
|
||||
"text": "N/A",
|
||||
"value": "null"
|
||||
}
|
||||
],
|
||||
"valueName": "current"
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
|
||||
},
|
||||
"id": 4,
|
||||
"interval": "1m",
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 9,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Used inodes",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "(\n sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n -\n sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": " Free inodes",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Volume inodes Usage",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "none",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "none",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cacheTimeout": null,
|
||||
"colorBackground": false,
|
||||
"colorValue": false,
|
||||
"colors": [
|
||||
"rgba(50, 172, 45, 0.97)",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"rgba(245, 54, 54, 0.9)"
|
||||
],
|
||||
"datasource": "$datasource",
|
||||
"format": "percent",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
"minValue": 0,
|
||||
"show": true,
|
||||
"thresholdLabels": false,
|
||||
"thresholdMarkers": true
|
||||
},
|
||||
"gridPos": {
|
||||
|
||||
},
|
||||
"id": 5,
|
||||
"interval": "1m",
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"rightSide": true
|
||||
},
|
||||
"links": [
|
||||
|
||||
],
|
||||
"mappingType": 1,
|
||||
"mappingTypes": [
|
||||
{
|
||||
"name": "value to text",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"name": "range to text",
|
||||
"value": 2
|
||||
}
|
||||
],
|
||||
"maxDataPoints": 100,
|
||||
"nullPointMode": "connected",
|
||||
"nullText": null,
|
||||
"postfix": "",
|
||||
"postfixFontSize": "50%",
|
||||
"prefix": "",
|
||||
"prefixFontSize": "50%",
|
||||
"rangeMaps": [
|
||||
{
|
||||
"from": "null",
|
||||
"text": "N/A",
|
||||
"to": "null"
|
||||
}
|
||||
],
|
||||
"span": 3,
|
||||
"sparkline": {
|
||||
"fillColor": "rgba(31, 118, 189, 0.18)",
|
||||
"full": false,
|
||||
"lineColor": "rgb(31, 120, 193)",
|
||||
"show": false
|
||||
},
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "max without(instance,node) (\ntopk(1, kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n/\ntopk(1, kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n* 100)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": "80, 90",
|
||||
"title": "Volume inodes Usage",
|
||||
"tooltip": {
|
||||
"shared": false
|
||||
},
|
||||
"type": "singlestat",
|
||||
"valueFontSize": "80%",
|
||||
"valueMaps": [
|
||||
{
|
||||
"op": "=",
|
||||
"text": "N/A",
|
||||
"value": "null"
|
||||
}
|
||||
],
|
||||
"valueName": "current"
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 14,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"kubernetes-mixin"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"text": "default",
|
||||
"value": "default"
|
||||
},
|
||||
"hide": 0,
|
||||
"label": "Data Source",
|
||||
"name": "datasource",
|
||||
"options": [
|
||||
|
||||
],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {
|
||||
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"hide": {{ if .Values.grafana.sidecar.dashboards.multicluster.global.enabled }}0{{ else }}2{{ end }},
|
||||
"includeAll": false,
|
||||
"label": "cluster",
|
||||
"multi": false,
|
||||
"name": "cluster",
|
||||
"options": [
|
||||
|
||||
],
|
||||
"query": "label_values(kubelet_volume_stats_capacity_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}, cluster)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [
|
||||
|
||||
],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {
|
||||
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Namespace",
|
||||
"multi": false,
|
||||
"name": "namespace",
|
||||
"options": [
|
||||
|
||||
],
|
||||
"query": "label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"}, namespace)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [
|
||||
|
||||
],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {
|
||||
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "PersistentVolumeClaim",
|
||||
"multi": false,
|
||||
"name": "volume",
|
||||
"options": [
|
||||
|
||||
],
|
||||
"query": "label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\"}, persistentvolumeclaim)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [
|
||||
|
||||
],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-7d",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "{{ .Values.grafana.defaultDashboardsTimezone }}",
|
||||
"title": "Kubernetes / Persistent Volumes",
|
||||
"uid": "919b92a8e8041bd567af9edab12c840c",
|
||||
"version": 0
|
||||
}
|
||||
{{- end }}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,215 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/alertmanager-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.alertmanager }}
|
||||
{{- $alertmanagerJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager" }}
|
||||
{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: alertmanager.rules
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.AlertmanagerFailedReload | default false) }}
|
||||
- alert: AlertmanagerFailedReload
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Configuration has failed to load for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerfailedreload
|
||||
summary: Reloading an Alertmanager configuration has failed.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.AlertmanagerMembersInconsistent | default false) }}
|
||||
- alert: AlertmanagerMembersInconsistent
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} has only found {{`{{`}} $value {{`}}`}} members of the {{`{{`}}$labels.job{{`}}`}} cluster.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagermembersinconsistent
|
||||
summary: A member of an Alertmanager cluster has not found all other cluster members.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
< on (namespace,service) group_left
|
||||
count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]))
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.AlertmanagerFailedToSendAlerts | default false) }}
|
||||
- alert: AlertmanagerFailedToSendAlerts
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} failed to send {{`{{`}} $value | humanizePercentage {{`}}`}} of notifications to {{`{{`}} $labels.integration {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerfailedtosendalerts
|
||||
summary: An Alertmanager instance failed to send notifications.
|
||||
expr: |-
|
||||
(
|
||||
rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
/
|
||||
rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
)
|
||||
> 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.AlertmanagerClusterFailedToSendAlerts | default false) }}
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerclusterfailedtosendalerts
|
||||
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
|
||||
expr: |-
|
||||
min by (namespace,service, integration) (
|
||||
rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m])
|
||||
/
|
||||
rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m])
|
||||
)
|
||||
> 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.AlertmanagerClusterFailedToSendAlerts | default false) }}
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerclusterfailedtosendalerts
|
||||
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
|
||||
expr: |-
|
||||
min by (namespace,service, integration) (
|
||||
rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m])
|
||||
/
|
||||
rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m])
|
||||
)
|
||||
> 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.AlertmanagerConfigInconsistent | default false) }}
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have different configurations.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerconfiginconsistent
|
||||
summary: Alertmanager instances within the same cluster have different configurations.
|
||||
expr: |-
|
||||
count by (namespace,service) (
|
||||
count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})
|
||||
)
|
||||
!= 1
|
||||
for: 20m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.AlertmanagerClusterDown | default false) }}
|
||||
- alert: AlertmanagerClusterDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have been up for less than half of the last 5m.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerclusterdown
|
||||
summary: Half or more of the Alertmanager instances within the same cluster are down.
|
||||
expr: |-
|
||||
(
|
||||
count by (namespace,service) (
|
||||
avg_over_time(up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) < 0.5
|
||||
)
|
||||
/
|
||||
count by (namespace,service) (
|
||||
up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}
|
||||
)
|
||||
)
|
||||
>= 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.AlertmanagerClusterCrashlooping | default false) }}
|
||||
- alert: AlertmanagerClusterCrashlooping
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have restarted at least 5 times in the last 10m.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerclustercrashlooping
|
||||
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
|
||||
expr: |-
|
||||
(
|
||||
count by (namespace,service) (
|
||||
changes(process_start_time_seconds{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[10m]) > 4
|
||||
)
|
||||
/
|
||||
count by (namespace,service) (
|
||||
up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}
|
||||
)
|
||||
)
|
||||
>= 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,46 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'config-reloaders' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheusOperator-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.configReloaders }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "config-reloaders" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: config-reloaders
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.ConfigReloaderSidecarErrors | default false) }}
|
||||
- alert: ConfigReloaderSidecarErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'Errors encountered while the {{`{{`}}$labels.pod{{`}}`}} config-reloader sidecar attempts to sync config in {{`{{`}}$labels.namespace{{`}}`}} namespace.
|
||||
|
||||
As a result, configuration for service running in {{`{{`}}$labels.pod{{`}}`}} may be stale and cannot be updated anymore.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/configreloadersidecarerrors
|
||||
summary: config-reloader sidecar has not had a successful reload for 10m
|
||||
expr: max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,294 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'etcd' group from https://raw.githubusercontent.com/etcd-io/etcd/main/contrib/mixin/mixin.libsonnet
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeEtcd.enabled .Values.defaultRules.rules.etcd }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "etcd" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: etcd
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.etcdMembersDown | default false) }}
|
||||
- alert: etcdMembersDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": members are down ({{`{{`}} $value {{`}}`}}).'
|
||||
summary: etcd cluster members are down.
|
||||
expr: |-
|
||||
max without (endpoint) (
|
||||
sum without (instance) (up{job=~".*etcd.*"} == bool 0)
|
||||
or
|
||||
count without (To) (
|
||||
sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
|
||||
)
|
||||
)
|
||||
> 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdInsufficientMembers | default false) }}
|
||||
- alert: etcdInsufficientMembers
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).'
|
||||
summary: etcd cluster has insufficient number of members.
|
||||
expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdNoLeader | default false) }}
|
||||
- alert: etcdNoLeader
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.'
|
||||
summary: etcd cluster has no leader.
|
||||
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighNumberOfLeaderChanges | default false) }}
|
||||
- alert: etcdHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
|
||||
summary: etcd cluster has high number of leader changes.
|
||||
expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighNumberOfFailedGRPCRequests | default false) }}
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster has high number of failed grpc requests.
|
||||
expr: |-
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighNumberOfFailedGRPCRequests | default false) }}
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster has high number of failed grpc requests.
|
||||
expr: |-
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
||||
> 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdGRPCRequestsSlow | default false) }}
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile of gRPC requests is {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}} for {{`{{`}} $labels.grpc_method {{`}}`}} method.'
|
||||
summary: etcd grpc requests are slow
|
||||
expr: |-
|
||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdMemberCommunicationSlow | default false) }}
|
||||
- alert: etcdMemberCommunicationSlow
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster member communication is slow.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighNumberOfFailedProposals | default false) }}
|
||||
- alert: etcdHighNumberOfFailedProposals
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster has high number of proposal failures.
|
||||
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighFsyncDurations | default false) }}
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fsync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighFsyncDurations | default false) }}
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fsync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighCommitDurations | default false) }}
|
||||
- alert: etcdHighCommitDurations
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster 99th percentile commit durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdDatabaseQuotaLowSpace | default false) }}
|
||||
- alert: etcdDatabaseQuotaLowSpace
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": database size exceeds the defined quota on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
|
||||
summary: etcd cluster database is running full.
|
||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdExcessiveDatabaseGrowth | default false) }}
|
||||
- alert: etcdExcessiveDatabaseGrowth
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please check as it might be disruptive.'
|
||||
summary: etcd cluster database growing very fast.
|
||||
expr: predict_linear(etcd_mvcc_db_total_size_in_bytes[4h], 4*60*60) > etcd_server_quota_backend_bytes
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdDatabaseHighFragmentationRatio | default false) }}
|
||||
- alert: etcdDatabaseHighFragmentationRatio
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": database size in use on instance {{`{{`}} $labels.instance {{`}}`}} is {{`{{`}} $value | humanizePercentage {{`}}`}} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
|
||||
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
|
||||
summary: etcd database size in use is less than 50% of the actual allocated storage.
|
||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,98 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubePrometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.general }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "general.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: general.rules
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.TargetDown | default false) }}
|
||||
- alert: TargetDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/targetdown
|
||||
summary: One or more targets are unreachable.
|
||||
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.Watchdog | default false) }}
|
||||
- alert: Watchdog
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
|
||||
|
||||
This alert is always firing, therefore it should always be firing in Alertmanager
|
||||
|
||||
and always fire against a receiver. There are integrations with various notification
|
||||
|
||||
mechanisms that send a notification when this alert is not firing. For example the
|
||||
|
||||
"DeadMansSnitch" integration in PagerDuty.
|
||||
|
||||
'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/watchdog
|
||||
summary: An alert that should always be firing to certify that Alertmanager is working properly.
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: none
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.InfoInhibitor | default false) }}
|
||||
- alert: InfoInhibitor
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'This is an alert that is used to inhibit info alerts.
|
||||
|
||||
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
|
||||
|
||||
other alerts.
|
||||
|
||||
This alert fires whenever there''s a severity="info" alert, and stops firing when another alert with a
|
||||
|
||||
severity of ''warning'' or ''critical'' starts firing on the same namespace.
|
||||
|
||||
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
|
||||
|
||||
'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/infoinhibitor
|
||||
summary: Info-level alert inhibition.
|
||||
expr: ALERTS{severity = "info"} == 1 unless on(namespace) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
|
||||
labels:
|
||||
severity: none
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,173 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'k8s.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8s }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules
|
||||
rules:
|
||||
- expr: |-
|
||||
sum by (cluster, namespace, pod, container) (
|
||||
irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
|
||||
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
|
||||
1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
|
||||
- expr: |-
|
||||
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_working_set_bytes
|
||||
- expr: |-
|
||||
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_rss
|
||||
- expr: |-
|
||||
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_cache
|
||||
- expr: |-
|
||||
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_swap
|
||||
- expr: |-
|
||||
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
|
||||
group_left() max by (namespace, pod, cluster) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
|
||||
- expr: |-
|
||||
sum by (namespace, cluster) (
|
||||
sum by (namespace, pod, cluster) (
|
||||
max by (namespace, pod, container, cluster) (
|
||||
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
|
||||
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace_memory:kube_pod_container_resource_requests:sum
|
||||
- expr: |-
|
||||
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
|
||||
group_left() max by (namespace, pod, cluster) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
|
||||
- expr: |-
|
||||
sum by (namespace, cluster) (
|
||||
sum by (namespace, pod, cluster) (
|
||||
max by (namespace, pod, container, cluster) (
|
||||
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
|
||||
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace_cpu:kube_pod_container_resource_requests:sum
|
||||
- expr: |-
|
||||
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
|
||||
group_left() max by (namespace, pod, cluster) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
|
||||
- expr: |-
|
||||
sum by (namespace, cluster) (
|
||||
sum by (namespace, pod, cluster) (
|
||||
max by (namespace, pod, container, cluster) (
|
||||
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
|
||||
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace_memory:kube_pod_container_resource_limits:sum
|
||||
- expr: |-
|
||||
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
|
||||
group_left() max by (namespace, pod, cluster) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
|
||||
- expr: |-
|
||||
sum by (namespace, cluster) (
|
||||
sum by (namespace, pod, cluster) (
|
||||
max by (namespace, pod, container, cluster) (
|
||||
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
|
||||
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace_cpu:kube_pod_container_resource_limits:sum
|
||||
- expr: |-
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
|
||||
"replicaset", "$1", "owner_name", "(.*)"
|
||||
) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (
|
||||
1, max by (replicaset, namespace, owner_name) (
|
||||
kube_replicaset_owner{job="kube-state-metrics"}
|
||||
)
|
||||
),
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: deployment
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- expr: |-
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: daemonset
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- expr: |-
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: statefulset
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- expr: |-
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="Job"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: job
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
{{- end }}
|
@ -1,136 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kube-apiserver-availability.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverAvailability }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver-availability.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- interval: 3m
|
||||
name: kube-apiserver-availability.rules
|
||||
rules:
|
||||
- expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
|
||||
labels:
|
||||
verb: read
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
labels:
|
||||
verb: write
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- expr: sum by (cluster, verb, scope) (increase(apiserver_request_slo_duration_seconds_count[1h]))
|
||||
record: cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h
|
||||
- expr: sum by (cluster, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h[30d]) * 24 * 30)
|
||||
record: cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d
|
||||
- expr: sum by (cluster, verb, scope, le) (increase(apiserver_request_slo_duration_seconds_bucket[1h]))
|
||||
record: cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h
|
||||
- expr: sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h[30d]) * 24 * 30)
|
||||
record: cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d
|
||||
- expr: |-
|
||||
1 - (
|
||||
(
|
||||
# write too slow
|
||||
sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
-
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
|
||||
) +
|
||||
(
|
||||
# read too slow
|
||||
sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"LIST|GET"})
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
|
||||
+
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
|
||||
)
|
||||
) +
|
||||
# errors
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d)
|
||||
labels:
|
||||
verb: all
|
||||
record: apiserver_request:availability30d
|
||||
- expr: |-
|
||||
1 - (
|
||||
sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"LIST|GET"})
|
||||
-
|
||||
(
|
||||
# too slow
|
||||
(
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
|
||||
+
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"})
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:availability30d
|
||||
- expr: |-
|
||||
1 - (
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
-
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"})
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:availability30d
|
||||
- expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
{{- end }}
|
@ -1,328 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kube-apiserver-burnrate.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverBurnrate }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver-burnrate.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-apiserver-burnrate.rules
|
||||
rules:
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1d]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1d]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate1d
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1h]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate1h
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[2h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[2h]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[2h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate2h
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[30m]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[30m]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[30m]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate30m
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[3d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[3d]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[3d]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate3d
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[5m]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[5m]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[5m]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate5m
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[6h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[6h]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[6h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate6h
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1d]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate1d
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1h]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate1h
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[2h]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate2h
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[30m]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate30m
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[3d]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate3d
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[5m]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate5m
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[6h]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate6h
|
||||
{{- end }}
|
@ -1,37 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kube-apiserver-histogram.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverHistogram }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver-histogram.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-apiserver-histogram.rules
|
||||
rules:
|
||||
- expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
verb: read
|
||||
record: cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
verb: write
|
||||
record: cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile
|
||||
{{- end }}
|
@ -1,115 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kube-apiserver-slos' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverSlos }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver-slos" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-apiserver-slos
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubeAPIErrorBudgetBurn | default false) }}
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000)
|
||||
for: 2m
|
||||
labels:
|
||||
long: 1h
|
||||
severity: critical
|
||||
short: 5m
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeAPIErrorBudgetBurn | default false) }}
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000)
|
||||
for: 15m
|
||||
labels:
|
||||
long: 6h
|
||||
severity: critical
|
||||
short: 30m
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeAPIErrorBudgetBurn | default false) }}
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
|
||||
for: 1h
|
||||
labels:
|
||||
long: 1d
|
||||
severity: warning
|
||||
short: 2h
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeAPIErrorBudgetBurn | default false) }}
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
|
||||
for: 3h
|
||||
labels:
|
||||
long: 3d
|
||||
severity: warning
|
||||
short: 6h
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,31 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kube-prometheus-general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubePrometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusGeneral }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-prometheus-general.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-prometheus-general.rules
|
||||
rules:
|
||||
- expr: count without(instance, pod, node) (up == 1)
|
||||
record: count:up1
|
||||
- expr: count without(instance, pod, node) (up == 0)
|
||||
record: count:up0
|
||||
{{- end }}
|
@ -1,39 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubePrometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusNodeRecording }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-prometheus-node-recording.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-prometheus-node-recording.rules
|
||||
rules:
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)
|
||||
record: instance:node_cpu:rate:sum
|
||||
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
|
||||
record: instance:node_network_receive_bytes:rate:sum
|
||||
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
|
||||
record: instance:node_network_transmit_bytes:rate:sum
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
|
||||
record: instance:node_cpu:ratio
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
|
||||
record: cluster:node_cpu:sum_rate5m
|
||||
- expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
|
||||
record: cluster:node_cpu:ratio
|
||||
{{- end }}
|
@ -1,63 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeScheduler.enabled .Values.defaultRules.rules.kubeSchedulerRecording }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-scheduler.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-scheduler.rules
|
||||
rules:
|
||||
- expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
{{- end }}
|
@ -1,107 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kube-state-metrics' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubeStateMetrics-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubeStateMetrics }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-state-metrics" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-state-metrics
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStateMetricsListErrors | default false) }}
|
||||
- alert: KubeStateMetricsListErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricslisterrors
|
||||
summary: kube-state-metrics is experiencing errors in list operations.
|
||||
expr: |-
|
||||
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
|
||||
/
|
||||
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStateMetricsWatchErrors | default false) }}
|
||||
- alert: KubeStateMetricsWatchErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricswatcherrors
|
||||
summary: kube-state-metrics is experiencing errors in watch operations.
|
||||
expr: |-
|
||||
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
|
||||
/
|
||||
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStateMetricsShardingMismatch | default false) }}
|
||||
- alert: KubeStateMetricsShardingMismatch
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardingmismatch
|
||||
summary: kube-state-metrics sharding is misconfigured.
|
||||
expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStateMetricsShardsMissing | default false) }}
|
||||
- alert: KubeStateMetricsShardsMissing
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardsmissing
|
||||
summary: kube-state-metrics shards are missing.
|
||||
expr: |-
|
||||
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1
|
||||
-
|
||||
sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) )
|
||||
!= 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,39 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kubelet.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubelet.enabled .Values.defaultRules.rules.kubelet }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubelet.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubelet.rules
|
||||
rules:
|
||||
- expr: histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
{{- end }}
|
@ -1,375 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesApps }}
|
||||
{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-apps" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-apps
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubePodCrashLooping | default false) }}
|
||||
- alert: KubePodCrashLooping
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is in waiting state (reason: "CrashLoopBackOff").'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodcrashlooping
|
||||
summary: Pod is crash looping.
|
||||
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m]) >= 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubePodNotReady | default false) }}
|
||||
- alert: KubePodNotReady
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodnotready
|
||||
summary: Pod has been in a non-ready state for more than 15 minutes.
|
||||
expr: |-
|
||||
sum by (namespace, pod, cluster) (
|
||||
max by(namespace, pod, cluster) (
|
||||
kube_pod_status_phase{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}", phase=~"Pending|Unknown|Failed"}
|
||||
) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
|
||||
1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
|
||||
)
|
||||
) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeDeploymentGenerationMismatch | default false) }}
|
||||
- alert: KubeDeploymentGenerationMismatch
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentgenerationmismatch
|
||||
summary: Deployment generation mismatch due to possible roll-back
|
||||
expr: |-
|
||||
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeDeploymentReplicasMismatch | default false) }}
|
||||
- alert: KubeDeploymentReplicasMismatch
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentreplicasmismatch
|
||||
summary: Deployment has not matched the expected number of replicas.
|
||||
expr: |-
|
||||
(
|
||||
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
>
|
||||
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
) and (
|
||||
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStatefulSetReplicasMismatch | default false) }}
|
||||
- alert: KubeStatefulSetReplicasMismatch
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetreplicasmismatch
|
||||
summary: Deployment has not matched the expected number of replicas.
|
||||
expr: |-
|
||||
(
|
||||
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStatefulSetGenerationMismatch | default false) }}
|
||||
- alert: KubeStatefulSetGenerationMismatch
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetgenerationmismatch
|
||||
summary: StatefulSet generation mismatch due to possible roll-back
|
||||
expr: |-
|
||||
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStatefulSetUpdateNotRolledOut | default false) }}
|
||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetupdatenotrolledout
|
||||
summary: StatefulSet update has not been rolled out.
|
||||
expr: |-
|
||||
(
|
||||
max without (revision) (
|
||||
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
unless
|
||||
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
)
|
||||
*
|
||||
(
|
||||
kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
)
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeDaemonSetRolloutStuck | default false) }}
|
||||
- alert: KubeDaemonSetRolloutStuck
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetrolloutstuck
|
||||
summary: DaemonSet rollout is stuck.
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
) or (
|
||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
0
|
||||
) or (
|
||||
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
) or (
|
||||
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
)
|
||||
) and (
|
||||
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeContainerWaiting | default false) }}
|
||||
- alert: KubeContainerWaiting
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: pod/{{`{{`}} $labels.pod {{`}}`}} in namespace {{`{{`}} $labels.namespace {{`}}`}} on container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontainerwaiting
|
||||
summary: Pod container waiting longer than 1 hour
|
||||
expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeDaemonSetNotScheduled | default false) }}
|
||||
- alert: KubeDaemonSetNotScheduled
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetnotscheduled
|
||||
summary: DaemonSet pods are not scheduled.
|
||||
expr: |-
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
-
|
||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeDaemonSetMisScheduled | default false) }}
|
||||
- alert: KubeDaemonSetMisScheduled
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetmisscheduled
|
||||
summary: DaemonSet pods are misscheduled.
|
||||
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeJobNotCompleted | default false) }}
|
||||
- alert: KubeJobNotCompleted
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than {{`{{`}} "43200" | humanizeDuration {{`}}`}} to complete.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobnotcompleted
|
||||
summary: Job did not complete in time
|
||||
expr: |-
|
||||
time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
and
|
||||
kube_job_status_active{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0) > 43200
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeJobFailed | default false) }}
|
||||
- alert: KubeJobFailed
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobfailed
|
||||
summary: Job failed to complete.
|
||||
expr: kube_job_failed{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeHpaReplicasMismatch | default false) }}
|
||||
- alert: KubeHpaReplicasMismatch
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubehpareplicasmismatch
|
||||
summary: HPA has not matched desired number of replicas.
|
||||
expr: |-
|
||||
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
|
||||
and
|
||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
>
|
||||
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
|
||||
and
|
||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
<
|
||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
|
||||
and
|
||||
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeHpaMaxedOut | default false) }}
|
||||
- alert: KubeHpaMaxedOut
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has been running at max replicas for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubehpamaxedout
|
||||
summary: HPA is running at max replicas
|
||||
expr: |-
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
==
|
||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,193 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesResources }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-resources" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-resources
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubeCPUOvercommit | default false) }}
|
||||
- alert: KubeCPUOvercommit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Cluster has overcommitted CPU resource requests for Pods by {{`{{`}} $value {{`}}`}} CPU shares and cannot tolerate node failure.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
expr: |-
|
||||
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
|
||||
and
|
||||
(sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeMemoryOvercommit | default false) }}
|
||||
- alert: KubeMemoryOvercommit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Cluster has overcommitted memory resource requests for Pods by {{`{{`}} $value | humanize {{`}}`}} bytes and cannot tolerate node failure.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryovercommit
|
||||
summary: Cluster has overcommitted memory resource requests.
|
||||
expr: |-
|
||||
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
|
||||
and
|
||||
(sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeCPUQuotaOvercommit | default false) }}
|
||||
- alert: KubeCPUQuotaOvercommit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Cluster has overcommitted CPU resource requests for Namespaces.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuquotaovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
expr: |-
|
||||
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"}))
|
||||
/
|
||||
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"})
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeMemoryQuotaOvercommit | default false) }}
|
||||
- alert: KubeMemoryQuotaOvercommit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Cluster has overcommitted memory resource requests for Namespaces.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryquotaovercommit
|
||||
summary: Cluster has overcommitted memory resource requests.
|
||||
expr: |-
|
||||
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"}))
|
||||
/
|
||||
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"})
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeQuotaAlmostFull | default false) }}
|
||||
- alert: KubeQuotaAlmostFull
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotaalmostfull
|
||||
summary: Namespace quota is going to be full.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
> 0.9 < 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeQuotaFullyUsed | default false) }}
|
||||
- alert: KubeQuotaFullyUsed
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotafullyused
|
||||
summary: Namespace quota is fully used.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
== 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeQuotaExceeded | default false) }}
|
||||
- alert: KubeQuotaExceeded
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotaexceeded
|
||||
summary: Namespace quota has exceeded the limits.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.CPUThrottlingHigh | default false) }}
|
||||
- alert: CPUThrottlingHigh
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}}.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/cputhrottlinghigh
|
||||
summary: Processes experience elevated CPU throttling.
|
||||
expr: |-
|
||||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
|
||||
/
|
||||
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)
|
||||
> ( 25 / 100 )
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,161 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesStorage }}
|
||||
{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-storage" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-storage
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubePersistentVolumeFillingUp | default false) }}
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is only {{`{{`}} $value | humanizePercentage {{`}}`}} free.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumefillingup
|
||||
summary: PersistentVolume is filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
) < 0.03
|
||||
and
|
||||
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0
|
||||
unless on(namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on(namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubePersistentVolumeFillingUp | default false) }}
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} is available.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumefillingup
|
||||
summary: PersistentVolume is filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
) < 0.15
|
||||
and
|
||||
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
unless on(namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on(namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubePersistentVolumeInodesFillingUp | default false) }}
|
||||
- alert: KubePersistentVolumeInodesFillingUp
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} only has {{`{{`}} $value | humanizePercentage {{`}}`}} free inodes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeinodesfillingup
|
||||
summary: PersistentVolumeInodes are filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_inodes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
) < 0.03
|
||||
and
|
||||
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0
|
||||
unless on(namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on(namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubePersistentVolumeInodesFillingUp | default false) }}
|
||||
- alert: KubePersistentVolumeInodesFillingUp
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is expected to run out of inodes within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} of its inodes are free.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeinodesfillingup
|
||||
summary: PersistentVolumeInodes are filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_inodes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
) < 0.15
|
||||
and
|
||||
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
unless on(namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on(namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubePersistentVolumeErrors | default false) }}
|
||||
- alert: KubePersistentVolumeErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeerrors
|
||||
summary: PersistentVolume is having issues with provisioning.
|
||||
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,130 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system-apiserver' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-apiserver" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-apiserver
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubeClientCertificateExpiration | default false) }}
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration
|
||||
summary: Client certificate is about to expire.
|
||||
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeClientCertificateExpiration | default false) }}
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration
|
||||
summary: Client certificate is about to expire.
|
||||
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeAggregatedAPIErrors | default false) }}
|
||||
- alert: KubeAggregatedAPIErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has reported errors. It has appeared unavailable {{`{{`}} $value | humanize {{`}}`}} times averaged over the past 10m.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeaggregatedapierrors
|
||||
summary: Kubernetes aggregated API has reported errors.
|
||||
expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeAggregatedAPIDown | default false) }}
|
||||
- alert: KubeAggregatedAPIDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has been only {{`{{`}} $value | humanize {{`}}`}}% available over the last 10m.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeaggregatedapidown
|
||||
summary: Kubernetes aggregated API is down.
|
||||
expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeApiServer.enabled }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeAPIDown | default false) }}
|
||||
- alert: KubeAPIDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: KubeAPI has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapidown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="apiserver"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeAPITerminatedRequests | default false) }}
|
||||
- alert: KubeAPITerminatedRequests
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapiterminatedrequests
|
||||
summary: The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.
|
||||
expr: sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,46 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system-controller-manager' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeControllerManager.enabled .Values.defaultRules.rules.kubeControllerManager }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-controller-manager" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-controller-manager
|
||||
rules:
|
||||
{{- if .Values.kubeControllerManager.enabled }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeControllerManagerDown | default false) }}
|
||||
- alert: KubeControllerManagerDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: KubeControllerManager has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontrollermanagerdown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="kube-controller-manager"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,44 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system-kube-proxy' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeProxy.enabled .Values.defaultRules.rules.kubeProxy }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-kube-proxy" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-kube-proxy
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubeProxyDown | default false) }}
|
||||
- alert: KubeProxyDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: KubeProxy has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeproxydown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="kube-proxy"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,253 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system-kubelet' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-kubelet" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-kubelet
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubeNodeNotReady | default false) }}
|
||||
- alert: KubeNodeNotReady
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodenotready
|
||||
summary: Node is not ready.
|
||||
expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeNodeUnreachable | default false) }}
|
||||
- alert: KubeNodeUnreachable
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodeunreachable
|
||||
summary: Node is unreachable.
|
||||
expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletTooManyPods | default false) }}
|
||||
- alert: KubeletTooManyPods
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubelet '{{`{{`}} $labels.node {{`}}`}}' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubelettoomanypods
|
||||
summary: Kubelet is running at capacity.
|
||||
expr: |-
|
||||
count by(cluster, node) (
|
||||
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"})
|
||||
)
|
||||
/
|
||||
max by(cluster, node) (
|
||||
kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1
|
||||
) > 0.95
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeNodeReadinessFlapping | default false) }}
|
||||
- alert: KubeNodeReadinessFlapping
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodereadinessflapping
|
||||
summary: Node readiness status is flapping.
|
||||
expr: sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (cluster, node) > 2
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletPlegDurationHigh | default false) }}
|
||||
- alert: KubeletPlegDurationHigh
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletplegdurationhigh
|
||||
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
|
||||
expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletPodStartUpLatencyHigh | default false) }}
|
||||
- alert: KubeletPodStartUpLatencyHigh
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletpodstartuplatencyhigh
|
||||
summary: Kubelet Pod startup latency is too high.
|
||||
expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletClientCertificateExpiration | default false) }}
|
||||
- alert: KubeletClientCertificateExpiration
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration
|
||||
summary: Kubelet client certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_client_ttl_seconds < 604800
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletClientCertificateExpiration | default false) }}
|
||||
- alert: KubeletClientCertificateExpiration
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration
|
||||
summary: Kubelet client certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_client_ttl_seconds < 86400
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletServerCertificateExpiration | default false) }}
|
||||
- alert: KubeletServerCertificateExpiration
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration
|
||||
summary: Kubelet server certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_server_ttl_seconds < 604800
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletServerCertificateExpiration | default false) }}
|
||||
- alert: KubeletServerCertificateExpiration
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration
|
||||
summary: Kubelet server certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_server_ttl_seconds < 86400
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletClientCertificateRenewalErrors | default false) }}
|
||||
- alert: KubeletClientCertificateRenewalErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its client certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificaterenewalerrors
|
||||
summary: Kubelet has failed to renew its client certificate.
|
||||
expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletServerCertificateRenewalErrors | default false) }}
|
||||
- alert: KubeletServerCertificateRenewalErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its server certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificaterenewalerrors
|
||||
summary: Kubelet has failed to renew its server certificate.
|
||||
expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheusOperator.kubeletService.enabled }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletDown | default false) }}
|
||||
- alert: KubeletDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubelet has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletdown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,46 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system-scheduler' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeScheduler.enabled .Values.defaultRules.rules.kubeSchedulerAlerting }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-scheduler" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-scheduler
|
||||
rules:
|
||||
{{- if .Values.kubeScheduler.enabled }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeSchedulerDown | default false) }}
|
||||
- alert: KubeSchedulerDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: KubeScheduler has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeschedulerdown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="kube-scheduler"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,65 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubeVersionMismatch | default false) }}
|
||||
- alert: KubeVersionMismatch
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeversionmismatch
|
||||
summary: Different semantic versions of Kubernetes components running.
|
||||
expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeClientErrors | default false) }}
|
||||
- alert: KubeClientErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} $value | humanizePercentage {{`}}`}} errors.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclienterrors
|
||||
summary: Kubernetes API server client is experiencing errors.
|
||||
expr: |-
|
||||
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (cluster, instance, job, namespace)
|
||||
/
|
||||
sum(rate(rest_client_requests_total[5m])) by (cluster, instance, job, namespace))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,89 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'node-exporter.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/nodeExporter-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.nodeExporterRecording }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node-exporter.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: node-exporter.rules
|
||||
rules:
|
||||
- expr: |-
|
||||
count without (cpu, mode) (
|
||||
node_cpu_seconds_total{job="node-exporter",mode="idle"}
|
||||
)
|
||||
record: instance:node_num_cpu:sum
|
||||
- expr: |-
|
||||
1 - avg without (cpu) (
|
||||
sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
|
||||
)
|
||||
record: instance:node_cpu_utilisation:rate5m
|
||||
- expr: |-
|
||||
(
|
||||
node_load1{job="node-exporter"}
|
||||
/
|
||||
instance:node_num_cpu:sum{job="node-exporter"}
|
||||
)
|
||||
record: instance:node_load1_per_cpu:ratio
|
||||
- expr: |-
|
||||
1 - (
|
||||
(
|
||||
node_memory_MemAvailable_bytes{job="node-exporter"}
|
||||
or
|
||||
(
|
||||
node_memory_Buffers_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_Cached_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_MemFree_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_Slab_bytes{job="node-exporter"}
|
||||
)
|
||||
)
|
||||
/
|
||||
node_memory_MemTotal_bytes{job="node-exporter"}
|
||||
)
|
||||
record: instance:node_memory_utilisation:ratio
|
||||
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
|
||||
record: instance:node_vmstat_pgmajfault:rate5m
|
||||
- expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
|
||||
record: instance_device:node_disk_io_time_seconds:rate5m
|
||||
- expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
|
||||
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_receive_bytes_excluding_lo:rate5m
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_receive_drop_excluding_lo:rate5m
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_transmit_drop_excluding_lo:rate5m
|
||||
{{- end }}
|
@ -1,398 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'node-exporter' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/nodeExporter-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.nodeExporterAlerting }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node-exporter" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: node-exporter
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemSpaceFillingUp | default false) }}
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemSpaceFillingUp | default false) }}
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up fast.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 10
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemAlmostOutOfSpace | default false) }}
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 5% space left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemAlmostOutOfSpace | default false) }}
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 3% space left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemFilesFillingUp | default false) }}
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemFilesFillingUp | default false) }}
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up fast.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemAlmostOutOfFiles | default false) }}
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 5% inodes left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemAlmostOutOfFiles | default false) }}
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 3% inodes left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeNetworkReceiveErrs | default false) }}
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} receive errors in the last two minutes.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodenetworkreceiveerrs
|
||||
summary: Network interface is reporting many receive errors.
|
||||
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeNetworkTransmitErrs | default false) }}
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} transmit errors in the last two minutes.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodenetworktransmiterrs
|
||||
summary: Network interface is reporting many transmit errors.
|
||||
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeHighNumberConntrackEntriesUsed | default false) }}
|
||||
- alert: NodeHighNumberConntrackEntriesUsed
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodehighnumberconntrackentriesused
|
||||
summary: Number of conntrack are getting close to the limit.
|
||||
expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeTextFileCollectorScrapeError | default false) }}
|
||||
- alert: NodeTextFileCollectorScrapeError
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Node Exporter text file collector failed to scrape.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodetextfilecollectorscrapeerror
|
||||
summary: Node Exporter text file collector failed to scrape.
|
||||
expr: node_textfile_scrape_error{job="node-exporter"} == 1
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeClockSkewDetected | default false) }}
|
||||
- alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Clock on {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodeclockskewdetected
|
||||
summary: Clock skew detected.
|
||||
expr: |-
|
||||
(
|
||||
node_timex_offset_seconds{job="node-exporter"} > 0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
|
||||
)
|
||||
or
|
||||
(
|
||||
node_timex_offset_seconds{job="node-exporter"} < -0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeClockNotSynchronising | default false) }}
|
||||
- alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Clock on {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodeclocknotsynchronising
|
||||
summary: Clock not synchronising.
|
||||
expr: |-
|
||||
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
|
||||
and
|
||||
node_timex_maxerror_seconds{job="node-exporter"} >= 16
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeRAIDDegraded | default false) }}
|
||||
- alert: NodeRAIDDegraded
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: RAID array '{{`{{`}} $labels.device {{`}}`}}' on {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/noderaiddegraded
|
||||
summary: RAID Array is degraded
|
||||
expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeRAIDDiskFailure | default false) }}
|
||||
- alert: NodeRAIDDiskFailure
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: At least one device in RAID array on {{`{{`}} $labels.instance {{`}}`}} failed. Array '{{`{{`}} $labels.device {{`}}`}}' needs attention and possibly a disk swap.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/noderaiddiskfailure
|
||||
summary: Failed device in RAID array
|
||||
expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFileDescriptorLimit | default false) }}
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |-
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFileDescriptorLimit | default false) }}
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |-
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,44 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'node-network' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubePrometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.network }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node-network" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: node-network
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.NodeNetworkInterfaceFlapping | default false) }}
|
||||
- alert: NodeNetworkInterfaceFlapping
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing its up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/nodenetworkinterfaceflapping
|
||||
summary: Network interface is often changing its status
|
||||
expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,55 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'node.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.node }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: node.rules
|
||||
rules:
|
||||
- expr: |-
|
||||
topk by(cluster, namespace, pod) (1,
|
||||
max by (cluster, node, namespace, pod) (
|
||||
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
|
||||
))
|
||||
record: 'node_namespace_pod:kube_pod_info:'
|
||||
- expr: |-
|
||||
count by (cluster, node) (sum by (node, cpu) (
|
||||
node_cpu_seconds_total{job="node-exporter"}
|
||||
* on (namespace, pod) group_left(node)
|
||||
topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:)
|
||||
))
|
||||
record: node:node_num_cpu:sum
|
||||
- expr: |-
|
||||
sum(
|
||||
node_memory_MemAvailable_bytes{job="node-exporter"} or
|
||||
(
|
||||
node_memory_Buffers_bytes{job="node-exporter"} +
|
||||
node_memory_Cached_bytes{job="node-exporter"} +
|
||||
node_memory_MemFree_bytes{job="node-exporter"} +
|
||||
node_memory_Slab_bytes{job="node-exporter"}
|
||||
)
|
||||
) by (cluster)
|
||||
record: :node_memory_MemAvailable_bytes:sum
|
||||
- expr: |-
|
||||
sum(rate(node_cpu_seconds_total{job="node-exporter",mode!="idle",mode!="iowait",mode!="steal"}[5m])) /
|
||||
count(sum(node_cpu_seconds_total{job="node-exporter"}) by (cluster, instance, cpu))
|
||||
record: cluster:node_cpu:ratio_rate5m
|
||||
{{- end }}
|
@ -1,148 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheusOperator-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheusOperator }}
|
||||
{{- $operatorJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "operator" }}
|
||||
{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus-operator" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: prometheus-operator
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorListErrors | default false) }}
|
||||
- alert: PrometheusOperatorListErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Errors while performing List operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorlisterrors
|
||||
summary: Errors while performing list operations in controller.
|
||||
expr: (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m]))) > 0.4
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorWatchErrors | default false) }}
|
||||
- alert: PrometheusOperatorWatchErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Errors while performing watch operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorwatcherrors
|
||||
summary: Errors while performing watch operations in controller.
|
||||
expr: (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.4
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorSyncFailed | default false) }}
|
||||
- alert: PrometheusOperatorSyncFailed
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Controller {{`{{`}} $labels.controller {{`}}`}} in {{`{{`}} $labels.namespace {{`}}`}} namespace fails to reconcile {{`{{`}} $value {{`}}`}} objects.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorsyncfailed
|
||||
summary: Last controller reconciliation failed
|
||||
expr: min_over_time(prometheus_operator_syncs{status="failed",job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorReconcileErrors | default false) }}
|
||||
- alert: PrometheusOperatorReconcileErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of reconciling operations failed for {{`{{`}} $labels.controller {{`}}`}} controller in {{`{{`}} $labels.namespace {{`}}`}} namespace.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorreconcileerrors
|
||||
summary: Errors while reconciling controller.
|
||||
expr: (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorNodeLookupErrors | default false) }}
|
||||
- alert: PrometheusOperatorNodeLookupErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Errors while reconciling Prometheus in {{`{{`}} $labels.namespace {{`}}`}} Namespace.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatornodelookuperrors
|
||||
summary: Errors while reconciling Prometheus.
|
||||
expr: rate(prometheus_operator_node_address_lookup_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorNotReady | default false) }}
|
||||
- alert: PrometheusOperatorNotReady
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace isn't ready to reconcile {{`{{`}} $labels.controller {{`}}`}} resources.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatornotready
|
||||
summary: Prometheus operator not ready
|
||||
expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) == 0)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorRejectedResources | default false) }}
|
||||
- alert: PrometheusOperatorRejectedResources
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace rejected {{`{{`}} printf "%0.0f" $value {{`}}`}} {{`{{`}} $labels.controller {{`}}`}}/{{`{{`}} $labels.resource {{`}}`}} resources.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorrejectedresources
|
||||
summary: Resources rejected by Prometheus operator
|
||||
expr: min_over_time(prometheus_operator_managed_resources{state="rejected",job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,448 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'prometheus' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheus }}
|
||||
{{- $prometheusJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus" }}
|
||||
{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: prometheus
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusBadConfig | default false) }}
|
||||
- alert: PrometheusBadConfig
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to reload its configuration.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusbadconfig
|
||||
summary: Failed Prometheus configuration reload.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(prometheus_config_last_reload_successful{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusNotificationQueueRunningFull | default false) }}
|
||||
- alert: PrometheusNotificationQueueRunningFull
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Alert notification queue of Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is running full.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusnotificationqueuerunningfull
|
||||
summary: Prometheus alert notification queue predicted to run full in less than 30m.
|
||||
expr: |-
|
||||
# Without min_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m], 60 * 30)
|
||||
>
|
||||
min_over_time(prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusErrorSendingAlertsToSomeAlertmanagers | default false) }}
|
||||
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.alertmanager{{`}}`}}.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheuserrorsendingalertstosomealertmanagers
|
||||
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
|
||||
expr: |-
|
||||
(
|
||||
rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
)
|
||||
* 100
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusNotConnectedToAlertmanagers | default false) }}
|
||||
- alert: PrometheusNotConnectedToAlertmanagers
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not connected to any Alertmanagers.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusnotconnectedtoalertmanagers
|
||||
summary: Prometheus is not connected to any Alertmanagers.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusTSDBReloadsFailing | default false) }}
|
||||
- alert: PrometheusTSDBReloadsFailing
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} reload failures over the last 3h.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheustsdbreloadsfailing
|
||||
summary: Prometheus has issues reloading blocks from disk.
|
||||
expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusTSDBCompactionsFailing | default false) }}
|
||||
- alert: PrometheusTSDBCompactionsFailing
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} compaction failures over the last 3h.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheustsdbcompactionsfailing
|
||||
summary: Prometheus has issues compacting blocks.
|
||||
expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusNotIngestingSamples | default false) }}
|
||||
- alert: PrometheusNotIngestingSamples
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not ingesting samples.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusnotingestingsamples
|
||||
summary: Prometheus is not ingesting samples.
|
||||
expr: |-
|
||||
(
|
||||
rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0
|
||||
and
|
||||
(
|
||||
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0
|
||||
or
|
||||
sum without(rule_group) (prometheus_rule_group_rules{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0
|
||||
)
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusDuplicateTimestamps | default false) }}
|
||||
- alert: PrometheusDuplicateTimestamps
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with different values but duplicated timestamp.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusduplicatetimestamps
|
||||
summary: Prometheus is dropping samples with duplicate timestamps.
|
||||
expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOutOfOrderTimestamps | default false) }}
|
||||
- alert: PrometheusOutOfOrderTimestamps
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with timestamps arriving out of order.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusoutofordertimestamps
|
||||
summary: Prometheus drops samples with out-of-order timestamps.
|
||||
expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusRemoteStorageFailures | default false) }}
|
||||
- alert: PrometheusRemoteStorageFailures
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} failed to send {{`{{`}} printf "%.1f" $value {{`}}`}}% of the samples to {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusremotestoragefailures
|
||||
summary: Prometheus fails to send samples to remote storage.
|
||||
expr: |-
|
||||
(
|
||||
(rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]))
|
||||
/
|
||||
(
|
||||
(rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]))
|
||||
+
|
||||
(rate(prometheus_remote_storage_succeeded_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]))
|
||||
)
|
||||
)
|
||||
* 100
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusRemoteWriteBehind | default false) }}
|
||||
- alert: PrometheusRemoteWriteBehind
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write is {{`{{`}} printf "%.1f" $value {{`}}`}}s behind for {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusremotewritebehind
|
||||
summary: Prometheus remote write is behind.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
- ignoring(remote_name, url) group_right
|
||||
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
)
|
||||
> 120
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusRemoteWriteDesiredShards | default false) }}
|
||||
- alert: PrometheusRemoteWriteDesiredShards
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write desired shards calculation wants to run {{`{{`}} $value {{`}}`}} shards for queue {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}, which is more than the max of {{`{{`}} printf `prometheus_remote_storage_shards_max{instance="%s",job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}` $labels.instance | query | first | value {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusremotewritedesiredshards
|
||||
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
max_over_time(prometheus_remote_storage_shards_desired{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
>
|
||||
max_over_time(prometheus_remote_storage_shards_max{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusRuleFailures | default false) }}
|
||||
- alert: PrometheusRuleFailures
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to evaluate {{`{{`}} printf "%.0f" $value {{`}}`}} rules in the last 5m.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusrulefailures
|
||||
summary: Prometheus is failing rule evaluations.
|
||||
expr: increase(prometheus_rule_evaluation_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusMissingRuleEvaluations | default false) }}
|
||||
- alert: PrometheusMissingRuleEvaluations
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has missed {{`{{`}} printf "%.0f" $value {{`}}`}} rule group evaluations in the last 5m.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusmissingruleevaluations
|
||||
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
||||
expr: increase(prometheus_rule_group_iterations_missed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusTargetLimitHit | default false) }}
|
||||
- alert: PrometheusTargetLimitHit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because the number of targets exceeded the configured target_limit.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheustargetlimithit
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
|
||||
expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusLabelLimitHit | default false) }}
|
||||
- alert: PrometheusLabelLimitHit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheuslabellimithit
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit.
|
||||
expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusScrapeBodySizeLimitHit | default false) }}
|
||||
- alert: PrometheusScrapeBodySizeLimitHit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed {{`{{`}} printf "%.0f" $value {{`}}`}} scrapes in the last 5m because some targets exceeded the configured body_size_limit.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusscrapebodysizelimithit
|
||||
summary: Prometheus has dropped some targets that exceeded body size limit.
|
||||
expr: increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusScrapeSampleLimitHit | default false) }}
|
||||
- alert: PrometheusScrapeSampleLimitHit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed {{`{{`}} printf "%.0f" $value {{`}}`}} scrapes in the last 5m because some targets exceeded the configured sample_limit.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusscrapesamplelimithit
|
||||
summary: Prometheus has failed scrapes that have exceeded the configured sample limit.
|
||||
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusTargetSyncFailure | default false) }}
|
||||
- alert: PrometheusTargetSyncFailure
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} printf "%.0f" $value {{`}}`}} targets in Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} have failed to sync because invalid configuration was supplied.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheustargetsyncfailure
|
||||
summary: Prometheus has failed to sync targets.
|
||||
expr: increase(prometheus_target_sync_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[30m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusHighQueryLoad | default false) }}
|
||||
- alert: PrometheusHighQueryLoad
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheushighqueryload
|
||||
summary: Prometheus is reaching its maximum capacity serving concurrent requests.
|
||||
expr: avg_over_time(prometheus_engine_queries{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.8
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusErrorSendingAlertsToAnyAlertmanager | default false) }}
|
||||
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% minimum errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to any Alertmanager.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheuserrorsendingalertstoanyalertmanager
|
||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
expr: |-
|
||||
min without (alertmanager) (
|
||||
rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}",alertmanager!~``}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}",alertmanager!~``}[5m])
|
||||
)
|
||||
* 100
|
||||
> 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,22 +1,15 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
#PG_VER=$(yq eval '.dependencies[] | select(.name=="prometheus-pushgateway") | .version' Chart.yaml)
|
||||
|
||||
helm dep update
|
||||
|
||||
VERSION=$(yq eval '.dependencies[] | select(.name=="kube-prometheus-stack") | .version' Chart.yaml)
|
||||
rm -rf charts/kube-prometheus-stack
|
||||
helm pull prometheus-community/kube-prometheus-stack --untar --untardir charts --version $VERSION
|
||||
|
||||
#rm -rf charts/prometheus-pushgateway
|
||||
#helm pull prometheus-community/prometheus-pushgateway --untar --untardir charts --version $PG_VER
|
||||
|
||||
# workaround for https://github.com/prometheus-community/helm-charts/issues/1500
|
||||
patch -p0 -i zdt.patch --no-backup-if-mismatch
|
||||
|
||||
#patch -p0 -i zdt-pushgateway.patch --no-backup-if-mismatch
|
||||
|
||||
# Create ZDT dashboard, alerts etc configmaps
|
||||
cd jsonnet && make
|
||||
|
||||
|
@ -1,66 +0,0 @@
|
||||
diff -tubr charts/prometheus-pushgateway/templates/deployment.yaml charts/prometheus-pushgateway.zdt/templates/deployment.yaml
|
||||
--- charts/prometheus-pushgateway/templates/deployment.yaml 2021-07-21 15:04:50.196722434 +0200
|
||||
+++ charts/prometheus-pushgateway.zdt/templates/deployment.yaml 2021-07-21 14:51:39.618598904 +0200
|
||||
@@ -2,6 +2,7 @@
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ template "prometheus-pushgateway.fullname" . }}
|
||||
+ namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
{{ template "prometheus-pushgateway.defaultLabels" merge (dict "extraLabels" .Values.podLabels) . }}
|
||||
spec:
|
||||
diff -tubr charts/prometheus-pushgateway/templates/ingress.yaml charts/prometheus-pushgateway.zdt/templates/ingress.yaml
|
||||
--- charts/prometheus-pushgateway/templates/ingress.yaml 2021-07-21 15:04:50.196722434 +0200
|
||||
+++ charts/prometheus-pushgateway.zdt/templates/ingress.yaml 2021-07-21 14:51:49.698702493 +0200
|
||||
@@ -16,6 +16,7 @@
|
||||
labels:
|
||||
{{ template "prometheus-pushgateway.defaultLabels" merge (dict "extraLabels" dict) . }}
|
||||
name: {{ template "prometheus-pushgateway.fullname" . }}
|
||||
+ namespace: {{ .Release.Namespace }}
|
||||
spec:
|
||||
rules:
|
||||
{{- range $host := .Values.ingress.hosts }}
|
||||
diff -tubr charts/prometheus-pushgateway/templates/pdb.yaml charts/prometheus-pushgateway.zdt/templates/pdb.yaml
|
||||
--- charts/prometheus-pushgateway/templates/pdb.yaml 2021-07-21 15:04:50.196722434 +0200
|
||||
+++ charts/prometheus-pushgateway.zdt/templates/pdb.yaml 2021-07-21 14:52:17.902325652 +0200
|
||||
@@ -3,6 +3,7 @@
|
||||
kind: PodDisruptionBudget
|
||||
metadata:
|
||||
name: {{ template "prometheus-pushgateway.fullname" . }}
|
||||
+ namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
{{ template "prometheus-pushgateway.defaultLabels" merge (dict "extraLabels" .Values.podLabels) . }}
|
||||
spec:
|
||||
diff -tubr charts/prometheus-pushgateway/templates/pushgateway-pvc.yaml charts/prometheus-pushgateway.zdt/templates/pushgateway-pvc.yaml
|
||||
--- charts/prometheus-pushgateway/templates/pushgateway-pvc.yaml 2021-07-21 15:04:50.196722434 +0200
|
||||
+++ charts/prometheus-pushgateway.zdt/templates/pushgateway-pvc.yaml 2021-07-21 14:52:31.325796911 +0200
|
||||
@@ -10,6 +10,7 @@
|
||||
labels:
|
||||
{{ template "prometheus-pushgateway.defaultLabels" merge (dict "extraLabels" .Values.persistentVolumeLabels) . }}
|
||||
name: {{ template "prometheus-pushgateway.fullname" . }}
|
||||
+ namespace: {{ .Release.Namespace }}
|
||||
spec:
|
||||
accessModes:
|
||||
{{ toYaml .Values.persistentVolume.accessModes | indent 4 }}
|
||||
diff -tubr charts/prometheus-pushgateway/templates/service.yaml charts/prometheus-pushgateway.zdt/templates/service.yaml
|
||||
--- charts/prometheus-pushgateway/templates/service.yaml 2021-07-21 15:04:50.196722434 +0200
|
||||
+++ charts/prometheus-pushgateway.zdt/templates/service.yaml 2021-07-21 14:52:39.355879424 +0200
|
||||
@@ -2,6 +2,7 @@
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ template "prometheus-pushgateway.fullname" . }}
|
||||
+ namespace: {{ .Release.Namespace }}
|
||||
annotations:
|
||||
{{ .Values.serviceAnnotations | toYaml | indent 4 }}
|
||||
labels:
|
||||
diff -tubr charts/prometheus-pushgateway/templates/serviceaccount.yaml charts/prometheus-pushgateway.zdt/templates/serviceaccount.yaml
|
||||
--- charts/prometheus-pushgateway/templates/serviceaccount.yaml 2021-07-21 15:04:50.196722434 +0200
|
||||
+++ charts/prometheus-pushgateway.zdt/templates/serviceaccount.yaml 2021-07-21 14:52:44.682600827 +0200
|
||||
@@ -3,6 +3,7 @@
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: {{ template "prometheus-pushgateway.serviceAccountName" . }}
|
||||
+ namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
{{ template "prometheus-pushgateway.defaultLabels" merge (dict "extraLabels" .Values.serviceAccountLabels) . }}
|
||||
{{- end -}}
|
Loading…
Reference in New Issue
Block a user