feat: metrics version bump

This commit is contained in:
Stefan Reimer 2025-02-12 19:17:27 +00:00
parent 99996b606f
commit 3343d51d09
41 changed files with 3437 additions and 445 deletions

View File

@ -5,22 +5,22 @@ folder: Istio
condition: '.Values.istiod.telemetry.enabled'
dashboards:
- name: istio-control-plane
url: https://grafana.com/api/dashboards/7645/revisions/229/download
url: https://grafana.com/api/dashboards/7645/revisions/239/download
tags:
- Istio
- name: istio-mesh
url: https://grafana.com/api/dashboards/7639/revisions/229/download
url: https://grafana.com/api/dashboards/7639/revisions/239/download
tags:
- Istio
- name: istio-service
url: https://grafana.com/api/dashboards/7636/revisions/229/download
url: https://grafana.com/api/dashboards/7636/revisions/239/download
tags:
- Istio
- name: istio-workload
url: https://grafana.com/api/dashboards/7630/revisions/229/download
url: https://grafana.com/api/dashboards/7630/revisions/239/download
tags:
- Istio
- name: istio-performance
url: https://grafana.com/api/dashboards/11829/revisions/229/download
url: https://grafana.com/api/dashboards/11829/revisions/239/download
tags:
- Istio

File diff suppressed because one or more lines are too long

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: kubezero-metrics
description: KubeZero Umbrella Chart for Prometheus, Grafana and Alertmanager as well as all Kubernetes integrations.
type: application
version: 0.10.4
version: 0.11.0
home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords:
@ -29,4 +29,4 @@ dependencies:
version: 3.0.0
repository: https://prometheus-community.github.io/helm-charts
condition: prometheus-pushgateway.enabled
kubeVersion: ">= 1.26.0"
kubeVersion: ">= 1.30.0-0"

View File

@ -1,6 +1,6 @@
# kubezero-metrics
![Version: 0.10.2](https://img.shields.io/badge/Version-0.10.2-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)
![Version: 0.10.4](https://img.shields.io/badge/Version-0.10.4-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)
KubeZero Umbrella Chart for Prometheus, Grafana and Alertmanager as well as all Kubernetes integrations.
@ -14,14 +14,14 @@ KubeZero Umbrella Chart for Prometheus, Grafana and Alertmanager as well as all
## Requirements
Kubernetes: `>= 1.26.0`
Kubernetes: `>= 1.30.0-0`
| Repository | Name | Version |
|------------|------|---------|
| https://cdn.zero-downtime.net/charts/ | kubezero-lib | >= 0.1.6 |
| https://prometheus-community.github.io/helm-charts | kube-prometheus-stack | 66.1.1 |
| https://prometheus-community.github.io/helm-charts | kube-prometheus-stack | 69.2.3 |
| https://prometheus-community.github.io/helm-charts | prometheus-adapter | 4.11.0 |
| https://prometheus-community.github.io/helm-charts | prometheus-pushgateway | 2.15.0 |
| https://prometheus-community.github.io/helm-charts | prometheus-pushgateway | 3.0.0 |
## Values

View File

@ -30,7 +30,7 @@
"options": {
"content": "The SLO (service level objective) and other metrics displayed on this dashboard are for informational purposes only."
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"title": "Notice",
"type": "text"
},
@ -54,7 +54,7 @@
},
"id": 2,
"interval": "1m",
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -100,7 +100,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -134,7 +134,7 @@
},
"id": 4,
"interval": "1m",
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -232,7 +232,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -276,7 +276,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -319,7 +319,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -353,7 +353,7 @@
},
"id": 8,
"interval": "1m",
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -451,7 +451,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -495,7 +495,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -538,7 +538,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -581,7 +581,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -624,7 +624,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -670,7 +670,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -712,7 +712,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -755,7 +755,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -797,7 +797,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {

View File

@ -46,7 +46,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -92,7 +92,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -163,7 +163,7 @@
"y": 9
},
"id": 3,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -328,7 +328,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -374,7 +374,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -420,7 +420,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -466,7 +466,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -512,7 +512,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -558,7 +558,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -604,7 +604,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -650,7 +650,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -696,7 +696,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -742,7 +742,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {

View File

@ -35,7 +35,7 @@
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -86,7 +86,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -137,7 +137,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -188,7 +188,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -239,7 +239,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -314,15 +314,15 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"POST\"}[$__rate_interval])) by (verb, url, le))",
"legendFormat": "{{verb}} {{url}}"
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"POST\"}[$__rate_interval])) by (verb, le))",
"legendFormat": "{{verb}}"
}
],
"title": "Post Request Latency 99th Quantile",
@ -365,15 +365,15 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"GET\"}[$__rate_interval])) by (verb, url, le))",
"legendFormat": "{{verb}} {{url}}"
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"GET\"}[$__rate_interval])) by (verb, le))",
"legendFormat": "{{verb}}"
}
],
"title": "Get Request Latency 99th Quantile",
@ -416,7 +416,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -467,7 +467,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -518,7 +518,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {

View File

@ -35,7 +35,7 @@
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -70,7 +70,7 @@
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -105,7 +105,7 @@
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -140,7 +140,7 @@
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -175,7 +175,7 @@
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -210,7 +210,7 @@
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -260,7 +260,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -319,7 +319,7 @@
"y": 12
},
"id": 8,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -476,7 +476,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -571,7 +571,7 @@
"y": 24
},
"id": 10,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -748,7 +748,7 @@
"y": 30
},
"id": 11,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -892,7 +892,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -943,7 +943,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -994,7 +994,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1045,7 +1045,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1096,7 +1096,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1147,7 +1147,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1198,7 +1198,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1249,7 +1249,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1300,7 +1300,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1351,7 +1351,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1422,7 +1422,7 @@
"y": 96
},
"id": 22,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {

View File

@ -35,7 +35,7 @@
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -70,7 +70,7 @@
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -105,7 +105,7 @@
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -140,7 +140,7 @@
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -242,7 +242,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -317,7 +317,7 @@
"y": 14
},
"id": 6,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -500,7 +500,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -578,7 +578,7 @@
"y": 28
},
"id": 8,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -768,7 +768,7 @@
"y": 35
},
"id": 9,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -912,7 +912,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -963,7 +963,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1014,7 +1014,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1065,7 +1065,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1116,7 +1116,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1167,7 +1167,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1218,7 +1218,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1269,7 +1269,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1340,7 +1340,7 @@
"y": 70
},
"id": 18,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {

View File

@ -94,7 +94,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -161,7 +161,7 @@
"y": 6
},
"id": 2,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -323,7 +323,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -342,6 +342,109 @@
"legendFormat": "{{pod}}"
}
],
"title": "Memory Usage (w/cache)",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true,
"stacking": {
"mode": "normal"
}
},
"unit": "bytes"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "max capacity"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
},
{
"id": "custom.stacking",
"value": {
"mode": "none"
}
},
{
"id": "custom.hideFrom",
"value": {
"legend": false,
"tooltip": true,
"viz": false
}
},
{
"id": "custom.lineStyle",
"value": {
"dash": [
10,
10
],
"fill": "dash"
}
}
]
}
]
},
"gridPos": {
"h": 6,
"w": 24,
"x": 0,
"y": 18
},
"id": 4,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(kube_node_status_capacity{cluster=\"$cluster\", job=\"kube-state-metrics\", node=~\"$node\", resource=\"memory\"})",
"legendFormat": "max capacity"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(node_namespace_pod_container:container_memory_rss{cluster=\"$cluster\", node=~\"$node\", container!=\"\"}) by (pod)",
"legendFormat": "{{pod}}"
}
],
"title": "Memory Usage (w/o cache)",
"type": "timeseries"
},
@ -390,10 +493,10 @@
"h": 6,
"w": 24,
"x": 0,
"y": 18
"y": 24
},
"id": 4,
"pluginVersion": "v11.1.0",
"id": 5,
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {

View File

@ -102,7 +102,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -208,7 +208,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -250,7 +250,7 @@
"y": 14
},
"id": 3,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -433,7 +433,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -494,7 +494,7 @@
"y": 28
},
"id": 5,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -664,7 +664,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -715,7 +715,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -766,7 +766,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -817,7 +817,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -868,7 +868,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -919,7 +919,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -970,7 +970,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1029,7 +1029,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1088,7 +1088,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1139,7 +1139,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1193,7 +1193,7 @@
"y": 70
},
"id": 16,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {

View File

@ -0,0 +1,675 @@
{
"editable": false,
"panels": [
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "none"
}
},
"gridPos": {
"h": 3,
"w": 4,
"x": 0,
"y": 0
},
"id": 1,
"interval": "1m",
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "1 - avg(rate(windows_cpu_time_total{cluster=\"$cluster\", job=\"kubernetes-windows-exporter\", mode=\"idle\"}[$__rate_interval]))",
"instant": true
}
],
"title": "CPU Utilisation",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "percentunit"
}
},
"gridPos": {
"h": 3,
"w": 4,
"x": 4,
"y": 0
},
"id": 2,
"interval": "1m",
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(kube_pod_windows_container_resource_cpu_cores_request{cluster=\"$cluster\"}) / sum(node:windows_node_num_cpu:sum{cluster=\"$cluster\"})",
"instant": true
}
],
"title": "CPU Requests Commitment",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "percentunit"
}
},
"gridPos": {
"h": 3,
"w": 4,
"x": 8,
"y": 0
},
"id": 3,
"interval": "1m",
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(kube_pod_windows_container_resource_cpu_cores_limit{cluster=\"$cluster\"}) / sum(node:windows_node_num_cpu:sum{cluster=\"$cluster\"})",
"instant": true
}
],
"title": "CPU Limits Commitment",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "percentunit"
}
},
"gridPos": {
"h": 3,
"w": 4,
"x": 12,
"y": 0
},
"id": 4,
"interval": "1m",
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "1 - sum(:windows_node_memory_MemFreeCached_bytes:sum{cluster=\"$cluster\"}) / sum(:windows_node_memory_MemTotal_bytes:sum{cluster=\"$cluster\"})",
"instant": true
}
],
"title": "Memory Utilisation",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "percentunit"
}
},
"gridPos": {
"h": 3,
"w": 4,
"x": 16,
"y": 0
},
"id": 5,
"interval": "1m",
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(kube_pod_windows_container_resource_memory_request{cluster=\"$cluster\"}) / sum(:windows_node_memory_MemTotal_bytes:sum{cluster=\"$cluster\"})",
"instant": true
}
],
"title": "Memory Requests Commitment",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "percentunit"
}
},
"gridPos": {
"h": 3,
"w": 4,
"x": 20,
"y": 0
},
"id": 6,
"interval": "1m",
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(kube_pod_windows_container_resource_memory_limit{cluster=\"$cluster\"}) / sum(:windows_node_memory_MemTotal_bytes:sum{cluster=\"$cluster\"})",
"instant": true
}
],
"title": "Memory Limits Commitment",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
}
}
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 7
},
"id": 7,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(namespace_pod_container:windows_container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)",
"legendFormat": "__auto"
}
],
"title": "CPU Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"overrides": [
{
"matcher": {
"id": "byRegexp",
"options": "/%/"
},
"properties": [
{
"id": "unit",
"value": "percentunit"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Namespace"
},
"properties": [
{
"id": "links",
"value": [
{
"title": "Drill down to pods",
"url": "/d/490b402361724ab1d4c45666c1fa9b6f/k8s-resources-windows-namespace?${datasource:queryparam}&var-cluster=$cluster&var-namespace=${__data.fields.Namespace}"
}
]
}
]
}
]
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 14
},
"id": 8,
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(namespace_pod_container:windows_container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(kube_pod_windows_container_resource_cpu_cores_request{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(namespace_pod_container:windows_container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_windows_container_resource_cpu_cores_request{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(kube_pod_windows_container_resource_cpu_cores_limit{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(namespace_pod_container:windows_container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_windows_container_resource_cpu_cores_limit{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true
}
],
"title": "CPU Quota",
"transformations": [
{
"id": "joinByField",
"options": {
"byField": "namespace",
"mode": "outer"
}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"Time 1": true,
"Time 2": true,
"Time 3": true,
"Time 4": true,
"Time 5": true
},
"indexByName": {
"Time 1": 0,
"Time 2": 1,
"Time 3": 2,
"Time 4": 3,
"Time 5": 4,
"Value #A": 6,
"Value #B": 7,
"Value #C": 8,
"Value #D": 9,
"Value #E": 10,
"namespace": 5
},
"renameByName": {
"Value #A": "CPU Usage",
"Value #B": "CPU Requests",
"Value #C": "CPU Requests %",
"Value #D": "CPU Limits",
"Value #E": "CPU Limits %",
"namespace": "Namespace"
}
}
}
],
"type": "table"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "decbytes"
}
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 21
},
"id": 9,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(windows_container_private_working_set_usage{cluster=\"$cluster\"}) by (namespace)",
"legendFormat": "__auto"
}
],
"title": "Memory Usage (Private Working Set)",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": [
{
"matcher": {
"id": "byRegexp",
"options": "/%/"
},
"properties": [
{
"id": "unit",
"value": "percentunit"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Memory Usage"
},
"properties": [
{
"id": "unit",
"value": "decbytes"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Memory Requests"
},
"properties": [
{
"id": "unit",
"value": "decbytes"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Memory Limits"
},
"properties": [
{
"id": "unit",
"value": "decbytes"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Namespace"
},
"properties": [
{
"id": "links",
"value": [
{
"title": "Drill down to pods",
"url": "/d/490b402361724ab1d4c45666c1fa9b6f/k8s-resources-windows-namespace?${datasource:queryparam}&var-cluster=$cluster&var-namespace=${__data.fields.Namespace}"
}
]
}
]
}
]
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 28
},
"id": 10,
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(windows_container_private_working_set_usage{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(kube_pod_windows_container_resource_memory_request{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(windows_container_private_working_set_usage{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_windows_container_resource_memory_request{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(kube_pod_windows_container_resource_memory_limit{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(windows_container_private_working_set_usage{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_windows_container_resource_memory_limit{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true
}
],
"title": "Memory Requests by Namespace",
"transformations": [
{
"id": "joinByField",
"options": {
"byField": "namespace",
"mode": "outer"
}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"Time 1": true,
"Time 2": true,
"Time 3": true,
"Time 4": true,
"Time 5": true
},
"indexByName": {
"Time 1": 0,
"Time 2": 1,
"Time 3": 2,
"Time 4": 3,
"Time 5": 4,
"Value #A": 6,
"Value #B": 7,
"Value #C": 8,
"Value #D": 9,
"Value #E": 10,
"namespace": 5
},
"renameByName": {
"Value #A": "Memory Usage",
"Value #B": "Memory Requests",
"Value #C": "Memory Requests %",
"Value #D": "Memory Limits",
"Value #E": "Memory Limits %",
"namespace": "Namespace"
}
}
}
],
"type": "table"
}
],
"refresh": "10s",
"schemaVersion": 39,
"tags": [
"kubernetes"
],
"templating": {
"list": [
{
"current": {
"selected": true,
"text": "default",
"value": "default"
},
"hide": 0,
"label": "Data source",
"name": "datasource",
"query": "prometheus",
"regex": "",
"type": "datasource"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"hide": 2,
"label": "cluster",
"name": "cluster",
"query": "label_values(up{job=\"kubernetes-windows-exporter\"}, cluster)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timezone": "utc",
"title": "Compute Resources / Cluster(Windows)",
"uid": "4d08557fd9391b100730f2494bccac68"
}

View File

@ -0,0 +1,442 @@
{
"editable": false,
"panels": [
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
}
}
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(namespace_pod_container:windows_container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"legendFormat": "__auto"
}
],
"title": "CPU Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"overrides": [
{
"matcher": {
"id": "byRegexp",
"options": "/%/"
},
"properties": [
{
"id": "unit",
"value": "percentunit"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Pod"
},
"properties": [
{
"id": "links",
"value": [
{
"title": "Drill down to pods",
"url": "/d/40597a704a610e936dc6ed374a7ce023/k8s-resources-windows-pod?${datasource:queryparam}&var-cluster=$cluster&var-namespace=$namespace&var-pod=${__data.fields.Pod}"
}
]
}
]
}
]
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 7
},
"id": 2,
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(namespace_pod_container:windows_container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(kube_pod_windows_container_resource_cpu_cores_request{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(namespace_pod_container:windows_container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_windows_container_resource_cpu_cores_request{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(kube_pod_windows_container_resource_cpu_cores_limit{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(namespace_pod_container:windows_container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_windows_container_resource_cpu_cores_limit{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true
}
],
"title": "CPU Quota",
"transformations": [
{
"id": "joinByField",
"options": {
"byField": "pod",
"mode": "outer"
}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"Time 1": true,
"Time 2": true,
"Time 3": true,
"Time 4": true,
"Time 5": true
},
"indexByName": {
"Time 1": 0,
"Time 2": 1,
"Time 3": 2,
"Time 4": 3,
"Time 5": 4,
"Value #A": 6,
"Value #B": 7,
"Value #C": 8,
"Value #D": 9,
"Value #E": 10,
"pod": 5
},
"renameByName": {
"Value #A": "CPU Usage",
"Value #B": "CPU Requests",
"Value #C": "CPU Requests %",
"Value #D": "CPU Limits",
"Value #E": "CPU Limits %",
"pod": "Pod"
}
}
}
],
"type": "table"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "decbytes"
}
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 14
},
"id": 3,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(windows_container_private_working_set_usage{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"legendFormat": "__auto"
}
],
"title": "Memory Usage (Private Working Set)",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": [
{
"matcher": {
"id": "byRegexp",
"options": "/%/"
},
"properties": [
{
"id": "unit",
"value": "percentunit"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Pod"
},
"properties": [
{
"id": "links",
"value": [
{
"title": "Drill down to pods",
"url": "/d/40597a704a610e936dc6ed374a7ce023/k8s-resources-windows-pod?${datasource:queryparam}&var-cluster=$cluster&var-namespace=$namespace&var-pod=${__data.fields.Pod}"
}
]
}
]
}
]
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 21
},
"id": 4,
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(windows_container_private_working_set_usage{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(kube_pod_windows_container_resource_memory_request{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(windows_container_private_working_set_usage{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_windows_container_resource_memory_request{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(kube_pod_windows_container_resource_memory_limit{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(windows_container_private_working_set_usage{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_windows_container_resource_memory_limit{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true
}
],
"title": "Memory Quota",
"transformations": [
{
"id": "joinByField",
"options": {
"byField": "pod",
"mode": "outer"
}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"Time 1": true,
"Time 2": true,
"Time 3": true,
"Time 4": true,
"Time 5": true
},
"indexByName": {
"Time 1": 0,
"Time 2": 1,
"Time 3": 2,
"Time 4": 3,
"Time 5": 4,
"Value #A": 6,
"Value #B": 7,
"Value #C": 8,
"Value #D": 9,
"Value #E": 10,
"pod": 5
},
"renameByName": {
"Value #A": "Memory Usage",
"Value #B": "Memory Requests",
"Value #C": "Memory Requests %",
"Value #D": "Memory Limits",
"Value #E": "Memory Limits %",
"pod": "Pod"
}
}
}
],
"type": "table"
}
],
"refresh": "10s",
"schemaVersion": 39,
"tags": [
"kubernetes"
],
"templating": {
"list": [
{
"current": {
"selected": true,
"text": "default",
"value": "default"
},
"hide": 0,
"label": "Data source",
"name": "datasource",
"query": "prometheus",
"regex": "",
"type": "datasource"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"hide": 2,
"label": "cluster",
"name": "cluster",
"query": "label_values(up{job=\"kubernetes-windows-exporter\"}, cluster)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"hide": 0,
"label": "namespace",
"name": "namespace",
"query": "label_values(windows_pod_container_available{cluster=\"$cluster\"}, namespace)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timezone": "utc",
"title": "Compute Resources / Namespace(Windows)",
"uid": "490b402361724ab1d4c45666c1fa9b6f"
}

View File

@ -0,0 +1,497 @@
{
"editable": false,
"panels": [
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
}
}
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(namespace_pod_container:windows_container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"legendFormat": "__auto"
}
],
"title": "CPU Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"overrides": [
{
"matcher": {
"id": "byRegexp",
"options": "/%/"
},
"properties": [
{
"id": "unit",
"value": "percentunit"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Namespace"
},
"properties": [
{
"id": "links",
"value": [
{
"title": "Drill down to pods",
"url": "/d/490b402361724ab1d4c45666c1fa9b6f/k8s-resources-windows-namespace?${datasource:queryparam}&var-cluster=$cluster&var-namespace=${__data.fields.Namespace}"
}
]
}
]
}
]
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 7
},
"id": 2,
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(namespace_pod_container:windows_container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(kube_pod_windows_container_resource_cpu_cores_request{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(namespace_pod_container:windows_container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_windows_container_resource_cpu_cores_request{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(kube_pod_windows_container_resource_cpu_cores_limit{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(namespace_pod_container:windows_container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_windows_container_resource_cpu_cores_limit{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true
}
],
"title": "CPU Quota",
"transformations": [
{
"id": "joinByField",
"options": {
"byField": "container",
"mode": "outer"
}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"Time 1": true,
"Time 2": true,
"Time 3": true,
"Time 4": true,
"Time 5": true
},
"indexByName": {
"Time 1": 0,
"Time 2": 1,
"Time 3": 2,
"Time 4": 3,
"Time 5": 4,
"Value #A": 6,
"Value #B": 7,
"Value #C": 8,
"Value #D": 9,
"Value #E": 10,
"container": 5
},
"renameByName": {
"Value #A": "CPU Usage",
"Value #B": "CPU Requests",
"Value #C": "CPU Requests %",
"Value #D": "CPU Limits",
"Value #E": "CPU Limits %",
"container": "Container"
}
}
}
],
"type": "table"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "decbytes"
}
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 14
},
"id": 3,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(windows_container_private_working_set_usage{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"legendFormat": "__auto"
}
],
"title": "Memory Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": [
{
"matcher": {
"id": "byRegexp",
"options": "/%/"
},
"properties": [
{
"id": "unit",
"value": "percentunit"
}
]
}
]
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 21
},
"id": 4,
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(windows_container_private_working_set_usage{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(kube_pod_windows_container_resource_memory_request{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(windows_container_private_working_set_usage{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_windows_container_resource_memory_request{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(kube_pod_windows_container_resource_memory_limit{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(windows_container_private_working_set_usage{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_windows_container_resource_memory_limit{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true
}
],
"title": "Memory Quota",
"transformations": [
{
"id": "joinByField",
"options": {
"byField": "container",
"mode": "outer"
}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"Time 1": true,
"Time 2": true,
"Time 3": true,
"Time 4": true,
"Time 5": true
},
"indexByName": {
"Time 1": 0,
"Time 2": 1,
"Time 3": 2,
"Time 4": 3,
"Time 5": 4,
"Value #A": 6,
"Value #B": 7,
"Value #C": 8,
"Value #D": 9,
"Value #E": 10,
"container": 5
},
"renameByName": {
"Value #A": "Memory Usage",
"Value #B": "Memory Requests",
"Value #C": "Memory Requests %",
"Value #D": "Memory Limits",
"Value #E": "Memory Limits %",
"container": "Container"
}
}
}
],
"type": "table"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "bytes"
}
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 28
},
"id": 5,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sort_desc(sum by (container) (rate(windows_container_network_received_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval])))",
"legendFormat": "Received : {{ container }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sort_desc(sum by (container) (rate(windows_container_network_transmitted_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval])))",
"legendFormat": "Transmitted : {{ container }}"
}
],
"title": "Network I/O",
"type": "timeseries"
}
],
"refresh": "10s",
"schemaVersion": 39,
"tags": [
"kubernetes"
],
"templating": {
"list": [
{
"current": {
"selected": true,
"text": "default",
"value": "default"
},
"hide": 0,
"label": "Data source",
"name": "datasource",
"query": "prometheus",
"regex": "",
"type": "datasource"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"hide": 2,
"label": "cluster",
"name": "cluster",
"query": "label_values(up{job=\"kubernetes-windows-exporter\"}, cluster)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"hide": 0,
"label": "namespace",
"name": "namespace",
"query": "label_values(windows_pod_container_available{cluster=\"$cluster\"}, namespace)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"hide": 0,
"label": "pod",
"name": "pod",
"query": "label_values(windows_pod_container_available{cluster=\"$cluster\",namespace=\"$namespace\"}, pod)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timezone": "utc",
"title": "Compute Resources / Pod(Windows)",
"uid": "40597a704a610e936dc6ed374a7ce023"
}

View File

@ -50,7 +50,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -109,7 +109,7 @@
"y": 7
},
"id": 2,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -240,7 +240,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -302,7 +302,7 @@
"y": 21
},
"id": 4,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -453,7 +453,7 @@
"y": 28
},
"id": 5,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -597,7 +597,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -648,7 +648,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -699,7 +699,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -750,7 +750,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -801,7 +801,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -852,7 +852,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -903,7 +903,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -954,7 +954,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {

View File

@ -102,7 +102,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -189,7 +189,7 @@
"y": 7
},
"id": 2,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -397,7 +397,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -487,7 +487,7 @@
"y": 21
},
"id": 4,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -663,7 +663,7 @@
"y": 28
},
"id": 5,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -807,7 +807,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -858,7 +858,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -909,7 +909,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -960,7 +960,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1011,7 +1011,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1062,7 +1062,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1113,7 +1113,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1164,7 +1164,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {

View File

@ -0,0 +1,404 @@
{
"editable": false,
"panels": [
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "percentunit"
}
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "node:windows_node_cpu_utilisation:avg1m{cluster=\"$cluster\"} * node:windows_node_num_cpu:sum{cluster=\"$cluster\"} / scalar(sum(node:windows_node_num_cpu:sum{cluster=\"$cluster\"}))",
"legendFormat": "{{instance}}"
}
],
"title": "CPU Utilisation",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "percentunit"
}
},
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 7
},
"id": 2,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "node:windows_node_memory_utilisation:ratio{cluster=\"$cluster\"}",
"legendFormat": "{{instance}}"
}
],
"title": "Memory Utilisation",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "short"
}
},
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 7
},
"id": 3,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "node:windows_node_memory_swap_io_pages:irate{cluster=\"$cluster\"}",
"legendFormat": "{{instance}}"
}
],
"title": "Memory Saturation (Swap I/O Pages)",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "percentunit"
}
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 14
},
"id": 4,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "node:windows_node_disk_utilisation:avg_irate{cluster=\"$cluster\"} / scalar(node:windows_node:sum{cluster=\"$cluster\"})",
"legendFormat": "{{instance}}"
}
],
"title": "Disk IO Utilisation",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "Bps"
}
},
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 21
},
"id": 5,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "node:windows_node_net_utilisation:sum_irate{cluster=\"$cluster\"}",
"legendFormat": "{{instance}}"
}
],
"title": "Net Utilisation (Transmitted)",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "Bps"
}
},
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 21
},
"id": 6,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "node:windows_node_net_saturation:sum_irate{cluster=\"$cluster\"}",
"legendFormat": "{{instance}}"
}
],
"title": "Net Utilisation (Dropped)",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "percentunit"
}
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 28
},
"id": 7,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum by (instance)(node:windows_node_filesystem_usage:{cluster=\"$cluster\"})",
"legendFormat": "{{instance}}"
}
],
"title": "Disk Capacity",
"type": "timeseries"
}
],
"refresh": "10s",
"schemaVersion": 39,
"tags": [
"kubernetes"
],
"templating": {
"list": [
{
"current": {
"selected": true,
"text": "default",
"value": "default"
},
"hide": 0,
"label": "Data source",
"name": "datasource",
"query": "prometheus",
"regex": "",
"type": "datasource"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"hide": 2,
"label": "cluster",
"name": "cluster",
"query": "label_values(up{job=\"kubernetes-windows-exporter\"}, cluster)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timezone": "utc",
"title": "USE Method / Cluster(Windows)",
"uid": "53a43377ec9aaf2ff64dfc7a1f539334"
}

View File

@ -0,0 +1,615 @@
{
"editable": false,
"panels": [
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "percentunit"
}
},
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "node:windows_node_cpu_utilisation:avg1m{cluster=\"$cluster\", instance=\"$instance\"}",
"legendFormat": "Utilisation"
}
],
"title": "CPU Utilisation",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "percentunit"
}
},
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum by (core) (irate(windows_cpu_time_total{cluster=\"$cluster\", job=\"kubernetes-windows-exporter\", mode!=\"idle\", instance=\"$instance\"}[$__rate_interval]))",
"legendFormat": "{{core}}"
}
],
"title": "CPU Usage Per Core",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "percentunit"
}
},
"gridPos": {
"h": 7,
"w": 8,
"x": 0,
"y": 7
},
"id": 3,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "node:windows_node_memory_utilisation:{cluster=\"$cluster\", instance=\"$instance\"}",
"legendFormat": "Memory"
}
],
"title": "Memory Utilisation %",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "bytes"
}
},
"gridPos": {
"h": 7,
"w": 8,
"x": 8,
"y": 7
},
"id": 4,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "max(\n windows_os_visible_memory_bytes{cluster=\"$cluster\", job=\"kubernetes-windows-exporter\", instance=\"$instance\"}\n - windows_memory_available_bytes{cluster=\"$cluster\", job=\"kubernetes-windows-exporter\", instance=\"$instance\"}\n)\n",
"legendFormat": "memory used"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "max(node:windows_node_memory_totalCached_bytes:sum{cluster=\"$cluster\", instance=\"$instance\"})",
"legendFormat": "memory cached"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "max(windows_memory_available_bytes{cluster=\"$cluster\", job=\"kubernetes-windows-exporter\", instance=\"$instance\"})",
"legendFormat": "memory free"
}
],
"title": "Memory Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "short"
}
},
"gridPos": {
"h": 7,
"w": 8,
"x": 16,
"y": 7
},
"id": 5,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "node:windows_node_memory_swap_io_pages:irate{cluster=\"$cluster\", instance=\"$instance\"}",
"legendFormat": "Swap IO"
}
],
"title": "Memory Saturation (Swap I/O) Pages",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "percentunit"
}
},
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 14
},
"id": 6,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "node:windows_node_disk_utilisation:avg_irate{cluster=\"$cluster\", instance=\"$instance\"}",
"legendFormat": "Utilisation"
}
],
"title": "Disk IO Utilisation",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "bytes"
},
"overrides": [
{
"matcher": {
"id": "byRegexp",
"options": "/io time/"
},
"properties": [
{
"id": "unit",
"value": "ms"
}
]
}
]
},
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 14
},
"id": 7,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "max(rate(windows_logical_disk_read_bytes_total{cluster=\"$cluster\", job=\"kubernetes-windows-exporter\", instance=\"$instance\"}[$__rate_interval]))",
"legendFormat": "read"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "max(rate(windows_logical_disk_write_bytes_total{cluster=\"$cluster\", job=\"kubernetes-windows-exporter\", instance=\"$instance\"}[$__rate_interval]))",
"legendFormat": "written"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "max(rate(windows_logical_disk_read_seconds_total{cluster=\"$cluster\", job=\"kubernetes-windows-exporter\", instance=\"$instance\"}[$__rate_interval]) + rate(windows_logical_disk_write_seconds_total{cluster=\"$cluster\", job=\"kubernetes-windows-exporter\", instance=\"$instance\"}[$__rate_interval]))",
"legendFormat": "io time"
}
],
"title": "Disk IO",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "percentunit"
}
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 21
},
"id": 8,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "node:windows_node_filesystem_usage:{cluster=\"$cluster\", instance=\"$instance\"}",
"legendFormat": "{{volume}}"
}
],
"title": "Disk Utilisation",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "Bps"
}
},
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 28
},
"id": 9,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "node:windows_node_net_utilisation:sum_irate{cluster=\"$cluster\", instance=\"$instance\"}",
"legendFormat": "Utilisation"
}
],
"title": "Net Utilisation (Transmitted)",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"showPoints": "never",
"spanNulls": true
},
"unit": "Bps"
}
},
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 28
},
"id": 10,
"interval": "1m",
"options": {
"legend": {
"asTable": true,
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "node:windows_node_net_saturation:sum_irate{cluster=\"$cluster\", instance=\"$instance\"}",
"legendFormat": "Saturation"
}
],
"title": "Net Saturation (Dropped)",
"type": "timeseries"
}
],
"refresh": "10s",
"schemaVersion": 39,
"tags": [
"kubernetes"
],
"templating": {
"list": [
{
"current": {
"selected": true,
"text": "default",
"value": "default"
},
"hide": 0,
"label": "Data source",
"name": "datasource",
"query": "prometheus",
"regex": "",
"type": "datasource"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"hide": 2,
"label": "cluster",
"name": "cluster",
"query": "label_values(up{job=\"kubernetes-windows-exporter\"}, cluster)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"hide": 0,
"label": "instance",
"name": "instance",
"query": "label_values(windows_system_system_up_time{cluster=\"$cluster\"}, instance)",
"refresh": 2,
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timezone": "utc",
"title": "USE Method / Node(Windows)",
"uid": "96e7484b0bb53b74fbc2bcb7723cd40b"
}

View File

@ -35,7 +35,7 @@
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -70,7 +70,7 @@
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -105,7 +105,7 @@
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -140,7 +140,7 @@
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -175,7 +175,7 @@
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -210,7 +210,7 @@
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -261,7 +261,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -312,7 +312,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -363,7 +363,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -414,7 +414,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -473,7 +473,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -532,7 +532,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -583,7 +583,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -634,7 +634,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -685,7 +685,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -736,7 +736,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -787,7 +787,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -838,7 +838,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -889,7 +889,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -940,7 +940,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1015,15 +1015,15 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", instance=~\"$instance\"}[$__rate_interval])) by (instance, verb, url, le))",
"legendFormat": "{{instance}} {{verb}} {{url}}"
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", instance=~\"$instance\"}[$__rate_interval])) by (instance, verb, le))",
"legendFormat": "{{instance}} {{verb}}"
}
],
"title": "Request duration 99th quantile",
@ -1066,7 +1066,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1117,7 +1117,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -1168,7 +1168,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {

View File

@ -53,7 +53,7 @@
"y": 0
},
"id": 1,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -106,7 +106,7 @@
"y": 0
},
"id": 2,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -177,7 +177,7 @@
"y": 9
},
"id": 3,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -316,7 +316,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -362,7 +362,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -408,7 +408,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -454,7 +454,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -500,7 +500,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -546,7 +546,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {

View File

@ -39,7 +39,7 @@
"displayMode": "basic",
"showUnfilled": false
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -78,7 +78,7 @@
"displayMode": "basic",
"showUnfilled": false
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -149,7 +149,7 @@
"y": 9
},
"id": 3,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -335,7 +335,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -386,7 +386,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -437,7 +437,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -488,7 +488,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -539,7 +539,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -590,7 +590,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -641,7 +641,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -692,7 +692,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {

View File

@ -50,7 +50,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -112,7 +112,7 @@
},
"id": 2,
"interval": "1m",
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -162,7 +162,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -224,7 +224,7 @@
},
"id": 4,
"interval": "1m",
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {

View File

@ -53,7 +53,7 @@
"y": 0
},
"id": 1,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -106,7 +106,7 @@
"y": 0
},
"id": 2,
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -152,7 +152,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -198,7 +198,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -244,7 +244,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -290,7 +290,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -336,7 +336,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -382,7 +382,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {

View File

@ -35,7 +35,7 @@
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -86,7 +86,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -137,7 +137,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -188,7 +188,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -239,7 +239,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -290,7 +290,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -365,15 +365,15 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\",verb=\"POST\"}[$__rate_interval])) by (verb, url, le))",
"legendFormat": "{{verb}} {{url}}"
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\",verb=\"POST\"}[$__rate_interval])) by (verb, le))",
"legendFormat": "{{verb}}"
}
],
"title": "Post Request Latency 99th Quantile",
@ -416,15 +416,15 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\", verb=\"GET\"}[$__rate_interval])) by (verb, url, le))",
"legendFormat": "{{verb}} {{url}}"
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\", verb=\"GET\"}[$__rate_interval])) by (verb, le))",
"legendFormat": "{{verb}}"
}
],
"title": "Get Request Latency 99th Quantile",
@ -467,7 +467,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -518,7 +518,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -569,7 +569,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {

View File

@ -35,7 +35,7 @@
"options": {
"colorMode": "none"
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -86,7 +86,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -161,7 +161,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -236,7 +236,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -311,15 +311,15 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\", verb=\"POST\"}[$__rate_interval])) by (verb, url, le))",
"legendFormat": "{{verb}} {{url}}"
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\", verb=\"POST\"}[$__rate_interval])) by (verb, le))",
"legendFormat": "{{verb}}"
}
],
"title": "Post Request Latency 99th Quantile",
@ -362,15 +362,15 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\", verb=\"GET\"}[$__rate_interval])) by (verb, url, le))",
"legendFormat": "{{verb}} {{url}}"
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\", verb=\"GET\"}[$__rate_interval])) by (verb, le))",
"legendFormat": "{{verb}}"
}
],
"title": "Get Request Latency 99th Quantile",
@ -413,7 +413,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -464,7 +464,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -515,7 +515,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {

View File

@ -39,7 +39,7 @@
"displayMode": "basic",
"showUnfilled": false
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -78,7 +78,7 @@
"displayMode": "basic",
"showUnfilled": false
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -117,7 +117,7 @@
"displayMode": "basic",
"showUnfilled": false
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -156,7 +156,7 @@
"displayMode": "basic",
"showUnfilled": false
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -207,7 +207,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -258,7 +258,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -309,7 +309,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -360,7 +360,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -411,7 +411,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {
@ -462,7 +462,7 @@
"mode": "single"
}
},
"pluginVersion": "v11.1.0",
"pluginVersion": "v11.4.0",
"targets": [
{
"datasource": {

View File

@ -18,8 +18,8 @@
"subdir": "contrib/mixin"
}
},
"version": "2b323071a8bd4f02ddaf63e0dfa1fd98c221dccb",
"sum": "IXI3LQIT9NmTPJAk8WLUJd5+qZfcGpeNCyWIK7oEpws="
"version": "f30cbaac111aa01a310fe75360c759cdd4d9cd14",
"sum": "XmXkOCriQIZmXwlIIFhqlJMa0e6qGWdxZD+ZDYaN0Po="
},
{
"source": {
@ -41,16 +41,6 @@
"version": "a1d61cce1da59c71409b99b5c7568511fec661ea",
"sum": "342u++/7rViR/zj2jeJOjshzglkZ1SY+hFNuyCBFMdc="
},
{
"source": {
"git": {
"remote": "https://github.com/grafana/grafonnet-lib.git",
"subdir": "grafonnet-7.0"
}
},
"version": "a1d61cce1da59c71409b99b5c7568511fec661ea",
"sum": "gCtR9s/4D5fxU9aKXg0Bru+/njZhA0YjLjPiASc61FM="
},
{
"source": {
"git": {
@ -58,8 +48,8 @@
"subdir": "gen/grafonnet-latest"
}
},
"version": "82a19822e54a0a12a51e24dbd48fcde717dc0864",
"sum": "64fMUPI3frXGj4X1FqFd1t7r04w3CUSmXaDcJ23EYbQ="
"version": "d20e609202733790caf5b554c9945d049f243ae3",
"sum": "V9vAj21qJOc2DlMPDgB1eEjSQU4A+sAA4AXuJ6bd4xc="
},
{
"source": {
@ -68,18 +58,18 @@
"subdir": "gen/grafonnet-v10.0.0"
}
},
"version": "82a19822e54a0a12a51e24dbd48fcde717dc0864",
"version": "d20e609202733790caf5b554c9945d049f243ae3",
"sum": "xdcrJPJlpkq4+5LpGwN4tPAuheNNLXZjE6tDcyvFjr0="
},
{
"source": {
"git": {
"remote": "https://github.com/grafana/grafonnet.git",
"subdir": "gen/grafonnet-v11.1.0"
"subdir": "gen/grafonnet-v11.4.0"
}
},
"version": "82a19822e54a0a12a51e24dbd48fcde717dc0864",
"sum": "41w7p/rwrNsITqNHMXtGSJAfAyKmnflg6rFhKBduUxM="
"version": "d20e609202733790caf5b554c9945d049f243ae3",
"sum": "aVAX09paQYNOoCSKVpuk1exVIyBoMt/C50QJI+Q/3nA="
},
{
"source": {
@ -88,7 +78,7 @@
"subdir": "grafana-builder"
}
},
"version": "767befa8fb46a07be516dec2777d7d89909a529d",
"version": "d6c38bb26f576b128cadca4137d73a037afdd872",
"sum": "yxqWcq/N3E/a/XreeU6EuE6X7kYPnG0AspAQFKOjASo="
},
{
@ -108,8 +98,8 @@
"subdir": ""
}
},
"version": "63d430b69a95741061c2f7fc9d84b1a778511d9c",
"sum": "qiZi3axUSXCVzKUF83zSAxklwrnitMmrDK4XAfjPMdE="
"version": "1199b50e9d2ff53d4bb5fb2304ad1fb69d38e609",
"sum": "LfbgcJbilu4uBdKYZSvmkoOTPwEAzg10L3/VqKAIWtA="
},
{
"source": {
@ -118,8 +108,8 @@
"subdir": ""
}
},
"version": "a3fbf21977deb89b7d843eb8371170c011ea6835",
"sum": "57zW2IGJ9zbYd8BI0qe6JkoWTRSMNiBUWC6+YcnEsWo="
"version": "e27267571be06c2bdc3d2fd8dbd70161cd709cb4",
"sum": "je1RPCp2aFNefYs5Q57Q5wDm93p8pL4pdBtA5rC7jLA="
},
{
"source": {
@ -128,8 +118,8 @@
"subdir": "jsonnet/kube-state-metrics"
}
},
"version": "32e7727ff4613b0f55dfc18aff15afb8c04d03c5",
"sum": "lO7jUSzAIy8Yk9pOWJIWgPRhubkWzVh56W6wtYfbVH4="
"version": "2a95d4649b2fea55799032fb9c0b571c4ba7f776",
"sum": "3bioG7CfTfY9zeu5xU4yon6Zt3kYvNkyl492nOhQxnM="
},
{
"source": {
@ -138,7 +128,7 @@
"subdir": "jsonnet/kube-state-metrics-mixin"
}
},
"version": "32e7727ff4613b0f55dfc18aff15afb8c04d03c5",
"version": "2a95d4649b2fea55799032fb9c0b571c4ba7f776",
"sum": "qclI7LwucTjBef3PkGBkKxF0mfZPbHnn4rlNWKGtR4c="
},
{
@ -148,8 +138,8 @@
"subdir": "jsonnet/kube-prometheus"
}
},
"version": "7e5a571a3fb735c78e17c76a637eb7e8bb5dd086",
"sum": "uTw/Mj+X91S+oqUpAX81xcfWPDlox0tdSZY/YBw7nGE="
"version": "1eea946a1532f1e8cccfceea98d907bf3a10b1d9",
"sum": "17LhiwefVfoNDsF3DcFZw/UL4PMU7YpNNUaOdaYd1gE="
},
{
"source": {
@ -158,7 +148,7 @@
"subdir": "jsonnet/mixin"
}
},
"version": "a366602bacb2c8d773a9cee058b6971b8d2e3732",
"version": "465bcbaf2a727c942e7f923aacfb9dff9af8d4a1",
"sum": "gi+knjdxs2T715iIQIntrimbHRgHnpM8IFBJDD1gYfs=",
"name": "prometheus-operator-mixin"
},
@ -169,8 +159,8 @@
"subdir": "jsonnet/prometheus-operator"
}
},
"version": "a366602bacb2c8d773a9cee058b6971b8d2e3732",
"sum": "z0/lCiMusMHTqntsosMVGYkVcSZjCpyZBmUMVUsK5nA="
"version": "465bcbaf2a727c942e7f923aacfb9dff9af8d4a1",
"sum": "LctDdofQostvviE5y8vpRKWGGO1ZKO3dgJe7P9xifW0="
},
{
"source": {
@ -179,7 +169,7 @@
"subdir": "doc/alertmanager-mixin"
}
},
"version": "0f65e8fa5fc72d2678655105c0213b416ca6f34c",
"version": "b5d1a64ad5bb0ff879705714d1e40cea82efbd5c",
"sum": "Mf4h1BYLle2nrgjf/HXrBbl0Zk8N+xaoEM017o0BC+k=",
"name": "alertmanager"
},
@ -190,8 +180,8 @@
"subdir": "docs/node-mixin"
}
},
"version": "cf8c6891cc610e54f70383addd4bb6079f0add35",
"sum": "cQCW+1N0Xae5yXecCWDK2oAlN0luBS/5GrwBYSlaFms="
"version": "11365f97bef6cb0e6259d536a7e21c49e3f5c065",
"sum": "xYj6VYFT/eafsbleNlC+Z2VfLy1CndyYrJs9BcTmnX8="
},
{
"source": {
@ -200,8 +190,8 @@
"subdir": "documentation/prometheus-mixin"
}
},
"version": "b407c2930da4f50c0d17fc39404c6302a9eb740b",
"sum": "OYT5u3S8DbamuJV/v3gbWSteOvFzMeNwMj+u4Apk7jM=",
"version": "509b978f0d675b4c9b3ccf8c0fc06961b0f03e8f",
"sum": "2c+wttfee9TwuQJZIkNV7Tekem74Qgc7iZ842P28rNw=",
"name": "prometheus"
},
{
@ -222,7 +212,7 @@
"subdir": "mixin"
}
},
"version": "7037331e6ea7dbe85a1b7af37bf8ea277a80663d",
"version": "346d18bb0f8011c63d7106de494cf3b9253161a1",
"sum": "ieCD4eMgGbOlrI8GmckGPHBGQDcLasE1rULYq56W/bs=",
"name": "thanos-mixin"
}

View File

@ -6,5 +6,5 @@ dashboards:
url: https://grafana.com/api/dashboards/9578/revisions/4/download
tags: []
- name: Prometheus
url: https://grafana.com/api/dashboards/3662/revisions/2/download
url: https://grafana.com/api/dashboards/19105/revisions/7/download
tags: []

View File

@ -7,7 +7,7 @@
"app.kubernetes.io/instance": "main",
"app.kubernetes.io/name": "alertmanager",
"app.kubernetes.io/part-of": "kube-prometheus",
"app.kubernetes.io/version": "0.27.0",
"app.kubernetes.io/version": "0.28.0",
"prometheus": "k8s",
"role": "alert-rules"
},

View File

@ -20,9 +20,9 @@
"summary": "etcd cluster members are down."
},
"expr": "max without (endpoint) (\n sum without (instance, pod) (up{job=~\".*etcd.*\"} == bool 0)\nor\n count without (To) (\n sum without (instance, pod) (rate(etcd_network_peer_sent_failures_total{job=~\".*etcd.*\"}[120s])) > 0.01\n )\n)\n> 0\n",
"for": "10m",
"for": "20m",
"labels": {
"severity": "critical"
"severity": "warning"
}
},
{

View File

@ -6,7 +6,7 @@
"app.kubernetes.io/component": "exporter",
"app.kubernetes.io/name": "kube-state-metrics",
"app.kubernetes.io/part-of": "kube-prometheus",
"app.kubernetes.io/version": "2.14.0",
"app.kubernetes.io/version": "2.15.0",
"prometheus": "k8s",
"role": "alert-rules"
},

View File

@ -19,7 +19,7 @@
{
"alert": "KubePodCrashLooping",
"annotations": {
"description": "Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: \"CrashLoopBackOff\").",
"description": "Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: \"CrashLoopBackOff\") on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping",
"summary": "Pod is crash looping."
},
@ -32,7 +32,7 @@
{
"alert": "KubePodNotReady",
"annotations": {
"description": "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.",
"description": "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready",
"summary": "Pod has been in a non-ready state for more than 15 minutes."
},
@ -45,7 +45,7 @@
{
"alert": "KubeDeploymentGenerationMismatch",
"annotations": {
"description": "Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.",
"description": "Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch",
"summary": "Deployment generation mismatch due to possible roll-back"
},
@ -58,7 +58,7 @@
{
"alert": "KubeDeploymentReplicasMismatch",
"annotations": {
"description": "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.",
"description": "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch",
"summary": "Deployment has not matched the expected number of replicas."
},
@ -71,7 +71,7 @@
{
"alert": "KubeDeploymentRolloutStuck",
"annotations": {
"description": "Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes.",
"description": "Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck",
"summary": "Deployment rollout is not progressing."
},
@ -84,7 +84,7 @@
{
"alert": "KubeStatefulSetReplicasMismatch",
"annotations": {
"description": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.",
"description": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch",
"summary": "StatefulSet has not matched the expected number of replicas."
},
@ -97,7 +97,7 @@
{
"alert": "KubeStatefulSetGenerationMismatch",
"annotations": {
"description": "StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.",
"description": "StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch",
"summary": "StatefulSet generation mismatch due to possible roll-back"
},
@ -110,7 +110,7 @@
{
"alert": "KubeStatefulSetUpdateNotRolledOut",
"annotations": {
"description": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.",
"description": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout",
"summary": "StatefulSet update has not been rolled out."
},
@ -123,7 +123,7 @@
{
"alert": "KubeDaemonSetRolloutStuck",
"annotations": {
"description": "DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15m.",
"description": "DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15m on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck",
"summary": "DaemonSet rollout is stuck."
},
@ -136,7 +136,7 @@
{
"alert": "KubeContainerWaiting",
"annotations": {
"description": "pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: \"{{ $labels.reason }}\").",
"description": "pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: \"{{ $labels.reason }}\") on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting",
"summary": "Pod container waiting longer than 1 hour"
},
@ -149,7 +149,7 @@
{
"alert": "KubeDaemonSetNotScheduled",
"annotations": {
"description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.",
"description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled",
"summary": "DaemonSet pods are not scheduled."
},
@ -162,7 +162,7 @@
{
"alert": "KubeDaemonSetMisScheduled",
"annotations": {
"description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.",
"description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled",
"summary": "DaemonSet pods are misscheduled."
},
@ -175,7 +175,7 @@
{
"alert": "KubeJobNotCompleted",
"annotations": {
"description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ \"43200\" | humanizeDuration }} to complete.",
"description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ \"43200\" | humanizeDuration }} to complete on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted",
"summary": "Job did not complete in time"
},
@ -187,7 +187,7 @@
{
"alert": "KubeJobFailed",
"annotations": {
"description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.",
"description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed",
"summary": "Job failed to complete."
},
@ -200,7 +200,7 @@
{
"alert": "KubeHpaReplicasMismatch",
"annotations": {
"description": "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes.",
"description": "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch",
"summary": "HPA has not matched desired number of replicas."
},
@ -213,7 +213,7 @@
{
"alert": "KubeHpaMaxedOut",
"annotations": {
"description": "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes.",
"description": "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout",
"summary": "HPA is running at max replicas"
},
@ -257,7 +257,7 @@
{
"alert": "KubeQuotaAlmostFull",
"annotations": {
"description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.",
"description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull",
"summary": "Namespace quota is going to be full."
},
@ -270,7 +270,7 @@
{
"alert": "KubeQuotaFullyUsed",
"annotations": {
"description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.",
"description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused",
"summary": "Namespace quota is fully used."
},
@ -283,7 +283,7 @@
{
"alert": "KubeQuotaExceeded",
"annotations": {
"description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.",
"description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded",
"summary": "Namespace quota has exceeded the limits."
},
@ -296,7 +296,7 @@
{
"alert": "CPUThrottlingHigh",
"annotations": {
"description": "{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.",
"description": "{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }} on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh",
"summary": "Processes experience elevated CPU throttling."
},
@ -396,7 +396,7 @@
{
"alert": "KubeVersionMismatch",
"annotations": {
"description": "There are {{ $value }} different semantic versions of Kubernetes components running.",
"description": "There are {{ $value }} different semantic versions of Kubernetes components running on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch",
"summary": "Different semantic versions of Kubernetes components running."
},
@ -409,7 +409,7 @@
{
"alert": "KubeClientErrors",
"annotations": {
"description": "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'",
"description": "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors",
"summary": "Kubernetes API server client is experiencing errors."
},
@ -427,7 +427,7 @@
{
"alert": "KubeAPIErrorBudgetBurn",
"annotations": {
"description": "The API server is burning too much error budget.",
"description": "The API server is burning too much error budget on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn",
"summary": "The API server is burning too much error budget."
},
@ -442,7 +442,7 @@
{
"alert": "KubeAPIErrorBudgetBurn",
"annotations": {
"description": "The API server is burning too much error budget.",
"description": "The API server is burning too much error budget on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn",
"summary": "The API server is burning too much error budget."
},
@ -457,7 +457,7 @@
{
"alert": "KubeAPIErrorBudgetBurn",
"annotations": {
"description": "The API server is burning too much error budget.",
"description": "The API server is burning too much error budget on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn",
"summary": "The API server is burning too much error budget."
},
@ -472,7 +472,7 @@
{
"alert": "KubeAPIErrorBudgetBurn",
"annotations": {
"description": "The API server is burning too much error budget.",
"description": "The API server is burning too much error budget on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn",
"summary": "The API server is burning too much error budget."
},
@ -518,11 +518,12 @@
{
"alert": "KubeAggregatedAPIErrors",
"annotations": {
"description": "Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.",
"description": "Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name }} has reported {{ $labels.reason }} errors on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors",
"summary": "Kubernetes aggregated API has reported errors."
},
"expr": "sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job=\"apiserver\"}[10m])) > 4\n",
"expr": "sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job=\"apiserver\"}[1m])) > 0\n",
"for": "10m",
"labels": {
"severity": "warning"
}
@ -530,7 +531,7 @@
{
"alert": "KubeAggregatedAPIDown",
"annotations": {
"description": "Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.",
"description": "Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown",
"summary": "Kubernetes aggregated API is down."
},
@ -556,7 +557,7 @@
{
"alert": "KubeAPITerminatedRequests",
"annotations": {
"description": "The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.",
"description": "The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests",
"summary": "The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests."
},
@ -574,11 +575,11 @@
{
"alert": "KubeNodeNotReady",
"annotations": {
"description": "{{ $labels.node }} has been unready for more than 15 minutes.",
"description": "{{ $labels.node }} has been unready for more than 15 minutes on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready",
"summary": "Node is not ready."
},
"expr": "kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} == 0\n",
"expr": "kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} == 0\nand on (cluster, node)\nkube_node_spec_unschedulable{job=\"kube-state-metrics\"} == 0\n",
"for": "15m",
"labels": {
"severity": "warning"
@ -587,7 +588,7 @@
{
"alert": "KubeNodeUnreachable",
"annotations": {
"description": "{{ $labels.node }} is unreachable and some workloads may be rescheduled.",
"description": "{{ $labels.node }} is unreachable and some workloads may be rescheduled on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable",
"summary": "Node is unreachable."
},
@ -600,11 +601,11 @@
{
"alert": "KubeletTooManyPods",
"annotations": {
"description": "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.",
"description": "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods",
"summary": "Kubelet is running at capacity."
},
"expr": "count by(cluster, node) (\n (kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})\n)\n/\nmax by(cluster, node) (\n kube_node_status_capacity{job=\"kube-state-metrics\",resource=\"pods\"} != 1\n) > 0.95\n",
"expr": "(\n max by (cluster, instance) (\n kubelet_running_pods{job=\"kubelet\", metrics_path=\"/metrics\"} > 1\n )\n * on (cluster, instance) group_left(node)\n max by (cluster, instance, node) (\n kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"}\n )\n)\n/ on (cluster, node) group_left()\nmax by (cluster, node) (\n kube_node_status_capacity{job=\"kube-state-metrics\", resource=\"pods\"} != 1\n) > 0.95\n",
"for": "15m",
"labels": {
"severity": "info"
@ -613,11 +614,11 @@
{
"alert": "KubeNodeReadinessFlapping",
"annotations": {
"description": "The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.",
"description": "The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping",
"summary": "Node readiness status is flapping."
},
"expr": "sum(changes(kube_node_status_condition{job=\"kube-state-metrics\",status=\"true\",condition=\"Ready\"}[15m])) by (cluster, node) > 2\n",
"expr": "sum(changes(kube_node_status_condition{job=\"kube-state-metrics\",status=\"true\",condition=\"Ready\"}[15m])) by (cluster, node) > 2\nand on (cluster, node)\nkube_node_spec_unschedulable{job=\"kube-state-metrics\"} == 0\n",
"for": "15m",
"labels": {
"severity": "warning"
@ -626,7 +627,7 @@
{
"alert": "KubeletPlegDurationHigh",
"annotations": {
"description": "The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.",
"description": "The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }} on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh",
"summary": "Kubelet Pod Lifecycle Event Generator is taking too long to relist."
},
@ -639,7 +640,7 @@
{
"alert": "KubeletPodStartUpLatencyHigh",
"annotations": {
"description": "Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.",
"description": "Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }} on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh",
"summary": "Kubelet Pod startup latency is too high."
},
@ -652,7 +653,7 @@
{
"alert": "KubeletClientCertificateExpiration",
"annotations": {
"description": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
"description": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }} on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration",
"summary": "Kubelet client certificate is about to expire."
},
@ -664,7 +665,7 @@
{
"alert": "KubeletClientCertificateExpiration",
"annotations": {
"description": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
"description": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }} on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration",
"summary": "Kubelet client certificate is about to expire."
},
@ -676,7 +677,7 @@
{
"alert": "KubeletServerCertificateExpiration",
"annotations": {
"description": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
"description": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }} on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration",
"summary": "Kubelet server certificate is about to expire."
},
@ -688,7 +689,7 @@
{
"alert": "KubeletServerCertificateExpiration",
"annotations": {
"description": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
"description": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }} on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration",
"summary": "Kubelet server certificate is about to expire."
},
@ -700,7 +701,7 @@
{
"alert": "KubeletClientCertificateRenewalErrors",
"annotations": {
"description": "Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).",
"description": "Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes) on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors",
"summary": "Kubelet has failed to renew its client certificate."
},
@ -713,7 +714,7 @@
{
"alert": "KubeletServerCertificateRenewalErrors",
"annotations": {
"description": "Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).",
"description": "Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes) on cluster {{ $labels.cluster }}.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors",
"summary": "Kubelet has failed to renew its server certificate."
},
@ -809,25 +810,25 @@
"record": "cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h"
},
{
"expr": "sum by (cluster, verb, scope) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{le=\"+Inf\"} * 24 * 30)\n",
"expr": "sum by (cluster, verb, scope) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{le=\"+Inf\"})\n",
"record": "cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d"
},
{
"expr": "1 - (\n (\n # write too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"})\n ) +\n (\n # read too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"cluster\",le=\"30\"})\n )\n ) +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d)\n",
"expr": "1 - (\n (\n # write too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=~\"1(\\\\.0)?\"})\n ) +\n (\n # read too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=~\"5(\\\\.0)?\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"cluster\",le=~\"30(\\\\.0)?\"})\n )\n ) +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d)\n",
"labels": {
"verb": "all"
},
"record": "apiserver_request:availability30d"
},
{
"expr": "1 - (\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n # too slow\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"cluster\",le=\"30\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\"})\n",
"expr": "1 - (\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n # too slow\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=~\"5(\\\\.0)?\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"cluster\",le=~\"30(\\\\.0)?\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\"})\n",
"labels": {
"verb": "read"
},
"record": "apiserver_request:availability30d"
},
{
"expr": "1 - (\n (\n # too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\"})\n",
"expr": "1 - (\n (\n # too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=~\"1(\\\\.0)?\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\"})\n",
"labels": {
"verb": "write"
},
@ -869,98 +870,98 @@
"name": "kube-apiserver-burnrate.rules",
"rules": [
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[1d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[1d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[1d]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[1d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[1d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[1d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=~\"5(\\\\.0)?\"}[1d]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=~\"30(\\\\.0)?\"}[1d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n",
"labels": {
"verb": "read"
},
"record": "apiserver_request:burnrate1d"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[1h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[1h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[1h]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[1h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[1h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[1h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=~\"5(\\\\.0)?\"}[1h]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=~\"30(\\\\.0)?\"}[1h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n",
"labels": {
"verb": "read"
},
"record": "apiserver_request:burnrate1h"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[2h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[2h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[2h]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[2h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[2h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[2h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=~\"5(\\\\.0)?\"}[2h]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=~\"30(\\\\.0)?\"}[2h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n",
"labels": {
"verb": "read"
},
"record": "apiserver_request:burnrate2h"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[30m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[30m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[30m]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[30m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[30m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[30m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=~\"5(\\\\.0)?\"}[30m]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=~\"30(\\\\.0)?\"}[30m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n",
"labels": {
"verb": "read"
},
"record": "apiserver_request:burnrate30m"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[3d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[3d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[3d]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[3d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[3d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[3d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=~\"5(\\\\.0)?\"}[3d]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=~\"30(\\\\.0)?\"}[3d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n",
"labels": {
"verb": "read"
},
"record": "apiserver_request:burnrate3d"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[5m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[5m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[5m]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[5m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[5m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[5m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=~\"5(\\\\.0)?\"}[5m]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=~\"30(\\\\.0)?\"}[5m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n",
"labels": {
"verb": "read"
},
"record": "apiserver_request:burnrate5m"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[6h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[6h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[6h]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[6h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[6h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[6h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=~\"5(\\\\.0)?\"}[6h]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=~\"30(\\\\.0)?\"}[6h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n",
"labels": {
"verb": "read"
},
"record": "apiserver_request:burnrate6h"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[1d]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[1d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[1d]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=~\"1(\\\\.0)?\"}[1d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n",
"labels": {
"verb": "write"
},
"record": "apiserver_request:burnrate1d"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[1h]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[1h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[1h]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=~\"1(\\\\.0)?\"}[1h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n",
"labels": {
"verb": "write"
},
"record": "apiserver_request:burnrate1h"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[2h]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[2h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[2h]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=~\"1(\\\\.0)?\"}[2h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n",
"labels": {
"verb": "write"
},
"record": "apiserver_request:burnrate2h"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[30m]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[30m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[30m]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=~\"1(\\\\.0)?\"}[30m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n",
"labels": {
"verb": "write"
},
"record": "apiserver_request:burnrate30m"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[3d]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[3d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[3d]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=~\"1(\\\\.0)?\"}[3d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n",
"labels": {
"verb": "write"
},
"record": "apiserver_request:burnrate3d"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[5m]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[5m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[5m]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=~\"1(\\\\.0)?\"}[5m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n",
"labels": {
"verb": "write"
},
"record": "apiserver_request:burnrate5m"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[6h]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[6h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[6h]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=~\"1(\\\\.0)?\"}[6h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n",
"labels": {
"verb": "write"
},
@ -1237,6 +1238,144 @@
"record": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile"
}
]
},
{
"name": "windows.node.rules",
"rules": [
{
"expr": "count by (cluster) (\n windows_system_system_up_time{job=\"kubernetes-windows-exporter\"}\n)\n",
"record": "node:windows_node:sum"
},
{
"expr": "count by (cluster, instance) (sum by (cluster, instance, core) (\n windows_cpu_time_total{job=\"kubernetes-windows-exporter\"}\n))\n",
"record": "node:windows_node_num_cpu:sum"
},
{
"expr": "1 - avg by (cluster) (rate(windows_cpu_time_total{job=\"kubernetes-windows-exporter\",mode=\"idle\"}[1m]))\n",
"record": ":windows_node_cpu_utilisation:avg1m"
},
{
"expr": "1 - avg by (cluster, instance) (\n rate(windows_cpu_time_total{job=\"kubernetes-windows-exporter\",mode=\"idle\"}[1m])\n)\n",
"record": "node:windows_node_cpu_utilisation:avg1m"
},
{
"expr": "1 -\nsum by (cluster) (windows_memory_available_bytes{job=\"kubernetes-windows-exporter\"})\n/\nsum by (cluster) (windows_os_visible_memory_bytes{job=\"kubernetes-windows-exporter\"})\n",
"record": ":windows_node_memory_utilisation:"
},
{
"expr": "sum by (cluster) (windows_memory_available_bytes{job=\"kubernetes-windows-exporter\"} + windows_memory_cache_bytes{job=\"kubernetes-windows-exporter\"})\n",
"record": ":windows_node_memory_MemFreeCached_bytes:sum"
},
{
"expr": "(windows_memory_cache_bytes{job=\"kubernetes-windows-exporter\"} + windows_memory_modified_page_list_bytes{job=\"kubernetes-windows-exporter\"} + windows_memory_standby_cache_core_bytes{job=\"kubernetes-windows-exporter\"} + windows_memory_standby_cache_normal_priority_bytes{job=\"kubernetes-windows-exporter\"} + windows_memory_standby_cache_reserve_bytes{job=\"kubernetes-windows-exporter\"})\n",
"record": "node:windows_node_memory_totalCached_bytes:sum"
},
{
"expr": "sum by (cluster) (windows_os_visible_memory_bytes{job=\"kubernetes-windows-exporter\"})\n",
"record": ":windows_node_memory_MemTotal_bytes:sum"
},
{
"expr": "sum by (cluster, instance) (\n (windows_memory_available_bytes{job=\"kubernetes-windows-exporter\"})\n)\n",
"record": "node:windows_node_memory_bytes_available:sum"
},
{
"expr": "sum by (cluster, instance) (\n windows_os_visible_memory_bytes{job=\"kubernetes-windows-exporter\"}\n)\n",
"record": "node:windows_node_memory_bytes_total:sum"
},
{
"expr": "(node:windows_node_memory_bytes_total:sum - node:windows_node_memory_bytes_available:sum)\n/\nscalar(sum(node:windows_node_memory_bytes_total:sum))\n",
"record": "node:windows_node_memory_utilisation:ratio"
},
{
"expr": "1 - (node:windows_node_memory_bytes_available:sum / node:windows_node_memory_bytes_total:sum)\n",
"record": "node:windows_node_memory_utilisation:"
},
{
"expr": "irate(windows_memory_swap_page_operations_total{job=\"kubernetes-windows-exporter\"}[5m])\n",
"record": "node:windows_node_memory_swap_io_pages:irate"
},
{
"expr": "avg by (cluster) (irate(windows_logical_disk_read_seconds_total{job=\"kubernetes-windows-exporter\"}[1m]) +\n irate(windows_logical_disk_write_seconds_total{job=\"kubernetes-windows-exporter\"}[1m])\n )\n",
"record": ":windows_node_disk_utilisation:avg_irate"
},
{
"expr": "avg by (cluster, instance) (\n (irate(windows_logical_disk_read_seconds_total{job=\"kubernetes-windows-exporter\"}[1m]) +\n irate(windows_logical_disk_write_seconds_total{job=\"kubernetes-windows-exporter\"}[1m]))\n)\n",
"record": "node:windows_node_disk_utilisation:avg_irate"
},
{
"expr": "max by (cluster,instance,volume)(\n (windows_logical_disk_size_bytes{job=\"kubernetes-windows-exporter\"}\n- windows_logical_disk_free_bytes{job=\"kubernetes-windows-exporter\"})\n/ windows_logical_disk_size_bytes{job=\"kubernetes-windows-exporter\"}\n)\n",
"record": "node:windows_node_filesystem_usage:"
},
{
"expr": "max by (cluster, instance, volume) (windows_logical_disk_free_bytes{job=\"kubernetes-windows-exporter\"} / windows_logical_disk_size_bytes{job=\"kubernetes-windows-exporter\"})\n",
"record": "node:windows_node_filesystem_avail:"
},
{
"expr": "sum by (cluster) (irate(windows_net_bytes_total{job=\"kubernetes-windows-exporter\"}[1m]))\n",
"record": ":windows_node_net_utilisation:sum_irate"
},
{
"expr": "sum by (cluster, instance) (\n (irate(windows_net_bytes_total{job=\"kubernetes-windows-exporter\"}[1m]))\n)\n",
"record": "node:windows_node_net_utilisation:sum_irate"
},
{
"expr": "sum by (cluster) (irate(windows_net_packets_received_discarded_total{job=\"kubernetes-windows-exporter\"}[1m])) +\nsum by (cluster) (irate(windows_net_packets_outbound_discarded_total{job=\"kubernetes-windows-exporter\"}[1m]))\n",
"record": ":windows_node_net_saturation:sum_irate"
},
{
"expr": "sum by (cluster, instance) (\n (irate(windows_net_packets_received_discarded_total{job=\"kubernetes-windows-exporter\"}[1m]) +\n irate(windows_net_packets_outbound_discarded_total{job=\"kubernetes-windows-exporter\"}[1m]))\n)\n",
"record": "node:windows_node_net_saturation:sum_irate"
}
]
},
{
"name": "windows.pod.rules",
"rules": [
{
"expr": "windows_container_available{job=\"kubernetes-windows-exporter\", container_id != \"\"} * on(container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job=\"kube-state-metrics\", container_id != \"\"}) by(container, container_id, pod, namespace, cluster)\n",
"record": "windows_pod_container_available"
},
{
"expr": "windows_container_cpu_usage_seconds_total{job=\"kubernetes-windows-exporter\", container_id != \"\"} * on(container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job=\"kube-state-metrics\", container_id != \"\"}) by(container, container_id, pod, namespace, cluster)\n",
"record": "windows_container_total_runtime"
},
{
"expr": "windows_container_memory_usage_commit_bytes{job=\"kubernetes-windows-exporter\", container_id != \"\"} * on(container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job=\"kube-state-metrics\", container_id != \"\"}) by(container, container_id, pod, namespace, cluster)\n",
"record": "windows_container_memory_usage"
},
{
"expr": "windows_container_memory_usage_private_working_set_bytes{job=\"kubernetes-windows-exporter\", container_id != \"\"} * on(container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job=\"kube-state-metrics\", container_id != \"\"}) by(container, container_id, pod, namespace, cluster)\n",
"record": "windows_container_private_working_set_usage"
},
{
"expr": "windows_container_network_receive_bytes_total{job=\"kubernetes-windows-exporter\", container_id != \"\"} * on(container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job=\"kube-state-metrics\", container_id != \"\"}) by(container, container_id, pod, namespace, cluster)\n",
"record": "windows_container_network_received_bytes_total"
},
{
"expr": "windows_container_network_transmit_bytes_total{job=\"kubernetes-windows-exporter\", container_id != \"\"} * on(container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job=\"kube-state-metrics\", container_id != \"\"}) by(container, container_id, pod, namespace, cluster)\n",
"record": "windows_container_network_transmitted_bytes_total"
},
{
"expr": "max by (cluster, namespace, pod, container) (\n kube_pod_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\"}\n) * on(container,pod,namespace,cluster) (windows_pod_container_available)\n",
"record": "kube_pod_windows_container_resource_memory_request"
},
{
"expr": "kube_pod_container_resource_limits{resource=\"memory\",job=\"kube-state-metrics\"} * on(container,pod,namespace,cluster) (windows_pod_container_available)\n",
"record": "kube_pod_windows_container_resource_memory_limit"
},
{
"expr": "max by (cluster, namespace, pod, container) (\n kube_pod_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\"}\n) * on(container,pod,namespace,cluster) (windows_pod_container_available)\n",
"record": "kube_pod_windows_container_resource_cpu_cores_request"
},
{
"expr": "kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"} * on(container,pod,namespace,cluster) (windows_pod_container_available)\n",
"record": "kube_pod_windows_container_resource_cpu_cores_limit"
},
{
"expr": "sum by (cluster, namespace, pod, container) (\n rate(windows_container_total_runtime{}[5m])\n)\n",
"record": "namespace_pod_container:windows_container_cpu_usage_seconds_total:sum_rate"
}
]
}
]
}

View File

@ -151,7 +151,7 @@
{
"alert": "NodeHighNumberConntrackEntriesUsed",
"annotations": {
"description": "{{ $value | humanizePercentage }} of conntrack entries are used.",
"description": "{{ $labels.instance }} {{ $value | humanizePercentage }} of conntrack entries are used.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused",
"summary": "Number of conntrack are getting close to the limit."
},
@ -256,7 +256,7 @@
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage",
"summary": "High CPU usage."
},
"expr": "sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\"}[2m]))) * 100 > 90\n",
"expr": "sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job=\"node-exporter\", mode!~\"idle|iowait\"}[2m]))) * 100 > 90\n",
"for": "15m",
"labels": {
"severity": "info"

View File

@ -6,7 +6,7 @@
"app.kubernetes.io/component": "controller",
"app.kubernetes.io/name": "prometheus-operator",
"app.kubernetes.io/part-of": "kube-prometheus",
"app.kubernetes.io/version": "0.78.2",
"app.kubernetes.io/version": "0.80.0",
"prometheus": "k8s",
"role": "alert-rules"
},

View File

@ -7,7 +7,7 @@
"app.kubernetes.io/instance": "k8s",
"app.kubernetes.io/name": "prometheus",
"app.kubernetes.io/part-of": "kube-prometheus",
"app.kubernetes.io/version": "3.0.1",
"app.kubernetes.io/version": "3.1.0",
"prometheus": "k8s",
"role": "alert-rules"
},

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -14,9 +14,9 @@ spec:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": members are down ({{`{{`}} $value {{`}}`}}).'
summary: etcd cluster members are down.
expr: "max without (endpoint) (\n sum without (instance, pod) (up{job=~\".*etcd.*\"} == bool 0)\nor\n count without (To) (\n sum without (instance, pod) (rate(etcd_network_peer_sent_failures_total{job=~\".*etcd.*\"}[120s])) > 0.01\n )\n)\n> 0\n"
for: 10m
for: 20m
labels:
severity: critical
severity: warning
- alert: etcdInsufficientMembers
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).'

View File

@ -11,7 +11,7 @@ spec:
rules:
- alert: KubePodCrashLooping
annotations:
description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is in waiting state (reason: "CrashLoopBackOff").'
description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is in waiting state (reason: "CrashLoopBackOff") on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping
summary: Pod is crash looping.
expr: 'max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics"}[5m]) >= 1
@ -22,7 +22,7 @@ spec:
severity: warning
- alert: KubePodNotReady
annotations:
description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes.
description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: "sum by (namespace, pod, cluster) (\n max by(namespace, pod, cluster) (\n kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Pending|Unknown|Failed\"}\n ) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (\n 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!=\"Job\"})\n )\n) > 0\n"
@ -31,7 +31,7 @@ spec:
severity: warning
- alert: KubeDeploymentGenerationMismatch
annotations:
description: Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back.
description: Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch
summary: Deployment generation mismatch due to possible roll-back
expr: "kube_deployment_status_observed_generation{job=\"kube-state-metrics\"}\n !=\nkube_deployment_metadata_generation{job=\"kube-state-metrics\"}\n"
@ -40,7 +40,7 @@ spec:
severity: warning
- alert: KubeDeploymentReplicasMismatch
annotations:
description: Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.
description: Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch
summary: Deployment has not matched the expected number of replicas.
expr: "(\n kube_deployment_spec_replicas{job=\"kube-state-metrics\"}\n >\n kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}\n) and (\n changes(kube_deployment_status_replicas_updated{job=\"kube-state-metrics\"}[10m])\n ==\n 0\n)\n"
@ -49,7 +49,7 @@ spec:
severity: warning
- alert: KubeDeploymentRolloutStuck
annotations:
description: Rollout of deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} is not progressing for longer than 15 minutes.
description: Rollout of deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} is not progressing for longer than 15 minutes on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck
summary: Deployment rollout is not progressing.
expr: 'kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics"}
@ -62,7 +62,7 @@ spec:
severity: warning
- alert: KubeStatefulSetReplicasMismatch
annotations:
description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.
description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
summary: StatefulSet has not matched the expected number of replicas.
expr: "(\n kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas{job=\"kube-state-metrics\"}\n) and (\n changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[10m])\n ==\n 0\n)\n"
@ -71,7 +71,7 @@ spec:
severity: warning
- alert: KubeStatefulSetGenerationMismatch
annotations:
description: StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
description: StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
summary: StatefulSet generation mismatch due to possible roll-back
expr: "kube_statefulset_status_observed_generation{job=\"kube-state-metrics\"}\n !=\nkube_statefulset_metadata_generation{job=\"kube-state-metrics\"}\n"
@ -80,7 +80,7 @@ spec:
severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out.
description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
summary: StatefulSet update has not been rolled out.
expr: "(\n max by(namespace, statefulset, job, cluster) (\n kube_statefulset_status_current_revision{job=\"kube-state-metrics\"}\n unless\n kube_statefulset_status_update_revision{job=\"kube-state-metrics\"}\n )\n *\n (\n kube_statefulset_replicas{job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n"
@ -89,7 +89,7 @@ spec:
severity: warning
- alert: KubeDaemonSetRolloutStuck
annotations:
description: DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15m.
description: DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15m on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
summary: DaemonSet rollout is stuck.
expr: "(\n (\n kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"}\n !=\n 0\n ) or (\n kube_daemonset_status_updated_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_available{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_daemonset_status_updated_number_scheduled{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n"
@ -98,7 +98,7 @@ spec:
severity: warning
- alert: KubeContainerWaiting
annotations:
description: 'pod/{{`{{`}} $labels.pod {{`}}`}} in namespace {{`{{`}} $labels.namespace {{`}}`}} on container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour. (reason: "{{`{{`}} $labels.reason {{`}}`}}").'
description: 'pod/{{`{{`}} $labels.pod {{`}}`}} in namespace {{`{{`}} $labels.namespace {{`}}`}} on container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour. (reason: "{{`{{`}} $labels.reason {{`}}`}}") on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
expr: 'kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", job="kube-state-metrics"} > 0
@ -109,7 +109,7 @@ spec:
severity: warning
- alert: KubeDaemonSetNotScheduled
annotations:
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.'
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled
summary: DaemonSet pods are not scheduled.
expr: "kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n -\nkube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"} > 0\n"
@ -118,7 +118,7 @@ spec:
severity: warning
- alert: KubeDaemonSetMisScheduled
annotations:
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.'
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled
summary: DaemonSet pods are misscheduled.
expr: 'kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
@ -129,7 +129,7 @@ spec:
severity: warning
- alert: KubeJobNotCompleted
annotations:
description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than {{`{{`}} "43200" | humanizeDuration {{`}}`}} to complete.
description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than {{`{{`}} "43200" | humanizeDuration {{`}}`}} to complete on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
summary: Job did not complete in time
expr: "time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job=\"kube-state-metrics\"}\n and\nkube_job_status_active{job=\"kube-state-metrics\"} > 0) > 43200\n"
@ -137,7 +137,7 @@ spec:
severity: warning
- alert: KubeJobFailed
annotations:
description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert.
description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
summary: Job failed to complete.
expr: 'kube_job_failed{job="kube-state-metrics"} > 0
@ -148,7 +148,7 @@ spec:
severity: warning
- alert: KubeHpaReplicasMismatch
annotations:
description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes.
description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch
summary: HPA has not matched desired number of replicas.
expr: "(kube_horizontalpodautoscaler_status_desired_replicas{job=\"kube-state-metrics\"}\n !=\nkube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"})\n and\n(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n >\nkube_horizontalpodautoscaler_spec_min_replicas{job=\"kube-state-metrics\"})\n and\n(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n <\nkube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\"})\n and\nchanges(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}[15m]) == 0\n"
@ -157,7 +157,7 @@ spec:
severity: warning
- alert: KubeHpaMaxedOut
annotations:
description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has been running at max replicas for longer than 15 minutes.
description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has been running at max replicas for longer than 15 minutes on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout
summary: HPA is running at max replicas
expr: "kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n ==\nkube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\"}\n"
@ -186,7 +186,7 @@ spec:
severity: warning
- alert: KubeQuotaAlmostFull
annotations:
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
summary: Namespace quota is going to be full.
expr: "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n > 0.9 < 1\n"
@ -195,7 +195,7 @@ spec:
severity: info
- alert: KubeQuotaFullyUsed
annotations:
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
summary: Namespace quota is fully used.
expr: "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n == 1\n"
@ -204,7 +204,7 @@ spec:
severity: info
- alert: KubeQuotaExceeded
annotations:
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
summary: Namespace quota has exceeded the limits.
expr: "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n > 1\n"
@ -213,7 +213,7 @@ spec:
severity: warning
- alert: CPUThrottlingHigh
annotations:
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}}.'
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}} on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
summary: Processes experience elevated CPU throttling.
expr: "sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\", job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", }[5m])) without (id, metrics_path, name, image, endpoint, job, node)\n /\nsum(increase(container_cpu_cfs_periods_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", }[5m])) without (id, metrics_path, name, image, endpoint, job, node)\n > ( 25 / 100 )\n"
@ -281,7 +281,7 @@ spec:
rules:
- alert: KubeVersionMismatch
annotations:
description: There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running.
description: There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
expr: 'count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
@ -292,7 +292,7 @@ spec:
severity: warning
- alert: KubeClientErrors
annotations:
description: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} $value | humanizePercentage {{`}}`}} errors.'
description: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} $value | humanizePercentage {{`}}`}} errors on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors
summary: Kubernetes API server client is experiencing errors.
expr: "(sum(rate(rest_client_requests_total{job=\"apiserver\",code=~\"5..\"}[5m])) by (cluster, instance, job, namespace)\n /\nsum(rate(rest_client_requests_total{job=\"apiserver\"}[5m])) by (cluster, instance, job, namespace))\n> 0.01\n"
@ -303,7 +303,7 @@ spec:
rules:
- alert: KubeAPIErrorBudgetBurn
annotations:
description: The API server is burning too much error budget.
description: The API server is burning too much error budget on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: 'sum by(cluster) (apiserver_request:burnrate1h) > (14.40 * 0.01000)
@ -320,7 +320,7 @@ spec:
short: 5m
- alert: KubeAPIErrorBudgetBurn
annotations:
description: The API server is burning too much error budget.
description: The API server is burning too much error budget on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: 'sum by(cluster) (apiserver_request:burnrate6h) > (6.00 * 0.01000)
@ -337,7 +337,7 @@ spec:
short: 30m
- alert: KubeAPIErrorBudgetBurn
annotations:
description: The API server is burning too much error budget.
description: The API server is burning too much error budget on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: 'sum by(cluster) (apiserver_request:burnrate1d) > (3.00 * 0.01000)
@ -354,7 +354,7 @@ spec:
short: 2h
- alert: KubeAPIErrorBudgetBurn
annotations:
description: The API server is burning too much error budget.
description: The API server is burning too much error budget on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: 'sum by(cluster) (apiserver_request:burnrate3d) > (1.00 * 0.01000)
@ -403,17 +403,18 @@ spec:
severity: critical
- alert: KubeAggregatedAPIErrors
annotations:
description: Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has reported errors. It has appeared unavailable {{`{{`}} $value | humanize {{`}}`}} times averaged over the past 10m.
description: Kubernetes aggregated API {{`{{`}} $labels.instance {{`}}`}}/{{`{{`}} $labels.name {{`}}`}} has reported {{`{{`}} $labels.reason {{`}}`}} errors on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
summary: Kubernetes aggregated API has reported errors.
expr: 'sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4
expr: 'sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[1m])) > 0
'
for: 10m
labels:
severity: warning
- alert: KubeAggregatedAPIDown
annotations:
description: Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has been only {{`{{`}} $value | humanize {{`}}`}}% available over the last 10m.
description: Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has been only {{`{{`}} $value | humanize {{`}}`}}% available over the last 10m on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
summary: Kubernetes aggregated API is down.
expr: '(1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="apiserver"}[10m]))) * 100 < 85
@ -435,7 +436,7 @@ spec:
severity: critical
- alert: KubeAPITerminatedRequests
annotations:
description: The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.
description: The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests
summary: The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.
expr: 'sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
@ -448,18 +449,22 @@ spec:
rules:
- alert: KubeNodeNotReady
annotations:
description: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes.'
description: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready
summary: Node is not ready.
expr: 'kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
and on (cluster, node)
kube_node_spec_unschedulable{job="kube-state-metrics"} == 0
'
for: 15m
labels:
severity: warning
- alert: KubeNodeUnreachable
annotations:
description: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled.'
description: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable
summary: Node is unreachable.
expr: '(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
@ -470,27 +475,31 @@ spec:
severity: warning
- alert: KubeletTooManyPods
annotations:
description: Kubelet '{{`{{`}} $labels.node {{`}}`}}' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity.
description: Kubelet '{{`{{`}} $labels.node {{`}}`}}' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods
summary: Kubelet is running at capacity.
expr: "count by(cluster, node) (\n (kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})\n)\n/\nmax by(cluster, node) (\n kube_node_status_capacity{job=\"kube-state-metrics\",resource=\"pods\"} != 1\n) > 0.95\n"
expr: "(\n max by (cluster, instance) (\n kubelet_running_pods{job=\"kubelet\", metrics_path=\"/metrics\"} > 1\n )\n * on (cluster, instance) group_left(node)\n max by (cluster, instance, node) (\n kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"}\n )\n)\n/ on (cluster, node) group_left()\nmax by (cluster, node) (\n kube_node_status_capacity{job=\"kube-state-metrics\", resource=\"pods\"} != 1\n) > 0.95\n"
for: 15m
labels:
severity: info
- alert: KubeNodeReadinessFlapping
annotations:
description: The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes.
description: The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping
summary: Node readiness status is flapping.
expr: 'sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2
and on (cluster, node)
kube_node_spec_unschedulable{job="kube-state-metrics"} == 0
'
for: 15m
labels:
severity: warning
- alert: KubeletPlegDurationHigh
annotations:
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}} on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
expr: 'node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
@ -501,7 +510,7 @@ spec:
severity: warning
- alert: KubeletPodStartUpLatencyHigh
annotations:
description: Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.
description: Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}} on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh
summary: Kubelet Pod startup latency is too high.
expr: 'histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
@ -512,7 +521,7 @@ spec:
severity: warning
- alert: KubeletClientCertificateExpiration
annotations:
description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}} on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration
summary: Kubelet client certificate is about to expire.
expr: 'kubelet_certificate_manager_client_ttl_seconds < 604800
@ -522,7 +531,7 @@ spec:
severity: warning
- alert: KubeletClientCertificateExpiration
annotations:
description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}} on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration
summary: Kubelet client certificate is about to expire.
expr: 'kubelet_certificate_manager_client_ttl_seconds < 86400
@ -532,7 +541,7 @@ spec:
severity: critical
- alert: KubeletServerCertificateExpiration
annotations:
description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}} on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration
summary: Kubelet server certificate is about to expire.
expr: 'kubelet_certificate_manager_server_ttl_seconds < 604800
@ -542,7 +551,7 @@ spec:
severity: warning
- alert: KubeletServerCertificateExpiration
annotations:
description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}} on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration
summary: Kubelet server certificate is about to expire.
expr: 'kubelet_certificate_manager_server_ttl_seconds < 86400
@ -552,7 +561,7 @@ spec:
severity: critical
- alert: KubeletClientCertificateRenewalErrors
annotations:
description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its client certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).
description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its client certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes) on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors
summary: Kubelet has failed to renew its client certificate.
expr: 'increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
@ -563,7 +572,7 @@ spec:
severity: warning
- alert: KubeletServerCertificateRenewalErrors
annotations:
description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its server certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).
description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its server certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes) on cluster {{`{{`}} $labels.cluster {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors
summary: Kubelet has failed to renew its server certificate.
expr: 'increase(kubelet_server_expiration_renew_errors[5m]) > 0
@ -640,20 +649,20 @@ spec:
'
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h
- expr: 'sum by (cluster, verb, scope) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{le="+Inf"} * 24 * 30)
- expr: 'sum by (cluster, verb, scope) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{le="+Inf"})
'
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d
- expr: "1 - (\n (\n # write too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"})\n ) +\n (\n # read too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"\
cluster\",le=\"30\"})\n )\n ) +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d)\n"
- expr: "1 - (\n (\n # write too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=~\"1(\\\\.0)?\"})\n ) +\n (\n # read too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=~\"5(\\\\.0)?\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"\
LIST|GET\",scope=\"cluster\",le=~\"30(\\\\.0)?\"})\n )\n ) +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d)\n"
labels:
verb: all
record: apiserver_request:availability30d
- expr: "1 - (\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n # too slow\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"cluster\",le=\"30\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\"})\n"
- expr: "1 - (\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n # too slow\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=~\"5(\\\\.0)?\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"cluster\",le=~\"30(\\\\.0)?\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\"})\n"
labels:
verb: read
record: apiserver_request:availability30d
- expr: "1 - (\n (\n # too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\"})\n"
- expr: "1 - (\n (\n # too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=~\"1(\\\\.0)?\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\"})\n"
labels:
verb: write
record: apiserver_request:availability30d
@ -687,66 +696,66 @@ spec:
record: code_verb:apiserver_request_total:increase1h
- name: kube-apiserver-burnrate.rules
rules:
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[1d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[1d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[1d]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[1d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"\
5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[1d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[1d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=~\"5(\\\\.0)?\"}[1d]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=~\"30(\\\\.0)?\"}[1d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\
,verb=~\"LIST|GET\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n"
labels:
verb: read
record: apiserver_request:burnrate1d
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[1h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[1h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[1h]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[1h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"\
5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[1h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[1h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=~\"5(\\\\.0)?\"}[1h]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=~\"30(\\\\.0)?\"}[1h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\
,verb=~\"LIST|GET\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n"
labels:
verb: read
record: apiserver_request:burnrate1h
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[2h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[2h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[2h]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[2h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"\
5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[2h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[2h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=~\"5(\\\\.0)?\"}[2h]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=~\"30(\\\\.0)?\"}[2h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\
,verb=~\"LIST|GET\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n"
labels:
verb: read
record: apiserver_request:burnrate2h
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[30m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[30m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[30m]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[30m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"\
,code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[30m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[30m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=~\"5(\\\\.0)?\"}[30m]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=~\"30(\\\\.0)?\"}[30m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"\
apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n"
labels:
verb: read
record: apiserver_request:burnrate30m
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[3d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[3d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[3d]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[3d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"\
5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[3d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[3d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=~\"5(\\\\.0)?\"}[3d]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=~\"30(\\\\.0)?\"}[3d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\
,verb=~\"LIST|GET\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n"
labels:
verb: read
record: apiserver_request:burnrate3d
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[5m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[5m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[5m]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[5m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"\
5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[5m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[5m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=~\"5(\\\\.0)?\"}[5m]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=~\"30(\\\\.0)?\"}[5m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\
,verb=~\"LIST|GET\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n"
labels:
verb: read
record: apiserver_request:burnrate5m
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[6h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[6h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[6h]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[6h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"\
5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[6h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[6h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=~\"5(\\\\.0)?\"}[6h]))\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=~\"30(\\\\.0)?\"}[6h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\
,verb=~\"LIST|GET\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n"
labels:
verb: read
record: apiserver_request:burnrate6h
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[1d]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[1d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[1d]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=~\"1(\\\\.0)?\"}[1d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n"
labels:
verb: write
record: apiserver_request:burnrate1d
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[1h]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[1h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[1h]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=~\"1(\\\\.0)?\"}[1h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n"
labels:
verb: write
record: apiserver_request:burnrate1h
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[2h]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[2h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[2h]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=~\"1(\\\\.0)?\"}[2h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n"
labels:
verb: write
record: apiserver_request:burnrate2h
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[30m]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[30m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[30m]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=~\"1(\\\\.0)?\"}[30m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n"
labels:
verb: write
record: apiserver_request:burnrate30m
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[3d]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[3d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[3d]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=~\"1(\\\\.0)?\"}[3d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n"
labels:
verb: write
record: apiserver_request:burnrate3d
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[5m]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[5m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[5m]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=~\"1(\\\\.0)?\"}[5m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n"
labels:
verb: write
record: apiserver_request:burnrate5m
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[6h]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[6h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[6h]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=~\"1(\\\\.0)?\"}[6h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n"
labels:
verb: write
record: apiserver_request:burnrate6h
@ -916,4 +925,122 @@ spec:
labels:
quantile: '0.5'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- name: windows.node.rules
rules:
- expr: "count by (cluster) (\n windows_system_system_up_time{job=\"kubernetes-windows-exporter\"}\n)\n"
record: node:windows_node:sum
- expr: "count by (cluster, instance) (sum by (cluster, instance, core) (\n windows_cpu_time_total{job=\"kubernetes-windows-exporter\"}\n))\n"
record: node:windows_node_num_cpu:sum
- expr: '1 - avg by (cluster) (rate(windows_cpu_time_total{job="kubernetes-windows-exporter",mode="idle"}[1m]))
'
record: :windows_node_cpu_utilisation:avg1m
- expr: "1 - avg by (cluster, instance) (\n rate(windows_cpu_time_total{job=\"kubernetes-windows-exporter\",mode=\"idle\"}[1m])\n)\n"
record: node:windows_node_cpu_utilisation:avg1m
- expr: '1 -
sum by (cluster) (windows_memory_available_bytes{job="kubernetes-windows-exporter"})
/
sum by (cluster) (windows_os_visible_memory_bytes{job="kubernetes-windows-exporter"})
'
record: ':windows_node_memory_utilisation:'
- expr: 'sum by (cluster) (windows_memory_available_bytes{job="kubernetes-windows-exporter"} + windows_memory_cache_bytes{job="kubernetes-windows-exporter"})
'
record: :windows_node_memory_MemFreeCached_bytes:sum
- expr: '(windows_memory_cache_bytes{job="kubernetes-windows-exporter"} + windows_memory_modified_page_list_bytes{job="kubernetes-windows-exporter"} + windows_memory_standby_cache_core_bytes{job="kubernetes-windows-exporter"} + windows_memory_standby_cache_normal_priority_bytes{job="kubernetes-windows-exporter"} + windows_memory_standby_cache_reserve_bytes{job="kubernetes-windows-exporter"})
'
record: node:windows_node_memory_totalCached_bytes:sum
- expr: 'sum by (cluster) (windows_os_visible_memory_bytes{job="kubernetes-windows-exporter"})
'
record: :windows_node_memory_MemTotal_bytes:sum
- expr: "sum by (cluster, instance) (\n (windows_memory_available_bytes{job=\"kubernetes-windows-exporter\"})\n)\n"
record: node:windows_node_memory_bytes_available:sum
- expr: "sum by (cluster, instance) (\n windows_os_visible_memory_bytes{job=\"kubernetes-windows-exporter\"}\n)\n"
record: node:windows_node_memory_bytes_total:sum
- expr: '(node:windows_node_memory_bytes_total:sum - node:windows_node_memory_bytes_available:sum)
/
scalar(sum(node:windows_node_memory_bytes_total:sum))
'
record: node:windows_node_memory_utilisation:ratio
- expr: '1 - (node:windows_node_memory_bytes_available:sum / node:windows_node_memory_bytes_total:sum)
'
record: 'node:windows_node_memory_utilisation:'
- expr: 'irate(windows_memory_swap_page_operations_total{job="kubernetes-windows-exporter"}[5m])
'
record: node:windows_node_memory_swap_io_pages:irate
- expr: "avg by (cluster) (irate(windows_logical_disk_read_seconds_total{job=\"kubernetes-windows-exporter\"}[1m]) +\n irate(windows_logical_disk_write_seconds_total{job=\"kubernetes-windows-exporter\"}[1m])\n )\n"
record: :windows_node_disk_utilisation:avg_irate
- expr: "avg by (cluster, instance) (\n (irate(windows_logical_disk_read_seconds_total{job=\"kubernetes-windows-exporter\"}[1m]) +\n irate(windows_logical_disk_write_seconds_total{job=\"kubernetes-windows-exporter\"}[1m]))\n)\n"
record: node:windows_node_disk_utilisation:avg_irate
- expr: "max by (cluster,instance,volume)(\n (windows_logical_disk_size_bytes{job=\"kubernetes-windows-exporter\"}\n- windows_logical_disk_free_bytes{job=\"kubernetes-windows-exporter\"})\n/ windows_logical_disk_size_bytes{job=\"kubernetes-windows-exporter\"}\n)\n"
record: 'node:windows_node_filesystem_usage:'
- expr: 'max by (cluster, instance, volume) (windows_logical_disk_free_bytes{job="kubernetes-windows-exporter"} / windows_logical_disk_size_bytes{job="kubernetes-windows-exporter"})
'
record: 'node:windows_node_filesystem_avail:'
- expr: 'sum by (cluster) (irate(windows_net_bytes_total{job="kubernetes-windows-exporter"}[1m]))
'
record: :windows_node_net_utilisation:sum_irate
- expr: "sum by (cluster, instance) (\n (irate(windows_net_bytes_total{job=\"kubernetes-windows-exporter\"}[1m]))\n)\n"
record: node:windows_node_net_utilisation:sum_irate
- expr: 'sum by (cluster) (irate(windows_net_packets_received_discarded_total{job="kubernetes-windows-exporter"}[1m])) +
sum by (cluster) (irate(windows_net_packets_outbound_discarded_total{job="kubernetes-windows-exporter"}[1m]))
'
record: :windows_node_net_saturation:sum_irate
- expr: "sum by (cluster, instance) (\n (irate(windows_net_packets_received_discarded_total{job=\"kubernetes-windows-exporter\"}[1m]) +\n irate(windows_net_packets_outbound_discarded_total{job=\"kubernetes-windows-exporter\"}[1m]))\n)\n"
record: node:windows_node_net_saturation:sum_irate
- name: windows.pod.rules
rules:
- expr: 'windows_container_available{job="kubernetes-windows-exporter", container_id != ""} * on(container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="kube-state-metrics", container_id != ""}) by(container, container_id, pod, namespace, cluster)
'
record: windows_pod_container_available
- expr: 'windows_container_cpu_usage_seconds_total{job="kubernetes-windows-exporter", container_id != ""} * on(container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="kube-state-metrics", container_id != ""}) by(container, container_id, pod, namespace, cluster)
'
record: windows_container_total_runtime
- expr: 'windows_container_memory_usage_commit_bytes{job="kubernetes-windows-exporter", container_id != ""} * on(container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="kube-state-metrics", container_id != ""}) by(container, container_id, pod, namespace, cluster)
'
record: windows_container_memory_usage
- expr: 'windows_container_memory_usage_private_working_set_bytes{job="kubernetes-windows-exporter", container_id != ""} * on(container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="kube-state-metrics", container_id != ""}) by(container, container_id, pod, namespace, cluster)
'
record: windows_container_private_working_set_usage
- expr: 'windows_container_network_receive_bytes_total{job="kubernetes-windows-exporter", container_id != ""} * on(container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="kube-state-metrics", container_id != ""}) by(container, container_id, pod, namespace, cluster)
'
record: windows_container_network_received_bytes_total
- expr: 'windows_container_network_transmit_bytes_total{job="kubernetes-windows-exporter", container_id != ""} * on(container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="kube-state-metrics", container_id != ""}) by(container, container_id, pod, namespace, cluster)
'
record: windows_container_network_transmitted_bytes_total
- expr: "max by (cluster, namespace, pod, container) (\n kube_pod_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\"}\n) * on(container,pod,namespace,cluster) (windows_pod_container_available)\n"
record: kube_pod_windows_container_resource_memory_request
- expr: 'kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on(container,pod,namespace,cluster) (windows_pod_container_available)
'
record: kube_pod_windows_container_resource_memory_limit
- expr: "max by (cluster, namespace, pod, container) (\n kube_pod_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\"}\n) * on(container,pod,namespace,cluster) (windows_pod_container_available)\n"
record: kube_pod_windows_container_resource_cpu_cores_request
- expr: 'kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on(container,pod,namespace,cluster) (windows_pod_container_available)
'
record: kube_pod_windows_container_resource_cpu_cores_limit
- expr: "sum by (cluster, namespace, pod, container) (\n rate(windows_container_total_runtime{}[5m])\n)\n"
record: namespace_pod_container:windows_container_cpu_usage_seconds_total:sum_rate

View File

@ -105,7 +105,7 @@ spec:
severity: warning
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used.'
description: '{{`{{`}} $labels.instance {{`}}`}} {{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
summary: Number of conntrack are getting close to the limit.
expr: '(node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) > 0.75
@ -193,7 +193,7 @@ spec:
'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
summary: High CPU usage.
expr: 'sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[2m]))) * 100 > 90
expr: 'sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!~"idle|iowait"}[2m]))) * 100 > 90
'
for: 15m

View File

@ -101,7 +101,7 @@ operators:
metrics:
enabled: false
namespace: monitoring
targetRevision: 0.10.2
targetRevision: 0.11.0
istio:
grafana: {}
prometheus: {}