fix: metrics version bump, fix for inter Grafana links

This commit is contained in:
Stefan Reimer 2021-09-27 17:34:47 +02:00
parent cb008678c4
commit 27f8954a88
56 changed files with 247 additions and 40913 deletions

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: kubezero-metrics
description: KubeZero Umbrella Chart for Prometheus, Grafana and Alertmanager as well as all Kubernetes integrations.
type: application
version: 0.4.7
version: 0.4.8
@ -17,7 +17,7 @@ dependencies:
version: ">= 0.1.4"
- name: kube-prometheus-stack
version: 18.0.5
version: 18.1.0
# Switch back to upstream once all alerts are fixed eg. etcd gpcr
# repository:
- name: prometheus-adapter

View File

@ -19,7 +19,7 @@ dependencies:
- condition: grafana.enabled
name: grafana
version: 6.15.*
version: 6.16.*
description: kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards,
and Prometheus rules combined with documentation and scripts to provide easy to
operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus
@ -47,4 +47,4 @@ sources:
type: application
version: 18.0.5
version: 18.1.0

View File

@ -1,5 +1,5 @@
apiVersion: v2
appVersion: 8.1.0
appVersion: 8.1.5
description: The leading tool for querying and visualizing time series and metrics.
@ -19,4 +19,4 @@ name: grafana
type: application
version: 6.15.0
version: 6.16.9

View File

@ -90,7 +90,7 @@ This version requires Helm >= 3.1.0.
| `tolerations` | Toleration labels for pod assignment | `[]` |
| `affinity` | Affinity settings for pod assignment | `{}` |
| `extraInitContainers` | Init containers to add to the grafana pod | `{}` |
| `extraContainers` | Sidecar containers to add to the grafana pod | `{}` |
| `extraContainers` | Sidecar containers to add to the grafana pod | `""` |
| `extraContainerVolumes` | Volumes that can be mounted in sidecar containers | `[]` |
| `extraLabels` | Custom labels for all manifests | `{}` |
| `schedulerName` | Name of the k8s scheduler (other than default) | `nil` |
@ -137,7 +137,7 @@ This version requires Helm >= 3.1.0.
| `podLabels` | Pod labels | `{}` |
| `podPortName` | Name of the grafana port on the pod | `grafana` |
| `sidecar.image.repository` | Sidecar image repository | `` |
| `sidecar.image.tag` | Sidecar image tag | `1.12.2` |
| `sidecar.image.tag` | Sidecar image tag | `1.12.3` |
| `sidecar.image.sha` | Sidecar image sha (optional) | `""` |
| `sidecar.imagePullPolicy` | Sidecar image pull policy | `IfNotPresent` |
| `sidecar.resources` | Sidecar resources | `{}` |
@ -490,10 +490,10 @@ Include in the `extraSecretMounts` configuration flag:
- extraSecretMounts:
- name: auth-generic-oauth-secret-mount
secretName: auth-generic-oauth-secret
defaultMode: 0440
mountPath: /etc/secrets/auth_generic_oauth
readOnly: true
secretName: auth-generic-oauth-secret
defaultMode: 0440
mountPath: /etc/secrets/auth_generic_oauth
readOnly: true
### extraSecretMounts using a Container Storage Interface (CSI) provider

View File

@ -265,19 +265,25 @@ containers:
{{- end }}
{{- end }}
{{- if .Values.datasources }}
{{- range (keys .Values.datasources | sortAlpha) }}
- name: config
mountPath: "/etc/grafana/provisioning/datasources/datasources.yaml"
subPath: datasources.yaml
mountPath: "/etc/grafana/provisioning/datasources/{{ . }}"
subPath: {{ . | quote }}
{{- end }}
{{- end }}
{{- if .Values.notifiers }}
{{- range (keys .Values.notifiers | sortAlpha) }}
- name: config
mountPath: "/etc/grafana/provisioning/notifiers/notifiers.yaml"
subPath: notifiers.yaml
mountPath: "/etc/grafana/provisioning/notifiers/{{ . }}"
subPath: {{ . | quote }}
{{- end }}
{{- end }}
{{- if .Values.dashboardProviders }}
{{- range (keys .Values.dashboardProviders | sortAlpha) }}
- name: config
mountPath: "/etc/grafana/provisioning/dashboards/dashboardproviders.yaml"
subPath: dashboardproviders.yaml
mountPath: "/etc/grafana/provisioning/dashboards/{{ . }}"
subPath: {{ . | quote }}
{{- end }}
{{- end }}
{{- if .Values.sidecar.dashboards.enabled }}
- name: sc-dashboard-volume
@ -376,16 +382,22 @@ containers:
- name: "{{ tpl $key $ }}"
value: "{{ tpl (print $value) $ }}"
{{- end }}
{{- if .Values.envFromSecret }}
{{- if or .Values.envFromSecret (or .Values.envRenderSecret .Values.envFromSecrets) }}
{{- if .Values.envFromSecret }}
- secretRef:
name: {{ tpl .Values.envFromSecret . }}
{{- end }}
{{- if .Values.envRenderSecret }}
- secretRef:
name: {{ template "grafana.fullname" . }}-env
{{- end }}
{{- range .Values.envFromSecrets }}
- secretRef:
name: {{ .name }}
optional: {{ .optional | default false }}
{{- end }}
{{- end }}
{{ toYaml .Values.livenessProbe | indent 6 }}

View File

@ -30,7 +30,7 @@ spec:
hostIPC: false
hostPID: false
rule: 'MustRunAsNonRoot'
rule: 'RunAsAny'
rule: 'RunAsAny'

View File

@ -70,7 +70,7 @@ livenessProbe:
repository: grafana/grafana
tag: 8.1.0
tag: 8.1.5
sha: ""
pullPolicy: IfNotPresent
@ -241,7 +241,8 @@ affinity: {}
extraInitContainers: []
## Enable an Specify container in extraContainers. This is meant to allow adding an authentication proxy to a grafana pod
extraContainers: |
extraContainers: ""
# extraContainers: |
# - name: proxy
# image:
# args:
@ -383,6 +384,12 @@ envFromSecret: ""
## This can be useful for auth tokens, etc
envRenderSecret: {}
## The names of secrets in the same kubernetes namespace which contain values to be added to the environment
## Each entry should contain a name key, and can optionally specify whether the secret must be defined with an optional key.
envFromSecrets: []
## - name: secret-name
## optional: true
# Inject Kubernetes services as environment variables.
# See
enableServiceLinks: true
@ -608,7 +615,7 @@ smtp:
tag: 1.12.2
tag: 1.12.3
sha: ""
imagePullPolicy: IfNotPresent
resources: {}
@ -705,7 +712,7 @@ imageRenderer:
# extra environment variables
# RENDERING_ARGS: --disable-gpu,--window-size=1280x758
# RENDERING_ARGS: --no-sandbox,--disable-gpu,--window-size=1280x758
# RENDERING_MODE: clustered
# image-renderer deployment serviceAccount
serviceAccountName: ""

View File

@ -101,7 +101,7 @@ spec:
- {key: app, operator: In, values: [alertmanager]}
- {key: prometheus, operator: In, values: [{{ template "kube-prometheus-stack.fullname" . }}-alertmanager]}
- {key: alertmanager, operator: In, values: [{{ template "kube-prometheus-stack.fullname" . }}-alertmanager]}
{{- end }}
{{- end }}
{{- if .Values.alertmanager.alertmanagerSpec.tolerations }}

View File

@ -14,10 +14,13 @@ spec:
{{- if .Values.kubeStateMetrics.serviceMonitor.interval }}
interval: {{ .Values.kubeStateMetrics.serviceMonitor.interval }}
{{- end }}
{{- if .Values.kubeStateMetrics.serviceMonitor.scrapeTimeout }}
scrapeTimeout: {{ .Values.kubeStateMetrics.serviceMonitor.scrapeTimeout }}
{{- end }}
{{- if .Values.kubeStateMetrics.serviceMonitor.proxyUrl }}
proxyUrl: {{ .Values.kubeStateMetrics.serviceMonitor.proxyUrl}}
{{- end }}
honorLabels: true
honorLabels: {{ .Values.kubeStateMetrics.serviceMonitor.honorLabels }}
{{- if .Values.kubeStateMetrics.serviceMonitor.metricRelabelings }}
{{ tpl (toYaml .Values.kubeStateMetrics.serviceMonitor.metricRelabelings | indent 4) . }}
@ -34,7 +37,7 @@ spec:
{{- if .Values.kubeStateMetrics.serviceMonitor.proxyUrl }}
proxyUrl: {{ .Values.kubeStateMetrics.serviceMonitor.proxyUrl}}
{{- end }}
honorLabels: true
honorLabels: {{ .Values.kubeStateMetrics.serviceMonitor.honorLabels }}
{{- if .Values.kubeStateMetrics.serviceMonitor.metricRelabelings }}
{{ tpl (toYaml .Values.kubeStateMetrics.serviceMonitor.metricRelabelings | indent 4) . }}

View File

@ -1,610 +0,0 @@
{{- /*
Generated from 'alertmanager-overview' from
Do not change in-place! In order to change this file first read following link:
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
namespace: {{ template "kube-prometheus-stack.namespace" . }}
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "alertmanager-overview" | trunc 63 | trimSuffix "-" }}
{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }}
{{- if $.Values.grafana.sidecar.dashboards.label }}
{{ $.Values.grafana.sidecar.dashboards.label }}: "1"
{{- end }}
app: {{ template "" $ }}-grafana
{{ include "kube-prometheus-stack.labels" $ | indent 4 }}
alertmanager-overview.json: |-
"__inputs": [
"__requires": [
"annotations": {
"list": [
"editable": false,
"gnetId": null,
"graphTooltip": 1,
"hideControls": false,
"id": null,
"links": [
"refresh": "30s",
"rows": [
"collapse": false,
"collapsed": false,
"panels": [
"aliasColors": {
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"id": 2,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": false,
"sideWidth": null,
"total": false,
"values": false
"lines": true,
"linewidth": 1,
"links": [
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": null,
"seriesOverrides": [
"spaceLength": 10,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
"expr": "sum(alertmanager_alerts{namespace=\"$namespace\",service=\"$service\"}) by (namespace,service,instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}}",
"refId": "A"
"thresholds": [
"timeFrom": null,
"timeShift": null,
"title": "Alerts",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
"yaxes": [
"format": "none",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
"format": "none",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
"aliasColors": {
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"id": 3,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": false,
"sideWidth": null,
"total": false,
"values": false
"lines": true,
"linewidth": 1,
"links": [
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": null,
"seriesOverrides": [
"spaceLength": 10,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
"expr": "sum(rate(alertmanager_alerts_received_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} Received",
"refId": "A"
"expr": "sum(rate(alertmanager_alerts_invalid_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} Invalid",
"refId": "B"
"thresholds": [
"timeFrom": null,
"timeShift": null,
"title": "Alerts receive rate",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
"yaxes": [
"format": "ops",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
"format": "ops",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Alerts",
"titleSize": "h6",
"type": "row"
"collapse": false,
"collapsed": false,
"panels": [
"aliasColors": {
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"id": 4,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": false,
"sideWidth": null,
"total": false,
"values": false
"lines": true,
"linewidth": 1,
"links": [
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": "integration",
"seriesOverrides": [
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
"expr": "sum(rate(alertmanager_notifications_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} Total",
"refId": "A"
"expr": "sum(rate(alertmanager_notifications_failed_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} Failed",
"refId": "B"
"thresholds": [
"timeFrom": null,
"timeShift": null,
"title": "$integration: Notifications Send Rate",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
"yaxes": [
"format": "ops",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
"format": "ops",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
"aliasColors": {
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"id": 5,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": false,
"sideWidth": null,
"total": false,
"values": false
"lines": true,
"linewidth": 1,
"links": [
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": "integration",
"seriesOverrides": [
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
"expr": "histogram_quantile(0.99,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} 99th Percentile",
"refId": "A"
"expr": "histogram_quantile(0.50,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} Median",
"refId": "B"
"expr": "sum(rate(alertmanager_notification_latency_seconds_sum{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n/\nsum(rate(alertmanager_notification_latency_seconds_count{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} Average",
"refId": "C"
"thresholds": [
"timeFrom": null,
"timeShift": null,
"title": "$integration: Notification Duration",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
"yaxes": [
"format": "s",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
"format": "s",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Notifications",
"titleSize": "h6",
"type": "row"
"schemaVersion": 14,
"style": "dark",
"tags": [
"templating": {
"list": [
"current": {
"text": "Prometheus",
"value": "Prometheus"
"hide": 0,
"label": null,
"name": "datasource",
"options": [
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
"allValue": null,
"current": {
"text": "",
"value": ""
"datasource": "$datasource",
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "namespace",
"options": [
"query": "label_values(alertmanager_alerts, namespace)",
"refresh": 2,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
"tags": [
"tagsQuery": "",
"type": "query",
"useTags": false
"allValue": null,
"current": {
"text": "",
"value": ""
"datasource": "$datasource",
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "service",
"options": [
"query": "label_values(alertmanager_alerts, service)",
"refresh": 2,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
"tags": [
"tagsQuery": "",
"type": "query",
"useTags": false
"allValue": null,
"current": {
"text": "all",
"value": "$__all"
"datasource": "$datasource",
"hide": 2,
"includeAll": true,
"label": null,
"multi": false,
"name": "integration",
"options": [
"query": "label_values(alertmanager_notifications_total{integration=~\".*\"}, integration)",
"refresh": 2,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
"tags": [
"tagsQuery": "",
"type": "query",
"useTags": false
"time": {
"from": "now-1h",
"to": "now"
"timepicker": {
"refresh_intervals": [
"time_options": [
"timezone": "utc",
"title": "Alertmanager / Overview",
"uid": "alertmanager-overview",
"version": 0
{{- end }}

View File

@ -1,978 +0,0 @@
{{- /*
Generated from 'k8s-resources-node' from
Do not change in-place! In order to change this file first read following link:
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
namespace: {{ template "kube-prometheus-stack.namespace" . }}
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "k8s-resources-node" | trunc 63 | trimSuffix "-" }}
{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }}
{{- if $.Values.grafana.sidecar.dashboards.label }}
{{ $.Values.grafana.sidecar.dashboards.label }}: "1"
{{- end }}
app: {{ template "" $ }}-grafana
{{ include "kube-prometheus-stack.labels" $ | indent 4 }}
k8s-resources-node.json: |-
"annotations": {
"list": [
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"hideControls": false,
"links": [
"refresh": "10s",
"rows": [
"collapse": false,
"height": "250px",
"panels": [
"aliasColors": {
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"id": 1,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
"lines": true,
"linewidth": 0,
"links": [
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
"spaceLength": 10,
"span": 12,
"stack": true,
"steppedLine": false,
"targets": [
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}pod{{`}}`}}",
"legendLink": null,
"step": 10
"thresholds": [
"timeFrom": null,
"timeShift": null,
"title": "CPU Usage",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
"yaxes": [
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "CPU Usage",
"titleSize": "h6"
"collapse": false,
"height": "250px",
"panels": [
"aliasColors": {
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 2,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
"lines": true,
"linewidth": 1,
"links": [
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
"spaceLength": 10,
"span": 12,
"stack": false,
"steppedLine": false,
"styles": [
"alias": "Time",
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"pattern": "Time",
"type": "hidden"
"alias": "CPU Usage",
"colorMode": null,
"colors": [
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #A",
"thresholds": [
"type": "number",
"unit": "short"
"alias": "CPU Requests",
"colorMode": null,
"colors": [
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #B",
"thresholds": [
"type": "number",
"unit": "short"
"alias": "CPU Requests %",
"colorMode": null,
"colors": [
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #C",
"thresholds": [
"type": "number",
"unit": "percentunit"
"alias": "CPU Limits",
"colorMode": null,
"colors": [
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #D",
"thresholds": [
"type": "number",
"unit": "short"
"alias": "CPU Limits %",
"colorMode": null,
"colors": [
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #E",
"thresholds": [
"type": "number",
"unit": "percentunit"
"alias": "Pod",
"colorMode": null,
"colors": [
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "pod",
"thresholds": [
"type": "number",
"unit": "short"
"alias": "",
"colorMode": null,
"colors": [
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"pattern": "/.*/",
"thresholds": [
"type": "string",
"unit": "short"
"targets": [
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "A",
"step": 10
"expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "B",
"step": 10
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "C",
"step": 10
"expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "D",
"step": 10
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "E",
"step": 10
"thresholds": [
"timeFrom": null,
"timeShift": null,
"title": "CPU Quota",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
"transform": "table",
"type": "table",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
"yaxes": [
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "CPU Quota",
"titleSize": "h6"
"collapse": false,
"height": "250px",
"panels": [
"aliasColors": {
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"id": 3,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
"lines": true,
"linewidth": 0,
"links": [
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
"spaceLength": 10,
"span": 12,
"stack": true,
"steppedLine": false,
"targets": [
"expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\", container!=\"\"}) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}pod{{`}}`}}",
"legendLink": null,
"step": 10
"thresholds": [
"timeFrom": null,
"timeShift": null,
"title": "Memory Usage (w/o cache)",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
"yaxes": [
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Memory Usage",
"titleSize": "h6"
"collapse": false,
"height": "250px",
"panels": [
"aliasColors": {
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 4,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
"lines": true,
"linewidth": 1,
"links": [
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
"spaceLength": 10,
"span": 12,
"stack": false,
"steppedLine": false,
"styles": [
"alias": "Time",
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"pattern": "Time",
"type": "hidden"
"alias": "Memory Usage",
"colorMode": null,
"colors": [
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #A",
"thresholds": [
"type": "number",
"unit": "bytes"
"alias": "Memory Requests",
"colorMode": null,
"colors": [
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #B",
"thresholds": [
"type": "number",
"unit": "bytes"
"alias": "Memory Requests %",
"colorMode": null,
"colors": [
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #C",
"thresholds": [
"type": "number",
"unit": "percentunit"
"alias": "Memory Limits",
"colorMode": null,
"colors": [
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #D",
"thresholds": [
"type": "number",
"unit": "bytes"
"alias": "Memory Limits %",
"colorMode": null,
"colors": [
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #E",
"thresholds": [
"type": "number",
"unit": "percentunit"
"alias": "Memory Usage (RSS)",
"colorMode": null,
"colors": [
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #F",
"thresholds": [
"type": "number",
"unit": "bytes"
"alias": "Memory Usage (Cache)",
"colorMode": null,
"colors": [
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #G",
"thresholds": [
"type": "number",
"unit": "bytes"
"alias": "Memory Usage (Swap)",
"colorMode": null,
"colors": [
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #H",
"thresholds": [
"type": "number",
"unit": "bytes"
"alias": "Pod",
"colorMode": null,
"colors": [
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "pod",
"thresholds": [
"type": "number",
"unit": "short"
"alias": "",
"colorMode": null,
"colors": [
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"pattern": "/.*/",
"thresholds": [
"type": "string",
"unit": "short"
"targets": [
"expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "A",
"step": 10
"expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "B",
"step": 10
"expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "C",
"step": 10
"expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "D",
"step": 10
"expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "E",
"step": 10
"expr": "sum(node_namespace_pod_container:container_memory_rss{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "F",
"step": 10
"expr": "sum(node_namespace_pod_container:container_memory_cache{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "G",
"step": 10
"expr": "sum(node_namespace_pod_container:container_memory_swap{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "H",
"step": 10
"thresholds": [
"timeFrom": null,
"timeShift": null,
"title": "Memory Quota",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
"transform": "table",
"type": "table",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
"yaxes": [
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Memory Quota",
"titleSize": "h6"
"schemaVersion": 14,
"style": "dark",
"tags": [
"templating": {
"list": [
"current": {
"text": "default",
"value": "default"
"hide": 0,
"label": null,
"name": "datasource",
"options": [
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
"allValue": null,
"current": {
"text": "",
"value": ""
"datasource": "$datasource",
"hide": {{ if }}0{{ else }}2{{ end }},
"includeAll": false,
"label": null,
"multi": false,
"name": "cluster",
"options": [
"query": "label_values(kube_pod_info, cluster)",
"refresh": 2,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
"tags": [
"tagsQuery": "",
"type": "query",
"useTags": false
"allValue": null,
"current": {
"text": "",
"value": ""
"datasource": "$datasource",
"hide": 0,
"includeAll": false,
"label": null,
"multi": true,
"name": "node",
"options": [
"query": "label_values(kube_pod_info{cluster=\"$cluster\"}, node)",
"refresh": 2,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
"tags": [
"tagsQuery": "",
"type": "query",
"useTags": false
"time": {
"from": "now-1h",
"to": "now"
"timepicker": {
"refresh_intervals": [
"time_options": [
"timezone": "UTC",
"title": "Kubernetes / Compute Resources / Node (Pods)",
"uid": "200ac8fdbfbb74b39aff88118e4d1c2c",
"version": 0
{{- end }}

View File

@ -1,991 +0,0 @@
{{- /*
Generated from 'nodes' from
Do not change in-place! In order to change this file first read following link:
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
namespace: {{ template "kube-prometheus-stack.namespace" . }}
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "nodes" | trunc 63 | trimSuffix "-" }}
{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }}
{{- if $.Values.grafana.sidecar.dashboards.label }}
{{ $.Values.grafana.sidecar.dashboards.label }}: "1"
{{- end }}
app: {{ template "" $ }}-grafana
{{ include "kube-prometheus-stack.labels" $ | indent 4 }}
nodes.json: |-
"__inputs": [
"__requires": [
"annotations": {
"list": [
"editable": false,
"gnetId": null,
"graphTooltip": 1,
"hideControls": false,
"id": null,
"links": [
"refresh": "30s",
"rows": [
"collapse": false,
"collapsed": false,
"panels": [
"aliasColors": {
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"id": 2,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sideWidth": null,
"total": false,
"values": false
"lines": true,
"linewidth": 1,
"links": [
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": null,
"seriesOverrides": [
"spaceLength": 10,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
"expr": "(\n (1 - rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[$__rate_interval]))\n/ ignoring(cpu) group_left\n count without (cpu)( node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"})\n)\n",
"format": "time_series",
"intervalFactor": 5,
"legendFormat": "{{`{{`}}cpu{{`}}`}}",
"refId": "A"
"thresholds": [
"timeFrom": null,
"timeShift": null,
"title": "CPU Usage",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
"yaxes": [
"format": "percentunit",
"label": null,
"logBase": 1,
"max": 1,
"min": 0,
"show": true
"format": "percentunit",
"label": null,
"logBase": 1,
"max": 1,
"min": 0,
"show": true
"aliasColors": {
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 0,
"fillGradient": 0,
"gridPos": {
"id": 3,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sideWidth": null,
"total": false,
"values": false
"lines": true,
"linewidth": 1,
"links": [
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": null,
"seriesOverrides": [
"spaceLength": 10,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
"expr": "node_load1{job=\"node-exporter\", instance=\"$instance\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "1m load average",
"refId": "A"
"expr": "node_load5{job=\"node-exporter\", instance=\"$instance\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "5m load average",
"refId": "B"
"expr": "node_load15{job=\"node-exporter\", instance=\"$instance\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "15m load average",
"refId": "C"
"expr": "count(node_cpu_seconds_total{job=\"node-exporter\", instance=\"$instance\", mode=\"idle\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "logical cores",
"refId": "D"
"thresholds": [
"timeFrom": null,
"timeShift": null,
"title": "Load Average",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
"yaxes": [
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "Dashboard Row",
"titleSize": "h6",
"type": "row"
"collapse": false,
"collapsed": false,
"panels": [
"aliasColors": {
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"id": 4,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sideWidth": null,
"total": false,
"values": false
"lines": true,
"linewidth": 1,
"links": [
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": null,
"seriesOverrides": [
"spaceLength": 10,
"span": 9,
"stack": true,
"steppedLine": false,
"targets": [
"expr": "(\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n-\n node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}\n-\n node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}\n-\n node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}\n)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "memory used",
"refId": "A"
"expr": "node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "memory buffers",
"refId": "B"
"expr": "node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "memory cached",
"refId": "C"
"expr": "node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "memory free",
"refId": "D"
"thresholds": [
"timeFrom": null,
"timeShift": null,
"title": "Memory Usage",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
"yaxes": [
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
"datasource": "$datasource",
"format": "percent",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
"gridPos": {
"id": 5,
"interval": null,
"links": [
"mappingType": 1,
"mappingTypes": [
"name": "value to text",
"value": 1
"name": "range to text",
"value": 2
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
"from": "null",
"text": "N/A",
"to": "null"
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
"tableColumn": "",
"targets": [
"expr": "100 -\n(\n avg(node_memory_MemAvailable_bytes{job=\"node-exporter\", instance=\"$instance\"})\n/\n avg(node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"})\n* 100\n)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A"
"thresholds": "80, 90",
"title": "Memory Usage",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
"op": "=",
"text": "N/A",
"value": "null"
"valueName": "current"
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "Dashboard Row",
"titleSize": "h6",
"type": "row"
"collapse": false,
"collapsed": false,
"panels": [
"aliasColors": {
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 0,
"fillGradient": 0,
"gridPos": {
"id": 6,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sideWidth": null,
"total": false,
"values": false
"lines": true,
"linewidth": 1,
"links": [
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": null,
"seriesOverrides": [
"alias": "/ read| written/",
"yaxis": 1
"alias": "/ io time/",
"yaxis": 2
"spaceLength": 10,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
"expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}device{{`}}`}} read",
"refId": "A"
"expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}device{{`}}`}} written",
"refId": "B"
"expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}device{{`}}`}} io time",
"refId": "C"
"thresholds": [
"timeFrom": null,
"timeShift": null,
"title": "Disk I/O",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
"yaxes": [
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
"format": "s",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
"aliasColors": {
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"id": 7,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sideWidth": null,
"total": false,
"values": false
"lines": true,
"linewidth": 1,
"links": [
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": null,
"seriesOverrides": [
"alias": "used",
"color": "#E0B400"
"alias": "available",
"color": "#73BF69"
"spaceLength": 10,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
"expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "used",
"refId": "A"
"expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "available",
"refId": "B"
"thresholds": [
"timeFrom": null,
"timeShift": null,
"title": "Disk Space Usage",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
"yaxes": [
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "Dashboard Row",
"titleSize": "h6",
"type": "row"
"collapse": false,
"collapsed": false,
"panels": [
"aliasColors": {
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 0,
"fillGradient": 0,
"gridPos": {
"id": 8,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sideWidth": null,
"total": false,
"values": false
"lines": true,
"linewidth": 1,
"links": [
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": null,
"seriesOverrides": [
"spaceLength": 10,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
"expr": "rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}device{{`}}`}}",
"refId": "A"
"thresholds": [
"timeFrom": null,
"timeShift": null,
"title": "Network Received",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
"yaxes": [
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
"aliasColors": {
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 0,
"fillGradient": 0,
"gridPos": {
"id": 9,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sideWidth": null,
"total": false,
"values": false
"lines": true,
"linewidth": 1,
"links": [
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": null,
"seriesOverrides": [
"spaceLength": 10,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
"expr": "rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}device{{`}}`}}",
"refId": "A"
"thresholds": [
"timeFrom": null,
"timeShift": null,
"title": "Network Transmitted",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
"yaxes": [
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "Dashboard Row",
"titleSize": "h6",
"type": "row"
"schemaVersion": 14,
"style": "dark",
"tags": [
"templating": {
"list": [
"current": {
"text": "Prometheus",
"value": "Prometheus"
"hide": 0,
"label": null,
"name": "datasource",
"options": [
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
"allValue": null,
"current": {
"datasource": "$datasource",
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "instance",
"options": [
"query": "label_values(node_exporter_build_info{job=\"node-exporter\"}, instance)",
"refresh": 2,
"regex": "",
"sort": 0,
"tagValuesQuery": "",
"tags": [
"tagsQuery": "",
"type": "query",
"useTags": false
"time": {
"from": "now-1h",
"to": "now"
"timepicker": {
"refresh_intervals": [
"time_options": [
"timezone": "utc",
"title": "Node Exporter / Nodes",
"version": 0
{{- end }}

View File

@ -1,577 +0,0 @@
{{- /*
Generated from 'persistentvolumesusage' from
Do not change in-place! In order to change this file first read following link:
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
namespace: {{ template "kube-prometheus-stack.namespace" . }}
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "persistentvolumesusage" | trunc 63 | trimSuffix "-" }}
{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }}
{{- if $.Values.grafana.sidecar.dashboards.label }}
{{ $.Values.grafana.sidecar.dashboards.label }}: "1"
{{- end }}
app: {{ template "" $ }}-grafana
{{ include "kube-prometheus-stack.labels" $ | indent 4 }}
persistentvolumesusage.json: |-
"__inputs": [
"__requires": [
"annotations": {
"list": [
"editable": false,
"gnetId": null,
"graphTooltip": 0,
"hideControls": false,
"id": null,
"links": [
"refresh": "10s",
"rows": [
"collapse": false,
"collapsed": false,
"panels": [
"aliasColors": {
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"id": 2,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"rightSide": false,
"show": true,
"sideWidth": null,
"total": false,
"values": true
"lines": true,
"linewidth": 1,
"links": [
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": null,
"seriesOverrides": [
"spaceLength": 10,
"span": 9,
"stack": true,
"steppedLine": false,
"targets": [
"expr": "(\n sum without(instance, node) (topk(1, (kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n -\n sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n)\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Used Space",
"refId": "A"
"expr": "sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Free Space",
"refId": "B"
"thresholds": [
"timeFrom": null,
"timeShift": null,
"title": "Volume Space Usage",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
"yaxes": [
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
"datasource": "$datasource",
"format": "percent",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
"gridPos": {
"id": 3,
"interval": null,
"links": [
"mappingType": 1,
"mappingTypes": [
"name": "value to text",
"value": 1
"name": "range to text",
"value": 2
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
"from": "null",
"text": "N/A",
"to": "null"
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
"tableColumn": "",
"targets": [
"expr": "max without(instance,node) (\n(\n topk(1, kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n topk(1, kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n/\ntopk(1, kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n* 100)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A"
"thresholds": "80, 90",
"title": "Volume Space Usage",
"tooltip": {
"shared": false
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
"op": "=",
"text": "N/A",
"value": "null"
"valueName": "current"
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "Dashboard Row",
"titleSize": "h6",
"type": "row"
"collapse": false,
"collapsed": false,
"panels": [
"aliasColors": {
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"id": 4,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"rightSide": false,
"show": true,
"sideWidth": null,
"total": false,
"values": true
"lines": true,
"linewidth": 1,
"links": [
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": null,
"seriesOverrides": [
"spaceLength": 10,
"span": 9,
"stack": true,
"steppedLine": false,
"targets": [
"expr": "sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Used inodes",
"refId": "A"
"expr": "(\n sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n -\n sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n)\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": " Free inodes",
"refId": "B"
"thresholds": [
"timeFrom": null,
"timeShift": null,
"title": "Volume inodes Usage",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
"yaxes": [
"format": "none",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
"format": "none",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
"datasource": "$datasource",
"format": "percent",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
"gridPos": {
"id": 5,
"interval": null,
"links": [
"mappingType": 1,
"mappingTypes": [
"name": "value to text",
"value": 1
"name": "range to text",
"value": 2
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
"from": "null",
"text": "N/A",
"to": "null"
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
"tableColumn": "",
"targets": [
"expr": "max without(instance,node) (\ntopk(1, kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n/\ntopk(1, kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n* 100)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A"
"thresholds": "80, 90",
"title": "Volume inodes Usage",
"tooltip": {
"shared": false
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
"op": "=",
"text": "N/A",
"value": "null"
"valueName": "current"
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "Dashboard Row",
"titleSize": "h6",
"type": "row"
"schemaVersion": 14,
"style": "dark",
"tags": [
"templating": {
"list": [
"current": {
"text": "default",
"value": "default"
"hide": 0,
"label": null,
"name": "datasource",
"options": [
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
"allValue": null,
"current": {
"datasource": "$datasource",
"hide": {{ if }}0{{ else }}2{{ end }},
"includeAll": false,
"label": "cluster",
"multi": false,
"name": "cluster",
"options": [
"query": "label_values(kubelet_volume_stats_capacity_bytes, cluster)",
"refresh": 2,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
"tags": [
"tagsQuery": "",
"type": "query",
"useTags": false
"allValue": null,
"current": {
"datasource": "$datasource",
"hide": 0,
"includeAll": false,
"label": "Namespace",
"multi": false,
"name": "namespace",
"options": [
"query": "label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"}, namespace)",
"refresh": 2,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
"tags": [
"tagsQuery": "",
"type": "query",
"useTags": false
"allValue": null,
"current": {
"datasource": "$datasource",
"hide": 0,
"includeAll": false,
"label": "PersistentVolumeClaim",
"multi": false,
"name": "volume",
"options": [
"query": "label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\"}, persistentvolumeclaim)",
"refresh": 2,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
"tags": [
"tagsQuery": "",
"type": "query",
"useTags": false
"time": {
"from": "now-7d",
"to": "now"
"timepicker": {
"refresh_intervals": [
"time_options": [
"timezone": "UTC",
"title": "Kubernetes / Persistent Volumes",
"uid": "919b92a8e8041bd567af9edab12c840c",
"version": 0
{{- end }}

View File

@ -1,928 +0,0 @@
{{- /*
Generated from 'statefulset' from
Do not change in-place! In order to change this file first read following link:
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
namespace: {{ template "kube-prometheus-stack.namespace" . }}
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "statefulset" | trunc 63 | trimSuffix "-" }}
{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }}
{{- if $.Values.grafana.sidecar.dashboards.label }}
{{ $.Values.grafana.sidecar.dashboards.label }}: "1"
{{- end }}
app: {{ template "" $ }}-grafana
{{ include "kube-prometheus-stack.labels" $ | indent 4 }}
statefulset.json: |-
"__inputs": [
"__requires": [
"annotations": {
"list": [
"editable": false,
"gnetId": null,
"graphTooltip": 0,
"hideControls": false,
"id": null,
"links": [
"refresh": "",
"rows": [
"collapse": false,
"collapsed": false,
"panels": [
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(237, 129, 40, 0.89)",
"datasource": "$datasource",
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
"gridPos": {
"id": 2,
"interval": null,
"links": [
"mappingType": 1,
"mappingTypes": [
"name": "value to text",
"value": 1
"name": "range to text",
"value": 2
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "cores",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
"from": "null",
"text": "N/A",
"to": "null"
"span": 4,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"lineColor": "rgb(31, 120, 193)",
"show": true
"tableColumn": "",
"targets": [
"expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", container!=\"\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}[3m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A"
"thresholds": "",
"title": "CPU",
"tooltip": {
"shared": false
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
"op": "=",
"text": "0",
"value": "null"
"valueName": "current"
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(237, 129, 40, 0.89)",
"datasource": "$datasource",
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
"gridPos": {
"id": 3,
"interval": null,
"links": [
"mappingType": 1,
"mappingTypes": [
"name": "value to text",
"value": 1
"name": "range to text",
"value": 2
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "GB",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
"from": "null",
"text": "N/A",
"to": "null"
"span": 4,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"lineColor": "rgb(31, 120, 193)",
"show": true
"tableColumn": "",
"targets": [
"expr": "sum(container_memory_usage_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", container!=\"\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}) / 1024^3",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A"
"thresholds": "",
"title": "Memory",
"tooltip": {
"shared": false
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
"op": "=",
"text": "0",
"value": "null"
"valueName": "current"
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(237, 129, 40, 0.89)",
"datasource": "$datasource",
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
"gridPos": {
"id": 4,
"interval": null,
"links": [
"mappingType": 1,
"mappingTypes": [
"name": "value to text",
"value": 1
"name": "range to text",
"value": 2
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "Bps",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
"from": "null",
"text": "N/A",
"to": "null"
"span": 4,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"lineColor": "rgb(31, 120, 193)",
"show": true
"tableColumn": "",
"targets": [
"expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\",pod=~\"$statefulset.*\"}[3m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A"
"thresholds": "",
"title": "Network",
"tooltip": {
"shared": false
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
"op": "=",
"text": "0",
"value": "null"
"valueName": "current"
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "Dashboard Row",
"titleSize": "h6",
"type": "row"
"collapse": false,
"collapsed": false,
"height": "100px",
"panels": [
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(237, 129, 40, 0.89)",
"datasource": "$datasource",
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
"gridPos": {
"id": 5,
"interval": null,
"links": [
"mappingType": 1,
"mappingTypes": [
"name": "value to text",
"value": 1
"name": "range to text",
"value": 2
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
"from": "null",
"text": "N/A",
"to": "null"
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
"tableColumn": "",
"targets": [
"expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A"
"thresholds": "",
"title": "Desired Replicas",
"tooltip": {
"shared": false
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
"op": "=",
"text": "0",
"value": "null"
"valueName": "current"
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(237, 129, 40, 0.89)",
"datasource": "$datasource",
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
"gridPos": {
"id": 6,
"interval": null,
"links": [
"mappingType": 1,
"mappingTypes": [
"name": "value to text",
"value": 1
"name": "range to text",
"value": 2
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
"from": "null",
"text": "N/A",
"to": "null"
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
"tableColumn": "",
"targets": [
"expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A"
"thresholds": "",
"title": "Replicas of current version",
"tooltip": {
"shared": false
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
"op": "=",
"text": "0",
"value": "null"
"valueName": "current"
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(237, 129, 40, 0.89)",
"datasource": "$datasource",
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
"gridPos": {
"id": 7,
"interval": null,
"links": [
"mappingType": 1,
"mappingTypes": [
"name": "value to text",
"value": 1
"name": "range to text",
"value": 2
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
"from": "null",
"text": "N/A",
"to": "null"
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
"tableColumn": "",
"targets": [
"expr": "max(kube_statefulset_status_observed_generation{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A"
"thresholds": "",
"title": "Observed Generation",
"tooltip": {
"shared": false
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
"op": "=",
"text": "0",
"value": "null"
"valueName": "current"
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(237, 129, 40, 0.89)",
"datasource": "$datasource",
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
"gridPos": {
"id": 8,
"interval": null,
"links": [
"mappingType": 1,
"mappingTypes": [
"name": "value to text",
"value": 1
"name": "range to text",
"value": 2
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
"from": "null",
"text": "N/A",
"to": "null"
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
"tableColumn": "",
"targets": [
"expr": "max(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A"
"thresholds": "",
"title": "Metadata Generation",
"tooltip": {
"shared": false
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
"op": "=",
"text": "0",
"value": "null"
"valueName": "current"
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "Dashboard Row",
"titleSize": "h6",
"type": "row"
"collapse": false,
"collapsed": false,
"panels": [
"aliasColors": {
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"id": 9,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sideWidth": null,
"total": false,
"values": false
"lines": true,
"linewidth": 1,
"links": [
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": null,
"seriesOverrides": [
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
"expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "replicas specified",
"refId": "A"
"expr": "max(kube_statefulset_status_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "replicas created",
"refId": "B"
"expr": "min(kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "ready",
"refId": "C"
"expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "replicas of current version",
"refId": "D"
"expr": "min(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "updated",
"refId": "E"
"thresholds": [
"timeFrom": null,
"timeShift": null,
"title": "Replicas",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
"yaxes": [
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "Dashboard Row",
"titleSize": "h6",
"type": "row"
"schemaVersion": 14,
"style": "dark",
"tags": [
"templating": {
"list": [
"current": {
"text": "default",
"value": "default"
"hide": 0,
"label": null,
"name": "datasource",
"options": [
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
"allValue": null,
"current": {
"datasource": "$datasource",
"hide": {{ if .Values.grafana.sidecar.dashboards.multicluster }}0{{ else }}2{{ end }},
"includeAll": false,
"label": "cluster",
"multi": false,
"name": "cluster",
"options": [
"query": "label_values(kube_statefulset_metadata_generation, cluster)",
"refresh": 2,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
"tags": [
"tagsQuery": "",
"type": "query",
"useTags": false
"allValue": null,
"current": {
"datasource": "$datasource",
"hide": 0,
"includeAll": false,
"label": "Namespace",
"multi": false,
"name": "namespace",
"options": [
"query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)",
"refresh": 2,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
"tags": [
"tagsQuery": "",
"type": "query",
"useTags": false
"allValue": null,
"current": {
"datasource": "$datasource",
"hide": 0,
"includeAll": false,
"label": "Name",
"multi": false,
"name": "statefulset",
"options": [
"query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\"}, statefulset)",
"refresh": 2,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
"tags": [
"tagsQuery": "",
"type": "query",
"useTags": false
"time": {
"from": "now-1h",
"to": "now"
"timepicker": {
"refresh_intervals": [
"time_options": [
"timezone": "UTC",
"title": "Kubernetes / StatefulSets",
"uid": "a31c1f46e6f727cb37c0d731a7245005",
"version": 0
{{- end }}

View File

@ -71,6 +71,6 @@ spec:
{{- end -}}
{{- if .Values.prometheus.thanosIngress.tls }}
{{ toYaml .Values.prometheus.thanosIngress.tls | indent 4 }}
{{ tpl (toYaml .Values.prometheus.thanosIngress.tls | indent 4) . }}
{{- end -}}
{{- end -}}

View File

@ -341,6 +341,18 @@ spec:
{{- if .Values.prometheus.prometheusSpec.enforcedSampleLimit }}
enforcedSampleLimit: {{ .Values.prometheus.prometheusSpec.enforcedSampleLimit }}
{{- end }}
{{- if .Values.prometheus.prometheusSpec.enforcedTargetLimit }}
enforcedTargetLimit: {{ .Values.prometheus.prometheusSpec.enforcedTargetLimit }}
{{- end }}
{{- if .Values.prometheus.prometheusSpec.enforcedLabelLimit }}
enforcedLabelLimit: {{ .Values.prometheus.prometheusSpec.enforcedLabelLimit }}
{{- end }}
{{- if .Values.prometheus.prometheusSpec.enforcedLabelNameLengthLimit }}
enforcedLabelNameLengthLimit: {{ .Values.prometheus.prometheusSpec.enforcedLabelNameLengthLimit }}
{{- end }}
{{- if .Values.prometheus.prometheusSpec.enforcedLabelValueLengthLimit}}
enforcedLabelValueLengthLimit: {{ .Values.prometheus.prometheusSpec.enforcedLabelValueLengthLimit }}
{{- end }}
{{- if .Values.prometheus.prometheusSpec.allowOverlappingBlocks }}
allowOverlappingBlocks: {{ .Values.prometheus.prometheusSpec.allowOverlappingBlocks }}
{{- end }}

View File

@ -24,6 +24,12 @@ spec:
{{- if eq .Values.prometheus.thanosService.type "NodePort" }}
nodePort: {{ .Values.prometheus.thanosService.nodePort }}
{{- end }}
- name: {{ .Values.prometheus.thanosService.httpPortName }}
port: {{ .Values.prometheus.thanosService.httpPort }}
targetPort: {{ .Values.prometheus.thanosService.targetHttpPort }}
{{- if eq .Values.prometheus.thanosService.type "NodePort" }}
nodePort: {{ .Values.prometheus.thanosService.httpNodePort }}
{{- end }}
selector: prometheus
prometheus: {{ template "kube-prometheus-stack.fullname" . }}-prometheus

View File

@ -31,6 +31,12 @@ spec:
{{- if eq .Values.prometheus.thanosServiceExternal.type "NodePort" }}
nodePort: {{ .Values.prometheus.thanosServiceExternal.nodePort }}
{{- end }}
- name: {{ .Values.prometheus.thanosServiceExternal.httpPortName }}
port: {{ .Values.prometheus.thanosServiceExternal.httpPort }}
targetPort: {{ .Values.prometheus.thanosServiceExternal.targetHttpPort }}
{{- if eq .Values.prometheus.thanosServiceExternal.type "NodePort" }}
nodePort: {{ .Values.prometheus.thanosServiceExternal.httpNodePort }}
{{- end }}
selector: prometheus
prometheus: {{ template "kube-prometheus-stack.fullname" . }}-prometheus

View File

@ -0,0 +1,41 @@
{{- if and .Values.prometheus.thanosService.enabled .Values.prometheus.thanosServiceMonitor.enabled }}
kind: ServiceMonitor
name: {{ template "kube-prometheus-stack.fullname" . }}-thanos-discovery
namespace: {{ template "kube-prometheus-stack.namespace" . }}
app: {{ template "" . }}-thanos-discovery
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
app: {{ template "" . }}-thanos-discovery
release: {{ $.Release.Name | quote }}
- {{ printf "%s" (include "kube-prometheus-stack.namespace" .) | quote }}
- port: {{ .Values.prometheus.thanosService.httpPortName }}
{{- if .Values.prometheus.thanosServiceMonitor.interval }}
interval: {{ .Values.prometheus.thanosServiceMonitor.interval }}
{{- end }}
{{- if .Values.prometheus.thanosServiceMonitor.scheme }}
scheme: {{ .Values.prometheus.thanosServiceMonitor.scheme }}
{{- end }}
{{- if .Values.prometheus.thanosServiceMonitor.tlsConfig }}
tlsConfig: {{ toYaml .Values.prometheus.thanosServiceMonitor.tlsConfig | nindent 6 }}
{{- end }}
{{- if .Values.prometheus.thanosServiceMonitor.bearerTokenFile }}
bearerTokenFile: {{ .Values.prometheus.thanosServiceMonitor.bearerTokenFile }}
{{- end }}
path: "/metrics"
{{- if .Values.prometheus.thanosServiceMonitor.metricRelabelings }}
{{ tpl (toYaml .Values.prometheus.thanosServiceMonitor.metricRelabelings | indent 6) . }}
{{- end }}
{{- if .Values.prometheus.thanosServiceMonitor.relabelings }}
{{ toYaml .Values.prometheus.thanosServiceMonitor.relabelings | indent 6 }}
{{- end }}
{{- end }}

View File

@ -631,10 +631,15 @@ grafana:
forceDeployDashboards: false
## Deploy default dashboards.
## Deploy default dashboards
defaultDashboardsEnabled: true
## Timezone for the default dashboards
## Other options are: browser or a specific timezone, i.e. Europe/Luxembourg
defaultDashboardsTimezone: utc
adminPassword: prom-operator
@ -1247,6 +1252,9 @@ kubeStateMetrics:
## Scrape interval. If not set, the Prometheus default scrape interval is used.
interval: ""
## Scrape Timeout. If not set, the Prometheus default scrape timeout is used.
scrapeTimeout: ""
## proxyUrl: URL of a proxy that should be used for scraping.
proxyUrl: ""
@ -1274,6 +1282,9 @@ kubeStateMetrics:
# replacement: $1
# action: replace
# Keep labels from scraped data, overriding server-side labels
honorLabels: true
# Enable self metrics configuration for Service Monitor
enabled: false
@ -1643,18 +1654,50 @@ prometheus:
enabled: false
annotations: {}
labels: {}
portName: grpc
port: 10901
targetPort: "grpc"
clusterIP: "None"
## Service type
type: ClusterIP
## Port to expose on each node
## gRPC port config
portName: grpc
port: 10901
targetPort: "grpc"
## HTTP port config (for metrics)
httpPortName: http
httpPort: 10902
targetHttpPort: "http"
## ClusterIP to assign
# Default is to make this a headless service ("None")
clusterIP: "None"
## Port to expose on each node, if service type is NodePort
nodePort: 30901
httpNodePort: 30902
# ServiceMonitor to scrape Sidecar metrics
# Needs thanosService to be enabled as well
enabled: false
interval: ""
## scheme: HTTP scheme to use for scraping. Can be used with `tlsConfig` for example if using istio mTLS.
scheme: ""
## tlsConfig: TLS configuration to use when scraping the endpoint. For example if using istio mTLS.
## Of type:
tlsConfig: {}
## Metric relabel configs to apply to samples before ingestion.
metricRelabelings: []
## relabel configs to apply to samples before ingestion.
relabelings: []
# Service for external access to sidecar
# Enabling this creates a service to expose thanos-sidecar outside the cluster.
@ -1662,11 +1705,18 @@ prometheus:
enabled: false
annotations: {}
labels: {}
loadBalancerIP: ""
loadBalancerSourceRanges: []
## gRPC port config
portName: grpc
port: 10901
targetPort: "grpc"
loadBalancerIP: ""
loadBalancerSourceRanges: []
## HTTP port config (for metrics)
httpPortName: http
httpPort: 10902
targetHttpPort: "http"
## Service type
@ -1675,6 +1725,7 @@ prometheus:
## Port to expose on each node
nodePort: 30901
httpNodePort: 30902
## Configuration for Prometheus service
@ -2422,6 +2473,28 @@ prometheus:
## number of samples/series under the desired limit. Note that if SampleLimit is lower that value will be taken instead.
enforcedSampleLimit: false
## EnforcedTargetLimit defines a global limit on the number of scraped targets. This overrides any TargetLimit set
## per ServiceMonitor or/and PodMonitor. It is meant to be used by admins to enforce the TargetLimit to keep the overall
## number of targets under the desired limit. Note that if TargetLimit is lower, that value will be taken instead, except
## if either value is zero, in which case the non-zero value will be used. If both values are zero, no limit is enforced.
enforcedTargetLimit: false
## Per-scrape limit on number of labels that will be accepted for a sample. If more than this number of labels are present
## post metric-relabeling, the entire scrape will be treated as failed. 0 means no limit. Only valid in Prometheus versions
## 2.27.0 and newer.
enforcedLabelLimit: false
## Per-scrape limit on length of labels name that will be accepted for a sample. If a label name is longer than this number
## post metric-relabeling, the entire scrape will be treated as failed. 0 means no limit. Only valid in Prometheus versions
## 2.27.0 and newer.
enforcedLabelNameLengthLimit: false
## Per-scrape limit on length of labels value that will be accepted for a sample. If a label value is longer than this
## number post metric-relabeling, the entire scrape will be treated as failed. 0 means no limit. Only valid in Prometheus
## versions 2.27.0 and newer.
enforcedLabelValueLengthLimit: false
## AllowOverlappingBlocks enables vertical compaction and vertical query merge in Prometheus. This is still experimental
## in Prometheus so it may change in any upcoming release.
allowOverlappingBlocks: false

View File

@ -8,8 +8,8 @@
"subdir": "grafonnet"
"version": "05fb200ee1a1816fc1b4c522071d5606d8dd71c1",
"sum": "mEoObbqbyVaXrHFEJSM2Nad31tOvadzIevWuyNHHBgI="
"version": "19b27b272abf4263af1365ec485784c49815a332",
"sum": "gF8foHByYcB25jcUOBqP6jxk0OPifQMjPvKY0HaCk6w="
"source": {
@ -18,7 +18,7 @@
"subdir": "grafana-builder"
"version": "ff22d1d6698573e7cb76228198edfa2b2f632dcc",
"version": "14bc3cec71f32ba3cc490a47a22004a744b6bb4e",
"sum": "GRf2GvwEU4jhXV+JOonXSZ4wdDv8mnHBPCQ6TUVd+g8="
@ -28,8 +28,8 @@
"subdir": ""
"version": "2b27a09a667091cef74776b690ccceaf55995e29",
"sum": "j2jPdrcM3iuaUK+6V9jWn2M3Fapr0KtI8FZ1KQoHIGA="
"version": "ff4641bcd83314c955150bea6b147df9ca335c4a",
"sum": "oUVGwcCbmdH8qz9B+lbRawI9s23GY9HeW7MwYZRbZ/0="
"legacyImports": false

View File

@ -1,685 +0,0 @@
- "name": "kubernetes-apps"
- "alert": "KubePodCrashLooping"
"description": "Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf \"%.2f\" $value }} times / 10 minutes."
"runbook_url": ""
"summary": "Pod is crash looping."
"expr": |
increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[10m]) > 0
kube_pod_container_status_waiting{job="kube-state-metrics"} == 1
"for": "15m"
"severity": "warning"
- "alert": "KubePodNotReady"
"description": "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes."
"runbook_url": ""
"summary": "Pod has been in a non-ready state for more than 15 minutes."
"expr": |
sum by (namespace, pod) (
max by(namespace, pod) (
kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}
) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
) > 0
"for": "15m"
"severity": "warning"
- "alert": "KubeDeploymentGenerationMismatch"
"description": "Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back."
"runbook_url": ""
"summary": "Deployment generation mismatch due to possible roll-back"
"expr": |
"for": "15m"
"severity": "warning"
- "alert": "KubeDeploymentReplicasMismatch"
"description": "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes."
"runbook_url": ""
"summary": "Deployment has not matched the expected number of replicas."
"expr": |
) and (
"for": "15m"
"severity": "warning"
- "alert": "KubeStatefulSetReplicasMismatch"
"description": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes."
"runbook_url": ""
"summary": "Deployment has not matched the expected number of replicas."
"expr": |
) and (
"for": "15m"
"severity": "warning"
- "alert": "KubeStatefulSetGenerationMismatch"
"description": "StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back."
"runbook_url": ""
"summary": "StatefulSet generation mismatch due to possible roll-back"
"expr": |
"for": "15m"
"severity": "warning"
- "alert": "KubeStatefulSetUpdateNotRolledOut"
"description": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out."
"runbook_url": ""
"summary": "StatefulSet update has not been rolled out."
"expr": |
max without (revision) (
) and (
"for": "15m"
"severity": "warning"
- "alert": "KubeDaemonSetRolloutStuck"
"description": "DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes."
"runbook_url": ""
"summary": "DaemonSet rollout is stuck."
"expr": |
) or (
) or (
) or (
) and (
"for": "15m"
"severity": "warning"
- "alert": "KubeContainerWaiting"
"description": "Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour."
"runbook_url": ""
"summary": "Pod container waiting longer than 1 hour"
"expr": |
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
"for": "1h"
"severity": "warning"
- "alert": "KubeDaemonSetNotScheduled"
"description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled."
"runbook_url": ""
"summary": "DaemonSet pods are not scheduled."
"expr": |
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
"for": "10m"
"severity": "warning"
- "alert": "KubeDaemonSetMisScheduled"
"description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run."
"runbook_url": ""
"summary": "DaemonSet pods are misscheduled."
"expr": |
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
"for": "15m"
"severity": "warning"
- "alert": "KubeJobCompletion"
"description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete."
"runbook_url": ""
"summary": "Job did not complete in time"
"expr": |
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
"for": "12h"
"severity": "warning"
- "alert": "KubeJobFailed"
"description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert."
"runbook_url": ""
"summary": "Job failed to complete."
"expr": |
kube_job_failed{job="kube-state-metrics"} > 0
"for": "15m"
"severity": "warning"
- "alert": "KubeHpaReplicasMismatch"
"description": "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes."
"runbook_url": ""
"summary": "HPA has not matched descired number of replicas."
"expr": |
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}[15m]) == 0
"for": "15m"
"severity": "warning"
- "alert": "KubeHpaMaxedOut"
"description": "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes."
"runbook_url": ""
"summary": "HPA is running at max replicas"
"expr": |
"for": "15m"
"severity": "warning"
- "name": "kubernetes-resources"
- "alert": "KubeCPUOvercommit"
"description": "Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure."
"runbook_url": ""
"summary": "Cluster has overcommitted CPU resource requests."
"expr": |
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
(sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
"for": "10m"
"severity": "warning"
- "alert": "KubeMemoryOvercommit"
"description": "Cluster has overcommitted memory resource requests for Pods by {{ $value }} bytes and cannot tolerate node failure."
"runbook_url": ""
"summary": "Cluster has overcommitted memory resource requests."
"expr": |
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
(sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
"for": "10m"
"severity": "warning"
- "alert": "KubeCPUQuotaOvercommit"
"description": "Cluster has overcommitted CPU resource requests for Namespaces."
"runbook_url": ""
"summary": "Cluster has overcommitted CPU resource requests."
"expr": |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
> 1.5
"for": "5m"
"severity": "warning"
- "alert": "KubeMemoryQuotaOvercommit"
"description": "Cluster has overcommitted memory resource requests for Namespaces."
"runbook_url": ""
"summary": "Cluster has overcommitted memory resource requests."
"expr": |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
> 1.5
"for": "5m"
"severity": "warning"
- "alert": "KubeQuotaAlmostFull"
"description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota."
"runbook_url": ""
"summary": "Namespace quota is going to be full."
"expr": |
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 0.9 < 1
"for": "15m"
"severity": "info"
- "alert": "KubeQuotaFullyUsed"
"description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota."
"runbook_url": ""
"summary": "Namespace quota is fully used."
"expr": |
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
== 1
"for": "15m"
"severity": "info"
- "alert": "KubeQuotaExceeded"
"description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota."
"runbook_url": ""
"summary": "Namespace quota has exceeded the limits."
"expr": |
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 1
"for": "15m"
"severity": "warning"
- "alert": "CPUThrottlingHigh"
"description": "{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}."
"runbook_url": ""
"summary": "Processes experience elevated CPU throttling."
"expr": |
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)
> ( 25 / 100 )
"for": "15m"
"severity": "info"
- "name": "kubernetes-storage"
- "alert": "KubePersistentVolumeFillingUp"
"description": "The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free."
"runbook_url": ""
"summary": "PersistentVolume is filling up."
"expr": |
) < 0.03
kubelet_volume_stats_used_bytes{job="kubelet"} > 0
"for": "1m"
"severity": "critical"
- "alert": "KubePersistentVolumeFillingUp"
"description": "Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available."
"runbook_url": ""
"summary": "PersistentVolume is filling up."
"expr": |
) < 0.15
kubelet_volume_stats_used_bytes{job="kubelet"} > 0
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
"for": "1h"
"severity": "warning"
- "alert": "KubePersistentVolumeErrors"
"description": "The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}."
"runbook_url": ""
"summary": "PersistentVolume is having issues with provisioning."
"expr": |
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
"for": "5m"
"severity": "critical"
- "name": "kubernetes-system"
- "alert": "KubeVersionMismatch"
"description": "There are {{ $value }} different semantic versions of Kubernetes components running."
"runbook_url": ""
"summary": "Different semantic versions of Kubernetes components running."
"expr": |
count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
"for": "15m"
"severity": "warning"
- "alert": "KubeClientErrors"
"description": "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'"
"runbook_url": ""
"summary": "Kubernetes API server client is experiencing errors."
"expr": |
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job, namespace)
sum(rate(rest_client_requests_total[5m])) by (instance, job, namespace))
> 0.01
"for": "15m"
"severity": "warning"
- "name": "kube-apiserver-slos"
- "alert": "KubeAPIErrorBudgetBurn"
"description": "The API server is burning too much error budget."
"runbook_url": ""
"summary": "The API server is burning too much error budget."
"expr": |
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000)
"for": "2m"
"long": "1h"
"severity": "critical"
"short": "5m"
- "alert": "KubeAPIErrorBudgetBurn"
"description": "The API server is burning too much error budget."
"runbook_url": ""
"summary": "The API server is burning too much error budget."
"expr": |
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000)
"for": "15m"
"long": "6h"
"severity": "critical"
"short": "30m"
- "alert": "KubeAPIErrorBudgetBurn"
"description": "The API server is burning too much error budget."
"runbook_url": ""
"summary": "The API server is burning too much error budget."
"expr": |
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
"for": "1h"
"long": "1d"
"severity": "warning"
"short": "2h"
- "alert": "KubeAPIErrorBudgetBurn"
"description": "The API server is burning too much error budget."
"runbook_url": ""
"summary": "The API server is burning too much error budget."
"expr": |
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
"for": "3h"
"long": "3d"
"severity": "warning"
"short": "6h"
- "name": "kubernetes-system-apiserver"
- "alert": "KubeClientCertificateExpiration"
"description": "A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days."
"runbook_url": ""
"summary": "Client certificate is about to expire."
"expr": |
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800
"severity": "warning"
- "alert": "KubeClientCertificateExpiration"
"description": "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours."
"runbook_url": ""
"summary": "Client certificate is about to expire."
"expr": |
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400
"severity": "critical"
- "alert": "AggregatedAPIErrors"
"description": "An aggregated API {{ $ }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m."
"runbook_url": ""
"summary": "An aggregated API has reported errors."
"expr": |
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
"severity": "warning"
- "alert": "AggregatedAPIDown"
"description": "An aggregated API {{ $ }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m."
"runbook_url": ""
"summary": "An aggregated API is down."
"expr": |
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
"for": "5m"
"severity": "warning"
- "alert": "KubeAPIDown"
"description": "KubeAPI has disappeared from Prometheus target discovery."
"runbook_url": ""
"summary": "Target disappeared from Prometheus target discovery."
"expr": |
absent(up{job="kube-apiserver"} == 1)
"for": "15m"
"severity": "critical"
- "alert": "KubeAPITerminatedRequests"
"description": "The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests."
"runbook_url": ""
"summary": "The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests."
"expr": |
sum(rate(apiserver_request_terminations_total{job="kube-apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="kube-apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="kube-apiserver"}[10m])) ) > 0.20
"for": "5m"
"severity": "warning"
- "name": "kubernetes-system-kubelet"
- "alert": "KubeNodeNotReady"
"description": "{{ $labels.node }} has been unready for more than 15 minutes."
"runbook_url": ""
"summary": "Node is not ready."
"expr": |
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
"for": "15m"
"severity": "warning"
- "alert": "KubeNodeUnreachable"
"description": "{{ $labels.node }} is unreachable and some workloads may be rescheduled."
"runbook_url": ""
"summary": "Node is unreachable."
"expr": |
(kube_node_spec_taint{job="kube-state-metrics",key="",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler||aws-node-termination-handler/spot-itn"}) == 1
"for": "15m"
"severity": "warning"
- "alert": "KubeletTooManyPods"
"description": "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity."
"runbook_url": ""
"summary": "Kubelet is running at capacity."
"expr": |
count by(node) (
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"})
max by(node) (
kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1
) > 0.95
"for": "15m"
"severity": "info"
- "alert": "KubeNodeReadinessFlapping"
"description": "The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes."
"runbook_url": ""
"summary": "Node readiness status is flapping."
"expr": |
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
"for": "15m"
"severity": "warning"
- "alert": "KubeletPlegDurationHigh"
"description": "The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}."
"runbook_url": ""
"summary": "Kubelet Pod Lifecycle Event Generator is taking too long to relist."
"expr": |
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
"for": "5m"
"severity": "warning"
- "alert": "KubeletPodStartUpLatencyHigh"
"description": "Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}."
"runbook_url": ""
"summary": "Kubelet Pod startup latency is too high."
"expr": |
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet"} > 60
"for": "15m"
"severity": "warning"
- "alert": "KubeletClientCertificateExpiration"
"description": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}."
"runbook_url": ""
"summary": "Kubelet client certificate is about to expire."
"expr": |
kubelet_certificate_manager_client_ttl_seconds < 604800
"severity": "warning"
- "alert": "KubeletClientCertificateExpiration"
"description": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}."
"runbook_url": ""
"summary": "Kubelet client certificate is about to expire."
"expr": |
kubelet_certificate_manager_client_ttl_seconds < 86400
"severity": "critical"
- "alert": "KubeletServerCertificateExpiration"
"description": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}."
"runbook_url": ""
"summary": "Kubelet server certificate is about to expire."
"expr": |
kubelet_certificate_manager_server_ttl_seconds < 604800
"severity": "warning"
- "alert": "KubeletServerCertificateExpiration"
"description": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}."
"runbook_url": ""
"summary": "Kubelet server certificate is about to expire."
"expr": |
kubelet_certificate_manager_server_ttl_seconds < 86400
"severity": "critical"
- "alert": "KubeletClientCertificateRenewalErrors"
"description": "Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes)."
"runbook_url": ""
"summary": "Kubelet has failed to renew its client certificate."
"expr": |
increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
"for": "15m"
"severity": "warning"
- "alert": "KubeletServerCertificateRenewalErrors"
"description": "Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes)."
"runbook_url": ""
"summary": "Kubelet has failed to renew its server certificate."
"expr": |
increase(kubelet_server_expiration_renew_errors[5m]) > 0
"for": "15m"
"severity": "warning"
- "alert": "KubeletDown"
"description": "Kubelet has disappeared from Prometheus target discovery."
"runbook_url": ""
"summary": "Target disappeared from Prometheus target discovery."
"expr": |
absent(up{job="kubelet"} == 1)
"for": "15m"
"severity": "critical"
- "name": "kubernetes-system-scheduler"
- "alert": "KubeSchedulerDown"
"description": "KubeScheduler has disappeared from Prometheus target discovery."
"runbook_url": ""
"summary": "Target disappeared from Prometheus target discovery."
"expr": |
absent(up{job="kube-scheduler"} == 1)
"for": "15m"
"severity": "critical"
- "name": "kubernetes-system-controller-manager"
- "alert": "KubeControllerManagerDown"
"description": "KubeControllerManager has disappeared from Prometheus target discovery."
"runbook_url": ""
"summary": "Target disappeared from Prometheus target discovery."
"expr": |
absent(up{job="kube-controller-manager"} == 1)
"for": "15m"
"severity": "critical"

View File

@ -616,7 +616,7 @@
"link": true,
"linkTargetBlank": false,
"linkTooltip": "Drill down to pods",
"linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1",
"linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1",
"pattern": "Value #A",
"thresholds": [ ],
"type": "number",
@ -631,7 +631,7 @@
"link": true,
"linkTargetBlank": false,
"linkTooltip": "Drill down to workloads",
"linkUrl": "./d/a87fb0d919ec0ea5f6543124e16c42a5/k8s-resources-workloads-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1",
"linkUrl": "/d/a87fb0d919ec0ea5f6543124e16c42a5/k8s-resources-workloads-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1",
"pattern": "Value #B",
"thresholds": [ ],
"type": "number",
@ -721,7 +721,7 @@
"link": true,
"linkTargetBlank": false,
"linkTooltip": "Drill down to pods",
"linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell",
"linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell",
"pattern": "namespace",
"thresholds": [ ],
"type": "number",
@ -987,7 +987,7 @@
"link": true,
"linkTargetBlank": false,
"linkTooltip": "Drill down to pods",
"linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1",
"linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1",
"pattern": "Value #A",
"thresholds": [ ],
"type": "number",
@ -1002,7 +1002,7 @@
"link": true,
"linkTargetBlank": false,
"linkTooltip": "Drill down to workloads",
"linkUrl": "./d/a87fb0d919ec0ea5f6543124e16c42a5/k8s-resources-workloads-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1",
"linkUrl": "/d/a87fb0d919ec0ea5f6543124e16c42a5/k8s-resources-workloads-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1",
"pattern": "Value #B",
"thresholds": [ ],
"type": "number",
@ -1092,7 +1092,7 @@
"link": true,
"linkTargetBlank": false,
"linkTooltip": "Drill down to pods",
"linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell",
"linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell",
"pattern": "namespace",
"thresholds": [ ],
"type": "number",
@ -1361,7 +1361,7 @@
"link": true,
"linkTargetBlank": false,
"linkTooltip": "Drill down to pods",
"linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell",
"linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell",
"pattern": "namespace",
"thresholds": [ ],
"type": "number",
@ -2445,7 +2445,7 @@
"link": true,
"linkTargetBlank": false,
"linkTooltip": "Drill down to pods",
"linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell",
"linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell",
"pattern": "namespace",
"thresholds": [ ],
"type": "number",

View File

@ -577,7 +577,7 @@
"link": true,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"pattern": "pod",
"thresholds": [ ],
"type": "number",
@ -984,7 +984,7 @@
"link": true,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"pattern": "pod",
"thresholds": [ ],
"type": "number",
@ -1262,7 +1262,7 @@
"link": true,
"linkTargetBlank": false,
"linkTooltip": "Drill down to pods",
"linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"pattern": "pod",
"thresholds": [ ],
"type": "number",
@ -2182,7 +2182,7 @@
"link": true,
"linkTargetBlank": false,
"linkTooltip": "Drill down to pods",
"linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"pattern": "pod",
"thresholds": [ ],
"type": "number",

View File

@ -222,7 +222,7 @@
"link": true,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"pattern": "pod",
"thresholds": [ ],
"type": "number",
@ -545,7 +545,7 @@
"link": true,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"pattern": "pod",
"thresholds": [ ],
"type": "number",
@ -796,7 +796,7 @@
"link": true,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"pattern": "pod",
"thresholds": [ ],
"type": "number",

View File

@ -276,7 +276,7 @@
"link": true,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "./d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$__cell_2",
"linkUrl": "/d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$__cell_2",
"pattern": "workload",
"thresholds": [ ],
"type": "number",
@ -677,7 +677,7 @@
"link": true,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "./d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$__cell_2",
"linkUrl": "/d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$__cell_2",
"pattern": "workload",
"thresholds": [ ],
"type": "number",
@ -952,7 +952,7 @@
"link": true,
"linkTargetBlank": false,
"linkTooltip": "Drill down to pods",
"linkUrl": "./d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$type",
"linkUrl": "/d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$type",
"pattern": "workload",
"thresholds": [ ],
"type": "number",

View File

@ -43,7 +43,8 @@
"fields": "",
"values": false
"textMode": "auto"
"pluginVersion": "7",
"targets": [
@ -91,7 +92,8 @@
"fields": "",
"values": false
"textMode": "auto"
"pluginVersion": "7",
"targets": [
@ -139,7 +141,8 @@
"fields": "",
"values": false
"textMode": "auto"
"pluginVersion": "7",
"targets": [
@ -187,7 +190,8 @@
"fields": "",
"values": false
"textMode": "auto"
"pluginVersion": "7",
"targets": [
@ -235,7 +239,8 @@
"fields": "",
"values": false
"textMode": "auto"
"pluginVersion": "7",
"targets": [
@ -283,7 +288,8 @@
"fields": "",
"values": false
"textMode": "auto"
"pluginVersion": "7",
"targets": [

View File

@ -1,685 +0,0 @@
- "name": "kube-apiserver-burnrate.rules"
- "expr": |
# too slow
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[1d]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[1d]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[1d]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[1d]))
# errors
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1d]))
"verb": "read"
"record": "apiserver_request:burnrate1d"
- "expr": |
# too slow
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[1h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[1h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[1h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[1h]))
# errors
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1h]))
"verb": "read"
"record": "apiserver_request:burnrate1h"
- "expr": |
# too slow
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[2h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[2h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[2h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[2h]))
# errors
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[2h]))
"verb": "read"
"record": "apiserver_request:burnrate2h"
- "expr": |
# too slow
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[30m]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[30m]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[30m]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[30m]))
# errors
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[30m]))
"verb": "read"
"record": "apiserver_request:burnrate30m"
- "expr": |
# too slow
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[3d]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[3d]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[3d]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[3d]))
# errors
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[3d]))
"verb": "read"
"record": "apiserver_request:burnrate3d"
- "expr": |
# too slow
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[5m]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[5m]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[5m]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[5m]))
# errors
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m]))
"verb": "read"
"record": "apiserver_request:burnrate5m"
- "expr": |
# too slow
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[6h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[6h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[6h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[6h]))
# errors
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[6h]))
"verb": "read"
"record": "apiserver_request:burnrate6h"
- "expr": |
# too slow
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
"verb": "write"
"record": "apiserver_request:burnrate1d"
- "expr": |
# too slow
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
"verb": "write"
"record": "apiserver_request:burnrate1h"
- "expr": |
# too slow
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
"verb": "write"
"record": "apiserver_request:burnrate2h"
- "expr": |
# too slow
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
"verb": "write"
"record": "apiserver_request:burnrate30m"
- "expr": |
# too slow
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
"verb": "write"
"record": "apiserver_request:burnrate3d"
- "expr": |
# too slow
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
"verb": "write"
"record": "apiserver_request:burnrate5m"
- "expr": |
# too slow
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
"verb": "write"
"record": "apiserver_request:burnrate6h"
- "name": "kube-apiserver-histogram.rules"
- "expr": |
histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET"}[5m]))) > 0
"quantile": "0.99"
"verb": "read"
"record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile"
- "expr": |
histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0
"quantile": "0.99"
"verb": "write"
"record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile"
- "expr": |
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
"quantile": "0.99"
"record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile"
- "expr": |
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
"quantile": "0.9"
"record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile"
- "expr": |
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
"quantile": "0.5"
"record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile"
- "interval": "3m"
"name": "kube-apiserver-availability.rules"
- "expr": |
avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30
"record": "code_verb:apiserver_request_total:increase30d"
- "expr": |
sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
"verb": "read"
"record": "code:apiserver_request_total:increase30d"
- "expr": |
sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
"verb": "write"
"record": "code:apiserver_request_total:increase30d"
- "expr": |
sum by (cluster, verb, scope) (increase(apiserver_request_duration_seconds_count[1h]))
"record": "cluster_verb_scope:apiserver_request_duration_seconds_count:increase1h"
- "expr": |
sum by (cluster, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_duration_seconds_count:increase1h[30d]) * 24 * 30)
"record": "cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d"
- "expr": |
sum by (cluster, verb, scope, le) (increase(apiserver_request_duration_seconds_bucket[1h]))
"record": "cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase1h"
- "expr": |
sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase1h[30d]) * 24 * 30)
"record": "cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d"
- "expr": |
1 - (
# write too slow
sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
) +
# read too slow
sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"LIST|GET"})
sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="40"})
) +
# errors
sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
sum by (cluster) (code:apiserver_request_total:increase30d)
"verb": "all"
"record": "apiserver_request:availability30d"
- "expr": |
1 - (
sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"LIST|GET"})
# too slow
sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="40"})
# errors
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"})
"verb": "read"
"record": "apiserver_request:availability30d"
- "expr": |
1 - (
# too slow
sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
# errors
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"})
"verb": "write"
"record": "apiserver_request:availability30d"
- "expr": |
sum by (cluster,code,resource) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m]))
"verb": "read"
"record": "code_resource:apiserver_request_total:rate5m"
- "expr": |
sum by (cluster,code,resource) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
"verb": "write"
"record": "code_resource:apiserver_request_total:rate5m"
- "expr": |
sum by (cluster, code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h]))
"record": "code_verb:apiserver_request_total:increase1h"
- "expr": |
sum by (cluster, code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h]))
"record": "code_verb:apiserver_request_total:increase1h"
- "expr": |
sum by (cluster, code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h]))
"record": "code_verb:apiserver_request_total:increase1h"
- "expr": |
sum by (cluster, code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
"record": "code_verb:apiserver_request_total:increase1h"
- "name": "k8s.rules"
- "expr": |
sum by (cluster, namespace, pod, container) (
irate(container_cpu_usage_seconds_total{job="cadvisor", image!=""}[5m])
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
"record": "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate"
- "expr": |
container_memory_working_set_bytes{job="cadvisor", image!=""}
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info{node!=""})
"record": "node_namespace_pod_container:container_memory_working_set_bytes"
- "expr": |
container_memory_rss{job="cadvisor", image!=""}
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info{node!=""})
"record": "node_namespace_pod_container:container_memory_rss"
- "expr": |
container_memory_cache{job="cadvisor", image!=""}
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info{node!=""})
"record": "node_namespace_pod_container:container_memory_cache"
- "expr": |
container_memory_swap{job="cadvisor", image!=""}
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info{node!=""})
"record": "node_namespace_pod_container:container_memory_swap"
- "expr": |
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
group_left() max by (namespace, pod) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
"record": "cluster:namespace:pod_memory:active:kube_pod_container_resource_requests"
- "expr": |
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
"record": "namespace_memory:kube_pod_container_resource_requests:sum"
- "expr": |
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
group_left() max by (namespace, pod) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
"record": "cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests"
- "expr": |
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
"record": "namespace_cpu:kube_pod_container_resource_requests:sum"
- "expr": |
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
group_left() max by (namespace, pod) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
"record": "cluster:namespace:pod_memory:active:kube_pod_container_resource_limits"
- "expr": |
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
"record": "namespace_memory:kube_pod_container_resource_limits:sum"
- "expr": |
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
group_left() max by (namespace, pod) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
"record": "cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits"
- "expr": |
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
"record": "namespace_cpu:kube_pod_container_resource_limits:sum"
- "expr": |
max by (cluster, namespace, workload, pod) (
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
"replicaset", "$1", "owner_name", "(.*)"
) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (
1, max by (replicaset, namespace, owner_name) (
"workload", "$1", "owner_name", "(.*)"
"workload_type": "deployment"
"record": "namespace_workload_pod:kube_pod_owner:relabel"
- "expr": |
max by (cluster, namespace, workload, pod) (
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
"workload", "$1", "owner_name", "(.*)"
"workload_type": "daemonset"
"record": "namespace_workload_pod:kube_pod_owner:relabel"
- "expr": |
max by (cluster, namespace, workload, pod) (
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
"workload", "$1", "owner_name", "(.*)"
"workload_type": "statefulset"
"record": "namespace_workload_pod:kube_pod_owner:relabel"
- "name": "kube-scheduler.rules"
- "expr": |
histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
"quantile": "0.99"
"record": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile"
- "expr": |
histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
"quantile": "0.99"
"record": "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile"
- "expr": |
histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
"quantile": "0.99"
"record": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile"
- "expr": |
histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
"quantile": "0.9"
"record": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile"
- "expr": |
histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
"quantile": "0.9"
"record": "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile"
- "expr": |
histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
"quantile": "0.9"
"record": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile"
- "expr": |
histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
"quantile": "0.5"
"record": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile"
- "expr": |
histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
"quantile": "0.5"
"record": "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile"
- "expr": |
histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
"quantile": "0.5"
"record": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile"
- "name": "node.rules"
- "expr": |
topk by(namespace, pod) (1,
max by (node, namespace, pod) (
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
"record": "node_namespace_pod:kube_pod_info:"
- "expr": |
count by (cluster, node) (sum by (node, cpu) (
* on (namespace, pod) group_left(node)
topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:)
"record": "node:node_num_cpu:sum"
- "expr": |
node_memory_MemAvailable_bytes{job="node-exporter"} or
node_memory_Buffers_bytes{job="node-exporter"} +
node_memory_Cached_bytes{job="node-exporter"} +
node_memory_MemFree_bytes{job="node-exporter"} +
) by (cluster)
"record": ":node_memory_MemAvailable_bytes:sum"
- "name": "kubelet.rules"
- "expr": |
histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet"})
"quantile": "0.99"
"record": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile"
- "expr": |
histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet"})
"quantile": "0.9"
"record": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile"
- "expr": |
histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet"})
"quantile": "0.5"
"record": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile"

View File

@ -4,8 +4,10 @@ local kubernetes = import "kubernetes-mixin/mixin.libsonnet";
kubernetes {
_config+:: {
kubeApiserverSelector: 'job="apiserver"',
grafanaK8s+:: {
dashboardNamePrefix: '',
linkPrefix: '',
dashboardNamePrefix: '',
dashboardTags: ['kubernetes'],

View File

@ -54,7 +54,7 @@ metadata:
namespace: {{ .Release.Namespace }}
grafana_dashboard: "1"
{{ include "kubezero-lib.labels" . | indent 4 }}
{{- include "kubezero-lib.labels" . | nindent 4 }}
''' % config
# Put all dashboards into a folder ?

View File

@ -5,7 +5,7 @@ metadata:
namespace: {{ .Release.Namespace }}
grafana_dashboard: "1"
{{ include "kubezero-lib.labels" . | indent 4 }}
{{- include "kubezero-lib.labels" . | nindent 4 }}
k8s-sidecar-target-directory: Kubernetes
@ -22,19 +22,19 @@ binaryData:

View File

@ -5,7 +5,7 @@ metadata:
namespace: {{ .Release.Namespace }}
grafana_dashboard: "1"
{{ include "kubezero-lib.labels" . | indent 4 }}
{{- include "kubezero-lib.labels" . | nindent 4 }}
k8s-sidecar-target-directory: Metrics

View File

@ -5,7 +5,7 @@ metadata:
namespace: {{ .Release.Namespace }}
grafana_dashboard: "1"
{{ include "kubezero-lib.labels" . | indent 4 }}
{{- include "kubezero-lib.labels" . | nindent 4 }}

View File

@ -19,10 +19,10 @@ patch -p0 -i zdt-pushgateway.patch --no-backup-if-mismatch
cd dashboards
# Patch for the apiserver dashboard
patch -p1 -i ../zdt-apiserver-dashboard.patch --no-backup-if-mismatch
../ metrics-dashboards.yaml ../templates/grafana-dashboards-metrics.yaml
../ k8s-dashboards.yaml ../templates/grafana-dashboards-k8s.yaml
../ zdt-dashboards.yaml ../templates/grafana-dashboards-zdt.yaml
cd -
# Delete not used upstream dashboards
rm -rf charts/kube-prometheus-stack/templates/grafana/dashboards-1.14

View File

@ -1,67 +0,0 @@
diff --git a/kube-mixin/apiserver.json b/kube-mixin/apiserver.json
index 9830c36..1c940dc 100644
--- a/kube-mixin/apiserver.json
+++ b/kube-mixin/apiserver.json
@@ -968,7 +968,7 @@
"steppedLine": false,
"targets": [
- "expr": "sum(rate(workqueue_adds_total{job=\"kube-apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name)",
+ "expr": "sum(rate(workqueue_adds_total{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} {{name}}",
@@ -1049,7 +1049,7 @@
"steppedLine": false,
"targets": [
- "expr": "sum(rate(workqueue_depth{job=\"kube-apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name)",
+ "expr": "sum(rate(workqueue_depth{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} {{name}}",
@@ -1130,7 +1130,7 @@
"steppedLine": false,
"targets": [
- "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"kube-apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} {{name}}",
@@ -1224,7 +1224,7 @@
"steppedLine": false,
"targets": [
- "expr": "process_resident_memory_bytes{job=\"kube-apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}",
+ "expr": "process_resident_memory_bytes{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
@@ -1305,7 +1305,7 @@
"steppedLine": false,
"targets": [
- "expr": "rate(process_cpu_seconds_total{job=\"kube-apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}[5m])",
+ "expr": "rate(process_cpu_seconds_total{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}[5m])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
@@ -1386,7 +1386,7 @@
"steppedLine": false,
"targets": [
- "expr": "go_goroutines{job=\"kube-apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}",
+ "expr": "go_goroutines{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
@@ -1490,7 +1490,7 @@
"multi": false,
"name": "instance",
"options": [ ],
- "query": "label_values(apiserver_request_total{job=\"kube-apiserver\", cluster=\"$cluster\"}, instance)",
+ "query": "label_values(apiserver_request_total{job=\"apiserver\", cluster=\"$cluster\"}, instance)",
"refresh": 2,
"regex": "",
"sort": 1,