feat: another round of upgrade to 1.23.11

This commit is contained in:
Stefan Reimer 2022-09-16 11:21:38 +02:00
parent 1f7971220e
commit c2b929f952
97 changed files with 2458 additions and 862 deletions

View File

@ -133,6 +133,9 @@ control_plane_upgrade kubeadm_upgrade
kubectl delete ds kube-multus-ds -n kube-system
# Required due to chart upgrade to 4.X part of prometheus-stack 40.X
kubectl delete daemonset metrics-prometheus-node-exporter -n monitoring
control_plane_upgrade "apply_network, apply_addons"
kubectl rollout restart daemonset/calico-node -n kube-system

View File

@ -39,4 +39,4 @@ Kubernetes: `>= 1.18.0`
| service.port | int | `3310` | The port to be used by the clamav service |
----------------------------------------------
Autogenerated from chart metadata using [helm-docs v1.9.1](https://github.com/norwoodj/helm-docs/releases/v1.9.1)
Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0)

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: kubeadm
description: KubeZero Kubeadm cluster config
type: application
version: 1.23.10
version: 1.23.11
home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords:

View File

@ -3,7 +3,7 @@ name: kubezero-addons
description: KubeZero umbrella chart for various optional cluster addons
type: application
version: 0.6.2
appVersion: v1.23.10
appVersion: v1.23.11
home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords:

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: kubezero-metrics
description: KubeZero Umbrella Chart for Prometheus, Grafana and Alertmanager as well as all Kubernetes integrations.
type: application
version: 0.8.1
version: 0.8.4
home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords:
@ -18,7 +18,7 @@ dependencies:
version: ">= 0.1.5"
repository: https://cdn.zero-downtime.net/charts/
- name: kube-prometheus-stack
version: 39.9.0
version: 40.0.0
# Switch back to upstream once all alerts are fixed eg. etcd gpcr
# repository: https://prometheus-community.github.io/helm-charts
- name: prometheus-adapter

View File

@ -6,20 +6,20 @@ annotations:
url: https://github.com/prometheus-operator/kube-prometheus
artifacthub.io/operator: "true"
apiVersion: v2
appVersion: 0.58.0
appVersion: 0.59.1
dependencies:
- condition: kubeStateMetrics.enabled
name: kube-state-metrics
repository: https://prometheus-community.github.io/helm-charts
version: 4.15.*
version: 4.18.*
- condition: nodeExporter.enabled
name: prometheus-node-exporter
repository: https://prometheus-community.github.io/helm-charts
version: 3.3.*
version: 4.2.*
- condition: grafana.enabled
name: grafana
repository: https://grafana.github.io/helm-charts
version: 6.32.*
version: 6.38.*
description: kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards,
and Prometheus rules combined with documentation and scripts to provide easy to
operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus
@ -51,4 +51,4 @@ sources:
- https://github.com/prometheus-community/helm-charts
- https://github.com/prometheus-operator/kube-prometheus
type: application
version: 39.9.0
version: 40.0.0

View File

@ -80,6 +80,33 @@ _See [helm upgrade](https://helm.sh/docs/helm/helm_upgrade/) for command documen
A major chart version change (like v1.2.3 -> v2.0.0) indicates that there is an incompatible breaking change needing manual actions.
### From 39.x to 40.x
This version upgrades Prometheus-Operator to v0.59.1, Prometheus to v2.38.0, kube-state-metrics to v2.6.0 and Thanos to v0.28.0.
This version also upgrades the Helm charts of kube-state-metrics to 4.18.0 and prometheus-node-exporter to 4.2.0.
Run these commands to update the CRDs before applying the upgrade.
```console
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.59.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.59.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.59.1/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.59.1/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.59.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.59.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.59.1/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.59.1/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml
```
Starting from prometheus-node-exporter version 4.0.0, the `node exporter` chart is using the [Kubernetes recommended labels](https://kubernetes.io/docs/concepts/overview/working-with-objects/common-labels/). Therefore you have to delete the daemonset before you upgrade.
```console
kubectl delete daemonset -l app=prometheus-node-exporter
helm upgrade -i kube-prometheus-stack prometheus-community/kube-prometheus-stack
```
If you use your own custom [ServiceMonitor](https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#servicemonitor) or [PodMonitor](https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#podmonitor), please ensure to upgrade their `selector` fields accordingly to the new labels.
### From 38.x to 39.x
This upgraded prometheus-operator to v0.58.0 and prometheus to v2.37.0

View File

@ -1,5 +1,5 @@
apiVersion: v2
appVersion: 9.0.5
appVersion: 9.1.4
description: The leading tool for querying and visualizing time series and metrics.
home: https://grafana.net
icon: https://raw.githubusercontent.com/grafana/grafana/master/public/img/logo_transparent_400x.png
@ -19,4 +19,4 @@ name: grafana
sources:
- https://github.com/grafana/grafana
type: application
version: 6.32.10
version: 6.38.0

View File

@ -67,6 +67,7 @@ This version requires Helm >= 3.1.0.
| `service.type` | Kubernetes service type | `ClusterIP` |
| `service.port` | Kubernetes port where service is exposed | `80` |
| `service.portName` | Name of the port on the service | `service` |
| `service.appProtocol` | Adds the appProtocol field to the service | `` |
| `service.targetPort` | Internal service is port | `3000` |
| `service.nodePort` | Kubernetes service nodePort | `nil` |
| `service.annotations` | Service annotations (can be templated) | `{}` |
@ -127,6 +128,7 @@ This version requires Helm >= 3.1.0.
| `extraEmptyDirMounts` | Additional grafana server emptyDir volume mounts | `[]` |
| `plugins` | Plugins to be loaded along with Grafana | `[]` |
| `datasources` | Configure grafana datasources (passed through tpl) | `{}` |
| `alerting` | Configure grafana alerting (passed through tpl) | `{}` |
| `notifiers` | Configure grafana notifiers | `{}` |
| `dashboardProviders` | Configure grafana dashboard providers | `{}` |
| `dashboards` | Dashboards to import | `{}` |
@ -233,8 +235,9 @@ This version requires Helm >= 3.1.0.
| `imageRenderer.priorityClassName` | image-renderer deployment priority class | `''` |
| `imageRenderer.service.enabled` | Enable the image-renderer service | `true` |
| `imageRenderer.service.portName` | image-renderer service port name | `http` |
| `imageRenderer.service.port` | image-renderer service port used by both service and deployment | `8081` |
| `imageRenderer.grafanaProtocol` | Protocol to use for image renderer callback url | `http` |
| `imageRenderer.service.port` | image-renderer port used by deployment | `8081` |
| `imageRenderer.service.targetPort` | image-renderer service port used by service | `8081` |
| `imageRenderer.appProtocol` | Adds the appProtocol field to the service | `` |
| `imageRenderer.grafanaSubPath` | Grafana sub path to use for image renderer callback url | `''` |
| `imageRenderer.podPortName` | name of the image-renderer port on the pod | `http` |
| `imageRenderer.revisionHistoryLimit` | number of image-renderer replica sets to keep | `10` |
@ -273,7 +276,7 @@ ingress:
### Example of extraVolumeMounts
Volume can be type persistentVolumeClaim or hostPath but not both at same time.
If none existingClaim or hostPath argument is givent then type is emptyDir.
If neither existingClaim or hostPath argument is given then type is emptyDir.
```yaml
- extraVolumeMounts:
@ -482,7 +485,7 @@ grafana.ini:
## How to securely reference secrets in grafana.ini
This example uses Grafana uses [file providers](https://grafana.com/docs/grafana/latest/administration/configuration/#file-provider) for secret values and the `extraSecretMounts` configuration flag (Additional grafana server secret mounts) to mount the secrets.
This example uses Grafana [file providers](https://grafana.com/docs/grafana/latest/administration/configuration/#file-provider) for secret values and the `extraSecretMounts` configuration flag (Additional grafana server secret mounts) to mount the secrets.
In grafana.ini:

View File

@ -141,22 +141,11 @@ Return the appropriate apiVersion for ingress.
{{- end -}}
{{- end -}}
{{/*
Return the appropriate apiVersion for podSecurityPolicy.
*/}}
{{- define "grafana.podSecurityPolicy.apiVersion" -}}
{{- if and (.Capabilities.APIVersions.Has "policy/v1beta1") (semverCompare ">= 1.16-0" .Capabilities.KubeVersion.Version) -}}
{{- print "policy/v1beta1" -}}
{{- else -}}
{{- print "extensions/v1beta1" -}}
{{- end -}}
{{- end -}}
{{/*
Return the appropriate apiVersion for podDisruptionBudget.
*/}}
{{- define "grafana.podDisruptionBudget.apiVersion" -}}
{{- if and (.Capabilities.APIVersions.Has "policy/v1") (semverCompare ">= 1.21-0" .Capabilities.KubeVersion.Version) -}}
{{- if $.Capabilities.APIVersions.Has "policy/v1/PodDisruptionBudget" -}}
{{- print "policy/v1" -}}
{{- else -}}
{{- print "policy/v1beta1" -}}

View File

@ -1,17 +1,16 @@
{{- define "grafana.pod" -}}
{{- if .Values.schedulerName }}
schedulerName: "{{ .Values.schedulerName }}"
{{- end }}
serviceAccountName: {{ template "grafana.serviceAccountName" . }}
automountServiceAccountToken: {{ .Values.serviceAccount.autoMount }}
{{- if .Values.securityContext }}
{{- with .Values.securityContext }}
securityContext:
{{ toYaml .Values.securityContext | indent 2 }}
{{- toYaml . | nindent 2 }}
{{- end }}
{{- if .Values.hostAliases }}
{{- with .Values.hostAliases }}
hostAliases:
{{ toYaml .Values.hostAliases | indent 2 }}
{{- toYaml . | nindent 2 }}
{{- end }}
{{- if .Values.priorityClassName }}
priorityClassName: {{ .Values.priorityClassName }}
@ -31,8 +30,10 @@ initContainers:
runAsNonRoot: false
runAsUser: 0
command: ["chown", "-R", "{{ .Values.securityContext.runAsUser }}:{{ .Values.securityContext.runAsGroup }}", "/var/lib/grafana"]
{{- with .Values.initChownData.resources }}
resources:
{{ toYaml .Values.initChownData.resources | indent 6 }}
{{- toYaml . | nindent 6 }}
{{- end }}
volumeMounts:
- name: storage
mountPath: "/var/lib/grafana"
@ -50,13 +51,19 @@ initContainers:
imagePullPolicy: {{ .Values.downloadDashboardsImage.pullPolicy }}
command: ["/bin/sh"]
args: [ "-c", "mkdir -p /var/lib/grafana/dashboards/default && /bin/sh -x /etc/grafana/download_dashboards.sh" ]
{{- with .Values.downloadDashboards.resources }}
resources:
{{ toYaml .Values.downloadDashboards.resources | indent 6 }}
{{- toYaml . | nindent 6 }}
{{- end }}
env:
{{- range $key, $value := .Values.downloadDashboards.env }}
- name: "{{ $key }}"
value: "{{ $value }}"
{{- end }}
{{- with .Values.downloadDashboards.securityContext }}
securityContext:
{{- toYaml . | nindent 6 }}
{{- end }}
{{- if .Values.downloadDashboards.envFromSecret }}
envFrom:
- secretRef:
@ -86,6 +93,14 @@ initContainers:
{{- end }}
imagePullPolicy: {{ .Values.sidecar.imagePullPolicy }}
env:
{{- range $key, $value := .Values.sidecar.datasources.env }}
- name: "{{ $key }}"
value: "{{ $value }}"
{{- end }}
{{- if .Values.sidecar.datasources.ignoreAlreadyProcessed }}
- name: IGNORE_ALREADY_PROCESSED
value: "true"
{{- end }}
- name: METHOD
value: "LIST"
- name: LABEL
@ -94,6 +109,10 @@ initContainers:
- name: LABEL_VALUE
value: {{ quote .Values.sidecar.datasources.labelValue }}
{{- end }}
{{- if or .Values.sidecar.logLevel .Values.sidecar.datasources.logLevel }}
- name: LOG_LEVEL
value: {{ default .Values.sidecar.logLevel .Values.sidecar.datasources.logLevel }}
{{- end }}
- name: FOLDER
value: "/etc/grafana/provisioning/datasources"
- name: RESOURCE
@ -110,12 +129,14 @@ initContainers:
- name: SKIP_TLS_VERIFY
value: "{{ .Values.sidecar.skipTlsVerify }}"
{{- end }}
{{- with .Values.sidecar.resources }}
resources:
{{ toYaml .Values.sidecar.resources | indent 6 }}
{{- if .Values.sidecar.securityContext }}
{{- toYaml . | nindent 6 }}
{{- end }}
{{- with .Values.sidecar.securityContext }}
securityContext:
{{- toYaml .Values.sidecar.securityContext | nindent 6 }}
{{- end }}
{{- toYaml . | nindent 6 }}
{{- end }}
volumeMounts:
- name: sc-datasources-volume
mountPath: "/etc/grafana/provisioning/datasources"
@ -129,10 +150,26 @@ initContainers:
{{- end }}
imagePullPolicy: {{ .Values.sidecar.imagePullPolicy }}
env:
{{- range $key, $value := .Values.sidecar.notifiers.env }}
- name: "{{ $key }}"
value: "{{ $value }}"
{{- end }}
{{- if .Values.sidecar.notifiers.ignoreAlreadyProcessed }}
- name: IGNORE_ALREADY_PROCESSED
value: "true"
{{- end }}
- name: METHOD
value: LIST
- name: LABEL
value: "{{ .Values.sidecar.notifiers.label }}"
{{- if .Values.sidecar.notifiers.labelValue }}
- name: LABEL_VALUE
value: {{ quote .Values.sidecar.notifiers.labelValue }}
{{- end }}
{{- if or .Values.sidecar.logLevel .Values.sidecar.notifiers.logLevel }}
- name: LOG_LEVEL
value: {{ default .Values.sidecar.logLevel .Values.sidecar.notifiers.logLevel }}
{{- end }}
- name: FOLDER
value: "/etc/grafana/provisioning/notifiers"
- name: RESOURCE
@ -149,20 +186,22 @@ initContainers:
- name: SKIP_TLS_VERIFY
value: "{{ .Values.sidecar.skipTlsVerify }}"
{{- end }}
{{- if .Values.sidecar.livenessProbe }}
{{- with .Values.sidecar.livenessProbe }}
livenessProbe:
{{ toYaml .Values.livenessProbe | indent 6 }}
{{- end }}
{{- if .Values.sidecar.readinessProbe }}
{{- toYaml . | nindent 6 }}
{{- end }}
{{- with .Values.sidecar.readinessProbe }}
readinessProbe:
{{ toYaml .Values.readinessProbe | indent 6 }}
{{- end }}
{{- toYaml . | nindent 6 }}
{{- end }}
{{- with .Values.sidecar.resources }}
resources:
{{ toYaml .Values.sidecar.resources | indent 6 }}
{{- if .Values.sidecar.securityContext }}
{{- toYaml . | nindent 6 }}
{{- end }}
{{- with .Values.sidecar.securityContext }}
securityContext:
{{- toYaml .Values.sidecar.securityContext | nindent 6 }}
{{- end }}
{{- toYaml . | nindent 6 }}
{{- end }}
volumeMounts:
- name: sc-notifiers-volume
mountPath: "/etc/grafana/provisioning/notifiers"
@ -190,6 +229,14 @@ containers:
{{- end }}
imagePullPolicy: {{ .Values.sidecar.imagePullPolicy }}
env:
{{- range $key, $value := .Values.sidecar.dashboards.env }}
- name: "{{ $key }}"
value: "{{ $value }}"
{{- end }}
{{- if .Values.sidecar.dashboards.ignoreAlreadyProcessed }}
- name: IGNORE_ALREADY_PROCESSED
value: "true"
{{- end }}
- name: METHOD
value: {{ .Values.sidecar.dashboards.watchMethod }}
- name: LABEL
@ -198,9 +245,9 @@ containers:
- name: LABEL_VALUE
value: {{ quote .Values.sidecar.dashboards.labelValue }}
{{- end }}
{{- if .Values.sidecar.logLevel }}
{{- if or .Values.sidecar.logLevel .Values.sidecar.dashboards.logLevel }}
- name: LOG_LEVEL
value: {{ quote .Values.sidecar.logLevel }}
value: {{ default .Values.sidecar.logLevel .Values.sidecar.dashboards.logLevel }}
{{- end }}
- name: FOLDER
value: "{{ .Values.sidecar.dashboards.folder }}{{- with .Values.sidecar.dashboards.defaultFolderName }}/{{ . }}{{- end }}"
@ -227,27 +274,35 @@ containers:
value: "{{ .Values.sidecar.dashboards.script }}"
{{- end }}
{{- if .Values.sidecar.dashboards.watchServerTimeout }}
{{- if ne .Values.sidecar.dashboards.watchMethod "WATCH" }}
{{- fail (printf "Cannot use .Values.sidecar.dashboards.watchServerTimeout with .Values.sidecar.dashboards.watchMethod %s" .Values.sidecar.dashboards.watchMethod) }}
{{- end }}
- name: WATCH_SERVER_TIMEOUT
value: "{{ .Values.sidecar.dashboards.watchServerTimeout }}"
{{- end }}
{{- if .Values.sidecar.dashboards.watchClientTimeout }}
{{- if ne .Values.sidecar.dashboards.watchMethod "WATCH" }}
{{- fail (printf "Cannot use .Values.sidecar.dashboards.watchClientTimeout with .Values.sidecar.dashboards.watchMethod %s" .Values.sidecar.dashboards.watchMethod) }}
{{- end }}
- name: WATCH_CLIENT_TIMEOUT
value: "{{ .Values.sidecar.dashboards.watchClientTimeout }}"
{{- end }}
{{- if .Values.sidecar.livenessProbe }}
{{- with .Values.sidecar.livenessProbe }}
livenessProbe:
{{ toYaml .Values.livenessProbe | indent 6 }}
{{- end }}
{{- if .Values.sidecar.readinessProbe }}
{{- toYaml . | nindent 6 }}
{{- end }}
{{- with .Values.sidecar.readinessProbe }}
readinessProbe:
{{ toYaml .Values.readinessProbe | indent 6 }}
{{- end }}
{{- toYaml . | nindent 6 }}
{{- end }}
{{- with .Values.sidecar.resources }}
resources:
{{ toYaml .Values.sidecar.resources | indent 6 }}
{{- if .Values.sidecar.securityContext }}
{{- toYaml . | nindent 6 }}
{{- end }}
{{- with .Values.sidecar.securityContext }}
securityContext:
{{- toYaml .Values.sidecar.securityContext | nindent 6 }}
{{- end }}
{{- toYaml . | nindent 6 }}
{{- end }}
volumeMounts:
- name: sc-dashboard-volume
mountPath: {{ .Values.sidecar.dashboards.folder | quote }}
@ -264,6 +319,14 @@ containers:
{{- end }}
imagePullPolicy: {{ .Values.sidecar.imagePullPolicy }}
env:
{{- range $key, $value := .Values.sidecar.datasources.env }}
- name: "{{ $key }}"
value: "{{ $value }}"
{{- end }}
{{- if .Values.sidecar.datasources.ignoreAlreadyProcessed }}
- name: IGNORE_ALREADY_PROCESSED
value: "true"
{{- end }}
- name: METHOD
value: {{ .Values.sidecar.datasources.watchMethod }}
- name: LABEL
@ -272,6 +335,10 @@ containers:
- name: LABEL_VALUE
value: {{ quote .Values.sidecar.datasources.labelValue }}
{{- end }}
{{- if or .Values.sidecar.logLevel .Values.sidecar.datasources.logLevel }}
- name: LOG_LEVEL
value: {{ default .Values.sidecar.logLevel .Values.sidecar.datasources.logLevel }}
{{- end }}
- name: FOLDER
value: "/etc/grafana/provisioning/datasources"
- name: RESOURCE
@ -288,6 +355,10 @@ containers:
- name: SKIP_TLS_VERIFY
value: "{{ .Values.sidecar.skipTlsVerify }}"
{{- end }}
{{- if .Values.sidecar.datasources.script }}
- name: SCRIPT
value: "{{ .Values.sidecar.datasources.script }}"
{{- end }}
{{- if and (not .Values.env.GF_SECURITY_ADMIN_USER) (not .Values.env.GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION) }}
- name: REQ_USERNAME
valueFrom:
@ -308,20 +379,36 @@ containers:
- name: REQ_METHOD
value: POST
{{- end }}
{{- if .Values.sidecar.livenessProbe }}
{{- if .Values.sidecar.datasources.watchServerTimeout }}
{{- if ne .Values.sidecar.datasources.watchMethod "WATCH" }}
{{- fail (printf "Cannot use .Values.sidecar.datasources.watchServerTimeout with .Values.sidecar.datasources.watchMethod %s" .Values.sidecar.datasources.watchMethod) }}
{{- end }}
- name: WATCH_SERVER_TIMEOUT
value: "{{ .Values.sidecar.datasources.watchServerTimeout }}"
{{- end }}
{{- if .Values.sidecar.datasources.watchClientTimeout }}
{{- if ne .Values.sidecar.datasources.watchMethod "WATCH" }}
{{- fail (printf "Cannot use .Values.sidecar.datasources.watchClientTimeout with .Values.sidecar.datasources.watchMethod %s" .Values.sidecar.datasources.watchMethod) }}
{{- end }}
- name: WATCH_CLIENT_TIMEOUT
value: "{{ .Values.sidecar.datasources.watchClientTimeout }}"
{{- end }}
{{- with .Values.sidecar.livenessProbe }}
livenessProbe:
{{ toYaml .Values.livenessProbe | indent 6 }}
{{- end }}
{{- if .Values.sidecar.readinessProbe }}
{{- toYaml . | nindent 6 }}
{{- end }}
{{- with .Values.sidecar.readinessProbe }}
readinessProbe:
{{ toYaml .Values.readinessProbe | indent 6 }}
{{- end }}
{{- toYaml . | nindent 6 }}
{{- end }}
{{- with .Values.sidecar.resources }}
resources:
{{ toYaml .Values.sidecar.resources | indent 6 }}
{{- if .Values.sidecar.securityContext }}
{{- toYaml . | nindent 6 }}
{{- end }}
{{- with .Values.sidecar.securityContext }}
securityContext:
{{- toYaml .Values.sidecar.securityContext | nindent 6 }}
{{- end }}
{{- toYaml . | nindent 6 }}
{{- end }}
volumeMounts:
- name: sc-datasources-volume
mountPath: "/etc/grafana/provisioning/datasources"
@ -335,6 +422,14 @@ containers:
{{- end }}
imagePullPolicy: {{ .Values.sidecar.imagePullPolicy }}
env:
{{- range $key, $value := .Values.sidecar.plugins.env }}
- name: "{{ $key }}"
value: "{{ $value }}"
{{- end }}
{{- if .Values.sidecar.plugins.ignoreAlreadyProcessed }}
- name: IGNORE_ALREADY_PROCESSED
value: "true"
{{- end }}
- name: METHOD
value: {{ .Values.sidecar.plugins.watchMethod }}
- name: LABEL
@ -343,6 +438,10 @@ containers:
- name: LABEL_VALUE
value: {{ quote .Values.sidecar.plugins.labelValue }}
{{- end }}
{{- if or .Values.sidecar.logLevel .Values.sidecar.plugins.logLevel }}
- name: LOG_LEVEL
value: {{ default .Values.sidecar.logLevel .Values.sidecar.plugins.logLevel }}
{{- end }}
- name: FOLDER
value: "/etc/grafana/provisioning/plugins"
- name: RESOURCE
@ -355,6 +454,10 @@ containers:
- name: NAMESPACE
value: "{{ .Values.sidecar.plugins.searchNamespace | join "," }}"
{{- end }}
{{- if .Values.sidecar.plugins.script }}
- name: SCRIPT
value: "{{ .Values.sidecar.plugins.script }}"
{{- end }}
{{- if .Values.sidecar.skipTlsVerify }}
- name: SKIP_TLS_VERIFY
value: "{{ .Values.sidecar.skipTlsVerify }}"
@ -379,20 +482,36 @@ containers:
- name: REQ_METHOD
value: POST
{{- end }}
{{- if .Values.sidecar.livenessProbe }}
{{- if .Values.sidecar.plugins.watchServerTimeout }}
{{- if ne .Values.sidecar.plugins.watchMethod "WATCH" }}
{{- fail (printf "Cannot use .Values.sidecar.plugins.watchServerTimeout with .Values.sidecar.plugins.watchMethod %s" .Values.sidecar.plugins.watchMethod) }}
{{- end }}
- name: WATCH_SERVER_TIMEOUT
value: "{{ .Values.sidecar.plugins.watchServerTimeout }}"
{{- end }}
{{- if .Values.sidecar.plugins.watchClientTimeout }}
{{- if ne .Values.sidecar.plugins.watchMethod "WATCH" }}
{{- fail (printf "Cannot use .Values.sidecar.plugins.watchClientTimeout with .Values.sidecar.plugins.watchMethod %s" .Values.sidecar.plugins.watchMethod) }}
{{- end }}
- name: WATCH_CLIENT_TIMEOUT
value: "{{ .Values.sidecar.plugins.watchClientTimeout }}"
{{- end }}
{{- with .Values.sidecar.livenessProbe }}
livenessProbe:
{{ toYaml .Values.livenessProbe | indent 6 }}
{{- end }}
{{- if .Values.sidecar.readinessProbe }}
{{- toYaml . | nindent 6 }}
{{- end }}
{{- with .Values.sidecar.readinessProbe }}
readinessProbe:
{{ toYaml .Values.readinessProbe | indent 6 }}
{{- end }}
{{- toYaml . | nindent 6 }}
{{- end }}
{{- with .Values.sidecar.resources }}
resources:
{{ toYaml .Values.sidecar.resources | indent 6 }}
{{- if .Values.sidecar.securityContext }}
{{- toYaml . | nindent 6 }}
{{- end }}
{{- with .Values.sidecar.securityContext }}
securityContext:
{{- toYaml .Values.sidecar.securityContext | nindent 6 }}
{{- end }}
{{- toYaml . | nindent 6 }}
{{- end }}
volumeMounts:
- name: sc-plugins-volume
mountPath: "/etc/grafana/provisioning/plugins"
@ -410,10 +529,10 @@ containers:
- {{ . }}
{{- end }}
{{- end}}
{{- if .Values.containerSecurityContext }}
{{- with .Values.containerSecurityContext }}
securityContext:
{{- toYaml .Values.containerSecurityContext | nindent 6 }}
{{- end }}
{{- toYaml . | nindent 6 }}
{{- end }}
volumeMounts:
- name: config
mountPath: "/etc/grafana/grafana.ini"
@ -466,6 +585,13 @@ containers:
subPath: {{ . | quote }}
{{- end }}
{{- end }}
{{- if .Values.alerting }}
{{- range (keys .Values.alerting | sortAlpha) }}
- name: config
mountPath: "/etc/grafana/provisioning/alerting/{{ . }}"
subPath: {{ . | quote }}
{{- end }}
{{- end }}
{{- if .Values.dashboardProviders }}
{{- range (keys .Values.dashboardProviders | sortAlpha) }}
- name: config
@ -511,11 +637,8 @@ containers:
mountPath: {{ .mountPath }}
{{- end }}
ports:
- name: {{ .Values.service.portName }}
containerPort: {{ .Values.service.port }}
protocol: TCP
- name: {{ .Values.podPortName }}
containerPort: 3000
containerPort: {{ .Values.service.targetPort }}
protocol: TCP
env:
{{- if and (not .Values.env.GF_SECURITY_ADMIN_USER) (not .Values.env.GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION) }}
@ -595,30 +718,40 @@ containers:
optional: {{ .optional | default false }}
{{- end }}
{{- end }}
{{- with .Values.livenessProbe }}
livenessProbe:
{{ toYaml .Values.livenessProbe | indent 6 }}
{{- toYaml . | nindent 6 }}
{{- end }}
{{- with .Values.readinessProbe }}
readinessProbe:
{{ toYaml .Values.readinessProbe | indent 6 }}
{{- toYaml . | nindent 6 }}
{{- end }}
{{- if .Values.lifecycleHooks }}
lifecycle: {{ tpl (.Values.lifecycleHooks | toYaml) . | nindent 6 }}
{{- end }}
{{- with .Values.resources }}
resources:
{{ toYaml .Values.resources | indent 6 }}
{{- toYaml . | nindent 6 }}
{{- end }}
{{- with .Values.extraContainers }}
{{ tpl . $ | indent 2 }}
{{- end }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{ toYaml . | indent 2 }}
{{- toYaml . | nindent 2 }}
{{- end }}
{{- $root := . }}
{{- with .Values.affinity }}
affinity:
{{ tpl (toYaml .) $root | indent 2 }}
{{- end }}
{{- with .Values.topologySpreadConstraints }}
topologySpreadConstraints:
{{- toYaml . | nindent 2 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{ toYaml . | indent 2 }}
{{- toYaml . | nindent 2 }}
{{- end }}
volumes:
- name: config
@ -744,6 +877,10 @@ volumes:
{{- else if .hostPath }}
hostPath:
path: {{ .hostPath }}
{{- else if .csi }}
csi:
data:
{{ toYaml .data | nindent 6 }}
{{- else }}
emptyDir: {}
{{- end }}
@ -753,6 +890,6 @@ volumes:
emptyDir: {}
{{- end -}}
{{- if .Values.extraContainerVolumes }}
{{ toYaml .Values.extraContainerVolumes | indent 2 }}
{{ tpl (toYaml .Values.extraContainerVolumes) . | indent 2 }}
{{- end }}
{{- end }}

View File

@ -56,6 +56,14 @@ data:
{{- end -}}
{{- end -}}
{{- if .Values.alerting }}
{{ $root := . }}
{{- range $key, $value := .Values.alerting }}
{{ $key }}: |
{{ tpl $value $root | indent 4 }}
{{- end -}}
{{- end -}}
{{- if .Values.dashboardProviders }}
{{- range $key, $value := .Values.dashboardProviders }}
{{ $key }}: |
@ -86,6 +94,12 @@ data:
{{- if $value.token }}
-H "Authorization: token {{ $value.token }}" \
{{- end }}
{{- if $value.bearerToken }}
-H "Authorization: Bearer {{ $value.bearerToken }}" \
{{- end }}
{{- if $value.gitlabToken }}
-H "PRIVATE-TOKEN: {{ $value.gitlabToken }}" \
{{- end }}
-H "Content-Type: application/json;charset=UTF-8" \
{{ end }}
{{- $dpPath := "" -}}

View File

@ -46,5 +46,5 @@ spec:
{{ toYaml . | indent 8 }}
{{- end }}
spec:
{{- include "grafana.pod" . | nindent 6 }}
{{- include "grafana.pod" . | indent 6 }}
{{- end }}

View File

@ -18,5 +18,5 @@ spec:
ports:
- protocol: TCP
port: 3000
targetPort: 3000
targetPort: {{ .Values.service.targetPort }}
{{- end }}

View File

@ -77,7 +77,7 @@ spec:
{{- end}}
ports:
- name: {{ .Values.imageRenderer.service.portName }}
containerPort: {{ .Values.imageRenderer.service.port }}
containerPort: {{ .Values.imageRenderer.service.targetPort }}
protocol: TCP
livenessProbe:
httpGet:
@ -85,7 +85,7 @@ spec:
port: {{ .Values.imageRenderer.service.portName }}
env:
- name: HTTP_PORT
value: {{ .Values.imageRenderer.service.port | quote }}
value: {{ .Values.imageRenderer.service.targetPort | quote }}
{{- range $key, $value := .Values.imageRenderer.env }}
- name: {{ $key | quote }}
value: {{ $value | quote }}

View File

@ -19,7 +19,7 @@ spec:
- Ingress
ingress:
- ports:
- port: {{ .Values.imageRenderer.service.port }}
- port: {{ .Values.imageRenderer.service.targetPort }}
protocol: TCP
from:
- namespaceSelector:

View File

@ -24,6 +24,9 @@ spec:
port: {{ .Values.imageRenderer.service.port }}
protocol: TCP
targetPort: {{ .Values.imageRenderer.service.targetPort }}
{{- if .Values.imageRenderer.appProtocol }}
appProtocol: {{ .Values.imageRenderer.appProtocol }}
{{- end }}
selector:
{{- include "grafana.imageRenderer.selectorLabels" . | nindent 4 }}
{{ end }}

View File

@ -6,13 +6,13 @@ metadata:
namespace: {{ template "grafana.namespace" . }}
labels:
{{- include "grafana.labels" . | nindent 4 }}
{{- if .Values.labels }}
{{ toYaml .Values.labels | indent 4 }}
{{- end }}
{{- with .Values.annotations }}
{{- with .Values.labels }}
{{ toYaml . | nindent 4 }}
{{- end }}
{{- with .Values.annotations }}
annotations:
{{ toYaml . | indent 4 }}
{{- end }}
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
policyTypes:
{{- if .Values.networkPolicy.ingress }}
@ -39,9 +39,9 @@ spec:
- podSelector:
matchLabels:
{{ template "grafana.fullname" . }}-client: "true"
{{- if .Values.networkPolicy.explicitNamespacesSelector }}
namespaceSelector:
{{ toYaml .Values.networkPolicy.explicitNamespacesSelector | indent 12 }}
{{- with .Values.networkPolicy.explicitNamespacesSelector }}
- namespaceSelector:
{{- toYaml . | nindent 12 }}
{{- end }}
- podSelector:
matchLabels:

View File

@ -1,5 +1,6 @@
{{- if .Values.rbac.pspEnabled }}
apiVersion: {{ include "grafana.podSecurityPolicy.apiVersion" . }}
{{- if .Capabilities.APIVersions.Has "policy/v1beta1/PodSecurityPolicy" }}
apiVersion: policy/v1beta1
kind: PodSecurityPolicy
metadata:
name: {{ template "grafana.fullname" . }}
@ -47,3 +48,4 @@ spec:
max: 65535
readOnlyRootFilesystem: false
{{- end }}
{{- end }}

View File

@ -9,9 +9,10 @@ metadata:
{{- if .Values.service.labels }}
{{ toYaml .Values.service.labels | indent 4 }}
{{- end }}
{{- $root := . }}
{{- with .Values.service.annotations }}
annotations:
{{ toYaml . | indent 4 }}
{{ tpl (toYaml . | indent 4) $root }}
{{- end }}
spec:
{{- if (or (eq .Values.service.type "ClusterIP") (empty .Values.service.type)) }}
@ -40,11 +41,14 @@ spec:
port: {{ .Values.service.port }}
protocol: TCP
targetPort: {{ .Values.service.targetPort }}
{{ if (and (eq .Values.service.type "NodePort") (not (empty .Values.service.nodePort))) }}
{{- if .Values.service.appProtocol }}
appProtocol: {{ .Values.service.appProtocol }}
{{- end }}
{{- if (and (eq .Values.service.type "NodePort") (not (empty .Values.service.nodePort))) }}
nodePort: {{.Values.service.nodePort}}
{{ end }}
{{ end }}
{{- if .Values.extraExposePorts }}
{{- tpl (toYaml .Values.extraExposePorts) . | indent 4 }}
{{- tpl (toYaml .Values.extraExposePorts) . | nindent 4 }}
{{- end }}
selector:
{{- include "grafana.selectorLabels" . | nindent 4 }}

View File

@ -5,7 +5,7 @@ kind: ServiceMonitor
metadata:
name: {{ template "grafana.fullname" . }}
{{- if .Values.serviceMonitor.namespace }}
namespace: {{ .Values.serviceMonitor.namespace }}
namespace: {{ tpl .Values.serviceMonitor.namespace . }}
{{- else }}
namespace: {{ template "grafana.namespace" . }}
{{- end }}

View File

@ -11,7 +11,7 @@ data:
@test "Test Health" {
url="http://{{ template "grafana.fullname" . }}/api/health"
code=$(wget --server-response --spider --timeout 10 --tries 1 ${url} 2>&1 | awk '/^ HTTP/{print $2}')
code=$(wget --server-response --spider --timeout 90 --tries 10 ${url} 2>&1 | awk '/^ HTTP/{print $2}')
[ "$code" == "200" ]
}
{{- end }}

View File

@ -1,4 +1,5 @@
{{- if and .Values.testFramework.enabled .Values.rbac.pspEnabled }}
{{- if .Capabilities.APIVersions.Has "policy/v1beta1/PodSecurityPolicy" }}
apiVersion: policy/v1beta1
kind: PodSecurityPolicy
metadata:
@ -27,3 +28,4 @@ spec:
- csi
- secret
{{- end }}
{{- end }}

View File

@ -136,6 +136,7 @@ downloadDashboards:
env: {}
envFromSecret: ""
resources: {}
securityContext: {}
## Pod Annotations
# podAnnotations: {}
@ -158,9 +159,12 @@ service:
port: 80
targetPort: 3000
# targetPort: 4181 To be used with a proxy extraContainer
## Service annotations. Can be templated.
annotations: {}
labels: {}
portName: service
# Adds the appProtocol field to the service. This allows to work with istio protocol selection. Ex: "http" or "tcp"
appProtocol: ""
serviceMonitor:
## If true, a ServiceMonitor CRD is created for a prometheus operator
@ -249,6 +253,11 @@ tolerations: []
##
affinity: {}
## Topology Spread Constraints
## ref: https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/
##
topologySpreadConstraints: []
## Additional init containers (evaluated as template)
## ref: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/
##
@ -468,6 +477,13 @@ extraVolumeMounts: []
# mountPath: /mnt/volume1
# readOnly: true
# hostPath: /usr/shared/
# - name: grafana-secrets
# csi: true
# data:
# driver: secrets-store.csi.k8s.io
# readOnly: true
# volumeAttributes:
# secretProviderClass: "grafana-env-spc"
## Container Lifecycle Hooks. Execute a specific bash command or make an HTTP request
lifecycleHooks: {}
@ -502,6 +518,71 @@ datasources: {}
# authType: default
# defaultRegion: us-east-1
## Configure grafana alerting (can be templated)
## ref: http://docs.grafana.org/administration/provisioning/#alerting
##
alerting: {}
# rules.yaml: |
# apiVersion: 1
# groups:
# - orgId: 1
# name: {{ .Chart.Name }}_my_rule_group
# folder: my_first_folder
# interval: 60s
# rules:
# - uid: my_id_1
# title: my_first_rule
# condition: A
# data:
# - refId: A
# datasourceUid: '-100'
# model:
# conditions:
# - evaluator:
# params:
# - 3
# type: gt
# operator:
# type: and
# query:
# params:
# - A
# reducer:
# type: last
# type: query
# datasource:
# type: __expr__
# uid: '-100'
# expression: 1==0
# intervalMs: 1000
# maxDataPoints: 43200
# refId: A
# type: math
# dashboardUid: my_dashboard
# panelId: 123
# noDataState: Alerting
# for: 60s
# annotations:
# some_key: some_value
# labels:
# team: sre_team_1
# contactpoints.yaml: |
# apiVersion: 1
# contactPoints:
# - orgId: 1
# name: cp_1
# receivers:
# - uid: first_uid
# type: pagerduty
# settings:
# integrationKey: XXX
# severity: critical
# class: ping failure
# component: Grafana
# group: app-stack
# summary: |
# {{ `{{ template "default.message" . }}` }}
## Configure notifiers
## ref: http://docs.grafana.org/administration/provisioning/#alert-notification-channels
##
@ -562,6 +643,12 @@ dashboards: {}
# url: https://example.com/repository/test-b64.json
# token: ''
# b64content: true
# local-dashboard-gitlab:
# url: https://example.com/repository/test-gitlab.json
# gitlabToken: ''
# local-dashboard-bitbucket:
# url: https://example.com/repository/test-bitbucket.json
# bearerToken: ''
## Reference to external ConfigMap per provider. Use provider name as key and ConfigMap name as value.
## A provider dashboards must be defined either by external ConfigMaps or in values.yaml, not in both.
@ -590,6 +677,8 @@ grafana.ini:
mode: console
grafana_net:
url: https://grafana.net
server:
domain: "{{ if (and .Values.ingress.enabled .Values.ingress.hosts) }}{{ .Values.ingress.hosts | first }}{{ end }}"
## grafana Authentication can be enabled with the following values on grafana.ini
# server:
# The full public facing url you use in browser, used for redirects and emails
@ -667,15 +756,21 @@ sidecar:
enableUniqueFilenames: false
readinessProbe: {}
livenessProbe: {}
# Log level. Can be one of: DEBUG, INFO, WARN, ERROR, CRITICAL.
logLevel: INFO
# Log level default for all sidecars. Can be one of: DEBUG, INFO, WARN, ERROR, CRITICAL. Defaults to INFO
# logLevel: INFO
dashboards:
enabled: false
# Additional environment variables for the dashboards sidecar
env: {}
# Do not reprocess already processed unchanged resources on k8s API reconnect.
# ignoreAlreadyProcessed: true
SCProvider: true
# label that the configmaps with dashboards are marked with
label: grafana_dashboard
# value of label that the configmaps with dashboards are set to
labelValue: ""
# Log level. Can be one of: DEBUG, INFO, WARN, ERROR, CRITICAL.
# logLevel: INFO
# folder in the pod that should hold the collected dashboards (unless `defaultFolderName` is set)
folder: /tmp/dashboards
# The default folder name, it will create a subfolder under the `folder` and put dashboards in there instead
@ -725,10 +820,16 @@ sidecar:
sizeLimit: {}
datasources:
enabled: false
# Additional environment variables for the datasourcessidecar
env: {}
# Do not reprocess already processed unchanged resources on k8s API reconnect.
# ignoreAlreadyProcessed: true
# label that the configmaps with datasources are marked with
label: grafana_datasource
# value of label that the configmaps with datasources are set to
labelValue: ""
# Log level. Can be one of: DEBUG, INFO, WARN, ERROR, CRITICAL.
# logLevel: INFO
# If specified, the sidecar will search for datasource config-maps inside this namespace.
# Otherwise the namespace in which the sidecar is running will be used.
# It's also possible to specify ALL to search in all namespaces
@ -737,8 +838,20 @@ sidecar:
watchMethod: WATCH
# search in configmap, secret or both
resource: both
# watchServerTimeout: request to the server, asking it to cleanly close the connection after that.
# defaults to 60sec; much higher values like 3600 seconds (1h) are feasible for non-Azure K8S
# watchServerTimeout: 3600
#
# watchClientTimeout: is a client-side timeout, configuring your local socket.
# If you have a network outage dropping all packets with no RST/FIN,
# this is how long your client waits before realizing & dropping the connection.
# defaults to 66sec (sic!)
# watchClientTimeout: 60
#
# Endpoint to send request to reload datasources
reloadURL: "http://localhost:3000/api/admin/provisioning/datasources/reload"
# Absolute path to shell script to execute after a datasource got reloaded
script: null
skipReload: false
# Deploy the datasource sidecar as an initContainer in addition to a container.
# This is needed if skipReload is true, to load any datasources defined at startup time.
@ -747,10 +860,16 @@ sidecar:
sizeLimit: {}
plugins:
enabled: false
# Additional environment variables for the plugins sidecar
env: {}
# Do not reprocess already processed unchanged resources on k8s API reconnect.
# ignoreAlreadyProcessed: true
# label that the configmaps with plugins are marked with
label: grafana_plugin
# value of label that the configmaps with plugins are set to
labelValue: ""
# Log level. Can be one of: DEBUG, INFO, WARN, ERROR, CRITICAL.
# logLevel: INFO
# If specified, the sidecar will search for plugin config-maps inside this namespace.
# Otherwise the namespace in which the sidecar is running will be used.
# It's also possible to specify ALL to search in all namespaces
@ -759,8 +878,20 @@ sidecar:
watchMethod: WATCH
# search in configmap, secret or both
resource: both
# watchServerTimeout: request to the server, asking it to cleanly close the connection after that.
# defaults to 60sec; much higher values like 3600 seconds (1h) are feasible for non-Azure K8S
# watchServerTimeout: 3600
#
# watchClientTimeout: is a client-side timeout, configuring your local socket.
# If you have a network outage dropping all packets with no RST/FIN,
# this is how long your client waits before realizing & dropping the connection.
# defaults to 66sec (sic!)
# watchClientTimeout: 60
#
# Endpoint to send request to reload plugins
reloadURL: "http://localhost:3000/api/admin/provisioning/plugins/reload"
# Absolute path to shell script to execute after a plugin got reloaded
script: null
skipReload: false
# Deploy the datasource sidecar as an initContainer in addition to a container.
# This is needed if skipReload is true, to load any plugins defined at startup time.
@ -769,8 +900,16 @@ sidecar:
sizeLimit: {}
notifiers:
enabled: false
# Additional environment variables for the notifierssidecar
env: {}
# Do not reprocess already processed unchanged resources on k8s API reconnect.
# ignoreAlreadyProcessed: true
# label that the configmaps with notifiers are marked with
label: grafana_notifier
# value of label that the configmaps with notifiers are set to
labelValue: ""
# Log level. Can be one of: DEBUG, INFO, WARN, ERROR, CRITICAL.
# logLevel: INFO
# If specified, the sidecar will search for notifier config-maps inside this namespace.
# Otherwise the namespace in which the sidecar is running will be used.
# It's also possible to specify ALL to search in all namespaces
@ -824,6 +963,8 @@ imageRenderer:
# image-renderer service port used by both service and deployment
port: 8081
targetPort: 8081
# Adds the appProtocol field to the image-renderer service. This allows to work with istio protocol selection. Ex: "http" or "tcp"
appProtocol: ""
# If https is enabled in Grafana, this needs to be set as 'https' to correctly configure the callback used in Grafana
grafanaProtocol: http
# In case a sub_path is used this needs to be added to the image renderer callback

View File

@ -1,5 +1,5 @@
apiVersion: v2
appVersion: 2.5.0
appVersion: 2.6.0
description: Install kube-state-metrics to generate and expose cluster-level metrics
home: https://github.com/kubernetes/kube-state-metrics/
keywords:
@ -18,4 +18,4 @@ name: kube-state-metrics
sources:
- https://github.com/kubernetes/kube-state-metrics/
type: application
version: 4.15.0
version: 4.18.0

View File

@ -92,7 +92,9 @@ spec:
{{- if .Values.selfMonitor.telemetryHost }}
- --telemetry-host={{ .Values.selfMonitor.telemetryHost }}
{{- end }}
{{- if .Values.selfMonitor.telemetryPort }}
- --telemetry-port={{ .Values.selfMonitor.telemetryPort | default 8081 }}
{{- end }}
{{- if or (.Values.kubeconfig.enabled) (.Values.volumeMounts) }}
volumeMounts:
{{- if .Values.kubeconfig.enabled }}
@ -105,7 +107,11 @@ spec:
{{- end }}
{{- end }}
imagePullPolicy: {{ .Values.image.pullPolicy }}
{{- if .Values.image.sha }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}@sha256:{{ .Values.image.sha }}"
{{- else }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
{{- end }}
ports:
- containerPort: {{ .Values.service.port | default 8080}}
name: "http"

View File

@ -2,7 +2,8 @@
prometheusScrape: true
image:
repository: registry.k8s.io/kube-state-metrics/kube-state-metrics
tag: v2.5.0
tag: v2.6.0
sha: ""
pullPolicy: IfNotPresent
imagePullSecrets: []

View File

@ -15,4 +15,4 @@ name: prometheus-node-exporter
sources:
- https://github.com/prometheus/node_exporter/
type: application
version: 3.3.1
version: 4.2.0

View File

@ -1,17 +1,17 @@
# Prometheus Node Exporter
# Prometheus `Node Exporter`
Prometheus exporter for hardware and OS metrics exposed by *NIX kernels, written in Go with pluggable metric collectors.
This chart bootstraps a prometheus [Node Exporter](http://github.com/prometheus/node_exporter) deployment on a [Kubernetes](http://kubernetes.io) cluster using the [Helm](https://helm.sh) package manager.
This chart bootstraps a prometheus [`Node Exporter`](http://github.com/prometheus/node_exporter) daemonset on a [Kubernetes](http://kubernetes.io) cluster using the [Helm](https://helm.sh) package manager.
## Get Repo Info
## Get Repository Info
```console
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
```
_See [helm repo](https://helm.sh/docs/helm/helm_repo/) for command documentation._
_See [`helm repo`](https://helm.sh/docs/helm/helm_repo/) for command documentation._
## Install Chart
@ -19,7 +19,7 @@ _See [helm repo](https://helm.sh/docs/helm/helm_repo/) for command documentation
helm install [RELEASE_NAME] prometheus-community/prometheus-node-exporter
```
_See [configuration](#configuration) below._
_See [configuration](#configuring) below._
_See [helm install](https://helm.sh/docs/helm/helm_install/) for command documentation._
@ -41,6 +41,17 @@ helm upgrade [RELEASE_NAME] [CHART] --install
_See [helm upgrade](https://helm.sh/docs/helm/helm_upgrade/) for command documentation._
### 3.x to 4.x
Starting from version 4.0.0, the `node exporter` chart is using the [Kubernetes recommended labels](https://kubernetes.io/docs/concepts/overview/working-with-objects/common-labels/). Therefore you have to delete the daemonset before you upgrade.
```console
kubectl delete daemonset -l app=prometheus-node-exporter
helm upgrade -i prometheus-node-exporter prometheus-community/prometheus-node-exporter
```
If you use your own custom [ServiceMonitor](https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#servicemonitor) or [PodMonitor](https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#podmonitor), please ensure to upgrade their `selector` fields accordingly to the new labels.
### From 2.x to 3.x
Change the following:

View File

@ -9,7 +9,7 @@
export SERVICE_IP=$(kubectl get svc --namespace {{ template "prometheus-node-exporter.namespace" . }} {{ template "prometheus-node-exporter.fullname" . }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
echo http://$SERVICE_IP:{{ .Values.service.port }}
{{- else if contains "ClusterIP" .Values.service.type }}
export POD_NAME=$(kubectl get pods --namespace {{ template "prometheus-node-exporter.namespace" . }} -l "app={{ template "prometheus-node-exporter.name" . }},release={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
export POD_NAME=$(kubectl get pods --namespace {{ template "prometheus-node-exporter.namespace" . }} -l "app.kubernetes.io/name={{ template "prometheus-node-exporter.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
echo "Visit http://127.0.0.1:9100 to use your application"
kubectl port-forward --namespace {{ template "prometheus-node-exporter.namespace" . }} $POD_NAME 9100
{{- end }}

View File

@ -26,13 +26,28 @@ If release name contains chart name it will be used as a full name.
{{/* Generate basic labels */}}
{{- define "prometheus-node-exporter.labels" }}
app: {{ template "prometheus-node-exporter.name" . }}
heritage: {{.Release.Service }}
release: {{.Release.Name }}
chart: {{ template "prometheus-node-exporter.chart" . }}
helm.sh/chart: {{ template "prometheus-node-exporter.chart" . }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
app.kubernetes.io/component: metrics
app.kubernetes.io/part-of: {{ template "prometheus-node-exporter.name" . }}
{{- include "prometheus-node-exporter.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
{{- if .Values.podLabels}}
{{ toYaml .Values.podLabels }}
{{- end }}
{{- if .Values.releaseLabel }}
release: {{ .Release.Name }}
{{- end }}
{{- end }}
{{/*
Selector labels
*/}}
{{- define "prometheus-node-exporter.selectorLabels" }}
app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/name: {{ template "prometheus-node-exporter.name" . }}
{{- end }}
{{/*
@ -58,8 +73,12 @@ Create the name of the service account to use
The image to use
*/}}
{{- define "prometheus-node-exporter.image" -}}
{{- if .Values.image.sha -}}
{{- printf "%s:%s@%s" .Values.image.repository (default (printf "v%s" .Chart.AppVersion) .Values.image.tag) .Values.image.sha }}
{{- else -}}
{{- printf "%s:%s" .Values.image.repository (default (printf "v%s" .Chart.AppVersion) .Values.image.tag) }}
{{- end }}
{{- end }}
{{/*
Allow the release namespace to be overridden for multi-namespace deployments in combined charts

View File

@ -7,8 +7,7 @@ metadata:
spec:
selector:
matchLabels:
app: {{ template "prometheus-node-exporter.name" . }}
release: {{ .Release.Name }}
{{- include "prometheus-node-exporter.selectorLabels" . | indent 6 }}
{{- if .Values.updateStrategy }}
updateStrategy:
{{ toYaml .Values.updateStrategy | indent 4 }}
@ -147,13 +146,21 @@ spec:
{{- end }}
{{- if .Values.sidecars }}
{{ toYaml .Values.sidecars | indent 8 }}
{{- if .Values.sidecarVolumeMount }}
{{- if or .Values.sidecarVolumeMount .Values.sidecarHostVolumeMounts }}
volumeMounts:
{{- range $_, $mount := .Values.sidecarVolumeMount }}
- name: {{ $mount.name }}
mountPath: {{ $mount.mountPath }}
readOnly: {{ $mount.readOnly }}
{{- end }}
{{- range $_, $mount := .Values.sidecarHostVolumeMounts }}
- name: {{ $mount.name }}
mountPath: {{ $mount.mountPath }}
readOnly: {{ $mount.readOnly }}
{{- if $mount.mountPropagation }}
mountPropagation: {{ $mount.mountPropagation }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}
{{- if .Values.imagePullSecrets }}
@ -204,6 +211,13 @@ spec:
medium: Memory
{{- end }}
{{- end }}
{{- if .Values.sidecarHostVolumeMounts }}
{{- range $_, $mount := .Values.sidecarHostVolumeMounts }}
- name: {{ $mount.name }}
hostPath:
path: {{ $mount.hostPath }}
{{- end }}
{{- end }}
{{- if .Values.configmaps }}
{{- range $_, $mount := .Values.configmaps }}
- name: {{ $mount.name }}

View File

@ -4,8 +4,7 @@ kind: Endpoints
metadata:
name: {{ template "prometheus-node-exporter.fullname" . }}
namespace: {{ template "prometheus-node-exporter.namespace" . }}
labels:
{{ include "prometheus-node-exporter.labels" . | indent 4 }}
labels: {{ include "prometheus-node-exporter.labels" . | indent 4 }}
subsets:
- addresses:
{{- range .Values.endpoints }}

View File

@ -3,11 +3,11 @@ kind: Service
metadata:
name: {{ template "prometheus-node-exporter.fullname" . }}
namespace: {{ template "prometheus-node-exporter.namespace" . }}
labels: {{ include "prometheus-node-exporter.labels" . | indent 4 }}
{{- if .Values.service.annotations }}
annotations:
{{ toYaml .Values.service.annotations | indent 4 }}
{{- end }}
labels: {{ include "prometheus-node-exporter.labels" . | indent 4 }}
spec:
type: {{ .Values.service.type }}
ports:
@ -19,5 +19,4 @@ spec:
protocol: TCP
name: {{ .Values.service.portName }}
selector:
app: {{ template "prometheus-node-exporter.name" . }}
release: {{ .Release.Name }}
{{- include "prometheus-node-exporter.selectorLabels" . | indent 4 }}

View File

@ -5,11 +5,7 @@ kind: ServiceAccount
metadata:
name: {{ template "prometheus-node-exporter.serviceAccountName" . }}
namespace: {{ template "prometheus-node-exporter.namespace" . }}
labels:
app: {{ template "prometheus-node-exporter.name" . }}
chart: {{ template "prometheus-node-exporter.chart" . }}
release: "{{ .Release.Name }}"
heritage: "{{ .Release.Service }}"
labels: {{ include "prometheus-node-exporter.labels" . | indent 4 }}
annotations:
{{ toYaml .Values.serviceAccount.annotations | indent 4 }}
imagePullSecrets:

View File

@ -15,8 +15,7 @@ spec:
{{- if .Values.prometheus.monitor.selectorOverride }}
{{ toYaml .Values.prometheus.monitor.selectorOverride | indent 6 }}
{{ else }}
app: {{ template "prometheus-node-exporter.name" . }}
release: {{ .Release.Name }}
{{ include "prometheus-node-exporter.selectorLabels" . | indent 6 }}
{{- end }}
endpoints:
- port: {{ .Values.service.portName }}

View File

@ -6,6 +6,7 @@ image:
# Overrides the image tag whose default is {{ printf "v%s" .Chart.AppVersion }}
tag: ""
pullPolicy: IfNotPresent
sha: ""
imagePullSecrets: []
# - name: "image-pull-secret"
@ -139,6 +140,9 @@ podAnnotations:
# Extra labels to be added to node exporter pods
podLabels: {}
## set to true to add the release label so scraping of the servicemonitor with kube-prometheus-stack works out of the box
releaseLabel: false
# Custom DNS configuration to be added to prometheus-node-exporter pods
dnsConfig: {}
# nameservers:
@ -170,7 +174,7 @@ extraArgs: []
# - --collector.diskstats.ignored-devices=^(ram|loop|fd|(h|s|v)d[a-z]|nvme\\d+n\\d+p)\\d+$
# - --collector.textfile.directory=/run/prometheus
## Additional mounts from the host
## Additional mounts from the host to node-exporter container
##
extraHostVolumeMounts: []
# - name: <mountName>
@ -204,6 +208,15 @@ sidecarVolumeMount: []
## mountPath: /run/prometheus
## readOnly: false
## Additional mounts from the host to sidecar containers
##
sidecarHostVolumeMounts: []
# - name: <mountName>
# hostPath: <hostPath>
# mountPath: <mountPath>
# readOnly: true|false
# mountPropagation: None|HostToContainer|Bidirectional
## Additional InitContainers to initialize the pod
##
extraInitContainers: []

View File

@ -1,4 +1,4 @@
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.58.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.59.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition

View File

@ -1,4 +1,4 @@
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.58.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.59.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
@ -981,15 +981,365 @@ spec:
x-kubernetes-map-type: atomic
alertmanagerConfiguration:
description: 'EXPERIMENTAL: alertmanagerConfiguration specifies the
global Alertmanager configuration. If defined, it takes precedence
over the `configSecret` field. This field may change in future releases.'
configuration of Alertmanager. If defined, it takes precedence over
the `configSecret` field. This field may change in future releases.'
properties:
global:
description: Defines the global parameters of the Alertmanager
configuration.
properties:
httpConfig:
description: HTTP client configuration.
properties:
authorization:
description: Authorization header configuration for the
client. This is mutually exclusive with BasicAuth and
is only available starting from Alertmanager v0.22+.
properties:
credentials:
description: The secret's key that contains the credentials
of the request
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its
key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type:
description: Set the authentication type. Defaults
to Bearer, Basic will cause an error
type: string
type: object
basicAuth:
description: BasicAuth for the client. This is mutually
exclusive with Authorization. If both are defined, BasicAuth
takes precedence.
properties:
password:
description: The secret in the service monitor namespace
that contains the password for authentication.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its
key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
username:
description: The secret in the service monitor namespace
that contains the username for authentication.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its
key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
bearerTokenSecret:
description: The secret's key that contains the bearer
token to be used by the client for authentication. The
secret needs to be in the same namespace as the Alertmanager
object and accessible by the Prometheus Operator.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its key
must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
followRedirects:
description: FollowRedirects specifies whether the client
should follow HTTP 3xx redirects.
type: boolean
oauth2:
description: OAuth2 client credentials used to fetch a
token for the targets.
properties:
clientId:
description: The secret or configmap containing the
OAuth2 client id
properties:
configMap:
description: ConfigMap containing data to use
for the targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion,
kind, uid?'
type: string
optional:
description: Specify whether the ConfigMap
or its key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
secret:
description: Secret containing data to use for
the targets.
properties:
key:
description: The key of the secret to select
from. Must be a valid secret key.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion,
kind, uid?'
type: string
optional:
description: Specify whether the Secret or
its key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
clientSecret:
description: The secret containing the OAuth2 client
secret
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its
key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
endpointParams:
additionalProperties:
type: string
description: Parameters to append to the token URL
type: object
scopes:
description: OAuth2 scopes used for the token request
items:
type: string
type: array
tokenUrl:
description: The URL to fetch the token from
minLength: 1
type: string
required:
- clientId
- clientSecret
- tokenUrl
type: object
proxyURL:
description: Optional proxy URL.
type: string
tlsConfig:
description: TLS configuration for the client.
properties:
ca:
description: Struct containing the CA cert to use
for the targets.
properties:
configMap:
description: ConfigMap containing data to use
for the targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion,
kind, uid?'
type: string
optional:
description: Specify whether the ConfigMap
or its key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
secret:
description: Secret containing data to use for
the targets.
properties:
key:
description: The key of the secret to select
from. Must be a valid secret key.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion,
kind, uid?'
type: string
optional:
description: Specify whether the Secret or
its key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
cert:
description: Struct containing the client cert file
for the targets.
properties:
configMap:
description: ConfigMap containing data to use
for the targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion,
kind, uid?'
type: string
optional:
description: Specify whether the ConfigMap
or its key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
secret:
description: Secret containing data to use for
the targets.
properties:
key:
description: The key of the secret to select
from. Must be a valid secret key.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion,
kind, uid?'
type: string
optional:
description: Specify whether the Secret or
its key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
insecureSkipVerify:
description: Disable target certificate validation.
type: boolean
keySecret:
description: Secret containing the client key file
for the targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its
key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
serverName:
description: Used to verify the hostname for the targets.
type: string
type: object
type: object
resolveTimeout:
description: ResolveTimeout is the default value used by alertmanager
if the alert does not include EndsAt, after this time passes
it can declare the alert as resolved if it has not been
updated. This has no impact on alerts from Prometheus, as
they always include EndsAt.
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
type: object
name:
description: The name of the AlertmanagerConfig resource which
is used to generate the global configuration. It must be defined
in the same namespace as the Alertmanager object. The operator
will not enforce a `namespace` label for routes and inhibition
rules.
is used to generate the Alertmanager configuration. It must
be defined in the same namespace as the Alertmanager object.
The operator will not enforce a `namespace` label for routes
and inhibition rules.
minLength: 1
type: string
type: object
@ -1595,13 +1945,13 @@ spec:
Cannot be updated.
type: string
ports:
description: List of ports to expose from the container. Exposing
a port here gives the system additional information about
the network connections a container uses, but is primarily
informational. Not specifying a port here DOES NOT prevent
that port from being exposed. Any port which is listening
on the default "0.0.0.0" address inside a container will be
accessible from the network. Cannot be updated.
description: List of ports to expose from the container. Not
specifying a port here DOES NOT prevent that port from being
exposed. Any port which is listening on the default "0.0.0.0"
address inside a container will be accessible from the network.
Modifying this array with strategic merge patch may corrupt
the data. For more information See https://github.com/kubernetes/kubernetes/issues/108255.
Cannot be updated.
items:
description: ContainerPort represents a network port in a
single container.
@ -2880,13 +3230,13 @@ spec:
Cannot be updated.
type: string
ports:
description: List of ports to expose from the container. Exposing
a port here gives the system additional information about
the network connections a container uses, but is primarily
informational. Not specifying a port here DOES NOT prevent
that port from being exposed. Any port which is listening
on the default "0.0.0.0" address inside a container will be
accessible from the network. Cannot be updated.
description: List of ports to expose from the container. Not
specifying a port here DOES NOT prevent that port from being
exposed. Any port which is listening on the default "0.0.0.0"
address inside a container will be accessible from the network.
Modifying this array with strategic merge patch may corrupt
the data. For more information See https://github.com/kubernetes/kubernetes/issues/108255.
Cannot be updated.
items:
description: ContainerPort represents a network port in a
single container.
@ -4509,6 +4859,19 @@ spec:
type: object
type: object
x-kubernetes-map-type: atomic
matchLabelKeys:
description: MatchLabelKeys is a set of pod label keys to select
the pods over which spreading will be calculated. The keys
are used to lookup values from the incoming pod labels, those
key-value labels are ANDed with labelSelector to select the
group of existing pods over which spreading will be calculated
for the incoming pod. Keys that don't exist in the incoming
pod labels will be ignored. A null or empty list means only
match against labelSelector.
items:
type: string
type: array
x-kubernetes-list-type: atomic
maxSkew:
description: 'MaxSkew describes the degree to which pods may
be unevenly distributed. When `whenUnsatisfiable=DoNotSchedule`,
@ -4549,11 +4912,31 @@ spec:
minimum\" is treated as 0. In this situation, new pod with
the same labelSelector cannot be scheduled, because computed
skew will be 3(3 - 0) if new Pod is scheduled to any of the
three zones, it will violate MaxSkew. \n This is an alpha
field and requires enabling MinDomainsInPodTopologySpread
feature gate."
three zones, it will violate MaxSkew. \n This is a beta field
and requires the MinDomainsInPodTopologySpread feature gate
to be enabled (enabled by default)."
format: int32
type: integer
nodeAffinityPolicy:
description: "NodeAffinityPolicy indicates how we will treat
Pod's nodeAffinity/nodeSelector when calculating pod topology
spread skew. Options are: - Honor: only nodes matching nodeAffinity/nodeSelector
are included in the calculations. - Ignore: nodeAffinity/nodeSelector
are ignored. All nodes are included in the calculations. \n
If this value is nil, the behavior is equivalent to the Honor
policy. This is a alpha-level feature enabled by the NodeInclusionPolicyInPodTopologySpread
feature flag."
type: string
nodeTaintsPolicy:
description: "NodeTaintsPolicy indicates how we will treat node
taints when calculating pod topology spread skew. Options
are: - Honor: nodes without taints, along with tainted nodes
for which the incoming pod has a toleration, are included.
- Ignore: node taints are ignored. All nodes are included.
\n If this value is nil, the behavior is equivalent to the
Ignore policy. This is a alpha-level feature enabled by the
NodeInclusionPolicyInPodTopologySpread feature flag."
type: string
topologyKey:
description: TopologyKey is the key of node labels. Nodes that
have a label with this key and identical values are considered
@ -4561,10 +4944,11 @@ spec:
as a "bucket", and try to put balanced number of pods into
each bucket. We define a domain as a particular instance of
a topology. Also, we define an eligible domain as a domain
whose nodes match the node selector. e.g. If TopologyKey is
"kubernetes.io/hostname", each Node is a domain of that topology.
And, if TopologyKey is "topology.kubernetes.io/zone", each
zone is a domain of that topology. It's a required field.
whose nodes meet the requirements of nodeAffinityPolicy and
nodeTaintsPolicy. e.g. If TopologyKey is "kubernetes.io/hostname",
each Node is a domain of that topology. And, if TopologyKey
is "topology.kubernetes.io/zone", each zone is a domain of
that topology. It's a required field.
type: string
whenUnsatisfiable:
description: 'WhenUnsatisfiable indicates how to deal with a
@ -6153,8 +6537,55 @@ spec:
web:
description: Defines the web command line flags when starting Alertmanager.
properties:
httpConfig:
description: Defines HTTP parameters for web server.
properties:
headers:
description: List of headers that can be added to HTTP responses.
properties:
contentSecurityPolicy:
description: Set the Content-Security-Policy header to
HTTP responses. Unset if blank.
type: string
strictTransportSecurity:
description: Set the Strict-Transport-Security header
to HTTP responses. Unset if blank. Please make sure
that you use this with care as this header might force
browsers to load Prometheus and the other applications
hosted on the same domain and subdomains over HTTPS.
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Strict-Transport-Security
type: string
xContentTypeOptions:
description: Set the X-Content-Type-Options header to
HTTP responses. Unset if blank. Accepted value is nosniff.
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Content-Type-Options
enum:
- ""
- NoSniff
type: string
xFrameOptions:
description: Set the X-Frame-Options header to HTTP responses.
Unset if blank. Accepted values are deny and sameorigin.
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Frame-Options
enum:
- ""
- Deny
- SameOrigin
type: string
xXSSProtection:
description: Set the X-XSS-Protection header to all responses.
Unset if blank. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-XSS-Protection
type: string
type: object
http2:
description: Enable HTTP/2 support. Note that HTTP/2 is only
supported with TLS. When TLSConfig is not configured, HTTP/2
will be disabled. Whenever the value of the field changes,
a rolling update will be triggered.
type: boolean
type: object
tlsConfig:
description: WebTLSConfig defines the TLS parameters for HTTPS.
description: Defines the TLS parameters for HTTPS.
properties:
cert:
description: Contains the TLS certificate for the server.

View File

@ -1,4 +1,4 @@
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.58.0/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.59.1/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition

View File

@ -1,4 +1,4 @@
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.58.0/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.59.1/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition

View File

@ -1,4 +1,4 @@
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.58.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.59.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
@ -110,6 +110,31 @@ spec:
- key
type: object
x-kubernetes-map-type: atomic
additionalArgs:
description: AdditionalArgs allows setting additional arguments for
the Prometheus container. It is intended for e.g. activating hidden
flags which are not supported by the dedicated configuration options
yet. The arguments are passed as-is to the Prometheus container
which may cause issues if they are invalid or not supporeted by
the given Prometheus version. In case of an argument conflict (e.g.
an argument which is already set by the operator itself) or when
providing an invalid argument the reconciliation will fail and an
error will be logged.
items:
description: Argument as part of the AdditionalArgs list.
properties:
name:
description: Name of the argument, e.g. "scrape.discovery-reload-interval".
minLength: 1
type: string
value:
description: Argument value, e.g. 30s. Can be empty for name-only
arguments (e.g. --storage.tsdb.no-lockfile)
type: string
required:
- name
type: object
type: array
additionalScrapeConfigs:
description: 'AdditionalScrapeConfigs allows specifying a key of a
Secret containing additional Prometheus scrape configurations. Scrape
@ -2004,13 +2029,13 @@ spec:
Cannot be updated.
type: string
ports:
description: List of ports to expose from the container. Exposing
a port here gives the system additional information about
the network connections a container uses, but is primarily
informational. Not specifying a port here DOES NOT prevent
that port from being exposed. Any port which is listening
on the default "0.0.0.0" address inside a container will be
accessible from the network. Cannot be updated.
description: List of ports to expose from the container. Not
specifying a port here DOES NOT prevent that port from being
exposed. Any port which is listening on the default "0.0.0.0"
address inside a container will be accessible from the network.
Modifying this array with strategic merge patch may corrupt
the data. For more information See https://github.com/kubernetes/kubernetes/issues/108255.
Cannot be updated.
items:
description: ContainerPort represents a network port in a
single container.
@ -3440,13 +3465,13 @@ spec:
Cannot be updated.
type: string
ports:
description: List of ports to expose from the container. Exposing
a port here gives the system additional information about
the network connections a container uses, but is primarily
informational. Not specifying a port here DOES NOT prevent
that port from being exposed. Any port which is listening
on the default "0.0.0.0" address inside a container will be
accessible from the network. Cannot be updated.
description: List of ports to expose from the container. Not
specifying a port here DOES NOT prevent that port from being
exposed. Any port which is listening on the default "0.0.0.0"
address inside a container will be accessible from the network.
Modifying this array with strategic merge patch may corrupt
the data. For more information See https://github.com/kubernetes/kubernetes/issues/108255.
Cannot be updated.
items:
description: ContainerPort represents a network port in a
single container.
@ -6347,6 +6372,29 @@ spec:
notice in any release. \n This is experimental and may change significantly
without backward compatibility in any release."
properties:
additionalArgs:
description: AdditionalArgs allows setting additional arguments
for the Thanos container. The arguments are passed as-is to
the Thanos container which may cause issues if they are invalid
or not supporeted the given Thanos version. In case of an argument
conflict (e.g. an argument which is already set by the operator
itself) or when providing an invalid argument the reconciliation
will fail and an error will be logged.
items:
description: Argument as part of the AdditionalArgs list.
properties:
name:
description: Name of the argument, e.g. "scrape.discovery-reload-interval".
minLength: 1
type: string
value:
description: Argument value, e.g. 30s. Can be empty for
name-only arguments (e.g. --storage.tsdb.no-lockfile)
type: string
required:
- name
type: object
type: array
baseImage:
description: 'Thanos base image if other than default. Deprecated:
use ''image'' instead'
@ -6756,6 +6804,19 @@ spec:
type: object
type: object
x-kubernetes-map-type: atomic
matchLabelKeys:
description: MatchLabelKeys is a set of pod label keys to select
the pods over which spreading will be calculated. The keys
are used to lookup values from the incoming pod labels, those
key-value labels are ANDed with labelSelector to select the
group of existing pods over which spreading will be calculated
for the incoming pod. Keys that don't exist in the incoming
pod labels will be ignored. A null or empty list means only
match against labelSelector.
items:
type: string
type: array
x-kubernetes-list-type: atomic
maxSkew:
description: 'MaxSkew describes the degree to which pods may
be unevenly distributed. When `whenUnsatisfiable=DoNotSchedule`,
@ -6796,11 +6857,31 @@ spec:
minimum\" is treated as 0. In this situation, new pod with
the same labelSelector cannot be scheduled, because computed
skew will be 3(3 - 0) if new Pod is scheduled to any of the
three zones, it will violate MaxSkew. \n This is an alpha
field and requires enabling MinDomainsInPodTopologySpread
feature gate."
three zones, it will violate MaxSkew. \n This is a beta field
and requires the MinDomainsInPodTopologySpread feature gate
to be enabled (enabled by default)."
format: int32
type: integer
nodeAffinityPolicy:
description: "NodeAffinityPolicy indicates how we will treat
Pod's nodeAffinity/nodeSelector when calculating pod topology
spread skew. Options are: - Honor: only nodes matching nodeAffinity/nodeSelector
are included in the calculations. - Ignore: nodeAffinity/nodeSelector
are ignored. All nodes are included in the calculations. \n
If this value is nil, the behavior is equivalent to the Honor
policy. This is a alpha-level feature enabled by the NodeInclusionPolicyInPodTopologySpread
feature flag."
type: string
nodeTaintsPolicy:
description: "NodeTaintsPolicy indicates how we will treat node
taints when calculating pod topology spread skew. Options
are: - Honor: nodes without taints, along with tainted nodes
for which the incoming pod has a toleration, are included.
- Ignore: node taints are ignored. All nodes are included.
\n If this value is nil, the behavior is equivalent to the
Ignore policy. This is a alpha-level feature enabled by the
NodeInclusionPolicyInPodTopologySpread feature flag."
type: string
topologyKey:
description: TopologyKey is the key of node labels. Nodes that
have a label with this key and identical values are considered
@ -6808,10 +6889,11 @@ spec:
as a "bucket", and try to put balanced number of pods into
each bucket. We define a domain as a particular instance of
a topology. Also, we define an eligible domain as a domain
whose nodes match the node selector. e.g. If TopologyKey is
"kubernetes.io/hostname", each Node is a domain of that topology.
And, if TopologyKey is "topology.kubernetes.io/zone", each
zone is a domain of that topology. It's a required field.
whose nodes meet the requirements of nodeAffinityPolicy and
nodeTaintsPolicy. e.g. If TopologyKey is "kubernetes.io/hostname",
each Node is a domain of that topology. And, if TopologyKey
is "topology.kubernetes.io/zone", each zone is a domain of
that topology. It's a required field.
type: string
whenUnsatisfiable:
description: 'WhenUnsatisfiable indicates how to deal with a
@ -8404,11 +8486,58 @@ spec:
web:
description: Defines the web command line flags when starting Prometheus.
properties:
httpConfig:
description: Defines HTTP parameters for web server.
properties:
headers:
description: List of headers that can be added to HTTP responses.
properties:
contentSecurityPolicy:
description: Set the Content-Security-Policy header to
HTTP responses. Unset if blank.
type: string
strictTransportSecurity:
description: Set the Strict-Transport-Security header
to HTTP responses. Unset if blank. Please make sure
that you use this with care as this header might force
browsers to load Prometheus and the other applications
hosted on the same domain and subdomains over HTTPS.
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Strict-Transport-Security
type: string
xContentTypeOptions:
description: Set the X-Content-Type-Options header to
HTTP responses. Unset if blank. Accepted value is nosniff.
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Content-Type-Options
enum:
- ""
- NoSniff
type: string
xFrameOptions:
description: Set the X-Frame-Options header to HTTP responses.
Unset if blank. Accepted values are deny and sameorigin.
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Frame-Options
enum:
- ""
- Deny
- SameOrigin
type: string
xXSSProtection:
description: Set the X-XSS-Protection header to all responses.
Unset if blank. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-XSS-Protection
type: string
type: object
http2:
description: Enable HTTP/2 support. Note that HTTP/2 is only
supported with TLS. When TLSConfig is not configured, HTTP/2
will be disabled. Whenever the value of the field changes,
a rolling update will be triggered.
type: boolean
type: object
pageTitle:
description: The prometheus web page title
type: string
tlsConfig:
description: WebTLSConfig defines the TLS parameters for HTTPS.
description: Defines the TLS parameters for HTTPS.
properties:
cert:
description: Contains the TLS certificate for the server.

View File

@ -1,4 +1,4 @@
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.58.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.59.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition

View File

@ -1,4 +1,4 @@
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.58.0/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.59.1/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition

View File

@ -1,4 +1,4 @@
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.58.0/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.59.1/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
@ -1508,13 +1508,13 @@ spec:
Cannot be updated.
type: string
ports:
description: List of ports to expose from the container. Exposing
a port here gives the system additional information about
the network connections a container uses, but is primarily
informational. Not specifying a port here DOES NOT prevent
that port from being exposed. Any port which is listening
on the default "0.0.0.0" address inside a container will be
accessible from the network. Cannot be updated.
description: List of ports to expose from the container. Not
specifying a port here DOES NOT prevent that port from being
exposed. Any port which is listening on the default "0.0.0.0"
address inside a container will be accessible from the network.
Modifying this array with strategic merge patch may corrupt
the data. For more information See https://github.com/kubernetes/kubernetes/issues/108255.
Cannot be updated.
items:
description: ContainerPort represents a network port in a
single container.
@ -2955,13 +2955,13 @@ spec:
Cannot be updated.
type: string
ports:
description: List of ports to expose from the container. Exposing
a port here gives the system additional information about
the network connections a container uses, but is primarily
informational. Not specifying a port here DOES NOT prevent
that port from being exposed. Any port which is listening
on the default "0.0.0.0" address inside a container will be
accessible from the network. Cannot be updated.
description: List of ports to expose from the container. Not
specifying a port here DOES NOT prevent that port from being
exposed. Any port which is listening on the default "0.0.0.0"
address inside a container will be accessible from the network.
Modifying this array with strategic merge patch may corrupt
the data. For more information See https://github.com/kubernetes/kubernetes/issues/108255.
Cannot be updated.
items:
description: ContainerPort represents a network port in a
single container.
@ -4731,6 +4731,19 @@ spec:
type: object
type: object
x-kubernetes-map-type: atomic
matchLabelKeys:
description: MatchLabelKeys is a set of pod label keys to select
the pods over which spreading will be calculated. The keys
are used to lookup values from the incoming pod labels, those
key-value labels are ANDed with labelSelector to select the
group of existing pods over which spreading will be calculated
for the incoming pod. Keys that don't exist in the incoming
pod labels will be ignored. A null or empty list means only
match against labelSelector.
items:
type: string
type: array
x-kubernetes-list-type: atomic
maxSkew:
description: 'MaxSkew describes the degree to which pods may
be unevenly distributed. When `whenUnsatisfiable=DoNotSchedule`,
@ -4771,11 +4784,31 @@ spec:
minimum\" is treated as 0. In this situation, new pod with
the same labelSelector cannot be scheduled, because computed
skew will be 3(3 - 0) if new Pod is scheduled to any of the
three zones, it will violate MaxSkew. \n This is an alpha
field and requires enabling MinDomainsInPodTopologySpread
feature gate."
three zones, it will violate MaxSkew. \n This is a beta field
and requires the MinDomainsInPodTopologySpread feature gate
to be enabled (enabled by default)."
format: int32
type: integer
nodeAffinityPolicy:
description: "NodeAffinityPolicy indicates how we will treat
Pod's nodeAffinity/nodeSelector when calculating pod topology
spread skew. Options are: - Honor: only nodes matching nodeAffinity/nodeSelector
are included in the calculations. - Ignore: nodeAffinity/nodeSelector
are ignored. All nodes are included in the calculations. \n
If this value is nil, the behavior is equivalent to the Honor
policy. This is a alpha-level feature enabled by the NodeInclusionPolicyInPodTopologySpread
feature flag."
type: string
nodeTaintsPolicy:
description: "NodeTaintsPolicy indicates how we will treat node
taints when calculating pod topology spread skew. Options
are: - Honor: nodes without taints, along with tainted nodes
for which the incoming pod has a toleration, are included.
- Ignore: node taints are ignored. All nodes are included.
\n If this value is nil, the behavior is equivalent to the
Ignore policy. This is a alpha-level feature enabled by the
NodeInclusionPolicyInPodTopologySpread feature flag."
type: string
topologyKey:
description: TopologyKey is the key of node labels. Nodes that
have a label with this key and identical values are considered
@ -4783,10 +4816,11 @@ spec:
as a "bucket", and try to put balanced number of pods into
each bucket. We define a domain as a particular instance of
a topology. Also, we define an eligible domain as a domain
whose nodes match the node selector. e.g. If TopologyKey is
"kubernetes.io/hostname", each Node is a domain of that topology.
And, if TopologyKey is "topology.kubernetes.io/zone", each
zone is a domain of that topology. It's a required field.
whose nodes meet the requirements of nodeAffinityPolicy and
nodeTaintsPolicy. e.g. If TopologyKey is "kubernetes.io/hostname",
each Node is a domain of that topology. And, if TopologyKey
is "topology.kubernetes.io/zone", each zone is a domain of
that topology. It's a required field.
type: string
whenUnsatisfiable:
description: 'WhenUnsatisfiable indicates how to deal with a
@ -4832,6 +4866,11 @@ spec:
- key
type: object
x-kubernetes-map-type: atomic
tracingConfigFile:
description: TracingConfig specifies the path of the tracing configuration
file. When used alongside with TracingConfig, TracingConfigFile
takes precedence.
type: string
volumes:
description: Volumes allows configuration of additional volumes on
the output StatefulSet definition. Volumes specified will be appended

View File

@ -42,6 +42,10 @@ spec:
- --host={{ template "kube-prometheus-stack.operator.fullname" . }},{{ template "kube-prometheus-stack.operator.fullname" . }}.{{ template "kube-prometheus-stack.namespace" . }}.svc
- --namespace={{ template "kube-prometheus-stack.namespace" . }}
- --secret-name={{ template "kube-prometheus-stack.fullname" . }}-admission
{{- with .Values.prometheusOperator.admissionWebhooks.createSecretJob }}
securityContext:
{{ toYaml .securityContext | nindent 12 }}
{{- end }}
resources:
{{ toYaml .Values.prometheusOperator.admissionWebhooks.patch.resources | indent 12 }}
restartPolicy: OnFailure

View File

@ -43,6 +43,10 @@ spec:
- --namespace={{ template "kube-prometheus-stack.namespace" . }}
- --secret-name={{ template "kube-prometheus-stack.fullname" . }}-admission
- --patch-failure-policy={{ .Values.prometheusOperator.admissionWebhooks.failurePolicy }}
{{- with .Values.prometheusOperator.admissionWebhooks.patchWebhookJob }}
securityContext:
{{ toYaml .securityContext | nindent 12 }}
{{- end }}
resources:
{{ toYaml .Values.prometheusOperator.admissionWebhooks.patch.resources | indent 12 }}
restartPolicy: OnFailure

View File

@ -12,5 +12,5 @@ metadata:
app: {{ template "kube-prometheus-stack.name" . }}-prometheus-am-confg
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
data:
additional-alertmanager-configs.yaml: {{ toYaml .Values.prometheus.prometheusSpec.additionalAlertManagerConfigs | b64enc | quote }}
additional-alertmanager-configs.yaml: {{ tpl (toYaml .Values.prometheus.prometheusSpec.additionalAlertManagerConfigs) . | b64enc | quote }}
{{- end }}

View File

@ -112,8 +112,10 @@ spec:
{{- if .Values.prometheus.prometheusSpec.retentionSize }}
retentionSize: {{ .Values.prometheus.prometheusSpec.retentionSize | quote }}
{{- end }}
{{- if .Values.prometheus.prometheusSpec.walCompression }}
walCompression: {{ .Values.prometheus.prometheusSpec.walCompression }}
{{- if eq .Values.prometheus.prometheusSpec.walCompression false }}
walCompression: false
{{ else }}
walCompression: true
{{- end }}
{{- if .Values.prometheus.prometheusSpec.routePrefix }}
routePrefix: {{ .Values.prometheus.prometheusSpec.routePrefix | quote }}
@ -354,7 +356,7 @@ spec:
name: "{{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) . | trunc 63 | trimSuffix "-" }}"
{{- end }}
{{- if .Values.prometheus.prometheusSpec.excludedFromEnforcement }}
{{ toYaml .Values.prometheus.prometheusSpec.excludedFromEnforcement | indent 4 }}
{{ tpl (toYaml .Values.prometheus.prometheusSpec.excludedFromEnforcement | indent 4) . }}
{{- end }}
{{- end }}
{{- if .Values.prometheus.prometheusSpec.queryLogFile }}

View File

@ -1530,6 +1530,7 @@ prometheus-node-exporter:
## Add the 'node-exporter' label to be used by serviceMonitor to match standard common usage in rules and grafana dashboards
##
jobLabel: node-exporter
releaseLabel: true
extraArgs:
- --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/)
- --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$
@ -1610,7 +1611,7 @@ prometheusOperator:
enabled: true
image:
repository: k8s.gcr.io/ingress-nginx/kube-webhook-certgen
tag: v1.2.0
tag: v1.3.0
sha: ""
pullPolicy: IfNotPresent
resources: {}
@ -1631,6 +1632,14 @@ prometheusOperator:
runAsNonRoot: true
runAsUser: 2000
# Security context for create job container
createSecretJob:
securityContext: {}
# Security context for patch job container
patchWebhookJob:
securityContext: {}
# Use certmanager to generate webhook certs
certManager:
enabled: false
@ -1841,7 +1850,7 @@ prometheusOperator:
##
image:
repository: quay.io/prometheus-operator/prometheus-operator
tag: v0.58.0
tag: v0.59.1
sha: ""
pullPolicy: IfNotPresent
@ -1856,10 +1865,9 @@ prometheusOperator:
## Prometheus-config-reloader
##
prometheusConfigReloader:
# image to use for config and rule reloading
image:
repository: quay.io/prometheus-operator/prometheus-config-reloader
tag: v0.58.0
tag: v0.59.1
sha: ""
# resource config for prometheusConfigReloader
@ -1875,7 +1883,7 @@ prometheusOperator:
##
thanosImage:
repository: quay.io/thanos/thanos
tag: v0.27.0
tag: v0.28.0
sha: ""
## Set a Field Selector to filter watched secrets
@ -2308,7 +2316,7 @@ prometheus:
##
image:
repository: quay.io/prometheus/prometheus
tag: v2.37.0
tag: v2.38.0
sha: ""
## Tolerations for use with node taints
@ -2500,7 +2508,7 @@ prometheus:
## Enable compression of the write-ahead log using Snappy.
##
walCompression: false
walCompression: true
## If true, the Operator won't process any Prometheus configuration changes
##
@ -3139,7 +3147,7 @@ thanosRuler:
##
image:
repository: quay.io/thanos/thanos
tag: v0.27.0
tag: v0.28.0
sha: ""
## Namespaces to be selected for PrometheusRules discovery.

View File

@ -56,7 +56,11 @@
},
"gridPos": { },
"id": 3,
"interval": null,
"interval": "1m",
"legend": {
"alignAsTable": true,
"rightSide": true
},
"links": [ ],
"mappingType": 1,
"mappingTypes": [
@ -128,13 +132,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 4,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -236,7 +241,11 @@
},
"gridPos": { },
"id": 5,
"interval": null,
"interval": "1m",
"legend": {
"alignAsTable": true,
"rightSide": true
},
"links": [ ],
"mappingType": 1,
"mappingTypes": [
@ -307,13 +316,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 6,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -406,13 +416,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 7,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -488,13 +499,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 8,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -516,7 +528,7 @@
"steppedLine": false,
"targets": [
{
"expr": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb=\"read\", cluster=\"$cluster\"}",
"expr": "cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile{verb=\"read\", cluster=\"$cluster\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{ resource }}",
@ -594,7 +606,11 @@
},
"gridPos": { },
"id": 9,
"interval": null,
"interval": "1m",
"legend": {
"alignAsTable": true,
"rightSide": true
},
"links": [ ],
"mappingType": 1,
"mappingTypes": [
@ -665,13 +681,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 10,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -764,13 +781,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 11,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -846,13 +864,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 12,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -874,7 +893,7 @@
"steppedLine": false,
"targets": [
{
"expr": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb=\"write\", cluster=\"$cluster\"}",
"expr": "cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile{verb=\"write\", cluster=\"$cluster\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{ resource }}",
@ -940,13 +959,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 13,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": false,
"sideWidth": null,
"total": false,
@ -968,7 +988,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(workqueue_adds_total{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name)",
"expr": "sum(rate(workqueue_adds_total{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[$__rate_interval])) by (instance, name)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} {{name}}",
@ -1021,13 +1041,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 14,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": false,
"sideWidth": null,
"total": false,
@ -1049,7 +1070,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(workqueue_depth{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name)",
"expr": "sum(rate(workqueue_depth{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[$__rate_interval])) by (instance, name)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} {{name}}",
@ -1102,6 +1123,7 @@
"fillGradient": 0,
"gridPos": { },
"id": 15,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -1130,7 +1152,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name, le))",
"expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[$__rate_interval])) by (instance, name, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} {{name}}",
@ -1196,13 +1218,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 16,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -1277,13 +1300,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 17,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -1305,7 +1329,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(process_cpu_seconds_total{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}[5m])",
"expr": "rate(process_cpu_seconds_total{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
@ -1358,13 +1382,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 18,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -1452,7 +1477,7 @@
"value": "default"
},
"hide": 0,
"label": null,
"label": "Data Source",
"name": "datasource",
"options": [ ],
"query": "prometheus",
@ -1470,7 +1495,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "label_values(apiserver_request_total, cluster)",
"query": "label_values(up{job=\"apiserver\"}, cluster)",
"refresh": 2,
"regex": "",
"sort": 1,
@ -1490,7 +1515,7 @@
"multi": false,
"name": "instance",
"options": [ ],
"query": "label_values(apiserver_request_total{job=\"apiserver\", cluster=\"$cluster\"}, instance)",
"query": "label_values(up{job=\"apiserver\", cluster=\"$cluster\"}, instance)",
"refresh": 2,
"regex": "",
"sort": 1,

View File

@ -1619,7 +1619,7 @@
"value": "default"
},
"hide": 0,
"label": null,
"label": "Data Source",
"name": "datasource",
"options": [ ],
"query": "prometheus",

View File

@ -36,7 +36,11 @@
},
"gridPos": { },
"id": 2,
"interval": null,
"interval": "1m",
"legend": {
"alignAsTable": true,
"rightSide": true
},
"links": [ ],
"mappingType": 1,
"mappingTypes": [
@ -106,6 +110,7 @@
"fillGradient": 0,
"gridPos": { },
"id": 3,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -134,7 +139,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)",
"expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance, name)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{cluster}} {{instance}} {{name}}",
@ -200,6 +205,7 @@
"fillGradient": 0,
"gridPos": { },
"id": 4,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -228,7 +234,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)",
"expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance, name)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{cluster}} {{instance}} {{name}}",
@ -294,6 +300,7 @@
"fillGradient": 0,
"gridPos": { },
"id": 5,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -322,7 +329,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name, le))",
"expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance, name, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{cluster}} {{instance}} {{name}}",
@ -388,13 +395,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 6,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -416,28 +424,28 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(rest_client_requests_total{job=\"kube-controller-manager\", instance=~\"$instance\",code=~\"2..\"}[5m]))",
"expr": "sum(rate(rest_client_requests_total{job=\"kube-controller-manager\", instance=~\"$instance\",code=~\"2..\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "2xx",
"refId": "A"
},
{
"expr": "sum(rate(rest_client_requests_total{job=\"kube-controller-manager\", instance=~\"$instance\",code=~\"3..\"}[5m]))",
"expr": "sum(rate(rest_client_requests_total{job=\"kube-controller-manager\", instance=~\"$instance\",code=~\"3..\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "3xx",
"refId": "B"
},
{
"expr": "sum(rate(rest_client_requests_total{job=\"kube-controller-manager\", instance=~\"$instance\",code=~\"4..\"}[5m]))",
"expr": "sum(rate(rest_client_requests_total{job=\"kube-controller-manager\", instance=~\"$instance\",code=~\"4..\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "4xx",
"refId": "C"
},
{
"expr": "sum(rate(rest_client_requests_total{job=\"kube-controller-manager\", instance=~\"$instance\",code=~\"5..\"}[5m]))",
"expr": "sum(rate(rest_client_requests_total{job=\"kube-controller-manager\", instance=~\"$instance\",code=~\"5..\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "5xx",
@ -490,13 +498,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 7,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -518,7 +527,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))",
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"POST\"}[$__rate_interval])) by (verb, url, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{verb}} {{url}}",
@ -584,6 +593,7 @@
"fillGradient": 0,
"gridPos": { },
"id": 8,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -612,7 +622,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))",
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"GET\"}[$__rate_interval])) by (verb, url, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{verb}} {{url}}",
@ -678,13 +688,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 9,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -759,13 +770,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 10,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -787,7 +799,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-controller-manager\",instance=~\"$instance\"}[5m])",
"expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-controller-manager\",instance=~\"$instance\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
@ -840,13 +852,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 11,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -934,7 +947,7 @@
"value": "default"
},
"hide": 0,
"label": null,
"label": "Data Source",
"name": "datasource",
"options": [ ],
"query": "prometheus",

View File

@ -24,10 +24,12 @@
"id": 1,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -47,7 +49,7 @@
"steppedLine": false,
"targets": [
{
"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster=\"$cluster\"}[$__rate_interval]))",
"expr": "cluster:node_cpu:ratio_rate5m{cluster=\"$cluster\"}",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@ -99,11 +101,14 @@
"fill": 1,
"format": "percentunit",
"id": 2,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -123,7 +128,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})",
"expr": "sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\",cluster=\"$cluster\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@ -175,11 +180,14 @@
"fill": 1,
"format": "percentunit",
"id": 3,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -199,7 +207,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})",
"expr": "sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\",cluster=\"$cluster\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@ -251,11 +259,14 @@
"fill": 1,
"format": "percentunit",
"id": 4,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -275,7 +286,7 @@
"steppedLine": false,
"targets": [
{
"expr": "1 - sum(:node_memory_MemAvailable_bytes:sum{cluster=\"$cluster\"}) / sum(node_memory_MemTotal_bytes{cluster=\"$cluster\"})",
"expr": "1 - sum(:node_memory_MemAvailable_bytes:sum{cluster=\"$cluster\"}) / sum(node_memory_MemTotal_bytes{job=\"node-exporter\",cluster=\"$cluster\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@ -327,11 +338,14 @@
"fill": 1,
"format": "percentunit",
"id": 5,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -351,7 +365,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})",
"expr": "sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"memory\",cluster=\"$cluster\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@ -403,11 +417,14 @@
"fill": 1,
"format": "percentunit",
"id": 6,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -427,7 +444,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})",
"expr": "sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"memory\",cluster=\"$cluster\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@ -490,11 +507,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 7,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -578,11 +598,14 @@
"datasource": "$datasource",
"fill": 1,
"id": 8,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -741,7 +764,7 @@
],
"targets": [
{
"expr": "sum(kube_pod_owner{cluster=\"$cluster\"}) by (namespace)",
"expr": "sum(kube_pod_owner{job=\"kube-state-metrics\", cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -861,11 +884,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 9,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -885,7 +911,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace)",
"expr": "sum(container_memory_rss{job=\"cadvisor\", cluster=\"$cluster\", container!=\"\"}) by (namespace)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@ -949,11 +975,14 @@
"datasource": "$datasource",
"fill": 1,
"id": 10,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1112,7 +1141,7 @@
],
"targets": [
{
"expr": "sum(kube_pod_owner{cluster=\"$cluster\"}) by (namespace)",
"expr": "sum(kube_pod_owner{job=\"kube-state-metrics\", cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1130,7 +1159,7 @@
"step": 10
},
{
"expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace)",
"expr": "sum(container_memory_rss{job=\"cadvisor\", cluster=\"$cluster\", container!=\"\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1148,7 +1177,7 @@
"step": 10
},
{
"expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)",
"expr": "sum(container_memory_rss{job=\"cadvisor\", cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1166,7 +1195,7 @@
"step": 10
},
{
"expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)",
"expr": "sum(container_memory_rss{job=\"cadvisor\", cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1234,10 +1263,12 @@
"id": 11,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1381,7 +1412,7 @@
],
"targets": [
{
"expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"expr": "sum(irate(container_network_receive_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1390,7 +1421,7 @@
"step": 10
},
{
"expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"expr": "sum(irate(container_network_transmit_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1399,7 +1430,7 @@
"step": 10
},
{
"expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"expr": "sum(irate(container_network_receive_packets_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1408,7 +1439,7 @@
"step": 10
},
{
"expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"expr": "sum(irate(container_network_transmit_packets_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1417,7 +1448,7 @@
"step": 10
},
{
"expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"expr": "sum(irate(container_network_receive_packets_dropped_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1426,7 +1457,7 @@
"step": 10
},
{
"expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"expr": "sum(irate(container_network_transmit_packets_dropped_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1492,11 +1523,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 12,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1516,7 +1550,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"expr": "sum(irate(container_network_receive_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@ -1568,11 +1602,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 13,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1592,7 +1629,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"expr": "sum(irate(container_network_transmit_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@ -1656,11 +1693,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 14,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1680,7 +1720,7 @@
"steppedLine": false,
"targets": [
{
"expr": "avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"expr": "avg(irate(container_network_receive_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@ -1732,11 +1772,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 15,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1756,7 +1799,7 @@
"steppedLine": false,
"targets": [
{
"expr": "avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"expr": "avg(irate(container_network_transmit_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@ -1820,11 +1863,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 16,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1844,7 +1890,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"expr": "sum(irate(container_network_receive_packets_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@ -1896,11 +1942,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 17,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1920,7 +1969,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"expr": "sum(irate(container_network_transmit_packets_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@ -1984,11 +2033,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 18,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -2008,7 +2060,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"expr": "sum(irate(container_network_receive_packets_dropped_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@ -2060,11 +2112,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 19,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -2084,7 +2139,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"expr": "sum(irate(container_network_transmit_packets_dropped_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@ -2149,11 +2204,14 @@
"decimals": -1,
"fill": 10,
"id": 20,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -2173,7 +2231,7 @@
"steppedLine": false,
"targets": [
{
"expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m])))",
"expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@ -2225,11 +2283,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 21,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -2249,7 +2310,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
"expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@ -2313,11 +2374,14 @@
"datasource": "$datasource",
"fill": 1,
"id": 22,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -2465,7 +2529,7 @@
],
"targets": [
{
"expr": "sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
"expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -2474,7 +2538,7 @@
"step": 10
},
{
"expr": "sum by(namespace) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
"expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -2483,7 +2547,7 @@
"step": 10
},
{
"expr": "sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
"expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -2492,7 +2556,7 @@
"step": 10
},
{
"expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
"expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -2501,7 +2565,7 @@
"step": 10
},
{
"expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
"expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -2510,7 +2574,7 @@
"step": 10
},
{
"expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
"expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,

View File

@ -22,11 +22,14 @@
"fill": 1,
"format": "percentunit",
"id": 1,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -46,7 +49,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@ -98,11 +101,14 @@
"fill": 1,
"format": "percentunit",
"id": 2,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -122,7 +128,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@ -174,11 +180,14 @@
"fill": 1,
"format": "percentunit",
"id": 3,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -198,7 +207,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"})",
"expr": "sum(container_memory_working_set_bytes{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@ -250,11 +259,14 @@
"fill": 1,
"format": "percentunit",
"id": 4,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -274,7 +286,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"})",
"expr": "sum(container_memory_working_set_bytes{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@ -337,11 +349,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 5,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -464,11 +479,14 @@
"datasource": "$datasource",
"fill": 1,
"id": 6,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -699,11 +717,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 7,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -746,7 +767,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}) by (pod)",
"expr": "sum(container_memory_working_set_bytes{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -826,11 +847,14 @@
"datasource": "$datasource",
"fill": 1,
"id": 8,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1004,7 +1028,7 @@
],
"targets": [
{
"expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod)",
"expr": "sum(container_memory_working_set_bytes{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1022,7 +1046,7 @@
"step": 10
},
{
"expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"expr": "sum(container_memory_working_set_bytes{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1040,7 +1064,7 @@
"step": 10
},
{
"expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"expr": "sum(container_memory_working_set_bytes{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1049,7 +1073,7 @@
"step": 10
},
{
"expr": "sum(container_memory_rss{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod)",
"expr": "sum(container_memory_rss{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1058,7 +1082,7 @@
"step": 10
},
{
"expr": "sum(container_memory_cache{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod)",
"expr": "sum(container_memory_cache{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1067,7 +1091,7 @@
"step": 10
},
{
"expr": "sum(container_memory_swap{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod)",
"expr": "sum(container_memory_swap{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1135,10 +1159,12 @@
"id": 9,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1282,7 +1308,7 @@
],
"targets": [
{
"expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)",
"expr": "sum(irate(container_network_receive_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1291,7 +1317,7 @@
"step": 10
},
{
"expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)",
"expr": "sum(irate(container_network_transmit_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1300,7 +1326,7 @@
"step": 10
},
{
"expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)",
"expr": "sum(irate(container_network_receive_packets_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1309,7 +1335,7 @@
"step": 10
},
{
"expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)",
"expr": "sum(irate(container_network_transmit_packets_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1318,7 +1344,7 @@
"step": 10
},
{
"expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)",
"expr": "sum(irate(container_network_receive_packets_dropped_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1327,7 +1353,7 @@
"step": 10
},
{
"expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)",
"expr": "sum(irate(container_network_transmit_packets_dropped_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1393,11 +1419,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 10,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1469,11 +1498,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 11,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1557,11 +1589,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 12,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1633,11 +1668,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 13,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1721,11 +1759,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 14,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1797,11 +1838,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 15,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1886,11 +1930,14 @@
"decimals": -1,
"fill": 10,
"id": 16,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1910,7 +1957,7 @@
"steppedLine": false,
"targets": [
{
"expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\"}[5m])))",
"expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -1962,11 +2009,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 17,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1986,7 +2036,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\"}[5m]))",
"expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -2050,11 +2100,14 @@
"datasource": "$datasource",
"fill": 1,
"id": 18,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -2202,7 +2255,7 @@
],
"targets": [
{
"expr": "sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\"}[5m]))",
"expr": "sum by(pod) (rate(container_fs_reads_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -2211,7 +2264,7 @@
"step": 10
},
{
"expr": "sum by(pod) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\"}[5m]))",
"expr": "sum by(pod) (rate(container_fs_writes_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -2220,7 +2273,7 @@
"step": 10
},
{
"expr": "sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\"}[5m]))",
"expr": "sum by(pod) (rate(container_fs_reads_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -2229,7 +2282,7 @@
"step": 10
},
{
"expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\"}[5m]))",
"expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -2238,7 +2291,7 @@
"step": 10
},
{
"expr": "sum by(pod) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\"}[5m]))",
"expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -2247,7 +2300,7 @@
"step": 10
},
{
"expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\"}[5m]))",
"expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -2336,7 +2389,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "label_values(kube_pod_info, cluster)",
"query": "label_values(up{job=\"kube-state-metrics\"}, cluster)",
"refresh": 2,
"regex": "",
"sort": 1,
@ -2359,7 +2412,7 @@
"multi": false,
"name": "namespace",
"options": [ ],
"query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)",
"query": "label_values(kube_namespace_status_phase{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)",
"refresh": 2,
"regex": "",
"sort": 1,

View File

@ -21,11 +21,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 1,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -38,12 +41,32 @@
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"seriesOverrides": [
{
"alias": "max capacity",
"color": "#F2495C",
"dashes": true,
"fill": 0,
"hiddenSeries": true,
"hideTooltip": true,
"legend": true,
"linewidth": 2,
"stack": false
}
],
"spaceLength": 10,
"span": 12,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum(kube_node_status_capacity{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "max capacity",
"legendLink": null,
"step": 10
},
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "time_series",
@ -109,11 +132,14 @@
"datasource": "$datasource",
"fill": 1,
"id": 2,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -344,11 +370,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 3,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -361,12 +390,32 @@
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"seriesOverrides": [
{
"alias": "max capacity",
"color": "#F2495C",
"dashes": true,
"fill": 0,
"hiddenSeries": true,
"hideTooltip": true,
"legend": true,
"linewidth": 2,
"stack": false
}
],
"spaceLength": 10,
"span": 12,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum(kube_node_status_capacity{cluster=\"$cluster\", node=~\"$node\", resource=\"memory\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "max capacity",
"legendLink": null,
"step": 10
},
{
"expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\", container!=\"\"}) by (pod)",
"format": "time_series",
@ -432,11 +481,14 @@
"datasource": "$datasource",
"fill": 1,
"id": 4,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -762,7 +814,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "label_values(kube_pod_info, cluster)",
"query": "label_values(up{job=\"kube-state-metrics\"}, cluster)",
"refresh": 2,
"regex": "",
"sort": 1,
@ -785,7 +837,7 @@
"multi": true,
"name": "node",
"options": [ ],
"query": "label_values(kube_pod_info{cluster=\"$cluster\"}, node)",
"query": "label_values(kube_node_info{cluster=\"$cluster\"}, node)",
"refresh": 2,
"regex": "",
"sort": 1,

View File

@ -21,11 +21,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 1,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -72,7 +75,7 @@
"step": 10
},
{
"expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}\n)\n",
"expr": "sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}\n)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "requests",
@ -80,7 +83,7 @@
"step": 10
},
{
"expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}\n)\n",
"expr": "sum(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}\n)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "limits",
@ -144,11 +147,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 2,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -168,7 +174,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(increase(container_cpu_cfs_throttled_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"\", cluster=\"$cluster\"}[5m])) by (container) /sum(increase(container_cpu_cfs_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"\", cluster=\"$cluster\"}[5m])) by (container)",
"expr": "sum(increase(container_cpu_cfs_throttled_periods_total{job=\"cadvisor\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (container) /sum(increase(container_cpu_cfs_periods_total{job=\"cadvisor\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (container)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{container}}",
@ -241,11 +247,14 @@
"datasource": "$datasource",
"fill": 1,
"id": 3,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -476,11 +485,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 4,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -521,7 +533,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container)",
"expr": "sum(container_memory_working_set_bytes{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{container}}",
@ -529,7 +541,7 @@
"step": 10
},
{
"expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}\n)\n",
"expr": "sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}\n)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "requests",
@ -537,7 +549,7 @@
"step": 10
},
{
"expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}\n)\n",
"expr": "sum(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}\n)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "limits",
@ -601,11 +613,14 @@
"datasource": "$datasource",
"fill": 1,
"id": 5,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -779,7 +794,7 @@
],
"targets": [
{
"expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container)",
"expr": "sum(container_memory_working_set_bytes{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -797,7 +812,7 @@
"step": 10
},
{
"expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", image!=\"\"}) by (container) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"expr": "sum(container_memory_working_set_bytes{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", image!=\"\"}) by (container) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -815,7 +830,7 @@
"step": 10
},
{
"expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"expr": "sum(container_memory_working_set_bytes{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -824,7 +839,7 @@
"step": 10
},
{
"expr": "sum(container_memory_rss{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container != \"\", container != \"POD\"}) by (container)",
"expr": "sum(container_memory_rss{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container != \"\", container != \"POD\"}) by (container)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -833,7 +848,7 @@
"step": 10
},
{
"expr": "sum(container_memory_cache{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container != \"\", container != \"POD\"}) by (container)",
"expr": "sum(container_memory_cache{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container != \"\", container != \"POD\"}) by (container)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -842,7 +857,7 @@
"step": 10
},
{
"expr": "sum(container_memory_swap{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container != \"\", container != \"POD\"}) by (container)",
"expr": "sum(container_memory_swap{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container != \"\", container != \"POD\"}) by (container)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -910,10 +925,12 @@
"id": 6,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -933,7 +950,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)",
"expr": "sum(irate(container_network_receive_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -987,10 +1004,12 @@
"id": 7,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1010,7 +1029,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)",
"expr": "sum(irate(container_network_transmit_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -1076,10 +1095,12 @@
"id": 8,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1099,7 +1120,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)",
"expr": "sum(irate(container_network_receive_packets_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -1153,10 +1174,12 @@
"id": 9,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1176,7 +1199,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)",
"expr": "sum(irate(container_network_transmit_packets_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -1242,10 +1265,12 @@
"id": 10,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1265,7 +1290,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)",
"expr": "sum(irate(container_network_receive_packets_dropped_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -1319,10 +1344,12 @@
"id": 11,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1342,7 +1369,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)",
"expr": "sum(irate(container_network_transmit_packets_dropped_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -1407,11 +1434,14 @@
"decimals": -1,
"fill": 10,
"id": 12,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1431,7 +1461,7 @@
"steppedLine": false,
"targets": [
{
"expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[5m])))",
"expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@ -1439,7 +1469,7 @@
"step": 10
},
{
"expr": "ceil(sum by(pod) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[5m])))",
"expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@ -1491,11 +1521,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 13,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1515,7 +1548,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[5m]))",
"expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@ -1523,7 +1556,7 @@
"step": 10
},
{
"expr": "sum by(pod) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[5m]))",
"expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@ -1588,11 +1621,14 @@
"decimals": -1,
"fill": 10,
"id": 14,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1612,7 +1648,7 @@
"steppedLine": false,
"targets": [
{
"expr": "ceil(sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=\"$pod\"}[5m])))",
"expr": "ceil(sum by(container) (rate(container_fs_reads_total{job=\"cadvisor\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"cadvisor\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{container}}",
@ -1664,11 +1700,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 15,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1688,7 +1727,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=\"$pod\"}[5m]))",
"expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"cadvisor\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"cadvisor\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{container}}",
@ -1752,11 +1791,14 @@
"datasource": "$datasource",
"fill": 1,
"id": 16,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1904,7 +1946,7 @@
],
"targets": [
{
"expr": "sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=\"$pod\"}[5m]))",
"expr": "sum by(container) (rate(container_fs_reads_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1913,7 +1955,7 @@
"step": 10
},
{
"expr": "sum by(container) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=\"$pod\"}[5m]))",
"expr": "sum by(container) (rate(container_fs_writes_total{job=\"cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1922,7 +1964,7 @@
"step": 10
},
{
"expr": "sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=\"$pod\"}[5m]))",
"expr": "sum by(container) (rate(container_fs_reads_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1931,7 +1973,7 @@
"step": 10
},
{
"expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=\"$pod\"}[5m]))",
"expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1940,7 +1982,7 @@
"step": 10
},
{
"expr": "sum by(container) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=\"$pod\"}[5m]))",
"expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1949,7 +1991,7 @@
"step": 10
},
{
"expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=\"$pod\"}[5m]))",
"expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -2038,7 +2080,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "label_values(kube_pod_info, cluster)",
"query": "label_values(up{job=\"kube-state-metrics\"}, cluster)",
"refresh": 2,
"regex": "",
"sort": 1,
@ -2061,7 +2103,7 @@
"multi": false,
"name": "namespace",
"options": [ ],
"query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)",
"query": "label_values(kube_namespace_status_phase{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)",
"refresh": 2,
"regex": "",
"sort": 1,
@ -2084,7 +2126,7 @@
"multi": false,
"name": "pod",
"options": [ ],
"query": "label_values(kube_pod_info{cluster=\"$cluster\", namespace=\"$namespace\"}, pod)",
"query": "label_values(kube_pod_info{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\"}, pod)",
"refresh": 2,
"regex": "",
"sort": 1,

View File

@ -21,11 +21,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 1,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -109,11 +112,14 @@
"datasource": "$datasource",
"fill": 1,
"id": 2,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -251,7 +257,7 @@
"step": 10
},
{
"expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"expr": "sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -260,7 +266,7 @@
"step": 10
},
{
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -269,7 +275,7 @@
"step": 10
},
{
"expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"expr": "sum(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -278,7 +284,7 @@
"step": 10
},
{
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -344,11 +350,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 3,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -432,11 +441,14 @@
"datasource": "$datasource",
"fill": 1,
"id": 4,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -574,7 +586,7 @@
"step": 10
},
{
"expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"expr": "sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -583,7 +595,7 @@
"step": 10
},
{
"expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -592,7 +604,7 @@
"step": 10
},
{
"expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"expr": "sum(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -601,7 +613,7 @@
"step": 10
},
{
"expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -669,10 +681,12 @@
"id": 5,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -816,7 +830,7 @@
],
"targets": [
{
"expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_receive_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -825,7 +839,7 @@
"step": 10
},
{
"expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_transmit_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -834,7 +848,7 @@
"step": 10
},
{
"expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_receive_packets_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -843,7 +857,7 @@
"step": 10
},
{
"expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_transmit_packets_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -852,7 +866,7 @@
"step": 10
},
{
"expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_receive_packets_dropped_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -861,7 +875,7 @@
"step": 10
},
{
"expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_transmit_packets_dropped_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -927,11 +941,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 6,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -951,7 +968,7 @@
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_receive_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -1003,11 +1020,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 7,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1027,7 +1047,7 @@
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_transmit_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -1091,11 +1111,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 8,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1115,7 +1138,7 @@
"steppedLine": false,
"targets": [
{
"expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(avg(irate(container_network_receive_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -1167,11 +1190,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 9,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1191,7 +1217,7 @@
"steppedLine": false,
"targets": [
{
"expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(avg(irate(container_network_transmit_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -1255,11 +1281,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 10,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1279,7 +1308,7 @@
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_receive_packets_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -1331,11 +1360,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 11,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1355,7 +1387,7 @@
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_transmit_packets_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -1419,11 +1451,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 12,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1443,7 +1478,7 @@
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_receive_packets_dropped_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -1495,11 +1530,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 13,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1519,7 +1557,7 @@
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_transmit_packets_dropped_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -1606,7 +1644,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "label_values(kube_pod_info, cluster)",
"query": "label_values(up{job=\"kube-state-metrics\"}, cluster)",
"refresh": 2,
"regex": "",
"sort": 1,
@ -1629,30 +1667,7 @@
"multi": false,
"name": "namespace",
"options": [ ],
"query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)",
"refresh": 2,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
"tags": [ ],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"current": {
"text": "",
"value": ""
},
"datasource": "$datasource",
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "workload",
"options": [ ],
"query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\"}, workload)",
"query": "label_values(kube_namespace_status_phase{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)",
"refresh": 2,
"regex": "",
"sort": 1,
@ -1675,7 +1690,30 @@
"multi": false,
"name": "type",
"options": [ ],
"query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\"}, workload_type)",
"query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\"}, workload_type)",
"refresh": 2,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
"tags": [ ],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"current": {
"text": "",
"value": ""
},
"datasource": "$datasource",
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "workload",
"options": [ ],
"query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}, workload)",
"refresh": 2,
"regex": "",
"sort": 1,

View File

@ -21,11 +21,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 1,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -148,11 +151,14 @@
"datasource": "$datasource",
"fill": 1,
"id": 2,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -329,7 +335,7 @@
"step": 10
},
{
"expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"expr": "sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -338,7 +344,7 @@
"step": 10
},
{
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -347,7 +353,7 @@
"step": 10
},
{
"expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"expr": "sum(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -356,7 +362,7 @@
"step": 10
},
{
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -422,11 +428,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 3,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -469,7 +478,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"expr": "sum(\n container_memory_working_set_bytes{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{workload}} - {{workload_type}}",
@ -549,11 +558,14 @@
"datasource": "$datasource",
"fill": 1,
"id": 4,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -721,7 +733,7 @@
"step": 10
},
{
"expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"expr": "sum(\n container_memory_working_set_bytes{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -730,7 +742,7 @@
"step": 10
},
{
"expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"expr": "sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -739,7 +751,7 @@
"step": 10
},
{
"expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"expr": "sum(\n container_memory_working_set_bytes{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -748,7 +760,7 @@
"step": 10
},
{
"expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"expr": "sum(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -757,7 +769,7 @@
"step": 10
},
{
"expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"expr": "sum(\n container_memory_working_set_bytes{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -825,10 +837,12 @@
"id": 5,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -987,7 +1001,7 @@
],
"targets": [
{
"expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_receive_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -996,7 +1010,7 @@
"step": 10
},
{
"expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_transmit_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1005,7 +1019,7 @@
"step": 10
},
{
"expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_receive_packets_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1014,7 +1028,7 @@
"step": 10
},
{
"expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_transmit_packets_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1023,7 +1037,7 @@
"step": 10
},
{
"expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_receive_packets_dropped_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1032,7 +1046,7 @@
"step": 10
},
{
"expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_transmit_packets_dropped_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1098,11 +1112,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 6,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1122,7 +1139,7 @@
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_receive_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{workload}}",
@ -1174,11 +1191,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 7,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1198,7 +1218,7 @@
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_transmit_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{workload}}",
@ -1262,11 +1282,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 8,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1286,7 +1309,7 @@
"steppedLine": false,
"targets": [
{
"expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(avg(irate(container_network_receive_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{workload}}",
@ -1338,11 +1361,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 9,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1362,7 +1388,7 @@
"steppedLine": false,
"targets": [
{
"expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(avg(irate(container_network_transmit_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{workload}}",
@ -1426,11 +1452,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 10,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1450,7 +1479,7 @@
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_receive_packets_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{workload}}",
@ -1502,11 +1531,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 11,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1526,7 +1558,7 @@
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_transmit_packets_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{workload}}",
@ -1590,11 +1622,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 12,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1614,7 +1649,7 @@
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_receive_packets_dropped_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{workload}}",
@ -1666,11 +1701,14 @@
"datasource": "$datasource",
"fill": 10,
"id": 13,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -1690,7 +1728,7 @@
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_transmit_packets_dropped_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{workload}}",
@ -1777,7 +1815,30 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "label_values(kube_pod_info, cluster)",
"query": "label_values(up{job=\"kube-state-metrics\"}, cluster)",
"refresh": 2,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
"tags": [ ],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"current": {
"text": "",
"value": ""
},
"datasource": "$datasource",
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "namespace",
"options": [ ],
"query": "label_values(kube_pod_info{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)",
"refresh": 2,
"regex": "",
"sort": 1,
@ -1814,29 +1875,6 @@
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"current": {
"text": "",
"value": ""
},
"datasource": "$datasource",
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "namespace",
"options": [ ],
"query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)",
"refresh": 2,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
"tags": [ ],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},

View File

@ -154,7 +154,7 @@
"refId": "A"
}
],
"title": "Running Container",
"title": "Running Containers",
"transparent": false,
"type": "stat"
},
@ -294,7 +294,7 @@
"pluginVersion": "7",
"targets": [
{
"expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"}[5m]))",
"expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
@ -347,7 +347,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (operation_type, instance)",
"expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[$__rate_interval])) by (operation_type, instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} {{operation_type}}",
@ -432,7 +432,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, operation_type)",
"expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[$__rate_interval])) by (instance, operation_type)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} {{operation_type}}",
@ -517,7 +517,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))",
"expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[$__rate_interval])) by (instance, operation_type, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} {{operation_type}}",
@ -602,14 +602,14 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance)",
"expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[$__rate_interval])) by (instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} pod",
"refId": "A"
},
{
"expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance)",
"expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[$__rate_interval])) by (instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} worker",
@ -694,14 +694,14 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, le))",
"expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[$__rate_interval])) by (instance, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} pod",
"refId": "A"
},
{
"expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, le))",
"expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[$__rate_interval])) by (instance, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} worker",
@ -788,7 +788,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)",
"expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[$__rate_interval])) by (instance, operation_name, volume_plugin)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}",
@ -875,7 +875,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)",
"expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[$__rate_interval])) by (instance, operation_name, volume_plugin)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}",
@ -962,7 +962,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))",
"expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"}[$__rate_interval])) by (instance, operation_name, volume_plugin, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}",
@ -1047,7 +1047,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"}[5m])) by (instance, operation_type)",
"expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"}[$__rate_interval])) by (instance, operation_type)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{operation_type}}",
@ -1132,7 +1132,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))",
"expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"}[$__rate_interval])) by (instance, operation_type, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} {{operation_type}}",
@ -1218,7 +1218,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"}[5m])) by (instance)",
"expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"}[$__rate_interval])) by (instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
@ -1303,7 +1303,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, le))",
"expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[$__rate_interval])) by (instance, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
@ -1388,7 +1388,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])) by (instance, le))",
"expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[$__rate_interval])) by (instance, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
@ -1473,28 +1473,28 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", instance=~\"$instance\",code=~\"2..\"}[5m]))",
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", instance=~\"$instance\",code=~\"2..\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "2xx",
"refId": "A"
},
{
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", instance=~\"$instance\",code=~\"3..\"}[5m]))",
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", instance=~\"$instance\",code=~\"3..\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "3xx",
"refId": "B"
},
{
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", instance=~\"$instance\",code=~\"4..\"}[5m]))",
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", instance=~\"$instance\",code=~\"4..\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "4xx",
"refId": "C"
},
{
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", instance=~\"$instance\",code=~\"5..\"}[5m]))",
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", instance=~\"$instance\",code=~\"5..\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "5xx",
@ -1579,7 +1579,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))",
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", instance=~\"$instance\"}[$__rate_interval])) by (instance, verb, url, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} {{verb}} {{url}}",
@ -1749,7 +1749,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[5m])",
"expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"kubelet\",instance=~\"$instance\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
@ -1893,7 +1893,7 @@
"value": "default"
},
"hide": 0,
"label": null,
"label": "Data Source",
"name": "datasource",
"options": [ ],
"query": "prometheus",
@ -1927,11 +1927,11 @@
"datasource": "$datasource",
"hide": 0,
"includeAll": true,
"label": null,
"label": "instance",
"multi": false,
"name": "instance",
"options": [ ],
"query": "label_values(kubelet_runtime_operations_total{cluster=\"$cluster\", job=\"kubelet\"}, instance)",
"query": "label_values(up{job=\"kubelet\",cluster=\"$cluster\"}, instance)",
"refresh": 2,
"regex": "",
"sort": 1,

View File

@ -1137,7 +1137,7 @@
"value": "default"
},
"hide": 0,
"label": null,
"label": "Data Source",
"name": "datasource",
"options": [ ],
"query": "prometheus",

View File

@ -1349,7 +1349,7 @@
"value": "default"
},
"hide": 0,
"label": null,
"label": "Data Source",
"name": "datasource",
"options": [ ],
"query": "prometheus",

View File

@ -26,13 +26,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 2,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -124,7 +125,11 @@
},
"gridPos": { },
"id": 3,
"interval": null,
"interval": "1m",
"legend": {
"alignAsTable": true,
"rightSide": true
},
"links": [ ],
"mappingType": 1,
"mappingTypes": [
@ -207,13 +212,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 4,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -305,7 +311,11 @@
},
"gridPos": { },
"id": 5,
"interval": null,
"interval": "1m",
"legend": {
"alignAsTable": true,
"rightSide": true
},
"links": [ ],
"mappingType": 1,
"mappingTypes": [
@ -388,7 +398,7 @@
"value": "default"
},
"hide": 0,
"label": null,
"label": "Data Source",
"name": "datasource",
"options": [ ],
"query": "prometheus",
@ -406,7 +416,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "label_values(kubelet_volume_stats_capacity_bytes, cluster)",
"query": "label_values(kubelet_volume_stats_capacity_bytes{job=\"kubelet\"}, cluster)",
"refresh": 2,
"regex": "",
"sort": 1,

View File

@ -903,7 +903,7 @@
"value": "default"
},
"hide": 0,
"label": null,
"label": "Data Source",
"name": "datasource",
"options": [ ],
"query": "prometheus",

View File

@ -36,7 +36,11 @@
},
"gridPos": { },
"id": 2,
"interval": null,
"interval": "1m",
"legend": {
"alignAsTable": true,
"rightSide": true
},
"links": [ ],
"mappingType": 1,
"mappingTypes": [
@ -106,13 +110,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 3,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -134,7 +139,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(kubeproxy_sync_proxy_rules_duration_seconds_count{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[5m]))",
"expr": "sum(rate(kubeproxy_sync_proxy_rules_duration_seconds_count{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "rate",
@ -187,6 +192,7 @@
"fillGradient": 0,
"gridPos": { },
"id": 4,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -215,7 +221,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99,rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[5m]))",
"expr": "histogram_quantile(0.99,rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
@ -281,13 +287,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 5,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -309,7 +316,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(kubeproxy_network_programming_duration_seconds_count{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[5m]))",
"expr": "sum(rate(kubeproxy_network_programming_duration_seconds_count{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "rate",
@ -362,6 +369,7 @@
"fillGradient": 0,
"gridPos": { },
"id": 6,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -390,7 +398,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[5m])) by (instance, le))",
"expr": "histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[$__rate_interval])) by (instance, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
@ -456,13 +464,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 7,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -484,28 +493,28 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"2..\"}[5m]))",
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"2..\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "2xx",
"refId": "A"
},
{
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"3..\"}[5m]))",
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"3..\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "3xx",
"refId": "B"
},
{
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"4..\"}[5m]))",
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"4..\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "4xx",
"refId": "C"
},
{
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"5..\"}[5m]))",
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"5..\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "5xx",
@ -558,13 +567,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 8,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -586,7 +596,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\",verb=\"POST\"}[5m])) by (verb, url, le))",
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\",verb=\"POST\"}[$__rate_interval])) by (verb, url, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{verb}} {{url}}",
@ -652,6 +662,7 @@
"fillGradient": 0,
"gridPos": { },
"id": 9,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -680,7 +691,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))",
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\", verb=\"GET\"}[$__rate_interval])) by (verb, url, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{verb}} {{url}}",
@ -746,13 +757,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 10,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -827,13 +839,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 11,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -855,7 +868,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\"}[5m])",
"expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
@ -908,13 +921,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 12,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -1002,7 +1016,7 @@
"value": "default"
},
"hide": 0,
"label": null,
"label": "Data Source",
"name": "datasource",
"options": [ ],
"query": "prometheus",
@ -1020,7 +1034,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "label_values(kube_pod_info, cluster)",
"query": "label_values(up{job=\"kube-proxy\"}, cluster)",
"refresh": 2,
"regex": "",
"sort": 1,
@ -1040,7 +1054,7 @@
"multi": false,
"name": "instance",
"options": [ ],
"query": "label_values(kubeproxy_network_programming_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\"}, instance)",
"query": "label_values(up{job=\"kube-proxy\", cluster=\"$cluster\", job=\"kube-proxy\"}, instance)",
"refresh": 2,
"regex": "",
"sort": 1,

View File

@ -36,7 +36,11 @@
},
"gridPos": { },
"id": 2,
"interval": null,
"interval": "1m",
"legend": {
"alignAsTable": true,
"rightSide": true
},
"links": [ ],
"mappingType": 1,
"mappingTypes": [
@ -106,6 +110,7 @@
"fillGradient": 0,
"gridPos": { },
"id": 3,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -134,28 +139,28 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
"expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{cluster}} {{instance}} e2e",
"refId": "A"
},
{
"expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
"expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{cluster}} {{instance}} binding",
"refId": "B"
},
{
"expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
"expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{cluster}} {{instance}} scheduling algorithm",
"refId": "C"
},
{
"expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
"expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{cluster}} {{instance}} volume",
@ -208,6 +213,7 @@
"fillGradient": 0,
"gridPos": { },
"id": 4,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -236,28 +242,28 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
"expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{cluster}} {{instance}} e2e",
"refId": "A"
},
{
"expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
"expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{cluster}} {{instance}} binding",
"refId": "B"
},
{
"expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
"expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{cluster}} {{instance}} scheduling algorithm",
"refId": "C"
},
{
"expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
"expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{cluster}} {{instance}} volume",
@ -323,13 +329,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 5,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -351,28 +358,28 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"2..\"}[5m]))",
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"2..\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "2xx",
"refId": "A"
},
{
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"3..\"}[5m]))",
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"3..\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "3xx",
"refId": "B"
},
{
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"4..\"}[5m]))",
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"4..\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "4xx",
"refId": "C"
},
{
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"5..\"}[5m]))",
"expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"5..\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "5xx",
@ -425,13 +432,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 6,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -453,7 +461,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))",
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\", verb=\"POST\"}[$__rate_interval])) by (verb, url, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{verb}} {{url}}",
@ -519,6 +527,7 @@
"fillGradient": 0,
"gridPos": { },
"id": 7,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -547,7 +556,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))",
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\", verb=\"GET\"}[$__rate_interval])) by (verb, url, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{verb}} {{url}}",
@ -613,13 +622,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 8,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -694,13 +704,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 9,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -722,7 +733,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])",
"expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
@ -775,13 +786,14 @@
"fillGradient": 0,
"gridPos": { },
"id": 10,
"interval": "1m",
"legend": {
"alignAsTable": false,
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
@ -869,7 +881,7 @@
"value": "default"
},
"hide": 0,
"label": null,
"label": "Data Source",
"name": "datasource",
"options": [ ],
"query": "prometheus",
@ -907,7 +919,7 @@
"multi": false,
"name": "instance",
"options": [ ],
"query": "label_values(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-scheduler\"}, instance)",
"query": "label_values(up{job=\"kube-scheduler\", cluster=\"$cluster\"}, instance)",
"refresh": 2,
"regex": "",
"sort": 1,

View File

@ -89,7 +89,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sort_desc(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "sort_desc(sum(irate(container_network_receive_bytes_total{job=\"cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{ pod }}",
@ -184,7 +184,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{job=\"cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{ pod }}",
@ -290,7 +290,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sort_desc(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "sort_desc(avg(irate(container_network_receive_bytes_total{job=\"cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{ pod }}",
@ -385,7 +385,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{job=\"cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{ pod }}",
@ -506,7 +506,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sort_desc(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "sort_desc(sum(irate(container_network_receive_bytes_total{job=\"cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{pod}}",
@ -597,7 +597,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{job=\"cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{pod}}",
@ -699,7 +699,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sort_desc(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "sort_desc(sum(irate(container_network_receive_packets_total{job=\"cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{pod}}",
@ -790,7 +790,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sort_desc(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "sort_desc(sum(irate(container_network_transmit_packets_total{job=\"cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{pod}}",
@ -901,7 +901,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{job=\"cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{pod}}",
@ -992,7 +992,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{job=\"cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{pod}}",
@ -1061,7 +1061,7 @@
"value": "default"
},
"hide": 0,
"label": null,
"label": "Data Source",
"name": "datasource",
"options": [ ],
"query": "prometheus",
@ -1079,7 +1079,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "label_values(kube_pod_info, cluster)",
"query": "label_values(kube_pod_info{job=\"kube-state-metrics\"}, cluster)",
"refresh": 2,
"regex": "",
"sort": 0,
@ -1099,14 +1099,14 @@
"value": "kube-system"
},
"datasource": "$datasource",
"definition": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)",
"definition": "label_values(container_network_receive_packets_total{job=\"cadvisor\", cluster=\"$cluster\"}, namespace)",
"hide": 0,
"includeAll": true,
"label": null,
"multi": false,
"name": "namespace",
"options": [ ],
"query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)",
"query": "label_values(container_network_receive_packets_total{job=\"cadvisor\", cluster=\"$cluster\"}, namespace)",
"refresh": 2,
"regex": "",
"skipUrlSync": false,

View File

@ -8,7 +8,7 @@
"subdir": "jsonnet/kube-prometheus"
}
},
"version": "release-0.9"
"version": "release-0.11"
}
],
"legacyImports": true

View File

@ -8,8 +8,8 @@
"subdir": "grafana"
}
},
"version": "90f38916f1f8a310a715d18e36f787f84df4ddf5",
"sum": "0kZ1pnuIirDtbg6F9at5+NQOwKNONIGEPq0eECzvRkI="
"version": "d039275e4916aceae1c137120882e01d857787ac",
"sum": "515vMn4x4tP8vegL4HLW0nDO5+njGTgnDZB5OOhtsCI="
},
{
"source": {
@ -18,9 +18,19 @@
"subdir": "contrib/mixin"
}
},
"version": "74aa38ec10bc22d34ffd204f46df6e460b78d855",
"version": "19002cfc689fba2b8f56605e5797bf79f8b61fdd",
"sum": "W/Azptf1PoqjyMwJON96UY69MFugDA4IAYiKURscryc="
},
{
"source": {
"git": {
"remote": "https://github.com/grafana/grafana.git",
"subdir": "grafana-mixin"
}
},
"version": "3eed09056849ab873b867b561b7ce580ef2c75ba",
"sum": "MkjR7zCgq6MUZgjDzop574tFKoTX2OBr7DTwm1K+Ofs="
},
{
"source": {
"git": {
@ -38,7 +48,7 @@
"subdir": "grafana-builder"
}
},
"version": "c132c4afcf17491718539db4c2d94c0ea4346120",
"version": "dbf6fc14105c28b6fd0253005f7ca2da37d3d4e1",
"sum": "tDR6yT2GVfw0wTU12iZH+m01HrbIr6g/xN+/8nzNkU0="
},
{
@ -48,8 +58,8 @@
"subdir": ""
}
},
"version": "fb9d8ed4bc4a3d6efac525f72e8a0d2c583a0fe2",
"sum": "xjKkdp+5fkekCNBUIgZCHTRmVdUEmQNFKslrL2Ho8gs="
"version": "b8f44bb7be728423836bef0e904ec7166895a34b",
"sum": "LCgSosxceeYuoau5fYSPtE5eXOFe46DxexfkrctUv7c="
},
{
"source": {
@ -58,7 +68,7 @@
"subdir": "lib/promgrafonnet"
}
},
"version": "eb98d4f74e8ac9c30b1f0e815b07bed31da76c8f",
"version": "5e44626d70c2bf2d35c37f3fee5a6261a5335cc6",
"sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps="
},
{
@ -68,8 +78,8 @@
"subdir": "jsonnet/kube-state-metrics"
}
},
"version": "f170cc73f11c1580d7f38af746be0f2fa79c6a1e",
"sum": "S5qI+PJUdNeYOv76jH5nxwYS9N6U7CRxvyuB1wI4cTE="
"version": "0567e1e1b981755e563d2244fa1659563f2cddbc",
"sum": "P0dCnbzyPScQGNXwXRcwiPkMLeTq0IPNbSTysDbySnM="
},
{
"source": {
@ -78,7 +88,7 @@
"subdir": "jsonnet/kube-state-metrics-mixin"
}
},
"version": "f170cc73f11c1580d7f38af746be0f2fa79c6a1e",
"version": "0567e1e1b981755e563d2244fa1659563f2cddbc",
"sum": "u8gaydJoxEjzizQ8jY8xSjYgWooPmxw+wIWdDxifMAk="
},
{
@ -88,8 +98,8 @@
"subdir": "jsonnet/kube-prometheus"
}
},
"version": "452aaed72e36acb31cae93cfa85a5d9c3d3d2ec7",
"sum": "pupXEvlRbhLdEO9b8LfFZB66+Z7fEqvRZ9m3MyEvsv4="
"version": "e3066575dc8be21f578f12887563bda3ee7a2eff",
"sum": "nNEMDrb5sQDOxJ20ITDvldyfIbbiGcVr8Bq46PH2ww8="
},
{
"source": {
@ -98,8 +108,8 @@
"subdir": "jsonnet/mixin"
}
},
"version": "83fe36566f4e0894eb5ffcd2638a0f039a17bdeb",
"sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U=",
"version": "5db6996d3ca995e66301c53c33959fd64c3f6ae6",
"sum": "GQmaVFJwKMiD/P4n3N2LrAZVcwutriWrP8joclDtBYQ=",
"name": "prometheus-operator-mixin"
},
{
@ -109,8 +119,8 @@
"subdir": "jsonnet/prometheus-operator"
}
},
"version": "83fe36566f4e0894eb5ffcd2638a0f039a17bdeb",
"sum": "J1G++A8hrtr3+OZQMmcNeb1w/C30bXqqwpwHL/Xhsd4="
"version": "5db6996d3ca995e66301c53c33959fd64c3f6ae6",
"sum": "pUggCYwO/3Y/p6Vgryx8Y4KO3QkJ+GqimrZtn/luzzI="
},
{
"source": {
@ -119,8 +129,8 @@
"subdir": "doc/alertmanager-mixin"
}
},
"version": "b408b522bc653d014e53035e59fa394cc1edd762",
"sum": "pep+dHzfIjh2SU5pEkwilMCAT/NoL6YYflV4x8cr7vU=",
"version": "14b01e6a34dd3155768c7e9bd5c4376055de9419",
"sum": "f3iZDUXQ/YWB5yDCY7VLD5bs442+3CdJgXJhJyWhNf8=",
"name": "alertmanager"
},
{
@ -130,8 +140,8 @@
"subdir": "docs/node-mixin"
}
},
"version": "832909dd257eb368cf83363ffcae3ab84cb4bcb1",
"sum": "MmxGhE2PJ1a52mk2x7vDpMT2at4Jglbud/rK74CB5i0="
"version": "a2321e7b940ddcff26873612bccdf7cd4c42b6b6",
"sum": "MlWDAKGZ+JArozRKdKEvewHeWn8j2DNBzesJfLVd0dk="
},
{
"source": {
@ -140,10 +150,20 @@
"subdir": "documentation/prometheus-mixin"
}
},
"version": "751ca03faddc9c64089c41d0da370a3a0b477742",
"sum": "AS8WYFi/z10BZSF6DFkKBscjB32XDMM7iIso7CO/FyI=",
"version": "d7e7b8e04b5ecdc1dd153534ba376a622b72741b",
"sum": "APXOIP3B3dZ3Tyh7L2UhyWR8Vbf5+9adTLz/ya7n6uU=",
"name": "prometheus"
},
{
"source": {
"git": {
"remote": "https://github.com/pyrra-dev/pyrra.git",
"subdir": "config/crd/bases"
}
},
"version": "3738a607a42a0c9566587a49cec7587cc92d61bd",
"sum": "GQ0GFKGdIWKx1b78VRs6jtC4SMqkBjT5jl65QUjPKK4="
},
{
"source": {
"git": {
@ -151,8 +171,8 @@
"subdir": "mixin"
}
},
"version": "ff363498fc95cfe17de894d7237bcf38bdd0bc36",
"sum": "cajthvLKDjYgYHCKQU2g/pTMRkxcbuJEvTnCyJOihl8=",
"version": "17c576472d80972bfd3705e1e0a08e6f8da8e04b",
"sum": "dBm9ML50quhu6dwTIgfNmVruMqfaUeQVCO/6EKtQLxE=",
"name": "thanos-mixin"
}
],

View File

@ -6,10 +6,10 @@ dashboards:
url: https://grafana.com/api/dashboards/12539/revisions/5/download
tags: ['kubernetes', 'DNS']
- name: etcd
url: https://grafana.com/api/dashboards/3070/revisions/3/download
url: https://grafana.com/api/dashboards/15308/revisions/1/download
tags: ['kubernetes', 'etcd']
- name: node
url: https://grafana.com/api/dashboards/1860/revisions/23/download
url: https://grafana.com/api/dashboards/1860/revisions/27/download
tags: ['kubernetes']
# cd dashboards; for f in *.json; do echo "- name: ${f%%.json}" >> ../dashboards.yaml; echo " url: file://dashboards/$f" >> ../dashboards.yaml; done; cd -
- name: apiserver

View File

@ -4,9 +4,10 @@
"metadata": {
"labels": {
"app.kubernetes.io/component": "alert-router",
"app.kubernetes.io/instance": "main",
"app.kubernetes.io/name": "alertmanager",
"app.kubernetes.io/part-of": "kube-prometheus",
"app.kubernetes.io/version": "0.22.2",
"app.kubernetes.io/version": "0.24.0",
"prometheus": "k8s",
"role": "alert-rules"
},

View File

@ -41,6 +41,18 @@
"labels": {
"severity": "none"
}
},
{
"alert": "InfoInhibitor",
"annotations": {
"description": "This is an alert that is used to inhibit info alerts.\nBy themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with\nother alerts.\nThis alert fires whenever there's a severity=\"info\" alert, and stops firing when another alert with a\nseverity of 'warning' or 'critical' starts firing on the same namespace.\nThis alert should be routed to a null receiver and configured to inhibit alerts with severity=\"info\".\n",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor",
"summary": "Info-level alert inhibition."
},
"expr": "ALERTS{severity = \"info\"} == 1 unless on(namespace) ALERTS{alertname != \"InfoInhibitor\", severity =~ \"warning|critical\", alertstate=\"firing\"} == 1",
"labels": {
"severity": "none"
}
}
]
},
@ -86,7 +98,7 @@
"record": "cluster:node_cpu:sum_rate5m"
},
{
"expr": "cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))",
"expr": "cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))",
"record": "cluster:node_cpu:ratio"
}
]

View File

@ -6,7 +6,7 @@
"app.kubernetes.io/component": "exporter",
"app.kubernetes.io/name": "kube-state-metrics",
"app.kubernetes.io/part-of": "kube-prometheus",
"app.kubernetes.io/version": "2.1.1",
"app.kubernetes.io/version": "2.5.0",
"prometheus": "k8s",
"role": "alert-rules"
},

View File

@ -36,7 +36,7 @@
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready",
"summary": "Pod has been in a non-ready state for more than 15 minutes."
},
"expr": "sum by (namespace, pod) (\n max by(namespace, pod) (\n kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Pending|Unknown\"}\n ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (\n 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!=\"Job\"})\n )\n) > 0\n",
"expr": "sum by (namespace, pod, cluster) (\n max by(namespace, pod, cluster) (\n kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Pending|Unknown\"}\n ) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (\n 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!=\"Job\"})\n )\n) > 0\n",
"for": "15m",
"labels": {
"severity": "warning"
@ -114,7 +114,7 @@
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck",
"summary": "DaemonSet rollout is stuck."
},
"expr": "(\n (\n kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"}\n !=\n 0\n ) or (\n kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_available{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n",
"expr": "(\n (\n kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"}\n !=\n 0\n ) or (\n kube_daemonset_status_updated_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_available{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_daemonset_status_updated_number_scheduled{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n",
"for": "15m",
"labels": {
"severity": "warning"
@ -123,11 +123,11 @@
{
"alert": "KubeContainerWaiting",
"annotations": {
"description": "Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.",
"description": "pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting",
"summary": "Pod container waiting longer than 1 hour"
},
"expr": "sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job=\"kube-state-metrics\"}) > 0\n",
"expr": "sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job=\"kube-state-metrics\"}) > 0\n",
"for": "1h",
"labels": {
"severity": "warning"
@ -160,14 +160,13 @@
}
},
{
"alert": "KubeJobCompletion",
"alert": "KubeJobNotCompleted",
"annotations": {
"description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobcompletion",
"description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ \"43200\" | humanizeDuration }} to complete.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted",
"summary": "Job did not complete in time"
},
"expr": "kube_job_spec_completions{job=\"kube-state-metrics\"} - kube_job_status_succeeded{job=\"kube-state-metrics\"} > 0\n",
"for": "12h",
"expr": "time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job=\"kube-state-metrics\"}\n and\nkube_job_status_active{job=\"kube-state-metrics\"} > 0) > 43200\n",
"labels": {
"severity": "warning"
}
@ -232,7 +231,7 @@
{
"alert": "KubeMemoryOvercommit",
"annotations": {
"description": "Cluster has overcommitted memory resource requests for Pods by {{ $value }} bytes and cannot tolerate node failure.",
"description": "Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit",
"summary": "Cluster has overcommitted memory resource requests."
},
@ -249,7 +248,7 @@
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit",
"summary": "Cluster has overcommitted CPU resource requests."
},
"expr": "sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"cpu\"})\n /\nsum(kube_node_status_allocatable{resource=\"cpu\"})\n > 1.5\n",
"expr": "sum(min without(resource) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=~\"(cpu|requests.cpu)\"}))\n /\nsum(kube_node_status_allocatable{resource=\"cpu\", job=\"kube-state-metrics\"})\n > 1.5\n",
"for": "5m",
"labels": {
"severity": "warning"
@ -262,7 +261,7 @@
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit",
"summary": "Cluster has overcommitted memory resource requests."
},
"expr": "sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"memory\"})\n /\nsum(kube_node_status_allocatable{resource=\"memory\",job=\"kube-state-metrics\"})\n > 1.5\n",
"expr": "sum(min without(resource) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=~\"(memory|requests.memory)\"}))\n /\nsum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"})\n > 1.5\n",
"for": "5m",
"labels": {
"severity": "warning"
@ -332,7 +331,7 @@
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup",
"summary": "PersistentVolume is filling up."
},
"expr": "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) < 0.03\nand\nkubelet_volume_stats_used_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} > 0\n",
"expr": "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) < 0.03\nand\nkubelet_volume_stats_used_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"} == 1\n",
"for": "1m",
"labels": {
"severity": "critical"
@ -345,7 +344,33 @@
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup",
"summary": "PersistentVolume is filling up."
},
"expr": "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) < 0.15\nand\nkubelet_volume_stats_used_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nand\npredict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}[6h], 4 * 24 * 3600) < 0\n",
"expr": "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) < 0.15\nand\nkubelet_volume_stats_used_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nand\npredict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}[6h], 4 * 24 * 3600) < 0\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"} == 1\n",
"for": "1h",
"labels": {
"severity": "warning"
}
},
{
"alert": "KubePersistentVolumeInodesFillingUp",
"annotations": {
"description": "The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} only has {{ $value | humanizePercentage }} free inodes.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup",
"summary": "PersistentVolumeInodes are filling up."
},
"expr": "(\n kubelet_volume_stats_inodes_free{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_inodes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) < 0.03\nand\nkubelet_volume_stats_inodes_used{job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"} == 1\n",
"for": "1m",
"labels": {
"severity": "critical"
}
},
{
"alert": "KubePersistentVolumeInodesFillingUp",
"annotations": {
"description": "Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to run out of inodes within four days. Currently {{ $value | humanizePercentage }} of its inodes are free.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup",
"summary": "PersistentVolumeInodes are filling up."
},
"expr": "(\n kubelet_volume_stats_inodes_free{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_inodes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) < 0.15\nand\nkubelet_volume_stats_inodes_used{job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nand\npredict_linear(kubelet_volume_stats_inodes_free{job=\"kubelet\", metrics_path=\"/metrics\"}[6h], 4 * 24 * 3600) < 0\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"} == 1\n",
"for": "1h",
"labels": {
"severity": "warning"
@ -376,7 +401,7 @@
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch",
"summary": "Different semantic versions of Kubernetes components running."
},
"expr": "count(count by (git_version) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"},\"git_version\",\"$1\",\"git_version\",\"(v[0-9]*.[0-9]*).*\"))) > 1\n",
"expr": "count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"},\"git_version\",\"$1\",\"git_version\",\"(v[0-9]*.[0-9]*).*\"))) > 1\n",
"for": "15m",
"labels": {
"severity": "warning"
@ -389,7 +414,7 @@
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors",
"summary": "Kubernetes API server client is experiencing errors."
},
"expr": "(sum(rate(rest_client_requests_total{code=~\"5..\"}[5m])) by (instance, job, namespace)\n /\nsum(rate(rest_client_requests_total[5m])) by (instance, job, namespace))\n> 0.01\n",
"expr": "(sum(rate(rest_client_requests_total{code=~\"5..\"}[5m])) by (cluster, instance, job, namespace)\n /\nsum(rate(rest_client_requests_total[5m])) by (cluster, instance, job, namespace))\n> 0.01\n",
"for": "15m",
"labels": {
"severity": "warning"
@ -468,7 +493,7 @@
{
"alert": "KubeClientCertificateExpiration",
"annotations": {
"description": "A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.",
"description": "A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration",
"summary": "Client certificate is about to expire."
},
@ -480,7 +505,7 @@
{
"alert": "KubeClientCertificateExpiration",
"annotations": {
"description": "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.",
"description": "A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration",
"summary": "Client certificate is about to expire."
},
@ -490,25 +515,25 @@
}
},
{
"alert": "AggregatedAPIErrors",
"alert": "KubeAggregatedAPIErrors",
"annotations": {
"description": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/aggregatedapierrors",
"summary": "An aggregated API has reported errors."
"description": "Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors",
"summary": "Kubernetes aggregated API has reported errors."
},
"expr": "sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4\n",
"expr": "sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4\n",
"labels": {
"severity": "warning"
}
},
{
"alert": "AggregatedAPIDown",
"alert": "KubeAggregatedAPIDown",
"annotations": {
"description": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/aggregatedapidown",
"summary": "An aggregated API is down."
"description": "Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown",
"summary": "Kubernetes aggregated API is down."
},
"expr": "(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85\n",
"expr": "(1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85\n",
"for": "5m",
"labels": {
"severity": "warning"
@ -530,9 +555,9 @@
{
"alert": "KubeAPITerminatedRequests",
"annotations": {
"description": "The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.",
"description": "The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests",
"summary": "The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests."
"summary": "The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests."
},
"expr": "sum(rate(apiserver_request_terminations_total{job=\"apiserver\"}[10m])) / ( sum(rate(apiserver_request_total{job=\"apiserver\"}[10m])) + sum(rate(apiserver_request_terminations_total{job=\"apiserver\"}[10m])) ) > 0.20\n",
"for": "5m",
@ -578,10 +603,10 @@
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods",
"summary": "Kubelet is running at capacity."
},
"expr": "count by(node) (\n (kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})\n)\n/\nmax by(node) (\n kube_node_status_capacity{job=\"kube-state-metrics\",resource=\"pods\"} != 1\n) > 0.95\n",
"expr": "count by(cluster, node) (\n (kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})\n)\n/\nmax by(cluster, node) (\n kube_node_status_capacity{job=\"kube-state-metrics\",resource=\"pods\"} != 1\n) > 0.95\n",
"for": "15m",
"labels": {
"severity": "warning"
"severity": "info"
}
},
{
@ -591,7 +616,7 @@
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping",
"summary": "Node readiness status is flapping."
},
"expr": "sum(changes(kube_node_status_condition{status=\"true\",condition=\"Ready\"}[15m])) by (node) > 2\n",
"expr": "sum(changes(kube_node_status_condition{status=\"true\",condition=\"Ready\"}[15m])) by (cluster, node) > 2\n",
"for": "15m",
"labels": {
"severity": "warning"
@ -617,7 +642,7 @@
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh",
"summary": "Kubelet Pod startup latency is too high."
},
"expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\", metrics_path=\"/metrics\"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"} > 60\n",
"expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\", metrics_path=\"/metrics\"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"} > 60\n",
"for": "15m",
"labels": {
"severity": "warning"
@ -752,98 +777,98 @@
"name": "kube-apiserver-burnrate.rules",
"rules": [
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[1d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[1d]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[1d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[1d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[1d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[1d]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[1d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n",
"labels": {
"verb": "read"
},
"record": "apiserver_request:burnrate1d"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[1h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[1h]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[1h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[1h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[1h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[1h]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[1h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n",
"labels": {
"verb": "read"
},
"record": "apiserver_request:burnrate1h"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[2h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[2h]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[2h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[2h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[2h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[2h]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[2h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n",
"labels": {
"verb": "read"
},
"record": "apiserver_request:burnrate2h"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[30m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[30m]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[30m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[30m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[30m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[30m]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[30m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n",
"labels": {
"verb": "read"
},
"record": "apiserver_request:burnrate30m"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[3d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[3d]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[3d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[3d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[3d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[3d]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[3d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n",
"labels": {
"verb": "read"
},
"record": "apiserver_request:burnrate3d"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[5m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[5m]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[5m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[5m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[5m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[5m]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[5m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n",
"labels": {
"verb": "read"
},
"record": "apiserver_request:burnrate5m"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[6h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[6h]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[6h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[6h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[6h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[6h]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[6h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n",
"labels": {
"verb": "read"
},
"record": "apiserver_request:burnrate6h"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[1d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[1d]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[1d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n",
"labels": {
"verb": "write"
},
"record": "apiserver_request:burnrate1d"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[1h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[1h]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[1h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n",
"labels": {
"verb": "write"
},
"record": "apiserver_request:burnrate1h"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[2h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[2h]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[2h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n",
"labels": {
"verb": "write"
},
"record": "apiserver_request:burnrate2h"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[30m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[30m]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[30m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n",
"labels": {
"verb": "write"
},
"record": "apiserver_request:burnrate30m"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[3d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[3d]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[3d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n",
"labels": {
"verb": "write"
},
"record": "apiserver_request:burnrate3d"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[5m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[5m]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[5m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n",
"labels": {
"verb": "write"
},
"record": "apiserver_request:burnrate5m"
},
{
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[6h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n",
"expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[6h]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[6h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n",
"labels": {
"verb": "write"
},
@ -855,41 +880,20 @@
"name": "kube-apiserver-histogram.rules",
"rules": [
{
"expr": "histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))) > 0\n",
"expr": "histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[5m]))) > 0\n",
"labels": {
"quantile": "0.99",
"verb": "read"
},
"record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile"
"record": "cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile"
},
{
"expr": "histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))) > 0\n",
"expr": "histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[5m]))) > 0\n",
"labels": {
"quantile": "0.99",
"verb": "write"
},
"record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile"
},
{
"expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod))\n",
"labels": {
"quantile": "0.99"
},
"record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile"
},
{
"expr": "histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod))\n",
"labels": {
"quantile": "0.9"
},
"record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile"
},
{
"expr": "histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod))\n",
"labels": {
"quantile": "0.5"
},
"record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile"
"record": "cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile"
}
]
},
@ -916,21 +920,37 @@
"record": "code:apiserver_request_total:increase30d"
},
{
"expr": "1 - (\n (\n # write too slow\n sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~\"POST|PUT|PATCH|DELETE\"}[30d]))\n -\n sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[30d]))\n ) +\n (\n # read too slow\n sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~\"LIST|GET\"}[30d]))\n -\n (\n (\n sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[30d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[30d]))\n +\n sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[30d]))\n )\n ) +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d)\n",
"expr": "sum by (cluster, verb, scope) (increase(apiserver_request_slo_duration_seconds_count[1h]))\n",
"record": "cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h"
},
{
"expr": "sum by (cluster, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h[30d]) * 24 * 30)\n",
"record": "cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d"
},
{
"expr": "sum by (cluster, verb, scope, le) (increase(apiserver_request_slo_duration_seconds_bucket[1h]))\n",
"record": "cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h"
},
{
"expr": "sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h[30d]) * 24 * 30)\n",
"record": "cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d"
},
{
"expr": "1 - (\n (\n # write too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"})\n ) +\n (\n # read too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"cluster\",le=\"30\"})\n )\n ) +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d)\n",
"labels": {
"verb": "all"
},
"record": "apiserver_request:availability30d"
},
{
"expr": "1 - (\n sum by (cluster) (increase(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[30d]))\n -\n (\n # too slow\n (\n sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[30d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[30d]))\n +\n sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[30d]))\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\"})\n",
"expr": "1 - (\n sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n # too slow\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"cluster\",le=\"30\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\"})\n",
"labels": {
"verb": "read"
},
"record": "apiserver_request:availability30d"
},
{
"expr": "1 - (\n (\n # too slow\n sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~\"POST|PUT|PATCH|DELETE\"}[30d]))\n -\n sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[30d]))\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\"})\n",
"expr": "1 - (\n (\n # too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\"})\n",
"labels": {
"verb": "write"
},
@ -992,7 +1012,7 @@
"record": "node_namespace_pod_container:container_memory_swap"
},
{
"expr": "kube_pod_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n",
"expr": "kube_pod_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod, cluster) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n",
"record": "cluster:namespace:pod_memory:active:kube_pod_container_resource_requests"
},
{
@ -1000,7 +1020,7 @@
"record": "namespace_memory:kube_pod_container_resource_requests:sum"
},
{
"expr": "kube_pod_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n",
"expr": "kube_pod_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod, cluster) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n",
"record": "cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests"
},
{
@ -1008,7 +1028,7 @@
"record": "namespace_cpu:kube_pod_container_resource_requests:sum"
},
{
"expr": "kube_pod_container_resource_limits{resource=\"memory\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n",
"expr": "kube_pod_container_resource_limits{resource=\"memory\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod, cluster) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n",
"record": "cluster:namespace:pod_memory:active:kube_pod_container_resource_limits"
},
{
@ -1016,7 +1036,7 @@
"record": "namespace_memory:kube_pod_container_resource_limits:sum"
},
{
"expr": "kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n )\n",
"expr": "kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod, cluster) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n )\n",
"record": "cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits"
},
{
@ -1043,6 +1063,13 @@
"workload_type": "statefulset"
},
"record": "namespace_workload_pod:kube_pod_owner:relabel"
},
{
"expr": "max by (cluster, namespace, workload, pod) (\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"Job\"},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n)\n",
"labels": {
"workload_type": "job"
},
"record": "namespace_workload_pod:kube_pod_owner:relabel"
}
]
},
@ -1118,7 +1145,7 @@
"name": "node.rules",
"rules": [
{
"expr": "topk by(namespace, pod) (1,\n max by (node, namespace, pod) (\n label_replace(kube_pod_info{job=\"kube-state-metrics\",node!=\"\"}, \"pod\", \"$1\", \"pod\", \"(.*)\")\n))\n",
"expr": "topk by(cluster, namespace, pod) (1,\n max by (cluster, node, namespace, pod) (\n label_replace(kube_pod_info{job=\"kube-state-metrics\",node!=\"\"}, \"pod\", \"$1\", \"pod\", \"(.*)\")\n))\n",
"record": "node_namespace_pod:kube_pod_info:"
},
{
@ -1128,6 +1155,10 @@
{
"expr": "sum(\n node_memory_MemAvailable_bytes{job=\"node-exporter\"} or\n (\n node_memory_Buffers_bytes{job=\"node-exporter\"} +\n node_memory_Cached_bytes{job=\"node-exporter\"} +\n node_memory_MemFree_bytes{job=\"node-exporter\"} +\n node_memory_Slab_bytes{job=\"node-exporter\"}\n )\n) by (cluster)\n",
"record": ":node_memory_MemAvailable_bytes:sum"
},
{
"expr": "sum(rate(node_cpu_seconds_total{job=\"node-exporter\",mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[5m])) /\ncount(sum(node_cpu_seconds_total{job=\"node-exporter\"}) by (cluster, instance, cpu))\n",
"record": "cluster:node_cpu:ratio_rate5m"
}
]
},
@ -1135,21 +1166,21 @@
"name": "kubelet.rules",
"rules": [
{
"expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"})\n",
"expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"})\n",
"labels": {
"quantile": "0.99"
},
"record": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile"
},
{
"expr": "histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"})\n",
"expr": "histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"})\n",
"labels": {
"quantile": "0.9"
},
"record": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile"
},
{
"expr": "histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"})\n",
"expr": "histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"})\n",
"labels": {
"quantile": "0.5"
},

View File

@ -6,7 +6,7 @@
"app.kubernetes.io/component": "exporter",
"app.kubernetes.io/name": "node-exporter",
"app.kubernetes.io/part-of": "kube-prometheus",
"app.kubernetes.io/version": "1.2.2",
"app.kubernetes.io/version": "1.3.1",
"prometheus": "k8s",
"role": "alert-rules"
},
@ -25,7 +25,7 @@
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup",
"summary": "Filesystem is predicted to run out of space within the next 24 hours."
},
"expr": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n",
"expr": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 15\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n",
"for": "1h",
"labels": {
"severity": "warning"
@ -38,7 +38,7 @@
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup",
"summary": "Filesystem is predicted to run out of space within the next 4 hours."
},
"expr": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 15\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n",
"expr": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 10\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n",
"for": "1h",
"labels": {
"severity": "critical"
@ -255,11 +255,11 @@
"name": "node-exporter.rules",
"rules": [
{
"expr": "count without (cpu) (\n count without (mode) (\n node_cpu_seconds_total{job=\"node-exporter\"}\n )\n)\n",
"expr": "count without (cpu, mode) (\n node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"}\n)\n",
"record": "instance:node_num_cpu:sum"
},
{
"expr": "1 - avg without (cpu, mode) (\n rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\"}[5m])\n)\n",
"expr": "1 - avg without (cpu) (\n sum without (mode) (rate(node_cpu_seconds_total{job=\"node-exporter\", mode=~\"idle|iowait|steal\"}[5m]))\n)\n",
"record": "instance:node_cpu_utilisation:rate5m"
},
{
@ -267,7 +267,7 @@
"record": "instance:node_load1_per_cpu:ratio"
},
{
"expr": "1 - (\n node_memory_MemAvailable_bytes{job=\"node-exporter\"}\n/\n node_memory_MemTotal_bytes{job=\"node-exporter\"}\n)\n",
"expr": "1 - (\n (\n node_memory_MemAvailable_bytes{job=\"node-exporter\"}\n or\n (\n node_memory_Buffers_bytes{job=\"node-exporter\"}\n +\n node_memory_Cached_bytes{job=\"node-exporter\"}\n +\n node_memory_MemFree_bytes{job=\"node-exporter\"}\n +\n node_memory_Slab_bytes{job=\"node-exporter\"}\n )\n )\n/\n node_memory_MemTotal_bytes{job=\"node-exporter\"}\n)\n",
"record": "instance:node_memory_utilisation:ratio"
},
{

View File

@ -6,7 +6,7 @@
"app.kubernetes.io/component": "controller",
"app.kubernetes.io/name": "prometheus-operator",
"app.kubernetes.io/part-of": "kube-prometheus",
"app.kubernetes.io/version": "0.49.0",
"app.kubernetes.io/version": "0.57.0",
"prometheus": "k8s",
"role": "alert-rules"
},
@ -38,7 +38,7 @@
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors",
"summary": "Errors while performing watch operations in controller."
},
"expr": "(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[10m]))) > 0.4\n",
"expr": "(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]))) > 0.4\n",
"for": "15m",
"labels": {
"severity": "warning"
@ -90,7 +90,7 @@
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready",
"summary": "Prometheus operator not ready"
},
"expr": "min by(namespace, controller) (max_over_time(prometheus_operator_ready{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]) == 0)\n",
"expr": "min by (controller,namespace) (max_over_time(prometheus_operator_ready{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]) == 0)\n",
"for": "5m",
"labels": {
"severity": "warning"
@ -110,6 +110,24 @@
}
}
]
},
{
"name": "config-reloaders",
"rules": [
{
"alert": "ConfigReloaderSidecarErrors",
"annotations": {
"description": "Errors encountered while the {{$labels.pod}} config-reloader sidecar attempts to sync config in {{$labels.namespace}} namespace.\nAs a result, configuration for service running in {{$labels.pod}} may be stale and cannot be updated anymore.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/configreloadersidecarerrors",
"summary": "config-reloader sidecar has not had a successful reload for 10m"
},
"expr": "max_over_time(reloader_last_reload_successful{namespace=~\".+\"}[5m]) == 0\n",
"for": "10m",
"labels": {
"severity": "warning"
}
}
]
}
]
}

View File

@ -4,9 +4,10 @@
"metadata": {
"labels": {
"app.kubernetes.io/component": "prometheus",
"app.kubernetes.io/instance": "k8s",
"app.kubernetes.io/name": "prometheus",
"app.kubernetes.io/part-of": "kube-prometheus",
"app.kubernetes.io/version": "2.29.1",
"app.kubernetes.io/version": "2.36.1",
"prometheus": "k8s",
"role": "alert-rules"
},
@ -226,6 +227,32 @@
"severity": "warning"
}
},
{
"alert": "PrometheusScrapeBodySizeLimitHit",
"annotations": {
"description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{ printf \"%.0f\" $value }} scrapes in the last 5m because some targets exceeded the configured body_size_limit.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapebodysizelimithit",
"summary": "Prometheus has dropped some targets that exceeded body size limit."
},
"expr": "increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) > 0\n",
"for": "15m",
"labels": {
"severity": "warning"
}
},
{
"alert": "PrometheusScrapeSampleLimitHit",
"annotations": {
"description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{ printf \"%.0f\" $value }} scrapes in the last 5m because some targets exceeded the configured sample_limit.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapesamplelimithit",
"summary": "Prometheus has failed scrapes that have exceeded the configured sample limit."
},
"expr": "increase(prometheus_target_scrapes_exceeded_sample_limit_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) > 0\n",
"for": "15m",
"labels": {
"severity": "warning"
}
},
{
"alert": "PrometheusTargetSyncFailure",
"annotations": {

File diff suppressed because one or more lines are too long

View File

@ -36,6 +36,26 @@ spec:
expr: vector(1)
labels:
severity: none
- alert: InfoInhibitor
annotations:
description: 'This is an alert that is used to inhibit info alerts.
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
other alerts.
This alert fires whenever there''s a severity="info" alert, and stops firing when another alert with a
severity of ''warning'' or ''critical'' starts firing on the same namespace.
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
summary: Info-level alert inhibition.
expr: ALERTS{severity = "info"} == 1 unless on(namespace) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
labels:
severity: none
- name: node-network
rules:
- alert: NodeNetworkInterfaceFlapping
@ -61,7 +81,7 @@ spec:
record: instance:node_cpu:ratio
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
record: cluster:node_cpu:sum_rate5m
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
- expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
record: cluster:node_cpu:ratio
- name: kube-prometheus-general.rules
rules:

View File

@ -25,7 +25,7 @@ spec:
description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: "sum by (namespace, pod) (\n max by(namespace, pod) (\n kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Pending|Unknown\"}\n ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (\n 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!=\"Job\"})\n )\n) > 0\n"
expr: "sum by (namespace, pod, cluster) (\n max by(namespace, pod, cluster) (\n kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Pending|Unknown\"}\n ) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (\n 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!=\"Job\"})\n )\n) > 0\n"
for: 15m
labels:
severity: warning
@ -79,16 +79,16 @@ spec:
description: DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
summary: DaemonSet rollout is stuck.
expr: "(\n (\n kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"}\n !=\n 0\n ) or (\n kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_available{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n"
expr: "(\n (\n kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"}\n !=\n 0\n ) or (\n kube_daemonset_status_updated_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_available{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_daemonset_status_updated_number_scheduled{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n"
for: 15m
labels:
severity: warning
- alert: KubeContainerWaiting
annotations:
description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour.
description: pod/{{`{{`}} $labels.pod {{`}}`}} in namespace {{`{{`}} $labels.namespace {{`}}`}} on container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
expr: 'sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
expr: 'sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
'
for: 1h
@ -114,15 +114,12 @@ spec:
for: 15m
labels:
severity: warning
- alert: KubeJobCompletion
- alert: KubeJobNotCompleted
annotations:
description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than 12 hours to complete.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobcompletion
description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than {{`{{`}} "43200" | humanizeDuration {{`}}`}} to complete.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
summary: Job did not complete in time
expr: 'kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
'
for: 12h
expr: "time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job=\"kube-state-metrics\"}\n and\nkube_job_status_active{job=\"kube-state-metrics\"} > 0) > 43200\n"
labels:
severity: warning
- alert: KubeJobFailed
@ -173,7 +170,7 @@ spec:
severity: warning
- alert: KubeMemoryOvercommit
annotations:
description: Cluster has overcommitted memory resource requests for Pods by {{`{{`}} $value {{`}}`}} bytes and cannot tolerate node failure.
description: Cluster has overcommitted memory resource requests for Pods by {{`{{`}} $value | humanize {{`}}`}} bytes and cannot tolerate node failure.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
summary: Cluster has overcommitted memory resource requests.
expr: 'sum(namespace_memory:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
@ -191,7 +188,7 @@ spec:
description: Cluster has overcommitted CPU resource requests for Namespaces.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
summary: Cluster has overcommitted CPU resource requests.
expr: "sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"cpu\"})\n /\nsum(kube_node_status_allocatable{resource=\"cpu\"})\n > 1.5\n"
expr: "sum(min without(resource) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=~\"(cpu|requests.cpu)\"}))\n /\nsum(kube_node_status_allocatable{resource=\"cpu\", job=\"kube-state-metrics\"})\n > 1.5\n"
for: 5m
labels:
severity: warning
@ -200,7 +197,7 @@ spec:
description: Cluster has overcommitted memory resource requests for Namespaces.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
summary: Cluster has overcommitted memory resource requests.
expr: "sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"memory\"})\n /\nsum(kube_node_status_allocatable{resource=\"memory\",job=\"kube-state-metrics\"})\n > 1.5\n"
expr: "sum(min without(resource) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=~\"(memory|requests.memory)\"}))\n /\nsum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"})\n > 1.5\n"
for: 5m
labels:
severity: warning
@ -247,7 +244,7 @@ spec:
description: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is only {{`{{`}} $value | humanizePercentage {{`}}`}} free.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
summary: PersistentVolume is filling up.
expr: "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) < 0.03\nand\nkubelet_volume_stats_used_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} > 0\n"
expr: "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) < 0.03\nand\nkubelet_volume_stats_used_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"} == 1\n"
for: 1m
labels:
severity: critical
@ -256,7 +253,25 @@ spec:
description: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} is available.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
summary: PersistentVolume is filling up.
expr: "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) < 0.15\nand\nkubelet_volume_stats_used_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nand\npredict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}[6h], 4 * 24 * 3600) < 0\n"
expr: "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) < 0.15\nand\nkubelet_volume_stats_used_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nand\npredict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}[6h], 4 * 24 * 3600) < 0\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"} == 1\n"
for: 1h
labels:
severity: warning
- alert: KubePersistentVolumeInodesFillingUp
annotations:
description: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} only has {{`{{`}} $value | humanizePercentage {{`}}`}} free inodes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
summary: PersistentVolumeInodes are filling up.
expr: "(\n kubelet_volume_stats_inodes_free{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_inodes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) < 0.03\nand\nkubelet_volume_stats_inodes_used{job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"} == 1\n"
for: 1m
labels:
severity: critical
- alert: KubePersistentVolumeInodesFillingUp
annotations:
description: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is expected to run out of inodes within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} of its inodes are free.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
summary: PersistentVolumeInodes are filling up.
expr: "(\n kubelet_volume_stats_inodes_free{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_inodes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) < 0.15\nand\nkubelet_volume_stats_inodes_used{job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nand\npredict_linear(kubelet_volume_stats_inodes_free{job=\"kubelet\", metrics_path=\"/metrics\"}[6h], 4 * 24 * 3600) < 0\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"} == 1\n"
for: 1h
labels:
severity: warning
@ -278,7 +293,7 @@ spec:
description: There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
expr: 'count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
expr: 'count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
'
for: 15m
@ -289,7 +304,7 @@ spec:
description: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} $value | humanizePercentage {{`}}`}} errors.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors
summary: Kubernetes API server client is experiencing errors.
expr: "(sum(rate(rest_client_requests_total{code=~\"5..\"}[5m])) by (instance, job, namespace)\n /\nsum(rate(rest_client_requests_total[5m])) by (instance, job, namespace))\n> 0.01\n"
expr: "(sum(rate(rest_client_requests_total{code=~\"5..\"}[5m])) by (cluster, instance, job, namespace)\n /\nsum(rate(rest_client_requests_total[5m])) by (cluster, instance, job, namespace))\n> 0.01\n"
for: 15m
labels:
severity: warning
@ -367,7 +382,7 @@ spec:
rules:
- alert: KubeClientCertificateExpiration
annotations:
description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
@ -377,7 +392,7 @@ spec:
severity: warning
- alert: KubeClientCertificateExpiration
annotations:
description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
@ -385,22 +400,22 @@ spec:
'
labels:
severity: critical
- alert: AggregatedAPIErrors
- alert: KubeAggregatedAPIErrors
annotations:
description: An aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has reported errors. It has appeared unavailable {{`{{`}} $value | humanize {{`}}`}} times averaged over the past 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/aggregatedapierrors
summary: An aggregated API has reported errors.
expr: 'sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
description: Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has reported errors. It has appeared unavailable {{`{{`}} $value | humanize {{`}}`}} times averaged over the past 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
summary: Kubernetes aggregated API has reported errors.
expr: 'sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
'
labels:
severity: warning
- alert: AggregatedAPIDown
- alert: KubeAggregatedAPIDown
annotations:
description: An aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has been only {{`{{`}} $value | humanize {{`}}`}}% available over the last 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/aggregatedapidown
summary: An aggregated API is down.
expr: '(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
description: Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has been only {{`{{`}} $value | humanize {{`}}`}}% available over the last 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
summary: Kubernetes aggregated API is down.
expr: '(1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
'
for: 5m
@ -419,9 +434,9 @@ spec:
severity: critical
- alert: KubeAPITerminatedRequests
annotations:
description: The apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.
description: The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests
summary: The apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.
summary: The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.
expr: 'sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
'
@ -457,16 +472,16 @@ spec:
description: Kubelet '{{`{{`}} $labels.node {{`}}`}}' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods
summary: Kubelet is running at capacity.
expr: "count by(node) (\n (kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})\n)\n/\nmax by(node) (\n kube_node_status_capacity{job=\"kube-state-metrics\",resource=\"pods\"} != 1\n) > 0.95\n"
expr: "count by(cluster, node) (\n (kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})\n)\n/\nmax by(cluster, node) (\n kube_node_status_capacity{job=\"kube-state-metrics\",resource=\"pods\"} != 1\n) > 0.95\n"
for: 15m
labels:
severity: warning
severity: info
- alert: KubeNodeReadinessFlapping
annotations:
description: The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping
summary: Node readiness status is flapping.
expr: 'sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
expr: 'sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (cluster, node) > 2
'
for: 15m
@ -488,7 +503,7 @@ spec:
description: Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh
summary: Kubelet Pod startup latency is too high.
expr: 'histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
expr: 'histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
'
for: 15m
@ -595,96 +610,85 @@ spec:
severity: critical
- name: kube-apiserver-burnrate.rules
rules:
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[1d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[1d]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[1d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[1d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[1d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[1d]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[1d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"\
5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n"
labels:
verb: read
record: apiserver_request:burnrate1d
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[1h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[1h]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[1h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[1h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[1h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[1h]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[1h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"\
5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n"
labels:
verb: read
record: apiserver_request:burnrate1h
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[2h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[2h]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[2h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[2h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[2h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[2h]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[2h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"\
5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n"
labels:
verb: read
record: apiserver_request:burnrate2h
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[30m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[30m]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[30m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[30m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[30m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[30m]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[30m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"\
,code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n"
labels:
verb: read
record: apiserver_request:burnrate30m
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[3d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[3d]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[3d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[3d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[3d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[3d]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[3d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"\
5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n"
labels:
verb: read
record: apiserver_request:burnrate3d
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[5m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[5m]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[5m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[5m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[5m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[5m]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[5m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"\
5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n"
labels:
verb: read
record: apiserver_request:burnrate5m
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[6h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[6h]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[6h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[6h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[6h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[6h]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[6h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"\
5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n"
labels:
verb: read
record: apiserver_request:burnrate6h
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[1d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[1d]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[1d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n"
labels:
verb: write
record: apiserver_request:burnrate1d
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[1h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[1h]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[1h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n"
labels:
verb: write
record: apiserver_request:burnrate1h
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[2h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[2h]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[2h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n"
labels:
verb: write
record: apiserver_request:burnrate2h
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[30m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[30m]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[30m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n"
labels:
verb: write
record: apiserver_request:burnrate30m
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[3d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[3d]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[3d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n"
labels:
verb: write
record: apiserver_request:burnrate3d
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[5m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[5m]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[5m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n"
labels:
verb: write
record: apiserver_request:burnrate5m
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[6h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n"
- expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[6h]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[6h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n"
labels:
verb: write
record: apiserver_request:burnrate6h
- name: kube-apiserver-histogram.rules
rules:
- expr: 'histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0
- expr: 'histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0
'
labels:
quantile: '0.99'
verb: read
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: 'histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0
record: cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile
- expr: 'histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0
'
labels:
quantile: '0.99'
verb: write
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
'
labels:
quantile: '0.99'
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: 'histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
'
labels:
quantile: '0.9'
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: 'histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
'
labels:
quantile: '0.5'
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
record: cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile
- interval: 3m
name: kube-apiserver-availability.rules
rules:
@ -704,16 +708,32 @@ spec:
labels:
verb: write
record: code:apiserver_request_total:increase30d
- expr: "1 - (\n (\n # write too slow\n sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~\"POST|PUT|PATCH|DELETE\"}[30d]))\n -\n sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[30d]))\n ) +\n (\n # read too slow\n sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~\"LIST|GET\"}[30d]))\n -\n (\n (\n sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[30d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[30d]))\n +\n sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[30d]))\n )\n ) +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{code=~\"5..\"}\
\ or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d)\n"
- expr: 'sum by (cluster, verb, scope) (increase(apiserver_request_slo_duration_seconds_count[1h]))
'
record: cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h
- expr: 'sum by (cluster, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h[30d]) * 24 * 30)
'
record: cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d
- expr: 'sum by (cluster, verb, scope, le) (increase(apiserver_request_slo_duration_seconds_bucket[1h]))
'
record: cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h
- expr: 'sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h[30d]) * 24 * 30)
'
record: cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d
- expr: "1 - (\n (\n # write too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"})\n ) +\n (\n # read too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"\
cluster\",le=\"30\"})\n )\n ) +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d)\n"
labels:
verb: all
record: apiserver_request:availability30d
- expr: "1 - (\n sum by (cluster) (increase(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[30d]))\n -\n (\n # too slow\n (\n sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[30d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[30d]))\n +\n sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[30d]))\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\"})\n"
- expr: "1 - (\n sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n # too slow\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"cluster\",le=\"30\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\"})\n"
labels:
verb: read
record: apiserver_request:availability30d
- expr: "1 - (\n (\n # too slow\n sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~\"POST|PUT|PATCH|DELETE\"}[30d]))\n -\n sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[30d]))\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\"})\n"
- expr: "1 - (\n (\n # too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\"})\n"
labels:
verb: write
record: apiserver_request:availability30d
@ -757,19 +777,19 @@ spec:
record: node_namespace_pod_container:container_memory_cache
- expr: "container_memory_swap{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info{node!=\"\"})\n)\n"
record: node_namespace_pod_container:container_memory_swap
- expr: "kube_pod_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n"
- expr: "kube_pod_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod, cluster) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n"
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
- expr: "sum by (namespace, cluster) (\n sum by (namespace, pod, cluster) (\n max by (namespace, pod, container, cluster) (\n kube_pod_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\"}\n ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (\n kube_pod_status_phase{phase=~\"Pending|Running\"} == 1\n )\n )\n)\n"
record: namespace_memory:kube_pod_container_resource_requests:sum
- expr: "kube_pod_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n"
- expr: "kube_pod_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod, cluster) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n"
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
- expr: "sum by (namespace, cluster) (\n sum by (namespace, pod, cluster) (\n max by (namespace, pod, container, cluster) (\n kube_pod_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\"}\n ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (\n kube_pod_status_phase{phase=~\"Pending|Running\"} == 1\n )\n )\n)\n"
record: namespace_cpu:kube_pod_container_resource_requests:sum
- expr: "kube_pod_container_resource_limits{resource=\"memory\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n"
- expr: "kube_pod_container_resource_limits{resource=\"memory\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod, cluster) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n"
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
- expr: "sum by (namespace, cluster) (\n sum by (namespace, pod, cluster) (\n max by (namespace, pod, container, cluster) (\n kube_pod_container_resource_limits{resource=\"memory\",job=\"kube-state-metrics\"}\n ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (\n kube_pod_status_phase{phase=~\"Pending|Running\"} == 1\n )\n )\n)\n"
record: namespace_memory:kube_pod_container_resource_limits:sum
- expr: "kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n )\n"
- expr: "kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod, cluster) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n )\n"
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
- expr: "sum by (namespace, cluster) (\n sum by (namespace, pod, cluster) (\n max by (namespace, pod, container, cluster) (\n kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"}\n ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (\n kube_pod_status_phase{phase=~\"Pending|Running\"} == 1\n )\n )\n)\n"
record: namespace_cpu:kube_pod_container_resource_limits:sum
@ -785,6 +805,10 @@ spec:
labels:
workload_type: statefulset
record: namespace_workload_pod:kube_pod_owner:relabel
- expr: "max by (cluster, namespace, workload, pod) (\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"Job\"},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n)\n"
labels:
workload_type: job
record: namespace_workload_pod:kube_pod_owner:relabel
- name: kube-scheduler.rules
rules:
- expr: 'histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
@ -843,27 +867,33 @@ spec:
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
- name: node.rules
rules:
- expr: "topk by(namespace, pod) (1,\n max by (node, namespace, pod) (\n label_replace(kube_pod_info{job=\"kube-state-metrics\",node!=\"\"}, \"pod\", \"$1\", \"pod\", \"(.*)\")\n))\n"
- expr: "topk by(cluster, namespace, pod) (1,\n max by (cluster, node, namespace, pod) (\n label_replace(kube_pod_info{job=\"kube-state-metrics\",node!=\"\"}, \"pod\", \"$1\", \"pod\", \"(.*)\")\n))\n"
record: 'node_namespace_pod:kube_pod_info:'
- expr: "count by (cluster, node) (sum by (node, cpu) (\n node_cpu_seconds_total{job=\"node-exporter\"}\n* on (namespace, pod) group_left(node)\n topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:)\n))\n"
record: node:node_num_cpu:sum
- expr: "sum(\n node_memory_MemAvailable_bytes{job=\"node-exporter\"} or\n (\n node_memory_Buffers_bytes{job=\"node-exporter\"} +\n node_memory_Cached_bytes{job=\"node-exporter\"} +\n node_memory_MemFree_bytes{job=\"node-exporter\"} +\n node_memory_Slab_bytes{job=\"node-exporter\"}\n )\n) by (cluster)\n"
record: :node_memory_MemAvailable_bytes:sum
- expr: 'sum(rate(node_cpu_seconds_total{job="node-exporter",mode!="idle",mode!="iowait",mode!="steal"}[5m])) /
count(sum(node_cpu_seconds_total{job="node-exporter"}) by (cluster, instance, cpu))
'
record: cluster:node_cpu:ratio_rate5m
- name: kubelet.rules
rules:
- expr: 'histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
- expr: 'histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
'
labels:
quantile: '0.99'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- expr: 'histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
- expr: 'histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
'
labels:
quantile: '0.9'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- expr: 'histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
- expr: 'histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
'
labels:

View File

@ -14,7 +14,7 @@ spec:
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 15\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
for: 1h
labels:
severity: warning
@ -23,7 +23,7 @@ spec:
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up fast.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 15\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 10\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
for: 1h
labels:
severity: critical
@ -188,13 +188,13 @@ spec:
severity: critical
- name: node-exporter.rules
rules:
- expr: "count without (cpu) (\n count without (mode) (\n node_cpu_seconds_total{job=\"node-exporter\"}\n )\n)\n"
- expr: "count without (cpu, mode) (\n node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"}\n)\n"
record: instance:node_num_cpu:sum
- expr: "1 - avg without (cpu, mode) (\n rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\"}[5m])\n)\n"
- expr: "1 - avg without (cpu) (\n sum without (mode) (rate(node_cpu_seconds_total{job=\"node-exporter\", mode=~\"idle|iowait|steal\"}[5m]))\n)\n"
record: instance:node_cpu_utilisation:rate5m
- expr: "(\n node_load1{job=\"node-exporter\"}\n/\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n"
record: instance:node_load1_per_cpu:ratio
- expr: "1 - (\n node_memory_MemAvailable_bytes{job=\"node-exporter\"}\n/\n node_memory_MemTotal_bytes{job=\"node-exporter\"}\n)\n"
- expr: "1 - (\n (\n node_memory_MemAvailable_bytes{job=\"node-exporter\"}\n or\n (\n node_memory_Buffers_bytes{job=\"node-exporter\"}\n +\n node_memory_Cached_bytes{job=\"node-exporter\"}\n +\n node_memory_MemFree_bytes{job=\"node-exporter\"}\n +\n node_memory_Slab_bytes{job=\"node-exporter\"}\n )\n )\n/\n node_memory_MemTotal_bytes{job=\"node-exporter\"}\n)\n"
record: instance:node_memory_utilisation:ratio
- expr: 'rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])

View File

@ -25,7 +25,7 @@ spec:
description: Errors while performing watch operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors
summary: Errors while performing watch operations in controller.
expr: '(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
expr: '(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[5m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.4
'
for: 15m
@ -69,7 +69,7 @@ spec:
description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace isn't ready to reconcile {{`{{`}} $labels.controller {{`}}`}} resources.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
summary: Prometheus operator not ready
expr: 'min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0)
expr: 'min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0)
'
for: 5m
@ -86,4 +86,19 @@ spec:
for: 5m
labels:
severity: warning
- name: config-reloaders
rules:
- alert: ConfigReloaderSidecarErrors
annotations:
description: 'Errors encountered while the {{`{{`}}$labels.pod{{`}}`}} config-reloader sidecar attempts to sync config in {{`{{`}}$labels.namespace{{`}}`}} namespace.
As a result, configuration for service running in {{`{{`}}$labels.pod{{`}}`}} may be stale and cannot be updated anymore.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/configreloadersidecarerrors
summary: config-reloader sidecar has not had a successful reload for 10m
expr: 'max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0
'
for: 10m
labels:
severity: warning

View File

@ -181,6 +181,28 @@ spec:
for: 15m
labels:
severity: warning
- alert: PrometheusScrapeBodySizeLimitHit
annotations:
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed {{`{{`}} printf "%.0f" $value {{`}}`}} scrapes in the last 5m because some targets exceeded the configured body_size_limit.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapebodysizelimithit
summary: Prometheus has dropped some targets that exceeded body size limit.
expr: 'increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
'
for: 15m
labels:
severity: warning
- alert: PrometheusScrapeSampleLimitHit
annotations:
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed {{`{{`}} printf "%.0f" $value {{`}}`}} scrapes in the last 5m because some targets exceeded the configured sample_limit.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapesamplelimithit
summary: Prometheus has failed scrapes that have exceeded the configured sample limit.
expr: 'increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
'
for: 15m
labels:
severity: warning
- alert: PrometheusTargetSyncFailure
annotations:
description: '{{`{{`}} printf "%.0f" $value {{`}}`}} targets in Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} have failed to sync because invalid configuration was supplied.'

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: kubezero-storage
description: KubeZero umbrella chart for all things storage incl. AWS EBS/EFS, openEBS-lvm, gemini
type: application
version: 0.7.0
version: 0.7.1
home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords:
@ -28,7 +28,7 @@ dependencies:
condition: gemini.enabled
# repository: https://charts.fairwinds.com/stable
- name: aws-ebs-csi-driver
version: 2.10.1
version: 2.11.0
condition: aws-ebs-csi-driver.enabled
# repository: https://kubernetes-sigs.github.io/aws-ebs-csi-driver
- name: aws-efs-csi-driver

View File

@ -1,9 +1,15 @@
# Helm chart
# v2.11.0
* Bump app/driver to version `v1.11.3`
* Add support for leader election tuning for `csi-provisioner` and `csi-attacher` ([#1371](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/pull/1371), [@moogzy](https://github.com/moogzy))
* Change `fsGroupPolicy` to `File` ([#1377](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/pull/1377), [@ConnorJC3](https://github.com/ConnorJC3))
* Allow all taint for `csi-node` by default ([#1381](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/pull/1381), [@gtxu](https://github.com/gtxu))
## v2.10.1
* Bump app/driver to version `v1.11.2`
## 2.10.0
## v2.10.0
* Implement securityContext for containers
* Add securityContext for node pod
* Utilize more secure defaults for securityContext

View File

@ -3,7 +3,7 @@ annotations:
- kind: added
description: Custom controller.updateStrategy to set controller deployment strategy.
apiVersion: v2
appVersion: 1.11.2
appVersion: 1.11.3
description: A Helm chart for AWS EBS CSI Driver
home: https://github.com/kubernetes-sigs/aws-ebs-csi-driver
keywords:
@ -19,4 +19,4 @@ maintainers:
name: aws-ebs-csi-driver
sources:
- https://github.com/kubernetes-sigs/aws-ebs-csi-driver
version: 2.10.1
version: 2.11.0

View File

@ -170,7 +170,18 @@ spec:
{{- if .Values.controller.extraCreateMetadata }}
- --extra-create-metadata
{{- end}}
- --leader-election=true
- --leader-election={{ .Values.sidecars.provisioner.leaderElection.enabled | required "leader election state for csi-provisioner is required, must be set to true || false." }}
{{- if .Values.sidecars.provisioner.leaderElection.enabled }}
{{- if .Values.sidecars.provisioner.leaderElection.leaseDuration }}
- --leader-election-lease-duration={{ .Values.sidecars.provisioner.leaderElection.leaseDuration }}
{{- end }}
{{- if .Values.sidecars.provisioner.leaderElection.renewDeadline}}
- --leader-election-renew-deadline={{ .Values.sidecars.provisioner.leaderElection.renewDeadline }}
{{- end }}
{{- if .Values.sidecars.provisioner.leaderElection.retryPeriod }}
- --leader-election-retry-period={{ .Values.sidecars.provisioner.leaderElection.retryPeriod }}
{{- end }}
{{- end }}
- --default-fstype={{ .Values.controller.defaultFsType }}
env:
- name: ADDRESS
@ -202,7 +213,18 @@ spec:
args:
- --csi-address=$(ADDRESS)
- --v={{ .Values.sidecars.attacher.logLevel }}
- --leader-election=true
- --leader-election={{ .Values.sidecars.attacher.leaderElection.enabled | required "leader election state for csi-attacher is required, must be set to true || false." }}
{{- if .Values.sidecars.attacher.leaderElection.enabled }}
{{- if .Values.sidecars.attacher.leaderElection.leaseDuration }}
- --leader-election-lease-duration={{ .Values.sidecars.attacher.leaderElection.leaseDuration }}
{{- end }}
{{- if .Values.sidecars.attacher.leaderElection.renewDeadline}}
- --leader-election-renew-deadline={{ .Values.sidecars.attacher.leaderElection.renewDeadline }}
{{- end }}
{{- if .Values.sidecars.attacher.leaderElection.retryPeriod }}
- --leader-election-retry-period={{ .Values.sidecars.attacher.leaderElection.retryPeriod }}
{{- end }}
{{- end }}
env:
- name: ADDRESS
value: /var/lib/csi/sockets/pluginproxy/csi.sock

View File

@ -7,3 +7,4 @@ metadata:
spec:
attachRequired: true
podInfoOnMount: false
fsGroupPolicy: File

View File

@ -40,8 +40,6 @@ spec:
{{- if .Values.node.tolerateAllTaints }}
- operator: Exists
{{- else }}
- key: CriticalAddonsOnly
operator: Exists
- operator: Exists
effect: NoExecute
tolerationSeconds: 300

View File

@ -40,8 +40,6 @@ spec:
{{- if .Values.node.tolerateAllTaints }}
- operator: Exists
{{- else }}
- key: CriticalAddonsOnly
operator: Exists
- operator: Exists
effect: NoExecute
tolerationSeconds: 300

View File

@ -22,6 +22,16 @@ sidecars:
tag: "v3.1.0"
logLevel: 2
resources: {}
# Tune leader lease election for csi-provisioner.
# Leader election is on by default.
leaderElection:
enabled: true
# Optional values to tune lease behavior.
# The arguments provided must be in an acceptable time.ParseDuration format.
# Ref: https://pkg.go.dev/flag#Duration
# leaseDuration: "15s"
# renewDeadline: "10s"
# retryPeriod: "5s"
securityContext:
readOnlyRootFilesystem: true
allowPrivilegeEscalation: false
@ -31,6 +41,16 @@ sidecars:
pullPolicy: IfNotPresent
repository: k8s.gcr.io/sig-storage/csi-attacher
tag: "v3.4.0"
# Tune leader lease election for csi-attacher.
# Leader election is on by default.
leaderElection:
enabled: true
# Optional values to tune lease behavior.
# The arguments provided must be in an acceptable time.ParseDuration format.
# Ref: https://pkg.go.dev/flag#Duration
# leaseDuration: "15s"
# renewDeadline: "10s"
# retryPeriod: "5s"
logLevel: 2
resources: {}
securityContext:
@ -183,7 +203,7 @@ node:
nodeSelector: {}
podAnnotations: {}
podLabels: {}
tolerateAllTaints: false
tolerateAllTaints: true
tolerations: []
resources: {}
serviceAccount:

View File

@ -18,7 +18,7 @@
"subdir": "contrib/mixin"
}
},
"version": "74aa38ec10bc22d34ffd204f46df6e460b78d855",
"version": "19002cfc689fba2b8f56605e5797bf79f8b61fdd",
"sum": "W/Azptf1PoqjyMwJON96UY69MFugDA4IAYiKURscryc="
},
{
@ -38,7 +38,7 @@
"subdir": "grafana-builder"
}
},
"version": "c132c4afcf17491718539db4c2d94c0ea4346120",
"version": "dbf6fc14105c28b6fd0253005f7ca2da37d3d4e1",
"sum": "tDR6yT2GVfw0wTU12iZH+m01HrbIr6g/xN+/8nzNkU0="
},
{
@ -58,7 +58,7 @@
"subdir": "lib/promgrafonnet"
}
},
"version": "eb98d4f74e8ac9c30b1f0e815b07bed31da76c8f",
"version": "5e44626d70c2bf2d35c37f3fee5a6261a5335cc6",
"sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps="
},
{

View File

@ -63,11 +63,6 @@ gemini:
aws-ebs-csi-driver:
enabled: false
# starting with 1.6 the ebs-plugin panics with "could not get number of attached ENIs"
# somewhere related to metadata / volumeattach limits and nitro instances ... AWS as usual
#image:
# tag: v1.5.3
controller:
replicaCount: 1
logLevel: 2

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: kubezero
description: KubeZero - Root App of Apps chart
type: application
version: 1.23.10-3
version: 1.23.11
home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords:

View File

@ -68,4 +68,4 @@ Kubernetes: `>= 1.20.0`
| storage.targetRevision | string | `"0.7.0"` | |
----------------------------------------------
Autogenerated from chart metadata using [helm-docs v1.9.1](https://github.com/norwoodj/helm-docs/releases/v1.9.1)
Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0)

View File

@ -22,7 +22,7 @@ cert-manager:
storage:
enabled: false
targetRevision: 0.7.0
targetRevision: 0.7.1
aws-ebs-csi-driver:
enabled: false
aws-efs-csi-driver:
@ -52,7 +52,7 @@ istio-private-ingress:
metrics:
enabled: false
namespace: monitoring
targetRevision: 0.8.1
targetRevision: 0.8.4
istio:
grafana: {}
prometheus: {}

View File

@ -29,4 +29,4 @@ Kubernetes: `>= 1.20.0`
| manticoresearch.worker.volume.size | string | `"4Gi"` | |
----------------------------------------------
Autogenerated from chart metadata using [helm-docs v1.9.1](https://github.com/norwoodj/helm-docs/releases/v1.9.1)
Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0)