feat: metrics version bump

This commit is contained in:
Stefan Reimer 2021-06-30 18:26:06 +02:00
parent 5e9d2bc168
commit 563cf4add7
117 changed files with 5304 additions and 4791 deletions

View File

@ -16,11 +16,11 @@ dependencies:
version: ">= 0.1.3"
repository: https://zero-down-time.github.io/kubezero/
- name: kube-prometheus-stack
version: 16.1.2
version: 16.12.0
# Switch back to upstream once all alerts are fixed eg. etcd gpcr
# repository: https://prometheus-community.github.io/helm-charts
- name: prometheus-adapter
version: 2.13
version: 2.14.2
repository: https://prometheus-community.github.io/helm-charts
condition: prometheus-adapter.enabled
kubeVersion: ">= 1.18.0"

View File

@ -6,12 +6,12 @@ annotations:
url: https://github.com/prometheus-operator/kube-prometheus
artifacthub.io/operator: "true"
apiVersion: v2
appVersion: 0.48.0
appVersion: 0.48.1
dependencies:
- condition: kubeStateMetrics.enabled
name: kube-state-metrics
repository: https://prometheus-community.github.io/helm-charts
version: 3.1.*
version: 3.3.*
- condition: nodeExporter.enabled
name: prometheus-node-exporter
repository: https://prometheus-community.github.io/helm-charts
@ -19,7 +19,7 @@ dependencies:
- condition: grafana.enabled
name: grafana
repository: https://grafana.github.io/helm-charts
version: 6.9.*
version: 6.13.*
description: kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards,
and Prometheus rules combined with documentation and scripts to provide easy to
operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus
@ -47,4 +47,4 @@ sources:
- https://github.com/prometheus-community/helm-charts
- https://github.com/prometheus-operator/kube-prometheus
type: application
version: 16.1.2
version: 16.12.0

View File

@ -1,5 +1,5 @@
apiVersion: v2
appVersion: 7.5.5
appVersion: 8.0.3
description: The leading tool for querying and visualizing time series and metrics.
home: https://grafana.net
icon: https://raw.githubusercontent.com/grafana/grafana/master/public/img/logo_transparent_400x.png
@ -19,4 +19,4 @@ name: grafana
sources:
- https://github.com/grafana/grafana
type: application
version: 6.9.1
version: 6.13.5

View File

@ -59,8 +59,8 @@ This version requires Helm >= 3.1.0.
| `securityContext` | Deployment securityContext | `{"runAsUser": 472, "runAsGroup": 472, "fsGroup": 472}` |
| `priorityClassName` | Name of Priority Class to assign pods | `nil` |
| `image.repository` | Image repository | `grafana/grafana` |
| `image.tag` | Image tag (`Must be >= 5.0.0`) | `7.4.5` |
| `image.sha` | Image sha (optional) | `2b56f6106ddc376bb46d974230d530754bf65a640dfbc5245191d72d3b49efc6` |
| `image.tag` | Image tag (`Must be >= 5.0.0`) | `8.0.3` |
| `image.sha` | Image sha (optional) | `80c6d6ac633ba5ab3f722976fb1d9a138f87ca6a9934fcd26a5fc28cbde7dbfa` |
| `image.pullPolicy` | Image pull policy | `IfNotPresent` |
| `image.pullSecrets` | Image pull secrets | `{}` |
| `service.enabled` | Enable grafana service | `true` |
@ -83,7 +83,7 @@ This version requires Helm >= 3.1.0.
| `ingress.path` | Ingress accepted path | `/` |
| `ingress.pathType` | Ingress type of path | `Prefix` |
| `ingress.hosts` | Ingress accepted hostnames | `["chart-example.local"]` |
| `ingress.extraPaths` | Ingress extra paths to prepend to every host configuration. Useful when configuring [custom actions with AWS ALB Ingress Controller](https://kubernetes-sigs.github.io/aws-alb-ingress-controller/guide/ingress/annotation/#actions). | `[]` |
| `ingress.extraPaths` | Ingress extra paths to prepend to every host configuration. Useful when configuring [custom actions with AWS ALB Ingress Controller](https://kubernetes-sigs.github.io/aws-alb-ingress-controller/guide/ingress/annotation/#actions). Requires `ingress.hosts` to have one or more host entries. | `[]` |
| `ingress.tls` | Ingress TLS configuration | `[]` |
| `resources` | CPU/Memory resource requests/limits | `{}` |
| `nodeSelector` | Node labels for pod assignment | `{}` |
@ -116,6 +116,7 @@ This version requires Helm >= 3.1.0.
| `envValueFrom` | Environment variables from alternate sources. See the API docs on [EnvVarSource](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.17/#envvarsource-v1-core) for format details. | `{}` |
| `envFromSecret` | Name of a Kubernetes secret (must be manually created in the same namespace) containing values to be added to the environment. Can be templated | `""` |
| `envRenderSecret` | Sensible environment variables passed to pods and stored as secret | `{}` |
| `enableServiceLinks` | Inject Kubernetes services as environment variables. | `true` |
| `extraSecretMounts` | Additional grafana server secret mounts | `[]` |
| `extraVolumeMounts` | Additional grafana server volume mounts | `[]` |
| `extraConfigmapMounts` | Additional grafana server configMap volume mounts | `[]` |
@ -136,7 +137,7 @@ This version requires Helm >= 3.1.0.
| `podLabels` | Pod labels | `{}` |
| `podPortName` | Name of the grafana port on the pod | `grafana` |
| `sidecar.image.repository` | Sidecar image repository | `quay.io/kiwigrid/k8s-sidecar` |
| `sidecar.image.tag` | Sidecar image tag | `1.10.7` |
| `sidecar.image.tag` | Sidecar image tag | `1.12.2` |
| `sidecar.image.sha` | Sidecar image sha (optional) | `""` |
| `sidecar.imagePullPolicy` | Sidecar image pull policy | `IfNotPresent` |
| `sidecar.resources` | Sidecar resources | `{}` |

View File

@ -104,13 +104,67 @@ app.kubernetes.io/name: {{ include "grafana.name" . }}-image-renderer
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end -}}
{{/*
Looks if there's an existing secret and reuse its password. If not it generates
new password and use it.
*/}}
{{- define "grafana.password" -}}
{{- $secret := (lookup "v1" "Secret" (include "grafana.namespace" .) (include "grafana.fullname" .) ) -}}
{{- if $secret -}}
{{- index $secret "data" "admin-password" -}}
{{- else -}}
{{- (randAlphaNum 40) | b64enc | quote -}}
{{- end -}}
{{- end -}}
{{/*
Get KubeVersion removing pre-release information.
*/}}
{{- define "grafana.kubeVersion" -}}
{{- default .Capabilities.KubeVersion.Version (regexFind "v[0-9]+\\.[0-9]+\\.[0-9]+" .Capabilities.KubeVersion.Version) -}}
{{- end -}}
{{/*
Return the appropriate apiVersion for rbac.
*/}}
{{- define "rbac.apiVersion" -}}
{{- define "grafana.rbac.apiVersion" -}}
{{- if .Capabilities.APIVersions.Has "rbac.authorization.k8s.io/v1" }}
{{- print "rbac.authorization.k8s.io/v1" -}}
{{- else -}}
{{- print "rbac.authorization.k8s.io/v1beta1" -}}
{{- end -}}
{{- end -}}
{{/*
Return the appropriate apiVersion for ingress.
*/}}
{{- define "grafana.ingress.apiVersion" -}}
{{- if and (.Capabilities.APIVersions.Has "networking.k8s.io/v1") (semverCompare ">= 1.19.x" (include "grafana.kubeVersion" .)) -}}
{{- print "networking.k8s.io/v1" -}}
{{- else if .Capabilities.APIVersions.Has "networking.k8s.io/v1beta1" -}}
{{- print "networking.k8s.io/v1beta1" -}}
{{- else -}}
{{- print "extensions/v1beta1" -}}
{{- end -}}
{{- end -}}
{{/*
Return if ingress is stable.
*/}}
{{- define "grafana.ingress.isStable" -}}
{{- eq (include "grafana.ingress.apiVersion" .) "networking.k8s.io/v1" -}}
{{- end -}}
{{/*
Return if ingress supports ingressClassName.
*/}}
{{- define "grafana.ingress.supportsIngressClassName" -}}
{{- or (eq (include "grafana.ingress.isStable" .) "true") (and (eq (include "grafana.ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18.x" (include "grafana.kubeVersion" .))) -}}
{{- end -}}
{{/*
Return if ingress supports pathType.
*/}}
{{- define "grafana.ingress.supportsPathType" -}}
{{- or (eq (include "grafana.ingress.isStable" .) "true") (and (eq (include "grafana.ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18.x" (include "grafana.kubeVersion" .))) -}}
{{- end -}}

View File

@ -159,6 +159,7 @@ imagePullSecrets:
- name: {{ . }}
{{- end}}
{{- end }}
enableServiceLinks: {{ .Values.enableServiceLinks }}
containers:
{{- if .Values.sidecar.dashboards.enabled }}
- name: {{ template "grafana.name" . }}-sc-dashboard
@ -357,6 +358,14 @@ containers:
- name: GF_RENDERING_CALLBACK_URL
value: http://{{ template "grafana.fullname" . }}.{{ template "grafana.namespace" . }}:{{ .Values.service.port }}/{{ .Values.imageRenderer.grafanaSubPath }}
{{ end }}
- name: GF_PATHS_DATA
value: {{ (get .Values "grafana.ini").paths.data }}
- name: GF_PATHS_LOGS
value: {{ (get .Values "grafana.ini").paths.logs }}
- name: GF_PATHS_PLUGINS
value: {{ (get .Values "grafana.ini").paths.plugins }}
- name: GF_PATHS_PROVISIONING
value: {{ (get .Values "grafana.ini").paths.provisioning }}
{{- range $key, $value := .Values.envValueFrom }}
- name: {{ $key | quote }}
valueFrom:

View File

@ -1,17 +1,13 @@
{{- if .Values.ingress.enabled -}}
{{- $ingressApiIsStable := eq (include "grafana.ingress.isStable" .) "true" -}}
{{- $ingressSupportsIngressClassName := eq (include "grafana.ingress.supportsIngressClassName" .) "true" -}}
{{- $ingressSupportsPathType := eq (include "grafana.ingress.supportsPathType" .) "true" -}}
{{- $fullName := include "grafana.fullname" . -}}
{{- $servicePort := .Values.service.port -}}
{{- $ingressPath := .Values.ingress.path -}}
{{- $ingressPathType := .Values.ingress.pathType -}}
{{- $extraPaths := .Values.ingress.extraPaths -}}
{{- $newAPI := .Capabilities.APIVersions.Has "networking.k8s.io/v1/Ingress" -}}
{{- if $newAPI -}}
apiVersion: networking.k8s.io/v1
{{- else if .Capabilities.APIVersions.Has "networking.k8s.io/v1beta1/Ingress" }}
apiVersion: networking.k8s.io/v1beta1
{{- else }}
apiVersion: extensions/v1beta1
{{- end }}
apiVersion: {{ include "grafana.ingress.apiVersion" . }}
kind: Ingress
metadata:
name: {{ $fullName }}
@ -28,7 +24,7 @@ metadata:
{{- end }}
{{- end }}
spec:
{{- if .Values.ingress.ingressClassName }}
{{- if and $ingressSupportsIngressClassName .Values.ingress.ingressClassName }}
ingressClassName: {{ .Values.ingress.ingressClassName }}
{{- end -}}
{{- if .Values.ingress.tls }}
@ -41,15 +37,15 @@ spec:
- host: {{ tpl . $}}
http:
paths:
{{ if $extraPaths }}
{{- if $extraPaths }}
{{ toYaml $extraPaths | indent 10 }}
{{- end }}
- path: {{ $ingressPath }}
{{- if $newAPI }}
{{- if $ingressSupportsPathType }}
pathType: {{ $ingressPathType }}
{{- end }}
backend:
{{- if $newAPI }}
{{- if $ingressApiIsStable }}
service:
name: {{ $fullName }}
port:
@ -63,12 +59,11 @@ spec:
- http:
paths:
- backend:
{{- if $newAPI }}
{{- if $ingressApiIsStable }}
service:
name: {{ $fullName }}
port:
number: {{ $servicePort }}
pathType: {{ $ingressPathType }}
{{- else }}
serviceName: {{ $fullName }}
servicePort: {{ $servicePort }}
@ -76,5 +71,8 @@ spec:
{{- if $ingressPath }}
path: {{ $ingressPath }}
{{- end }}
{{- if $ingressSupportsPathType }}
pathType: {{ $ingressPathType }}
{{- end }}
{{- end -}}
{{- end }}

View File

@ -1,5 +1,5 @@
{{- if and .Values.rbac.create (not .Values.rbac.useExistingRole) -}}
apiVersion: {{ template "rbac.apiVersion" . }}
apiVersion: {{ template "grafana.rbac.apiVersion" . }}
kind: Role
metadata:
name: {{ template "grafana.fullname" . }}

View File

@ -1,5 +1,5 @@
{{- if .Values.rbac.create -}}
apiVersion: {{ template "rbac.apiVersion" . }}
apiVersion: {{ template "grafana.rbac.apiVersion" . }}
kind: RoleBinding
metadata:
name: {{ template "grafana.fullname" . }}

View File

@ -17,7 +17,7 @@ data:
{{- if .Values.adminPassword }}
admin-password: {{ .Values.adminPassword | b64enc | quote }}
{{- else }}
admin-password: {{ randAlphaNum 40 | b64enc | quote }}
admin-password: {{ template "grafana.password" . }}
{{- end }}
{{- end }}
{{- if not .Values.ldap.existingSecret }}

View File

@ -69,7 +69,7 @@ livenessProbe:
image:
repository: grafana/grafana
tag: 7.5.5
tag: 8.0.3
sha: ""
pullPolicy: IfNotPresent
@ -188,7 +188,7 @@ ingress:
labels: {}
path: /
# pathType is only for k8s > 1.19
# pathType is only for k8s >= 1.1=
pathType: Prefix
hosts:
@ -382,6 +382,10 @@ envFromSecret: ""
## This can be useful for auth tokens, etc
envRenderSecret: {}
# Inject Kubernetes services as environment variables.
# See https://kubernetes.io/docs/concepts/services-networking/connect-applications-service/#environment-variables
enableServiceLinks: true
## Additional grafana server secret mounts
# Defines additional mounts with secrets. Secrets must be manually created in the namespace.
extraSecretMounts: []
@ -451,7 +455,7 @@ datasources: {}
# uid: cloudwatch
# editable: false
# jsonData:
# authType: credentials
# authType: default
# defaultRegion: us-east-1
## Configure notifiers
@ -532,7 +536,7 @@ dashboardsConfigMaps: {}
##
grafana.ini:
paths:
data: /var/lib/grafana/data
data: /var/lib/grafana/
logs: /var/log/grafana
plugins: /var/lib/grafana/plugins
provisioning: /etc/grafana/provisioning
@ -603,7 +607,7 @@ smtp:
sidecar:
image:
repository: quay.io/kiwigrid/k8s-sidecar
tag: 1.10.7
tag: 1.12.2
sha: ""
imagePullPolicy: IfNotPresent
resources: {}

View File

@ -16,4 +16,4 @@ name: kube-state-metrics
sources:
- https://github.com/kubernetes/kube-state-metrics/
type: application
version: 3.1.0
version: 3.3.1

View File

@ -62,7 +62,7 @@ The upgraded chart now the following changes:
See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_helm/#customizing-the-chart-before-installing). To see all configurable options with detailed comments:
```console
helm show values kube-state-metrics/kube-state-metrics
helm show values prometheus-community/kube-state-metrics
```
You may also run `helm show values` on this chart's [dependencies](#dependencies) for additional options.

View File

@ -18,6 +18,6 @@ roleRef:
{{- end }}
subjects:
- kind: ServiceAccount
name: {{ template "kube-state-metrics.fullname" . }}
name: {{ template "kube-state-metrics.serviceAccountName" . }}
namespace: {{ template "kube-state-metrics.namespace" . }}
{{- end -}}

View File

@ -63,109 +63,40 @@ spec:
fieldPath: metadata.namespace
{{- end }}
args:
{{ if .Values.extraArgs }}
{{- if .Values.extraArgs }}
{{- range .Values.extraArgs }}
- {{ . }}
{{- end }}
{{ end }}
{{ if .Values.collectors.certificatesigningrequests }}
- --resources=certificatesigningrequests
{{ end }}
{{ if .Values.collectors.configmaps }}
- --resources=configmaps
{{ end }}
{{ if .Values.collectors.cronjobs }}
- --resources=cronjobs
{{ end }}
{{ if .Values.collectors.daemonsets }}
- --resources=daemonsets
{{ end }}
{{ if .Values.collectors.deployments }}
- --resources=deployments
{{ end }}
{{ if .Values.collectors.endpoints }}
- --resources=endpoints
{{ end }}
{{ if .Values.collectors.horizontalpodautoscalers }}
- --resources=horizontalpodautoscalers
{{ end }}
{{ if .Values.collectors.ingresses }}
- --resources=ingresses
{{ end }}
{{ if .Values.collectors.jobs }}
- --resources=jobs
{{ end }}
{{ if .Values.collectors.limitranges }}
- --resources=limitranges
{{ end }}
{{ if .Values.collectors.mutatingwebhookconfigurations }}
- --resources=mutatingwebhookconfigurations
{{ end }}
{{ if .Values.collectors.namespaces }}
- --resources=namespaces
{{ end }}
{{ if .Values.collectors.networkpolicies }}
- --resources=networkpolicies
{{ end }}
{{ if .Values.collectors.nodes }}
- --resources=nodes
{{ end }}
{{ if .Values.collectors.persistentvolumeclaims }}
- --resources=persistentvolumeclaims
{{ end }}
{{ if .Values.collectors.persistentvolumes }}
- --resources=persistentvolumes
{{ end }}
{{ if .Values.collectors.poddisruptionbudgets }}
- --resources=poddisruptionbudgets
{{ end }}
{{ if .Values.collectors.pods }}
- --resources=pods
{{ end }}
{{ if .Values.collectors.replicasets }}
- --resources=replicasets
{{ end }}
{{ if .Values.collectors.replicationcontrollers }}
- --resources=replicationcontrollers
{{ end }}
{{ if .Values.collectors.resourcequotas }}
- --resources=resourcequotas
{{ end }}
{{ if .Values.collectors.secrets }}
- --resources=secrets
{{ end }}
{{ if .Values.collectors.services }}
- --resources=services
{{ end }}
{{ if .Values.collectors.statefulsets }}
- --resources=statefulsets
{{ end }}
{{ if .Values.collectors.storageclasses }}
- --resources=storageclasses
{{ end }}
{{ if .Values.collectors.validatingwebhookconfigurations }}
- --resources=validatingwebhookconfigurations
{{ end }}
{{ if .Values.collectors.verticalpodautoscalers }}
- --resources=verticalpodautoscalers
{{ end }}
{{ if .Values.collectors.volumeattachments }}
- --resources=volumeattachments
{{ end }}
{{ if .Values.namespaces }}
- --namespaces={{ tpl .Values.namespaces $ | join "," }}
{{ end }}
{{ if .Values.autosharding.enabled }}
{{- end }}
{{- if .Values.service.port }}
- --port={{ .Values.service.port | default 8080}}
{{- end }}
{{- if .Values.collectors }}
- --resources={{ .Values.collectors | join "," }}
{{- end }}
{{- if .Values.metricLabelsAllowlist }}
- --metric-labels-allowlist={{ .Values.metricLabelsAllowlist | join "," }}
{{- end }}
{{- if .Values.metricAllowlist }}
- --metric-allowlist={{ .Values.metricAllowlist | join "," }}
{{- end }}
{{- if .Values.metricDenylist }}
- --metric-denylist={{ .Values.metricDenylist | join "," }}
{{- end }}
{{- if .Values.namespaces }}
- --namespaces={{ tpl (.Values.namespaces | join ",") $ }}
{{- end }}
{{- if .Values.autosharding.enabled }}
- --pod=$(POD_NAME)
- --pod-namespace=$(POD_NAMESPACE)
{{ end }}
{{ if .Values.kubeconfig.enabled }}
{{- end }}
{{- if .Values.kubeconfig.enabled }}
- --kubeconfig=/opt/k8s/.kube/config
{{ end }}
{{ if .Values.selfMonitor.telemetryHost }}
{{- end }}
{{- if .Values.selfMonitor.telemetryHost }}
- --telemetry-host={{ .Values.selfMonitor.telemetryHost }}
{{ end }}
- --telemetry-port=8081
{{- end }}
- --telemetry-port={{ .Values.selfMonitor.telemetryPort | default 8081 }}
{{- if .Values.kubeconfig.enabled }}
volumeMounts:
- name: kubeconfig
@ -175,26 +106,30 @@ spec:
imagePullPolicy: {{ .Values.image.pullPolicy }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
ports:
- containerPort: 8080
- containerPort: {{ .Values.service.port | default 8080}}
{{- if .Values.selfMonitor.enabled }}
- containerPort: 8081
- containerPort: {{ .Values.selfMonitor.telemetryPort | default 8081 }}
{{- end }}
livenessProbe:
httpGet:
path: /healthz
port: 8080
port: {{ .Values.service.port | default 8080}}
initialDelaySeconds: 5
timeoutSeconds: 5
readinessProbe:
httpGet:
path: /
port: 8080
port: {{ .Values.service.port | default 8080}}
initialDelaySeconds: 5
timeoutSeconds: 5
{{- if .Values.resources }}
resources:
{{ toYaml .Values.resources | indent 10 }}
{{- end }}
{{- if .Values.containerSecurityContext }}
securityContext:
{{ toYaml .Values.containerSecurityContext | indent 10 }}
{{- end }}
{{- if .Values.imagePullSecrets }}
imagePullSecrets:
{{ toYaml .Values.imagePullSecrets | indent 8 }}

View File

@ -14,6 +14,6 @@ roleRef:
name: psp-{{ template "kube-state-metrics.fullname" . }}
subjects:
- kind: ServiceAccount
name: {{ template "kube-state-metrics.fullname" . }}
name: {{ template "kube-state-metrics.serviceAccountName" . }}
namespace: {{ template "kube-state-metrics.namespace" . }}
{{- end }}

View File

@ -18,169 +18,169 @@ metadata:
namespace: {{ . }}
{{- end }}
rules:
{{ if $.Values.collectors.certificatesigningrequests }}
{{ if has "certificatesigningrequests" $.Values.collectors }}
- apiGroups: ["certificates.k8s.io"]
resources:
- certificatesigningrequests
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.configmaps }}
{{ if has "configmaps" $.Values.collectors }}
- apiGroups: [""]
resources:
- configmaps
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.cronjobs }}
{{ if has "cronjobs" $.Values.collectors }}
- apiGroups: ["batch"]
resources:
- cronjobs
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.daemonsets }}
{{ if has "daemonsets" $.Values.collectors }}
- apiGroups: ["extensions", "apps"]
resources:
- daemonsets
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.deployments }}
{{ if has "deployments" $.Values.collectors }}
- apiGroups: ["extensions", "apps"]
resources:
- deployments
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.endpoints }}
{{ if has "endpoints" $.Values.collectors }}
- apiGroups: [""]
resources:
- endpoints
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.horizontalpodautoscalers }}
{{ if has "horizontalpodautoscalers" $.Values.collectors }}
- apiGroups: ["autoscaling"]
resources:
- horizontalpodautoscalers
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.ingresses }}
{{ if has "ingresses" $.Values.collectors }}
- apiGroups: ["extensions", "networking.k8s.io"]
resources:
- ingresses
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.jobs }}
{{ if has "jobs" $.Values.collectors }}
- apiGroups: ["batch"]
resources:
- jobs
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.limitranges }}
{{ if has "limitranges" $.Values.collectors }}
- apiGroups: [""]
resources:
- limitranges
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.mutatingwebhookconfigurations }}
{{ if has "mutatingwebhookconfigurations" $.Values.collectors }}
- apiGroups: ["admissionregistration.k8s.io"]
resources:
- mutatingwebhookconfigurations
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.namespaces }}
{{ if has "namespaces" $.Values.collectors }}
- apiGroups: [""]
resources:
- namespaces
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.networkpolicies }}
{{ if has "networkpolicies" $.Values.collectors }}
- apiGroups: ["networking.k8s.io"]
resources:
- networkpolicies
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.nodes }}
{{ if has "nodes" $.Values.collectors }}
- apiGroups: [""]
resources:
- nodes
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.persistentvolumeclaims }}
{{ if has "persistentvolumeclaims" $.Values.collectors }}
- apiGroups: [""]
resources:
- persistentvolumeclaims
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.persistentvolumes }}
{{ if has "persistentvolumes" $.Values.collectors }}
- apiGroups: [""]
resources:
- persistentvolumes
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.poddisruptionbudgets }}
{{ if has "poddisruptionbudgets" $.Values.collectors }}
- apiGroups: ["policy"]
resources:
- poddisruptionbudgets
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.pods }}
{{ if has "pods" $.Values.collectors }}
- apiGroups: [""]
resources:
- pods
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.replicasets }}
{{ if has "replicasets" $.Values.collectors }}
- apiGroups: ["extensions", "apps"]
resources:
- replicasets
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.replicationcontrollers }}
{{ if has "replicationcontrollers" $.Values.collectors }}
- apiGroups: [""]
resources:
- replicationcontrollers
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.resourcequotas }}
{{ if has "resourcequotas" $.Values.collectors }}
- apiGroups: [""]
resources:
- resourcequotas
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.secrets }}
{{ if has "secrets" $.Values.collectors }}
- apiGroups: [""]
resources:
- secrets
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.services }}
{{ if has "services" $.Values.collectors }}
- apiGroups: [""]
resources:
- services
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.statefulsets }}
{{ if has "statefulsets" $.Values.collectors }}
- apiGroups: ["apps"]
resources:
- statefulsets
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.storageclasses }}
{{ if has "storageclasses" $.Values.collectors }}
- apiGroups: ["storage.k8s.io"]
resources:
- storageclasses
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.validatingwebhookconfigurations }}
{{ if has "validatingwebhookconfigurations" $.Values.collectors }}
- apiGroups: ["admissionregistration.k8s.io"]
resources:
- validatingwebhookconfigurations
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.volumeattachments }}
{{ if has "volumeattachments" $.Values.collectors }}
- apiGroups: ["storage.k8s.io"]
resources:
- volumeattachments
verbs: ["list", "watch"]
{{ end -}}
{{ if $.Values.collectors.verticalpodautoscalers }}
{{ if has "verticalpodautoscalers" $.Values.collectors }}
- apiGroups: ["autoscaling.k8s.io"]
resources:
- verticalpodautoscalers

View File

@ -21,7 +21,7 @@ roleRef:
{{- end }}
subjects:
- kind: ServiceAccount
name: {{ template "kube-state-metrics.fullname" $ }}
name: {{ template "kube-state-metrics.serviceAccountName" $ }}
namespace: {{ template "kube-state-metrics.namespace" $ }}
{{- end -}}
{{- end -}}

View File

@ -23,16 +23,16 @@ spec:
ports:
- name: "http"
protocol: TCP
port: {{ .Values.service.port }}
port: {{ .Values.service.port | default 8080}}
{{- if .Values.service.nodePort }}
nodePort: {{ .Values.service.nodePort }}
{{- end }}
targetPort: 8080
targetPort: {{ .Values.service.port | default 8080}}
{{ if .Values.selfMonitor.enabled }}
- name: "metrics"
protocol: TCP
port: {{ .Values.selfMonitor.telemetryPort | default 8081 }}
targetPort: 8081
targetPort: {{ .Values.selfMonitor.telemetryPort | default 8081 }}
{{ end }}
{{- if .Values.service.loadBalancerIP }}
loadBalancerIP: "{{ .Values.service.loadBalancerIP }}"

View File

@ -7,7 +7,7 @@ metadata:
helm.sh/chart: {{ .Chart.Name }}-{{ .Chart.Version }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
app.kubernetes.io/instance: {{ .Release.Name }}
name: {{ template "kube-state-metrics.fullname" . }}
name: {{ template "kube-state-metrics.serviceAccountName" . }}
namespace: {{ template "kube-state-metrics.namespace" . }}
{{- if .Values.serviceAccount.annotations }}
annotations:

View File

@ -15,6 +15,6 @@ roleRef:
name: stsdiscovery-{{ template "kube-state-metrics.fullname" . }}
subjects:
- kind: ServiceAccount
name: {{ template "kube-state-metrics.fullname" . }}
name: {{ template "kube-state-metrics.serviceAccountName" . }}
namespace: {{ template "kube-state-metrics.namespace" . }}
{{- end }}

View File

@ -88,6 +88,11 @@ securityContext:
runAsUser: 65534
fsGroup: 65534
## Specify security settings for a Container
## Allows overrides and additional options compared to (Pod) securityContext
## Ref: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-container
containerSecurityContext: {}
## Node labels for pod assignment
## Ref: https://kubernetes.io/docs/user-guide/node-selection/
nodeSelector: {}
@ -109,37 +114,56 @@ podAnnotations: {}
# Ref: https://kubernetes.io/docs/tasks/run-application/configure-pdb/
podDisruptionBudget: {}
# Available collectors for kube-state-metrics. By default all available
# resources are enabled.
# Comma-separated list of metrics to be exposed.
# This list comprises of exact metric names and/or regex patterns.
# The allowlist and denylist are mutually exclusive.
metricAllowlist: []
# Comma-separated list of metrics not to be enabled.
# This list comprises of exact metric names and/or regex patterns.
# The allowlist and denylist are mutually exclusive.
metricDenylist: []
# Comma-separated list of additional Kubernetes label keys that will be used in the resource's
# labels metric. By default the metric contains only name and namespace labels.
# To include additional labels, provide a list of resource names in their plural form and Kubernetes
# label keys you would like to allow for them (Example: '=namespaces=[k8s-label-1,k8s-label-n,...],pods=[app],...)'.
# A single '*' can be provided per resource instead to allow any labels, but that has
# severe performance implications (Example: '=pods=[*]').
metricLabelsAllowlist: []
# - namespaces=[k8s-label-1,k8s-label-n]
# Available collectors for kube-state-metrics.
# By default, all available resources are enabled, comment out to disable.
collectors:
certificatesigningrequests: true
configmaps: true
cronjobs: true
daemonsets: true
deployments: true
endpoints: true
horizontalpodautoscalers: true
ingresses: true
jobs: true
limitranges: true
mutatingwebhookconfigurations: true
namespaces: true
networkpolicies: true
nodes: true
persistentvolumeclaims: true
persistentvolumes: true
poddisruptionbudgets: true
pods: true
replicasets: true
replicationcontrollers: true
resourcequotas: true
secrets: true
services: true
statefulsets: true
storageclasses: true
validatingwebhookconfigurations: true
verticalpodautoscalers: false
volumeattachments: true
- certificatesigningrequests
- configmaps
- cronjobs
- daemonsets
- deployments
- endpoints
- horizontalpodautoscalers
- ingresses
- jobs
- limitranges
- mutatingwebhookconfigurations
- namespaces
- networkpolicies
- nodes
- persistentvolumeclaims
- persistentvolumes
- poddisruptionbudgets
- pods
- replicasets
- replicationcontrollers
- resourcequotas
- secrets
- services
- statefulsets
- storageclasses
- validatingwebhookconfigurations
- verticalpodautoscalers
- volumeattachments
# Enabling kubeconfig will pass the --kubeconfig argument to the container
kubeconfig:

View File

@ -14,4 +14,4 @@ maintainers:
name: prometheus-node-exporter
sources:
- https://github.com/prometheus/node_exporter/
version: 1.18.0
version: 1.18.2

View File

@ -22,6 +22,9 @@ spec:
{{- if $.Values.prometheus.monitor.tlsConfig }}
tlsConfig: {{ toYaml $.Values.prometheus.monitor.tlsConfig | nindent 8 }}
{{- end }}
{{- if .Values.prometheus.monitor.proxyUrl }}
proxyUrl: {{ .Values.prometheus.monitor.proxyUrl}}
{{- end }}
{{- if .Values.prometheus.monitor.scrapeTimeout }}
scrapeTimeout: {{ .Values.prometheus.monitor.scrapeTimeout }}
{{- end }}

View File

@ -24,6 +24,10 @@ prometheus:
bearerTokenFile:
tlsConfig: {}
## proxyUrl: URL of a proxy that should be used for scraping.
##
proxyUrl: ""
relabelings: []
scrapeTimeout: 10s

View File

@ -1,4 +1,4 @@
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml
---
apiVersion: apiextensions.k8s.io/v1

View File

@ -1,4 +1,4 @@
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml
---
apiVersion: apiextensions.k8s.io/v1

View File

@ -1,4 +1,4 @@
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml
---
apiVersion: apiextensions.k8s.io/v1

View File

@ -1,4 +1,4 @@
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml
---
apiVersion: apiextensions.k8s.io/v1

View File

@ -1,4 +1,4 @@
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml
---
apiVersion: apiextensions.k8s.io/v1

View File

@ -1,4 +1,4 @@
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml
---
apiVersion: apiextensions.k8s.io/v1

View File

@ -1,4 +1,4 @@
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml
---
apiVersion: apiextensions.k8s.io/v1

View File

@ -1,4 +1,4 @@
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml
---
apiVersion: apiextensions.k8s.io/v1

View File

@ -98,7 +98,12 @@ Allow the release namespace to be overridden for multi-namespace deployments in
{{/* Allow KubeVersion to be overridden. */}}
{{- define "kube-prometheus-stack.ingress.kubeVersion" -}}
{{- default .Capabilities.KubeVersion.Version .Values.kubeVersionOverride -}}
{{- $kubeVersion := default .Capabilities.KubeVersion.Version .Values.kubeVersionOverride -}}
{{/* Special use case for Amazon EKS, Google GKE */}}
{{- if and (regexMatch "\\d+\\.\\d+\\.\\d+-(?:eks|gke).+" $kubeVersion) (not .Values.kubeVersionOverride) -}}
{{- $kubeVersion = regexFind "\\d+\\.\\d+\\.\\d+" $kubeVersion -}}
{{- end -}}
{{- $kubeVersion -}}
{{- end -}}
{{/* Get Ingress API Version */}}

View File

@ -22,9 +22,9 @@ spec:
metricRelabelings:
{{ tpl (toYaml .Values.kubeApiServer.serviceMonitor.metricRelabelings | indent 6) . }}
{{- end }}
{{- if .Values.kubeApiServer.relabelings }}
{{- if .Values.kubeApiServer.serviceMonitor.relabelings }}
relabelings:
{{ toYaml .Values.kubeApiServer.relabelings | indent 6 }}
{{ toYaml .Values.kubeApiServer.serviceMonitor.relabelings | indent 6 }}
{{- end }}
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt

View File

@ -26,6 +26,24 @@ spec:
relabelings:
{{ toYaml .Values.kubeStateMetrics.serviceMonitor.relabelings | indent 4 }}
{{- end }}
{{- if .Values.kubeStateMetrics.serviceMonitor.selfMonitor.enabled }}
- port: metrics
{{- if .Values.kubeStateMetrics.serviceMonitor.interval }}
interval: {{ .Values.kubeStateMetrics.serviceMonitor.interval }}
{{- end }}
{{- if .Values.kubeStateMetrics.serviceMonitor.proxyUrl }}
proxyUrl: {{ .Values.kubeStateMetrics.serviceMonitor.proxyUrl}}
{{- end }}
honorLabels: true
{{- if .Values.kubeStateMetrics.serviceMonitor.metricRelabelings }}
metricRelabelings:
{{ tpl (toYaml .Values.kubeStateMetrics.serviceMonitor.metricRelabelings | indent 4) . }}
{{- end }}
{{- if .Values.kubeStateMetrics.serviceMonitor.relabelings }}
relabelings:
{{ toYaml .Values.kubeStateMetrics.serviceMonitor.relabelings | indent 4 }}
{{- end }}
{{- end }}
{{- if .Values.kubeStateMetrics.serviceMonitor.namespaceOverride }}
namespaceSelector:
matchNames:

View File

@ -168,6 +168,6 @@ spec:
- {{ .Values.kubelet.namespace }}
selector:
matchLabels:
app.kubernetes.io/managed-by: prometheus-operator
app.kubernetes.io/name: kubelet
k8s-app: kubelet
{{- end}}

View File

@ -13,6 +13,11 @@ spec:
matchLabels:
app: prometheus-node-exporter
release: {{ $.Release.Name }}
{{- if (index .Values "prometheus-node-exporter" "namespaceOverride") }}
namespaceSelector:
matchNames:
- {{ index .Values "prometheus-node-exporter" "namespaceOverride" }}
{{- end }}
endpoints:
- port: metrics
{{- if .Values.nodeExporter.serviceMonitor.interval }}

View File

@ -1,4 +1,4 @@
{{- if and .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
{{- if or (and .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled) .Values.grafana.forceDeployDashboards }}
{{- $files := .Files.Glob "dashboards-1.14/*.json" }}
{{- if $files }}
apiVersion: v1

View File

@ -1,4 +1,4 @@
{{- if and .Values.grafana.enabled .Values.grafana.sidecar.datasources.enabled }}
{{- if or (and .Values.grafana.enabled .Values.grafana.sidecar.datasources.enabled) .Values.grafana.forceDeployDatasources }}
apiVersion: v1
kind: ConfigMap
metadata:
@ -20,7 +20,11 @@ data:
{{- if .Values.grafana.sidecar.datasources.defaultDatasourceEnabled }}
- name: Prometheus
type: prometheus
{{- if .Values.grafana.sidecar.datasources.url }}
url: {{ .Values.grafana.sidecar.datasources.url }}
{{- else }}
url: http://{{ template "kube-prometheus-stack.fullname" . }}-prometheus:{{ .Values.prometheus.service.port }}/{{ trimPrefix "/" .Values.prometheus.prometheusSpec.routePrefix }}
{{- end }}
access: proxy
isDefault: true
jsonData:

View File

@ -0,0 +1,610 @@
{{- /*
Generated from 'alertmanager-overview' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
metadata:
namespace: {{ template "kube-prometheus-stack.namespace" . }}
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "alertmanager-overview" | trunc 63 | trimSuffix "-" }}
annotations:
{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }}
labels:
{{- if $.Values.grafana.sidecar.dashboards.label }}
{{ $.Values.grafana.sidecar.dashboards.label }}: "1"
{{- end }}
app: {{ template "kube-prometheus-stack.name" $ }}-grafana
{{ include "kube-prometheus-stack.labels" $ | indent 4 }}
data:
alertmanager-overview.json: |-
{
"__inputs": [
],
"__requires": [
],
"annotations": {
"list": [
]
},
"editable": false,
"gnetId": null,
"graphTooltip": 1,
"hideControls": false,
"id": null,
"links": [
],
"refresh": "30s",
"rows": [
{
"collapse": false,
"collapsed": false,
"panels": [
{
"aliasColors": {
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
"id": 2,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": false,
"sideWidth": null,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [
],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": null,
"seriesOverrides": [
],
"spaceLength": 10,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum(alertmanager_alerts{namespace=\"$namespace\",service=\"$service\"}) by (namespace,service,instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}}",
"refId": "A"
}
],
"thresholds": [
],
"timeFrom": null,
"timeShift": null,
"title": "Alerts",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
]
},
"yaxes": [
{
"format": "none",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "none",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
"id": 3,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": false,
"sideWidth": null,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [
],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": null,
"seriesOverrides": [
],
"spaceLength": 10,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(alertmanager_alerts_received_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} Received",
"refId": "A"
},
{
"expr": "sum(rate(alertmanager_alerts_invalid_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} Invalid",
"refId": "B"
}
],
"thresholds": [
],
"timeFrom": null,
"timeShift": null,
"title": "Alerts receive rate",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
]
},
"yaxes": [
{
"format": "ops",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "ops",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Alerts",
"titleSize": "h6",
"type": "row"
},
{
"collapse": false,
"collapsed": false,
"panels": [
{
"aliasColors": {
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
"id": 4,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": false,
"sideWidth": null,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [
],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": "integration",
"seriesOverrides": [
],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(alertmanager_notifications_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} Total",
"refId": "A"
},
{
"expr": "sum(rate(alertmanager_notifications_failed_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} Failed",
"refId": "B"
}
],
"thresholds": [
],
"timeFrom": null,
"timeShift": null,
"title": "$integration: Notifications Send Rate",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
]
},
"yaxes": [
{
"format": "ops",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "ops",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
"id": 5,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": false,
"sideWidth": null,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [
],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": "integration",
"seriesOverrides": [
],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} 99th Percentile",
"refId": "A"
},
{
"expr": "histogram_quantile(0.50,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} Median",
"refId": "B"
},
{
"expr": "sum(rate(alertmanager_notification_latency_seconds_sum{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n/\nsum(rate(alertmanager_notification_latency_seconds_count{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} Average",
"refId": "C"
}
],
"thresholds": [
],
"timeFrom": null,
"timeShift": null,
"title": "$integration: Notification Duration",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
]
},
"yaxes": [
{
"format": "s",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "s",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Notifications",
"titleSize": "h6",
"type": "row"
}
],
"schemaVersion": 14,
"style": "dark",
"tags": [
"alertmanager-mixin"
],
"templating": {
"list": [
{
"current": {
"text": "Prometheus",
"value": "Prometheus"
},
"hide": 0,
"label": null,
"name": "datasource",
"options": [
],
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
},
{
"allValue": null,
"current": {
"text": "",
"value": ""
},
"datasource": "$datasource",
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "namespace",
"options": [
],
"query": "label_values(alertmanager_alerts, namespace)",
"refresh": 2,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
"tags": [
],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"current": {
"text": "",
"value": ""
},
"datasource": "$datasource",
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "service",
"options": [
],
"query": "label_values(alertmanager_alerts, service)",
"refresh": 2,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
"tags": [
],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"current": {
"text": "all",
"value": "$__all"
},
"datasource": "$datasource",
"hide": 2,
"includeAll": true,
"label": null,
"multi": false,
"name": "integration",
"options": [
],
"query": "label_values(alertmanager_notifications_total{integration=~\".*\"}, integration)",
"refresh": 2,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
"tags": [
],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "utc",
"title": "Alertmanager / Overview",
"uid": "alertmanager-overview",
"version": 0
}
{{- end }}

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'apiserver' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'apiserver' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeApiServer.enabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubeApiServer.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'cluster-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'cluster-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
metadata:

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'controller-manager' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'controller-manager' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeControllerManager.enabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubeControllerManager.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
@ -176,10 +176,10 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)",
"expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
"refId": "A"
}
],
@ -282,10 +282,10 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)",
"expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
"refId": "A"
}
],
@ -388,10 +388,10 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name, le))",
"expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
"refId": "A"
}
],

View File

@ -4,7 +4,7 @@ Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeEtcd.enabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubeEtcd.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:

View File

@ -1,6 +1,6 @@
{{- /* Added manually, can be changed in-place. */ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.coreDns.enabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.coreDns.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'k8s-resources-cluster' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'k8s-resources-cluster' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
metadata:
@ -247,7 +247,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})",
"expr": "sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@ -499,7 +499,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})",
"expr": "sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@ -594,7 +594,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}namespace{{`}}`}}",
@ -885,7 +885,7 @@ data:
"step": 10
},
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -903,7 +903,7 @@ data:
"step": 10
},
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -912,7 +912,7 @@ data:
"step": 10
},
{
"expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)",
"expr": "sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -921,7 +921,7 @@ data:
"step": 10
},
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1321,7 +1321,7 @@ data:
"step": 10
},
{
"expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)",
"expr": "sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1330,7 +1330,7 @@ data:
"step": 10
},
{
"expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)",
"expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1339,7 +1339,7 @@ data:
"step": 10
},
{
"expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)",
"expr": "sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1348,7 +1348,7 @@ data:
"step": 10
},
{
"expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)",
"expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -2443,6 +2443,496 @@ data:
"showTitle": true,
"title": "Rate of Packets Dropped",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"aliasColors": {
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"decimals": -1,
"fill": 10,
"id": 20,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 0,
"links": [
],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
],
"spaceLength": 10,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}namespace{{`}}`}}",
"legendLink": null,
"step": 10
}
],
"thresholds": [
],
"timeFrom": null,
"timeShift": null,
"title": "IOPS(Reads+Writes)",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
},
{
"aliasColors": {
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"id": 21,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 0,
"links": [
],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
],
"spaceLength": 10,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}namespace{{`}}`}}",
"legendLink": null,
"step": 10
}
],
"thresholds": [
],
"timeFrom": null,
"timeShift": null,
"title": "ThroughPut(Read+Write)",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
]
},
"yaxes": [
{
"format": "Bps",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Storage IO",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"aliasColors": {
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 22,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [
],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
],
"sort": {
"col": 4,
"desc": true
},
"spaceLength": 10,
"span": 12,
"stack": false,
"steppedLine": false,
"styles": [
{
"alias": "Time",
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"pattern": "Time",
"type": "hidden"
},
{
"alias": "IOPS(Reads)",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": -1,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #A",
"thresholds": [
],
"type": "number",
"unit": "short"
},
{
"alias": "IOPS(Writes)",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": -1,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #B",
"thresholds": [
],
"type": "number",
"unit": "short"
},
{
"alias": "IOPS(Reads + Writes)",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": -1,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #C",
"thresholds": [
],
"type": "number",
"unit": "short"
},
{
"alias": "Throughput(Read)",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #D",
"thresholds": [
],
"type": "number",
"unit": "Bps"
},
{
"alias": "Throughput(Write)",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #E",
"thresholds": [
],
"type": "number",
"unit": "Bps"
},
{
"alias": "Throughput(Read + Write)",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #F",
"thresholds": [
],
"type": "number",
"unit": "Bps"
},
{
"alias": "Namespace",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": true,
"linkTargetBlank": false,
"linkTooltip": "Drill down to pods",
"linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell",
"pattern": "namespace",
"thresholds": [
],
"type": "number",
"unit": "short"
},
{
"alias": "",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"pattern": "/.*/",
"thresholds": [
],
"type": "string",
"unit": "short"
}
],
"targets": [
{
"expr": "sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "A",
"step": 10
},
{
"expr": "sum by(namespace) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "B",
"step": 10
},
{
"expr": "sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "C",
"step": 10
},
{
"expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "D",
"step": 10
},
{
"expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "E",
"step": 10
},
{
"expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "F",
"step": 10
}
],
"thresholds": [
],
"timeFrom": null,
"timeShift": null,
"title": "Current Storage IO",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
},
"transform": "table",
"type": "table",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Storage IO - Distribution",
"titleSize": "h6"
}
],
"schemaVersion": 14,

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'k8s-resources-namespace' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'k8s-resources-namespace' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
metadata:
@ -78,7 +78,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@ -162,7 +162,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@ -256,7 +256,7 @@ data:
"thresholds": "70,80",
"timeFrom": null,
"timeShift": null,
"title": "Memory Utilization (from requests)",
"title": "Memory Utilisation (from requests)",
"tooltip": {
"shared": false,
"sort": 0,
@ -446,7 +446,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}pod{{`}}`}}",
@ -697,7 +697,7 @@ data:
],
"targets": [
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -715,7 +715,7 @@ data:
"step": 10
},
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -733,7 +733,7 @@ data:
"step": 10
},
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -2136,6 +2136,496 @@ data:
"showTitle": true,
"title": "Rate of Packets Dropped",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"aliasColors": {
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"decimals": -1,
"fill": 10,
"id": 16,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 0,
"links": [
],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
],
"spaceLength": 10,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}pod{{`}}`}}",
"legendLink": null,
"step": 10
}
],
"thresholds": [
],
"timeFrom": null,
"timeShift": null,
"title": "IOPS(Reads+Writes)",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
},
{
"aliasColors": {
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"id": 17,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 0,
"links": [
],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
],
"spaceLength": 10,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}pod{{`}}`}}",
"legendLink": null,
"step": 10
}
],
"thresholds": [
],
"timeFrom": null,
"timeShift": null,
"title": "ThroughPut(Read+Write)",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
]
},
"yaxes": [
{
"format": "Bps",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Storage IO",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"aliasColors": {
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 18,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [
],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
],
"sort": {
"col": 4,
"desc": true
},
"spaceLength": 10,
"span": 12,
"stack": false,
"steppedLine": false,
"styles": [
{
"alias": "Time",
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"pattern": "Time",
"type": "hidden"
},
{
"alias": "IOPS(Reads)",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": -1,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #A",
"thresholds": [
],
"type": "number",
"unit": "short"
},
{
"alias": "IOPS(Writes)",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": -1,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #B",
"thresholds": [
],
"type": "number",
"unit": "short"
},
{
"alias": "IOPS(Reads + Writes)",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": -1,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #C",
"thresholds": [
],
"type": "number",
"unit": "short"
},
{
"alias": "Throughput(Read)",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #D",
"thresholds": [
],
"type": "number",
"unit": "Bps"
},
{
"alias": "Throughput(Write)",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #E",
"thresholds": [
],
"type": "number",
"unit": "Bps"
},
{
"alias": "Throughput(Read + Write)",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #F",
"thresholds": [
],
"type": "number",
"unit": "Bps"
},
{
"alias": "Pod",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": true,
"linkTargetBlank": false,
"linkTooltip": "Drill down to pods",
"linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
"pattern": "pod",
"thresholds": [
],
"type": "number",
"unit": "short"
},
{
"alias": "",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"pattern": "/.*/",
"thresholds": [
],
"type": "string",
"unit": "short"
}
],
"targets": [
{
"expr": "sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "A",
"step": 10
},
{
"expr": "sum by(pod) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "B",
"step": 10
},
{
"expr": "sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "C",
"step": 10
},
{
"expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "D",
"step": 10
},
{
"expr": "sum by(pod) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "E",
"step": 10
},
{
"expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "F",
"step": 10
}
],
"thresholds": [
],
"timeFrom": null,
"timeShift": null,
"title": "Current Storage IO",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
},
"transform": "table",
"type": "table",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Storage IO - Distribution",
"titleSize": "h6"
}
],
"schemaVersion": 14,

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'k8s-resources-node' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'k8s-resources-node' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
metadata:
@ -77,7 +77,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}pod{{`}}`}}",
@ -312,7 +312,7 @@ data:
],
"targets": [
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -330,7 +330,7 @@ data:
"step": 10
},
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -348,7 +348,7 @@ data:
"step": 10
},
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'k8s-resources-pod' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'k8s-resources-pod' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
metadata:
@ -94,7 +94,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}) by (container)",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}) by (container)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}container{{`}}`}}",
@ -450,7 +450,7 @@ data:
],
"targets": [
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -468,7 +468,7 @@ data:
"step": 10
},
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -486,7 +486,7 @@ data:
"step": 10
},
{
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -634,7 +634,7 @@ data:
],
"timeFrom": null,
"timeShift": null,
"title": "Memory Usage",
"title": "Memory Usage (WSS)",
"tooltip": {
"shared": false,
"sort": 0,
@ -725,7 +725,7 @@ data:
"type": "hidden"
},
{
"alias": "Memory Usage",
"alias": "Memory Usage (WSS)",
"colorMode": null,
"colors": [
@ -1591,6 +1591,697 @@ data:
"showTitle": true,
"title": "Rate of Packets Dropped",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"aliasColors": {
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"decimals": -1,
"fill": 10,
"id": 12,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 0,
"links": [
],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
],
"spaceLength": 10,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
"legendLink": null,
"step": 10
},
{
"expr": "ceil(sum by(pod) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
"legendLink": null,
"step": 10
}
],
"thresholds": [
],
"timeFrom": null,
"timeShift": null,
"title": "IOPS",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
},
{
"aliasColors": {
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"id": 13,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 0,
"links": [
],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
],
"spaceLength": 10,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
"legendLink": null,
"step": 10
},
{
"expr": "sum by(pod) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
"legendLink": null,
"step": 10
}
],
"thresholds": [
],
"timeFrom": null,
"timeShift": null,
"title": "ThroughPut",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
]
},
"yaxes": [
{
"format": "Bps",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Storage IO - Distribution(Pod - Read & Writes)",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"aliasColors": {
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"decimals": -1,
"fill": 10,
"id": 14,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 0,
"links": [
],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
],
"spaceLength": 10,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "ceil(sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}container{{`}}`}}",
"legendLink": null,
"step": 10
}
],
"thresholds": [
],
"timeFrom": null,
"timeShift": null,
"title": "IOPS(Reads+Writes)",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
},
{
"aliasColors": {
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"id": 15,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 0,
"links": [
],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
],
"spaceLength": 10,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}container{{`}}`}}",
"legendLink": null,
"step": 10
}
],
"thresholds": [
],
"timeFrom": null,
"timeShift": null,
"title": "ThroughPut(Read+Write)",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
]
},
"yaxes": [
{
"format": "Bps",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Storage IO - Distribution(Containers)",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"aliasColors": {
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 16,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [
],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
],
"sort": {
"col": 4,
"desc": true
},
"spaceLength": 10,
"span": 12,
"stack": false,
"steppedLine": false,
"styles": [
{
"alias": "Time",
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"pattern": "Time",
"type": "hidden"
},
{
"alias": "IOPS(Reads)",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": -1,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #A",
"thresholds": [
],
"type": "number",
"unit": "short"
},
{
"alias": "IOPS(Writes)",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": -1,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #B",
"thresholds": [
],
"type": "number",
"unit": "short"
},
{
"alias": "IOPS(Reads + Writes)",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": -1,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #C",
"thresholds": [
],
"type": "number",
"unit": "short"
},
{
"alias": "Throughput(Read)",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #D",
"thresholds": [
],
"type": "number",
"unit": "Bps"
},
{
"alias": "Throughput(Write)",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #E",
"thresholds": [
],
"type": "number",
"unit": "Bps"
},
{
"alias": "Throughput(Read + Write)",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #F",
"thresholds": [
],
"type": "number",
"unit": "Bps"
},
{
"alias": "Container",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "container",
"thresholds": [
],
"type": "number",
"unit": "short"
},
{
"alias": "",
"colorMode": null,
"colors": [
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"pattern": "/.*/",
"thresholds": [
],
"type": "string",
"unit": "short"
}
],
"targets": [
{
"expr": "sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "A",
"step": 10
},
{
"expr": "sum by(container) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "B",
"step": 10
},
{
"expr": "sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "C",
"step": 10
},
{
"expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "D",
"step": 10
},
{
"expr": "sum by(container) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "E",
"step": 10
},
{
"expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "F",
"step": 10
}
],
"thresholds": [
],
"timeFrom": null,
"timeShift": null,
"title": "Current Storage IO",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
},
"transform": "table",
"type": "table",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [
]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Storage IO - Distribution",
"titleSize": "h6"
}
],
"schemaVersion": 14,

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'k8s-resources-workload' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'k8s-resources-workload' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
metadata:
@ -77,7 +77,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}pod{{`}}`}}",
@ -312,7 +312,7 @@ data:
],
"targets": [
{
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -330,7 +330,7 @@ data:
"step": 10
},
{
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -348,7 +348,7 @@ data:
"step": 10
},
{
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'k8s-resources-workloads-namespace' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'k8s-resources-workloads-namespace' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
metadata:
@ -98,7 +98,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}workload{{`}}`}} - {{`{{`}}workload_type{{`}}`}}",
@ -396,7 +396,7 @@ data:
"step": 10
},
{
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -414,7 +414,7 @@ data:
"step": 10
},
{
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -432,7 +432,7 @@ data:
"step": 10
},
{
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'namespace-by-pod' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'namespace-by-pod' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
metadata:

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'namespace-by-workload' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'namespace-by-workload' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
metadata:

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'node-cluster-rsrc-use' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'node-cluster-rsrc-use' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.nodeExporter.enabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.nodeExporter.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'node-rsrc-use' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'node-rsrc-use' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.nodeExporter.enabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.nodeExporter.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'nodes' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'nodes' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
metadata:
@ -92,9 +92,8 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "(\n (1 - rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[$__interval]))\n/ ignoring(cpu) group_left\n count without (cpu)( node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"})\n)\n",
"expr": "(\n (1 - rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[$__rate_interval]))\n/ ignoring(cpu) group_left\n count without (cpu)( node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"})\n)\n",
"format": "time_series",
"interval": "$__rate_interval",
"intervalFactor": 5,
"legendFormat": "{{`{{`}}cpu{{`}}`}}",
"refId": "A"
@ -528,25 +527,22 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
"expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
"format": "time_series",
"interval": "$__rate_interval",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}device{{`}}`}} read",
"refId": "A"
},
{
"expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
"expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
"format": "time_series",
"interval": "$__rate_interval",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}device{{`}}`}} written",
"refId": "B"
},
{
"expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
"expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
"format": "time_series",
"interval": "$__rate_interval",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}device{{`}}`}} io time",
"refId": "C"
@ -758,9 +754,8 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__interval])",
"expr": "rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__rate_interval])",
"format": "time_series",
"interval": "$__rate_interval",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}device{{`}}`}}",
"refId": "A"
@ -852,9 +847,8 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__interval])",
"expr": "rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__rate_interval])",
"format": "time_series",
"interval": "$__rate_interval",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}device{{`}}`}}",
"refId": "A"

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'persistentvolumesusage' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'persistentvolumesusage' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
metadata:
@ -92,14 +92,14 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "(\n sum without(instance, node) (kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n sum without(instance, node) (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n",
"expr": "(\n sum without(instance, node) (topk(1, (kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n -\n sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n)\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Used Space",
"refId": "A"
},
{
"expr": "sum without(instance, node) (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n",
"expr": "sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Free Space",
@ -207,7 +207,7 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "max without(instance,node) (\n(\n kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n -\n kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n)\n/\nkubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100)\n",
"expr": "max without(instance,node) (\n(\n topk(1, kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n topk(1, kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n/\ntopk(1, kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n* 100)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "",
@ -289,14 +289,14 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum without(instance, node) (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n",
"expr": "sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Used inodes",
"refId": "A"
},
{
"expr": "(\n sum without(instance, node) (kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n sum without(instance, node) (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n",
"expr": "(\n sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n -\n sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n)\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": " Free inodes",
@ -404,7 +404,7 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "max without(instance,node) (\nkubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n/\nkubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100)\n",
"expr": "max without(instance,node) (\ntopk(1, kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n/\ntopk(1, kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n* 100)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "",

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'pod-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'pod-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
metadata:

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'prometheus-remote-write' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'prometheus-remote-write' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.prometheus.prometheusSpec.remoteWriteDashboards }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.prometheus.prometheusSpec.remoteWriteDashboards }}
apiVersion: v1
kind: ConfigMap
metadata:

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'prometheus' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'prometheus' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
metadata:

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'proxy' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'proxy' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeProxy.enabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubeProxy.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'scheduler' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'scheduler' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeScheduler.enabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubeScheduler.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
@ -176,31 +176,31 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)",
"expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} e2e",
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} e2e",
"refId": "A"
},
{
"expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)",
"expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} binding",
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} binding",
"refId": "B"
},
{
"expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)",
"expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} scheduling algorithm",
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} scheduling algorithm",
"refId": "C"
},
{
"expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)",
"expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} volume",
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} volume",
"refId": "D"
}
],
@ -290,31 +290,31 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))",
"expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} e2e",
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} e2e",
"refId": "A"
},
{
"expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))",
"expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} binding",
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} binding",
"refId": "B"
},
{
"expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))",
"expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} scheduling algorithm",
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} scheduling algorithm",
"refId": "C"
},
{
"expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))",
"expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}instance{{`}}`}} volume",
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} volume",
"refId": "D"
}
],

View File

@ -4,7 +4,7 @@ Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
metadata:

View File

@ -1,10 +1,10 @@
{{- /*
Generated from 'workload-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
Generated from 'workload-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
apiVersion: v1
kind: ConfigMap
metadata:

View File

@ -58,8 +58,8 @@ spec:
tolerations:
{{ toYaml . | indent 8 }}
{{- end }}
{{- if .Values.prometheusOperator.admissionWebhooks.patch.securityContext }}
securityContext:
runAsGroup: 2000
runAsNonRoot: true
runAsUser: 2000
{{ toYaml .Values.prometheusOperator.admissionWebhooks.patch.securityContext | indent 8 }}
{{- end }}
{{- end }}

View File

@ -59,8 +59,8 @@ spec:
tolerations:
{{ toYaml . | indent 8 }}
{{- end }}
{{- if .Values.prometheusOperator.admissionWebhooks.patch.securityContext }}
securityContext:
runAsGroup: 2000
runAsNonRoot: true
runAsUser: 2000
{{ toYaml .Values.prometheusOperator.admissionWebhooks.patch.securityContext | indent 8 }}
{{- end }}
{{- end }}

View File

@ -5,8 +5,8 @@ metadata:
name: {{ template "kube-prometheus-stack.fullname" . }}-admission
{{- if .Values.prometheusOperator.admissionWebhooks.certManager.enabled }}
annotations:
certmanager.k8s.io/inject-ca-from: {{ printf "%s/%s-root-cert" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }}
cert-manager.io/inject-ca-from: {{ printf "%s/%s-root-cert" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }}
certmanager.k8s.io/inject-ca-from: {{ printf "%s/%s-admission" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }}
cert-manager.io/inject-ca-from: {{ printf "%s/%s-admission" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }}
{{- end }}
labels:
app: {{ template "kube-prometheus-stack.name" $ }}-admission

View File

@ -5,8 +5,8 @@ metadata:
name: {{ template "kube-prometheus-stack.fullname" . }}-admission
{{- if .Values.prometheusOperator.admissionWebhooks.certManager.enabled }}
annotations:
certmanager.k8s.io/inject-ca-from: {{ printf "%s/%s-root-cert" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }}
cert-manager.io/inject-ca-from: {{ printf "%s/%s-root-cert" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }}
certmanager.k8s.io/inject-ca-from: {{ printf "%s/%s-admission" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }}
cert-manager.io/inject-ca-from: {{ printf "%s/%s-admission" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }}
{{- end }}
labels:
app: {{ template "kube-prometheus-stack.name" $ }}-admission

View File

@ -78,6 +78,11 @@ spec:
{{- if .Values.prometheusOperator.prometheusInstanceNamespaces }}
- --prometheus-instance-namespaces={{ .Values.prometheusOperator.prometheusInstanceNamespaces | join "," }}
{{- end }}
{{- if .Values.prometheusOperator.thanosImage.sha }}
- --thanos-default-base-image={{ .Values.prometheusOperator.thanosImage.repository }}:{{ .Values.prometheusOperator.thanosImage.tag }}@sha256:{{ .Values.prometheusOperator.thanosImage.sha }}
{{- else }}
- --thanos-default-base-image={{ .Values.prometheusOperator.thanosImage.repository }}:{{ .Values.prometheusOperator.thanosImage.tag }}
{{- end }}
{{- if .Values.prometheusOperator.thanosRulerInstanceNamespaces }}
- --thanos-ruler-instance-namespaces={{ .Values.prometheusOperator.thanosRulerInstanceNamespaces | join "," }}
{{- end }}

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml
Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/alertmanager-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
@ -26,41 +26,146 @@ spec:
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerConfigInconsistent
annotations:
message: 'The configuration of the instances of the Alertmanager cluster `{{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.service {{`}}`}}` are out of sync.
{{`{{`}} range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query {{`}}`}}
Configuration hash for pod {{`{{`}} .Labels.pod {{`}}`}} is "{{`{{`}} printf "%.f" .Value {{`}}`}}"
{{`{{`}} end {{`}}`}}
'
expr: count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})) != 1
for: 5m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: AlertmanagerFailedReload
annotations:
message: Reloading Alertmanager's configuration has failed for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}.
expr: alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} == 0
description: Configuration has failed to load for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerfailedreload
summary: Reloading an Alertmanager configuration has failed.
expr: |-
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) == 0
for: 10m
labels:
severity: warning
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: AlertmanagerMembersInconsistent
annotations:
message: Alertmanager has not found all other members of the cluster.
description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} has only found {{`{{`}} $value {{`}}`}} members of the {{`{{`}}$labels.job{{`}}`}} cluster.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagermembersinconsistent
summary: A member of an Alertmanager cluster has not found all other cluster members.
expr: |-
alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}
!= on (service) GROUP_LEFT()
count by (service) (alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])
< on (namespace,service) group_left
count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]))
for: 10m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: AlertmanagerFailedToSendAlerts
annotations:
description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} failed to send {{`{{`}} $value | humanizePercentage {{`}}`}} of notifications to {{`{{`}} $labels.integration {{`}}`}}.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerfailedtosendalerts
summary: An Alertmanager instance failed to send notifications.
expr: |-
(
rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])
/
rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])
)
> 0.01
for: 5m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
expr: |-
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m])
/
rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m])
)
> 0.01
for: 5m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
expr: |-
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m])
/
rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m])
)
> 0.01
for: 5m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: AlertmanagerConfigInconsistent
annotations:
description: Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have different configurations.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerconfiginconsistent
summary: Alertmanager instances within the same cluster have different configurations.
expr: |-
count by (namespace,service) (
count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})
)
!= 1
for: 20m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: AlertmanagerClusterDown
annotations:
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have been up for less than half of the last 5m.'
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterdown
summary: Half or more of the Alertmanager instances within the same cluster are down.
expr: |-
(
count by (namespace,service) (
avg_over_time(up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) < 0.5
)
/
count by (namespace,service) (
up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}
)
)
>= 0.5
for: 5m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: AlertmanagerClusterCrashlooping
annotations:
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have restarted at least 5 times in the last 10m.'
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclustercrashlooping
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
expr: |-
(
count by (namespace,service) (
changes(process_start_time_seconds{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[10m]) > 4
)
/
count by (namespace,service) (
up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}
)
)
>= 0.5
for: 5m
labels:
severity: critical

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml
Generated from 'general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
@ -26,7 +26,9 @@ spec:
rules:
- alert: TargetDown
annotations:
message: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.'
description: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.'
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-targetdown
summary: One or more targets are unreachable.
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10
for: 10m
labels:
@ -36,7 +38,7 @@ spec:
{{- end }}
- alert: Watchdog
annotations:
message: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
description: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
@ -47,6 +49,8 @@ spec:
"DeadMansSnitch" integration in PagerDuty.
'
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-watchdog
summary: An alert that should always be firing to certify that Alertmanager is working properly.
expr: vector(1)
labels:
severity: none

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'k8s.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
Generated from 'k8s.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
@ -26,11 +26,11 @@ spec:
rules:
- expr: |-
sum by (cluster, namespace, pod, container) (
rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
- expr: |-
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
@ -77,6 +77,28 @@ spec:
)
)
record: namespace_cpu:kube_pod_container_resource_requests:sum
- expr: |-
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
) * on(namespace, pod, cluster) group_left() max by (namespace, pod) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_memory:kube_pod_container_resource_limits:sum
- expr: |-
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
) * on(namespace, pod, cluster) group_left() max by (namespace, pod) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_cpu:kube_pod_container_resource_limits:sum
- expr: |-
max by (cluster, namespace, workload, pod) (
label_replace(

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'kube-apiserver-availability.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
Generated from 'kube-apiserver-availability.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
@ -29,56 +29,56 @@ spec:
1 - (
(
# write too slow
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
-
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
) +
(
# read too slow
sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d]))
sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d]))
-
(
(
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d]))
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d]))
or
vector(0)
)
+
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d]))
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d]))
+
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
)
) +
# errors
sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
)
/
sum(code:apiserver_request_total:increase30d)
sum by (cluster) (code:apiserver_request_total:increase30d)
labels:
verb: all
record: apiserver_request:availability30d
- expr: |-
1 - (
sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d]))
sum by (cluster) (increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d]))
-
(
# too slow
(
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d]))
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d]))
or
vector(0)
)
+
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d]))
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d]))
+
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
)
+
# errors
sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
)
/
sum(code:apiserver_request_total:increase30d{verb="read"})
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"})
labels:
verb: read
record: apiserver_request:availability30d
@ -86,74 +86,74 @@ spec:
1 - (
(
# too slow
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
-
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
)
+
# errors
sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
)
/
sum(code:apiserver_request_total:increase30d{verb="write"})
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"})
labels:
verb: write
record: apiserver_request:availability30d
- expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30
record: code_verb:apiserver_request_total:increase30d
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[1h]))
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- expr: sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
- expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
labels:
verb: read
record: code:apiserver_request_total:increase30d
- expr: sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
- expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
labels:
verb: write
record: code:apiserver_request_total:increase30d

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'kube-apiserver-slos' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
Generated from 'kube-apiserver-slos' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'kube-apiserver.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
Generated from 'kube-apiserver.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
@ -28,26 +28,26 @@ spec:
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d]))
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d]))
-
(
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d]))
or
vector(0)
)
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d]))
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d]))
)
)
+
# errors
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d]))
labels:
verb: read
record: apiserver_request:burnrate1d
@ -55,26 +55,26 @@ spec:
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h]))
-
(
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h]))
or
vector(0)
)
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h]))
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h]))
)
)
+
# errors
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h]))
labels:
verb: read
record: apiserver_request:burnrate1h
@ -82,26 +82,26 @@ spec:
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h]))
-
(
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h]))
or
vector(0)
)
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h]))
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h]))
)
)
+
# errors
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h]))
labels:
verb: read
record: apiserver_request:burnrate2h
@ -109,26 +109,26 @@ spec:
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m]))
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m]))
-
(
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m]))
or
vector(0)
)
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m]))
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m]))
)
)
+
# errors
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m]))
labels:
verb: read
record: apiserver_request:burnrate30m
@ -136,26 +136,26 @@ spec:
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
-
(
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d]))
or
vector(0)
)
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d]))
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d]))
)
)
+
# errors
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
labels:
verb: read
record: apiserver_request:burnrate3d
@ -163,26 +163,26 @@ spec:
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m]))
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m]))
-
(
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m]))
or
vector(0)
)
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m]))
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m]))
)
)
+
# errors
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
labels:
verb: read
record: apiserver_request:burnrate5m
@ -190,26 +190,26 @@ spec:
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h]))
-
(
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h]))
or
vector(0)
)
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h]))
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h]))
)
)
+
# errors
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h]))
labels:
verb: read
record: apiserver_request:burnrate6h
@ -217,15 +217,15 @@ spec:
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
-
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d]))
)
+
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
labels:
verb: write
record: apiserver_request:burnrate1d
@ -233,15 +233,15 @@ spec:
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
-
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h]))
)
+
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
labels:
verb: write
record: apiserver_request:burnrate1h
@ -249,15 +249,15 @@ spec:
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
-
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h]))
)
+
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
labels:
verb: write
record: apiserver_request:burnrate2h
@ -265,15 +265,15 @@ spec:
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
-
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m]))
)
+
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
labels:
verb: write
record: apiserver_request:burnrate30m
@ -281,15 +281,15 @@ spec:
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
-
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d]))
)
+
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
labels:
verb: write
record: apiserver_request:burnrate3d
@ -297,15 +297,15 @@ spec:
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
-
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m]))
)
+
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
labels:
verb: write
record: apiserver_request:burnrate5m
@ -313,32 +313,32 @@ spec:
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
-
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h]))
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h]))
)
+
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
labels:
verb: write
record: apiserver_request:burnrate6h
- expr: sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
- expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
labels:
verb: read
record: code_resource:apiserver_request_total:rate5m
- expr: sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
- expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
labels:
verb: write
record: code_resource:apiserver_request_total:rate5m
- expr: histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0
- expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0
labels:
quantile: '0.99'
verb: read
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0
- expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0
labels:
quantile: '0.99'
verb: write

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'kube-prometheus-general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml
Generated from 'kube-prometheus-general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml
Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'kube-state-metrics' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml
Generated from 'kube-state-metrics' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-state-metrics-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
@ -55,5 +55,33 @@ spec:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: KubeStateMetricsShardingMismatch
annotations:
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricsshardingmismatch
summary: kube-state-metrics sharding is misconfigured.
expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0
for: 15m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: KubeStateMetricsShardsMissing
annotations:
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricsshardsmissing
summary: kube-state-metrics shards are missing.
expr: |-
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1
-
sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) )
!= 0
for: 15m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'kubelet.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
Generated from 'kubelet.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
@ -30,7 +30,10 @@ spec:
description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is restarting {{`{{`}} printf "%.2f" $value {{`}}`}} times / 10 minutes.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodcrashlooping
summary: Pod is crash looping.
expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m]) * 60 * 5 > 0
expr: |-
increase(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m]) > 0
and
sum without (phase) (kube_pod_status_phase{phase!="Running",job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} == 1)
for: 15m
labels:
severity: warning
@ -261,19 +264,19 @@ spec:
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubehpareplicasmismatch
summary: HPA has not matched descired number of replicas.
expr: |-
(kube_hpa_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
!=
kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
and
(kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
>
kube_hpa_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
and
(kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
<
kube_hpa_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
and
changes(kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) == 0
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) == 0
for: 15m
labels:
severity: warning
@ -286,9 +289,9 @@ spec:
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubehpamaxedout
summary: HPA is running at max replicas
expr: |-
kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
==
kube_hpa_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
for: 15m
labels:
severity: warning

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'kubernetes-system-apiserver' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
Generated from 'kubernetes-system-apiserver' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'kubernetes-system-controller-manager' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
Generated from 'kubernetes-system-controller-manager' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'kubernetes-system-kubelet' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
Generated from 'kubernetes-system-kubelet' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'kubernetes-system-scheduler' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
Generated from 'kubernetes-system-scheduler' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'node-exporter.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/node-exporter-prometheusRule.yaml
Generated from 'node-exporter.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/node-exporter-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
@ -33,9 +33,9 @@ spec:
record: instance:node_num_cpu:sum
- expr: |-
1 - avg without (cpu, mode) (
rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m])
rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[5m])
)
record: instance:node_cpu_utilisation:rate1m
record: instance:node_cpu_utilisation:rate5m
- expr: |-
(
node_load1{job="node-exporter"}
@ -50,30 +50,30 @@ spec:
node_memory_MemTotal_bytes{job="node-exporter"}
)
record: instance:node_memory_utilisation:ratio
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
record: instance:node_vmstat_pgmajfault:rate1m
- expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
record: instance_device:node_disk_io_time_seconds:rate1m
- expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- expr: |-
sum without (device) (
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m])
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_receive_bytes_excluding_lo:rate1m
record: instance:node_network_receive_bytes_excluding_lo:rate5m
- expr: |-
sum without (device) (
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m])
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_transmit_bytes_excluding_lo:rate1m
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
- expr: |-
sum without (device) (
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m])
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_receive_drop_excluding_lo:rate1m
record: instance:node_network_receive_drop_excluding_lo:rate5m
- expr: |-
sum without (device) (
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_transmit_drop_excluding_lo:rate1m
record: instance:node_network_transmit_drop_excluding_lo:rate5m
{{- end }}

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'node-exporter' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/node-exporter-prometheusRule.yaml
Generated from 'node-exporter' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/node-exporter-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
@ -27,6 +27,7 @@ spec:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |-
(
@ -45,6 +46,7 @@ spec:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up fast.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |-
(
@ -63,6 +65,7 @@ spec:
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutofspace
summary: Filesystem has less than 5% space left.
expr: |-
(
@ -79,6 +82,7 @@ spec:
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutofspace
summary: Filesystem has less than 3% space left.
expr: |-
(
@ -95,6 +99,7 @@ spec:
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |-
(
@ -113,6 +118,7 @@ spec:
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up fast.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |-
(
@ -131,6 +137,7 @@ spec:
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutoffiles
summary: Filesystem has less than 5% inodes left.
expr: |-
(
@ -147,6 +154,7 @@ spec:
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutoffiles
summary: Filesystem has less than 3% inodes left.
expr: |-
(
@ -163,6 +171,7 @@ spec:
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} receive errors in the last two minutes.'
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworkreceiveerrs
summary: Network interface is reporting many receive errors.
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: 1h
@ -174,6 +183,7 @@ spec:
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} transmit errors in the last two minutes.'
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworktransmiterrs
summary: Network interface is reporting many transmit errors.
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: 1h
@ -185,6 +195,7 @@ spec:
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used.'
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodehighnumberconntrackentriesused
summary: Number of conntrack are getting close to the limit.
expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
labels:
@ -195,6 +206,7 @@ spec:
- alert: NodeTextFileCollectorScrapeError
annotations:
description: Node Exporter text file collector failed to scrape.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodetextfilecollectorscrapeerror
summary: Node Exporter text file collector failed to scrape.
expr: node_textfile_scrape_error{job="node-exporter"} == 1
labels:
@ -204,7 +216,8 @@ spec:
{{- end }}
- alert: NodeClockSkewDetected
annotations:
message: Clock on {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
description: Clock on {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeclockskewdetected
summary: Clock skew detected.
expr: |-
(
@ -226,7 +239,8 @@ spec:
{{- end }}
- alert: NodeClockNotSynchronising
annotations:
message: Clock on {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host.
description: Clock on {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |-
min_over_time(node_timex_sync_status[5m]) == 0
@ -241,6 +255,7 @@ spec:
- alert: NodeRAIDDegraded
annotations:
description: RAID array '{{`{{`}} $labels.device {{`}}`}}' on {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-noderaiddegraded
summary: RAID Array is degraded
expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
for: 15m
@ -252,8 +267,9 @@ spec:
- alert: NodeRAIDDiskFailure
annotations:
description: At least one device in RAID array on {{`{{`}} $labels.instance {{`}}`}} failed. Array '{{`{{`}} $labels.device {{`}}`}}' needs attention and possibly a disk swap.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-noderaiddiskfailure
summary: Failed device in RAID array
expr: node_md_disks{state="fail"} > 0
expr: node_md_disks{state="failed"} > 0
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'node-network' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml
Generated from 'node-network' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
@ -26,7 +26,8 @@ spec:
rules:
- alert: NodeNetworkInterfaceFlapping
annotations:
message: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing it's up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}"
message: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing it's up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworkinterfaceflapping
expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
for: 2m
labels:

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'node.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
Generated from 'node.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml
Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheus-operator-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}

View File

@ -1,5 +1,5 @@
{{- /*
Generated from 'prometheus' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml
Generated from 'prometheus' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheus-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
@ -29,6 +29,7 @@ spec:
- alert: PrometheusBadConfig
annotations:
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to reload its configuration.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusbadconfig
summary: Failed Prometheus configuration reload.
expr: |-
# Without max_over_time, failed scrapes could create false negatives, see
@ -43,6 +44,7 @@ spec:
- alert: PrometheusNotificationQueueRunningFull
annotations:
description: Alert notification queue of Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is running full.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusnotificationqueuerunningfull
summary: Prometheus alert notification queue predicted to run full in less than 30m.
expr: |-
# Without min_over_time, failed scrapes could create false negatives, see
@ -61,6 +63,7 @@ spec:
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.alertmanager{{`}}`}}.'
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheuserrorsendingalertstosomealertmanagers
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
expr: |-
(
@ -75,28 +78,11 @@ spec:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% minimum errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to any Alertmanager.'
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: |-
min without(alertmanager) (
rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
/
rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
)
* 100
> 3
for: 15m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: PrometheusNotConnectedToAlertmanagers
annotations:
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not connected to any Alertmanagers.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusnotconnectedtoalertmanagers
summary: Prometheus is not connected to any Alertmanagers.
expr: |-
# Without max_over_time, failed scrapes could create false negatives, see
@ -111,6 +97,7 @@ spec:
- alert: PrometheusTSDBReloadsFailing
annotations:
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} reload failures over the last 3h.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheustsdbreloadsfailing
summary: Prometheus has issues reloading blocks from disk.
expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0
for: 4h
@ -122,6 +109,7 @@ spec:
- alert: PrometheusTSDBCompactionsFailing
annotations:
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} compaction failures over the last 3h.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheustsdbcompactionsfailing
summary: Prometheus has issues compacting blocks.
expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0
for: 4h
@ -133,8 +121,18 @@ spec:
- alert: PrometheusNotIngestingSamples
annotations:
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not ingesting samples.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusnotingestingsamples
summary: Prometheus is not ingesting samples.
expr: rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0
expr: |-
(
rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0
and
(
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0
or
sum without(rule_group) (prometheus_rule_group_rules{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0
)
)
for: 10m
labels:
severity: warning
@ -144,6 +142,7 @@ spec:
- alert: PrometheusDuplicateTimestamps
annotations:
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with different values but duplicated timestamp.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusduplicatetimestamps
summary: Prometheus is dropping samples with duplicate timestamps.
expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
for: 10m
@ -155,6 +154,7 @@ spec:
- alert: PrometheusOutOfOrderTimestamps
annotations:
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with timestamps arriving out of order.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoutofordertimestamps
summary: Prometheus drops samples with out-of-order timestamps.
expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
for: 10m
@ -166,15 +166,16 @@ spec:
- alert: PrometheusRemoteStorageFailures
annotations:
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} failed to send {{`{{`}} printf "%.1f" $value {{`}}`}}% of the samples to {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusremotestoragefailures
summary: Prometheus fails to send samples to remote storage.
expr: |-
(
rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
(rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]))
/
(
rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
(rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]))
+
rate(prometheus_remote_storage_succeeded_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
(rate(prometheus_remote_storage_succeeded_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]))
)
)
* 100
@ -188,13 +189,14 @@ spec:
- alert: PrometheusRemoteWriteBehind
annotations:
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write is {{`{{`}} printf "%.1f" $value {{`}}`}}s behind for {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusremotewritebehind
summary: Prometheus remote write is behind.
expr: |-
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
- on(job, instance) group_right
- ignoring(remote_name, url) group_right
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
)
> 120
@ -207,6 +209,7 @@ spec:
- alert: PrometheusRemoteWriteDesiredShards
annotations:
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write desired shards calculation wants to run {{`{{`}} $value {{`}}`}} shards for queue {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}, which is more than the max of {{`{{`}} printf `prometheus_remote_storage_shards_max{instance="%s",job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}` $labels.instance | query | first | value {{`}}`}}.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusremotewritedesiredshards
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
expr: |-
# Without max_over_time, failed scrapes could create false negatives, see
@ -225,6 +228,7 @@ spec:
- alert: PrometheusRuleFailures
annotations:
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to evaluate {{`{{`}} printf "%.0f" $value {{`}}`}} rules in the last 5m.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusrulefailures
summary: Prometheus is failing rule evaluations.
expr: increase(prometheus_rule_evaluation_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
for: 15m
@ -236,6 +240,7 @@ spec:
- alert: PrometheusMissingRuleEvaluations
annotations:
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has missed {{`{{`}} printf "%.0f" $value {{`}}`}} rule group evaluations in the last 5m.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusmissingruleevaluations
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
expr: increase(prometheus_rule_group_iterations_missed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
for: 15m
@ -247,6 +252,7 @@ spec:
- alert: PrometheusTargetLimitHit
annotations:
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because the number of targets exceeded the configured target_limit.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheustargetlimithit
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
for: 15m
@ -254,5 +260,36 @@ spec:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: PrometheusLabelLimitHit
annotations:
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheuslabellimithit
summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit.
expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
for: 15m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% minimum errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to any Alertmanager.'
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheuserrorsendingalertstoanyalertmanager
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: |-
min without (alertmanager) (
rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}",alertmanager!~``}[5m])
/
rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}",alertmanager!~``}[5m])
)
* 100
> 3
for: 15m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}

View File

@ -1,63 +0,0 @@
{{- /*
Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.alertmanager }}
{{- $operatorJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "operator" }}
{{- $alertmanagerJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager" }}
{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager.rules" | trunc 63 | trimSuffix "-" }}
namespace: {{ template "kube-prometheus-stack.namespace" . }}
labels:
app: {{ template "kube-prometheus-stack.name" . }}
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
{{- if .Values.defaultRules.labels }}
{{ toYaml .Values.defaultRules.labels | indent 4 }}
{{- end }}
{{- if .Values.defaultRules.annotations }}
annotations:
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
{{- end }}
spec:
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerConfigInconsistent
annotations:
message: The configuration of the instances of the Alertmanager cluster `{{`{{`}}$labels.service{{`}}`}}` are out of sync.
expr: count_values("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{job="{{ $operatorJob }}",namespace="{{ $namespace }}",controller="alertmanager"}) by (name, job, namespace, controller), "service", "$1", "name", "(.*)") != 1
for: 5m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: AlertmanagerFailedReload
annotations:
message: Reloading Alertmanager's configuration has failed for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}.
expr: alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} == 0
for: 10m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: AlertmanagerMembersInconsistent
annotations:
message: Alertmanager has not found all other members of the cluster.
expr: |-
alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}
!= on (service) GROUP_LEFT()
count by (service) (alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})
for: 5m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}

View File

@ -1,179 +0,0 @@
{{- /*
Generated from 'etcd' group from https://raw.githubusercontent.com/etcd-io/website/master/content/en/docs/v3.4/op-guide/etcd3_alert.rules.yml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeEtcd.enabled .Values.defaultRules.rules.etcd }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "etcd" | trunc 63 | trimSuffix "-" }}
namespace: {{ template "kube-prometheus-stack.namespace" . }}
labels:
app: {{ template "kube-prometheus-stack.name" . }}
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
{{- if .Values.defaultRules.labels }}
{{ toYaml .Values.defaultRules.labels | indent 4 }}
{{- end }}
{{- if .Values.defaultRules.annotations }}
annotations:
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
{{- end }}
spec:
groups:
- name: etcd
rules:
- alert: etcdInsufficientMembers
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).'
expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
for: 3m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdNoLeader
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.'
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdHighNumberOfLeaderChanges
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": instance {{`{{`}} $labels.instance {{`}}`}} has seen {{`{{`}} $value {{`}}`}} leader changes within the last hour.'
expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
for: 15m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
expr: |-
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
/
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
> 1
for: 10m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
expr: |-
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
/
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
> 5
for: 5m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdGRPCRequestsSlow
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": gRPC requests to {{`{{`}} $labels.grpc_method {{`}}`}} are taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
expr: |-
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
> 0.15
for: 10m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdMemberCommunicationSlow
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
expr: |-
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
for: 10m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdHighNumberOfFailedProposals
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last hour on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdHighFsyncDurations
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
expr: |-
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
for: 10m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdHighCommitDurations
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
expr: |-
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25
for: 10m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdHighNumberOfFailedHTTPRequests
annotations:
message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}'
expr: |-
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
BY (method) > 0.01
for: 10m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdHighNumberOfFailedHTTPRequests
annotations:
message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
expr: |-
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
BY (method) > 0.05
for: 10m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdHTTPRequestsSlow
annotations:
message: etcd instance {{`{{`}} $labels.instance {{`}}`}} HTTP requests to {{`{{`}} $labels.method {{`}}`}} are slow.
expr: |-
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
> 0.15
for: 10m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}

View File

@ -1,56 +0,0 @@
{{- /*
Generated from 'general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.general }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "general.rules" | trunc 63 | trimSuffix "-" }}
namespace: {{ template "kube-prometheus-stack.namespace" . }}
labels:
app: {{ template "kube-prometheus-stack.name" . }}
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
{{- if .Values.defaultRules.labels }}
{{ toYaml .Values.defaultRules.labels | indent 4 }}
{{- end }}
{{- if .Values.defaultRules.annotations }}
annotations:
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
{{- end }}
spec:
groups:
- name: general.rules
rules:
- alert: TargetDown
annotations:
message: '{{`{{`}} $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}} targets are down.'
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
for: 10m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: Watchdog
annotations:
message: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
'
expr: vector(1)
labels:
severity: none
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}

View File

@ -1,83 +0,0 @@
{{- /*
Generated from 'k8s.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8s }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules" | trunc 63 | trimSuffix "-" }}
namespace: {{ template "kube-prometheus-stack.namespace" . }}
labels:
app: {{ template "kube-prometheus-stack.name" . }}
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
{{- if .Values.defaultRules.labels }}
{{ toYaml .Values.defaultRules.labels | indent 4 }}
{{- end }}
{{- if .Values.defaultRules.annotations }}
annotations:
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
{{- end }}
spec:
groups:
- name: k8s.rules
rules:
- expr: sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace)
record: namespace:container_cpu_usage_seconds_total:sum_rate
- expr: sum(container_memory_usage_bytes{job="kubelet", image!="", container_name!=""}) by (namespace)
record: namespace:container_memory_usage_bytes:sum
- expr: |-
sum by (namespace, pod_name, container_name) (
rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])
)
record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate
- expr: |-
sum by(namespace) (
kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}
* on (endpoint, instance, job, namespace, pod, service)
group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)
)
record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
- expr: |-
sum by (namespace) (
kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"}
* on (endpoint, instance, job, namespace, pod, service)
group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)
)
record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
- expr: |-
sum(
label_replace(
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
"replicaset", "$1", "owner_name", "(.*)"
) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"},
"workload", "$1", "owner_name", "(.*)"
)
) by (namespace, workload, pod)
labels:
workload_type: deployment
record: mixin_pod_workload
- expr: |-
sum(
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
"workload", "$1", "owner_name", "(.*)"
)
) by (namespace, workload, pod)
labels:
workload_type: daemonset
record: mixin_pod_workload
- expr: |-
sum(
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
"workload", "$1", "owner_name", "(.*)"
)
) by (namespace, workload, pod)
labels:
workload_type: statefulset
record: mixin_pod_workload
{{- end }}

Some files were not shown because too many files have changed in this diff Show More