feat: metrics version bump
This commit is contained in:
parent
5e9d2bc168
commit
563cf4add7
@ -16,11 +16,11 @@ dependencies:
|
||||
version: ">= 0.1.3"
|
||||
repository: https://zero-down-time.github.io/kubezero/
|
||||
- name: kube-prometheus-stack
|
||||
version: 16.1.2
|
||||
version: 16.12.0
|
||||
# Switch back to upstream once all alerts are fixed eg. etcd gpcr
|
||||
# repository: https://prometheus-community.github.io/helm-charts
|
||||
- name: prometheus-adapter
|
||||
version: 2.13
|
||||
version: 2.14.2
|
||||
repository: https://prometheus-community.github.io/helm-charts
|
||||
condition: prometheus-adapter.enabled
|
||||
kubeVersion: ">= 1.18.0"
|
||||
|
@ -6,12 +6,12 @@ annotations:
|
||||
url: https://github.com/prometheus-operator/kube-prometheus
|
||||
artifacthub.io/operator: "true"
|
||||
apiVersion: v2
|
||||
appVersion: 0.48.0
|
||||
appVersion: 0.48.1
|
||||
dependencies:
|
||||
- condition: kubeStateMetrics.enabled
|
||||
name: kube-state-metrics
|
||||
repository: https://prometheus-community.github.io/helm-charts
|
||||
version: 3.1.*
|
||||
version: 3.3.*
|
||||
- condition: nodeExporter.enabled
|
||||
name: prometheus-node-exporter
|
||||
repository: https://prometheus-community.github.io/helm-charts
|
||||
@ -19,7 +19,7 @@ dependencies:
|
||||
- condition: grafana.enabled
|
||||
name: grafana
|
||||
repository: https://grafana.github.io/helm-charts
|
||||
version: 6.9.*
|
||||
version: 6.13.*
|
||||
description: kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards,
|
||||
and Prometheus rules combined with documentation and scripts to provide easy to
|
||||
operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus
|
||||
@ -47,4 +47,4 @@ sources:
|
||||
- https://github.com/prometheus-community/helm-charts
|
||||
- https://github.com/prometheus-operator/kube-prometheus
|
||||
type: application
|
||||
version: 16.1.2
|
||||
version: 16.12.0
|
||||
|
@ -1,5 +1,5 @@
|
||||
apiVersion: v2
|
||||
appVersion: 7.5.5
|
||||
appVersion: 8.0.3
|
||||
description: The leading tool for querying and visualizing time series and metrics.
|
||||
home: https://grafana.net
|
||||
icon: https://raw.githubusercontent.com/grafana/grafana/master/public/img/logo_transparent_400x.png
|
||||
@ -19,4 +19,4 @@ name: grafana
|
||||
sources:
|
||||
- https://github.com/grafana/grafana
|
||||
type: application
|
||||
version: 6.9.1
|
||||
version: 6.13.5
|
||||
|
@ -59,8 +59,8 @@ This version requires Helm >= 3.1.0.
|
||||
| `securityContext` | Deployment securityContext | `{"runAsUser": 472, "runAsGroup": 472, "fsGroup": 472}` |
|
||||
| `priorityClassName` | Name of Priority Class to assign pods | `nil` |
|
||||
| `image.repository` | Image repository | `grafana/grafana` |
|
||||
| `image.tag` | Image tag (`Must be >= 5.0.0`) | `7.4.5` |
|
||||
| `image.sha` | Image sha (optional) | `2b56f6106ddc376bb46d974230d530754bf65a640dfbc5245191d72d3b49efc6` |
|
||||
| `image.tag` | Image tag (`Must be >= 5.0.0`) | `8.0.3` |
|
||||
| `image.sha` | Image sha (optional) | `80c6d6ac633ba5ab3f722976fb1d9a138f87ca6a9934fcd26a5fc28cbde7dbfa` |
|
||||
| `image.pullPolicy` | Image pull policy | `IfNotPresent` |
|
||||
| `image.pullSecrets` | Image pull secrets | `{}` |
|
||||
| `service.enabled` | Enable grafana service | `true` |
|
||||
@ -83,7 +83,7 @@ This version requires Helm >= 3.1.0.
|
||||
| `ingress.path` | Ingress accepted path | `/` |
|
||||
| `ingress.pathType` | Ingress type of path | `Prefix` |
|
||||
| `ingress.hosts` | Ingress accepted hostnames | `["chart-example.local"]` |
|
||||
| `ingress.extraPaths` | Ingress extra paths to prepend to every host configuration. Useful when configuring [custom actions with AWS ALB Ingress Controller](https://kubernetes-sigs.github.io/aws-alb-ingress-controller/guide/ingress/annotation/#actions). | `[]` |
|
||||
| `ingress.extraPaths` | Ingress extra paths to prepend to every host configuration. Useful when configuring [custom actions with AWS ALB Ingress Controller](https://kubernetes-sigs.github.io/aws-alb-ingress-controller/guide/ingress/annotation/#actions). Requires `ingress.hosts` to have one or more host entries. | `[]` |
|
||||
| `ingress.tls` | Ingress TLS configuration | `[]` |
|
||||
| `resources` | CPU/Memory resource requests/limits | `{}` |
|
||||
| `nodeSelector` | Node labels for pod assignment | `{}` |
|
||||
@ -116,6 +116,7 @@ This version requires Helm >= 3.1.0.
|
||||
| `envValueFrom` | Environment variables from alternate sources. See the API docs on [EnvVarSource](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.17/#envvarsource-v1-core) for format details. | `{}` |
|
||||
| `envFromSecret` | Name of a Kubernetes secret (must be manually created in the same namespace) containing values to be added to the environment. Can be templated | `""` |
|
||||
| `envRenderSecret` | Sensible environment variables passed to pods and stored as secret | `{}` |
|
||||
| `enableServiceLinks` | Inject Kubernetes services as environment variables. | `true` |
|
||||
| `extraSecretMounts` | Additional grafana server secret mounts | `[]` |
|
||||
| `extraVolumeMounts` | Additional grafana server volume mounts | `[]` |
|
||||
| `extraConfigmapMounts` | Additional grafana server configMap volume mounts | `[]` |
|
||||
@ -136,7 +137,7 @@ This version requires Helm >= 3.1.0.
|
||||
| `podLabels` | Pod labels | `{}` |
|
||||
| `podPortName` | Name of the grafana port on the pod | `grafana` |
|
||||
| `sidecar.image.repository` | Sidecar image repository | `quay.io/kiwigrid/k8s-sidecar` |
|
||||
| `sidecar.image.tag` | Sidecar image tag | `1.10.7` |
|
||||
| `sidecar.image.tag` | Sidecar image tag | `1.12.2` |
|
||||
| `sidecar.image.sha` | Sidecar image sha (optional) | `""` |
|
||||
| `sidecar.imagePullPolicy` | Sidecar image pull policy | `IfNotPresent` |
|
||||
| `sidecar.resources` | Sidecar resources | `{}` |
|
||||
|
@ -104,13 +104,67 @@ app.kubernetes.io/name: {{ include "grafana.name" . }}-image-renderer
|
||||
app.kubernetes.io/instance: {{ .Release.Name }}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Looks if there's an existing secret and reuse its password. If not it generates
|
||||
new password and use it.
|
||||
*/}}
|
||||
{{- define "grafana.password" -}}
|
||||
{{- $secret := (lookup "v1" "Secret" (include "grafana.namespace" .) (include "grafana.fullname" .) ) -}}
|
||||
{{- if $secret -}}
|
||||
{{- index $secret "data" "admin-password" -}}
|
||||
{{- else -}}
|
||||
{{- (randAlphaNum 40) | b64enc | quote -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Get KubeVersion removing pre-release information.
|
||||
*/}}
|
||||
{{- define "grafana.kubeVersion" -}}
|
||||
{{- default .Capabilities.KubeVersion.Version (regexFind "v[0-9]+\\.[0-9]+\\.[0-9]+" .Capabilities.KubeVersion.Version) -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Return the appropriate apiVersion for rbac.
|
||||
*/}}
|
||||
{{- define "rbac.apiVersion" -}}
|
||||
{{- define "grafana.rbac.apiVersion" -}}
|
||||
{{- if .Capabilities.APIVersions.Has "rbac.authorization.k8s.io/v1" }}
|
||||
{{- print "rbac.authorization.k8s.io/v1" -}}
|
||||
{{- else -}}
|
||||
{{- print "rbac.authorization.k8s.io/v1beta1" -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Return the appropriate apiVersion for ingress.
|
||||
*/}}
|
||||
{{- define "grafana.ingress.apiVersion" -}}
|
||||
{{- if and (.Capabilities.APIVersions.Has "networking.k8s.io/v1") (semverCompare ">= 1.19.x" (include "grafana.kubeVersion" .)) -}}
|
||||
{{- print "networking.k8s.io/v1" -}}
|
||||
{{- else if .Capabilities.APIVersions.Has "networking.k8s.io/v1beta1" -}}
|
||||
{{- print "networking.k8s.io/v1beta1" -}}
|
||||
{{- else -}}
|
||||
{{- print "extensions/v1beta1" -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Return if ingress is stable.
|
||||
*/}}
|
||||
{{- define "grafana.ingress.isStable" -}}
|
||||
{{- eq (include "grafana.ingress.apiVersion" .) "networking.k8s.io/v1" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Return if ingress supports ingressClassName.
|
||||
*/}}
|
||||
{{- define "grafana.ingress.supportsIngressClassName" -}}
|
||||
{{- or (eq (include "grafana.ingress.isStable" .) "true") (and (eq (include "grafana.ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18.x" (include "grafana.kubeVersion" .))) -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Return if ingress supports pathType.
|
||||
*/}}
|
||||
{{- define "grafana.ingress.supportsPathType" -}}
|
||||
{{- or (eq (include "grafana.ingress.isStable" .) "true") (and (eq (include "grafana.ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18.x" (include "grafana.kubeVersion" .))) -}}
|
||||
{{- end -}}
|
||||
|
@ -159,6 +159,7 @@ imagePullSecrets:
|
||||
- name: {{ . }}
|
||||
{{- end}}
|
||||
{{- end }}
|
||||
enableServiceLinks: {{ .Values.enableServiceLinks }}
|
||||
containers:
|
||||
{{- if .Values.sidecar.dashboards.enabled }}
|
||||
- name: {{ template "grafana.name" . }}-sc-dashboard
|
||||
@ -357,6 +358,14 @@ containers:
|
||||
- name: GF_RENDERING_CALLBACK_URL
|
||||
value: http://{{ template "grafana.fullname" . }}.{{ template "grafana.namespace" . }}:{{ .Values.service.port }}/{{ .Values.imageRenderer.grafanaSubPath }}
|
||||
{{ end }}
|
||||
- name: GF_PATHS_DATA
|
||||
value: {{ (get .Values "grafana.ini").paths.data }}
|
||||
- name: GF_PATHS_LOGS
|
||||
value: {{ (get .Values "grafana.ini").paths.logs }}
|
||||
- name: GF_PATHS_PLUGINS
|
||||
value: {{ (get .Values "grafana.ini").paths.plugins }}
|
||||
- name: GF_PATHS_PROVISIONING
|
||||
value: {{ (get .Values "grafana.ini").paths.provisioning }}
|
||||
{{- range $key, $value := .Values.envValueFrom }}
|
||||
- name: {{ $key | quote }}
|
||||
valueFrom:
|
||||
|
@ -1,17 +1,13 @@
|
||||
{{- if .Values.ingress.enabled -}}
|
||||
{{- $ingressApiIsStable := eq (include "grafana.ingress.isStable" .) "true" -}}
|
||||
{{- $ingressSupportsIngressClassName := eq (include "grafana.ingress.supportsIngressClassName" .) "true" -}}
|
||||
{{- $ingressSupportsPathType := eq (include "grafana.ingress.supportsPathType" .) "true" -}}
|
||||
{{- $fullName := include "grafana.fullname" . -}}
|
||||
{{- $servicePort := .Values.service.port -}}
|
||||
{{- $ingressPath := .Values.ingress.path -}}
|
||||
{{- $ingressPathType := .Values.ingress.pathType -}}
|
||||
{{- $extraPaths := .Values.ingress.extraPaths -}}
|
||||
{{- $newAPI := .Capabilities.APIVersions.Has "networking.k8s.io/v1/Ingress" -}}
|
||||
{{- if $newAPI -}}
|
||||
apiVersion: networking.k8s.io/v1
|
||||
{{- else if .Capabilities.APIVersions.Has "networking.k8s.io/v1beta1/Ingress" }}
|
||||
apiVersion: networking.k8s.io/v1beta1
|
||||
{{- else }}
|
||||
apiVersion: extensions/v1beta1
|
||||
{{- end }}
|
||||
apiVersion: {{ include "grafana.ingress.apiVersion" . }}
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: {{ $fullName }}
|
||||
@ -28,7 +24,7 @@ metadata:
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- if .Values.ingress.ingressClassName }}
|
||||
{{- if and $ingressSupportsIngressClassName .Values.ingress.ingressClassName }}
|
||||
ingressClassName: {{ .Values.ingress.ingressClassName }}
|
||||
{{- end -}}
|
||||
{{- if .Values.ingress.tls }}
|
||||
@ -41,15 +37,15 @@ spec:
|
||||
- host: {{ tpl . $}}
|
||||
http:
|
||||
paths:
|
||||
{{ if $extraPaths }}
|
||||
{{- if $extraPaths }}
|
||||
{{ toYaml $extraPaths | indent 10 }}
|
||||
{{- end }}
|
||||
- path: {{ $ingressPath }}
|
||||
{{- if $newAPI }}
|
||||
{{- if $ingressSupportsPathType }}
|
||||
pathType: {{ $ingressPathType }}
|
||||
{{- end }}
|
||||
backend:
|
||||
{{- if $newAPI }}
|
||||
{{- if $ingressApiIsStable }}
|
||||
service:
|
||||
name: {{ $fullName }}
|
||||
port:
|
||||
@ -63,12 +59,11 @@ spec:
|
||||
- http:
|
||||
paths:
|
||||
- backend:
|
||||
{{- if $newAPI }}
|
||||
{{- if $ingressApiIsStable }}
|
||||
service:
|
||||
name: {{ $fullName }}
|
||||
port:
|
||||
number: {{ $servicePort }}
|
||||
pathType: {{ $ingressPathType }}
|
||||
{{- else }}
|
||||
serviceName: {{ $fullName }}
|
||||
servicePort: {{ $servicePort }}
|
||||
@ -76,5 +71,8 @@ spec:
|
||||
{{- if $ingressPath }}
|
||||
path: {{ $ingressPath }}
|
||||
{{- end }}
|
||||
{{- if $ingressSupportsPathType }}
|
||||
pathType: {{ $ingressPathType }}
|
||||
{{- end }}
|
||||
{{- end -}}
|
||||
{{- end }}
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- if and .Values.rbac.create (not .Values.rbac.useExistingRole) -}}
|
||||
apiVersion: {{ template "rbac.apiVersion" . }}
|
||||
apiVersion: {{ template "grafana.rbac.apiVersion" . }}
|
||||
kind: Role
|
||||
metadata:
|
||||
name: {{ template "grafana.fullname" . }}
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- if .Values.rbac.create -}}
|
||||
apiVersion: {{ template "rbac.apiVersion" . }}
|
||||
apiVersion: {{ template "grafana.rbac.apiVersion" . }}
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: {{ template "grafana.fullname" . }}
|
||||
|
@ -17,7 +17,7 @@ data:
|
||||
{{- if .Values.adminPassword }}
|
||||
admin-password: {{ .Values.adminPassword | b64enc | quote }}
|
||||
{{- else }}
|
||||
admin-password: {{ randAlphaNum 40 | b64enc | quote }}
|
||||
admin-password: {{ template "grafana.password" . }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not .Values.ldap.existingSecret }}
|
||||
|
@ -69,7 +69,7 @@ livenessProbe:
|
||||
|
||||
image:
|
||||
repository: grafana/grafana
|
||||
tag: 7.5.5
|
||||
tag: 8.0.3
|
||||
sha: ""
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
@ -188,7 +188,7 @@ ingress:
|
||||
labels: {}
|
||||
path: /
|
||||
|
||||
# pathType is only for k8s > 1.19
|
||||
# pathType is only for k8s >= 1.1=
|
||||
pathType: Prefix
|
||||
|
||||
hosts:
|
||||
@ -382,6 +382,10 @@ envFromSecret: ""
|
||||
## This can be useful for auth tokens, etc
|
||||
envRenderSecret: {}
|
||||
|
||||
# Inject Kubernetes services as environment variables.
|
||||
# See https://kubernetes.io/docs/concepts/services-networking/connect-applications-service/#environment-variables
|
||||
enableServiceLinks: true
|
||||
|
||||
## Additional grafana server secret mounts
|
||||
# Defines additional mounts with secrets. Secrets must be manually created in the namespace.
|
||||
extraSecretMounts: []
|
||||
@ -451,7 +455,7 @@ datasources: {}
|
||||
# uid: cloudwatch
|
||||
# editable: false
|
||||
# jsonData:
|
||||
# authType: credentials
|
||||
# authType: default
|
||||
# defaultRegion: us-east-1
|
||||
|
||||
## Configure notifiers
|
||||
@ -532,7 +536,7 @@ dashboardsConfigMaps: {}
|
||||
##
|
||||
grafana.ini:
|
||||
paths:
|
||||
data: /var/lib/grafana/data
|
||||
data: /var/lib/grafana/
|
||||
logs: /var/log/grafana
|
||||
plugins: /var/lib/grafana/plugins
|
||||
provisioning: /etc/grafana/provisioning
|
||||
@ -603,7 +607,7 @@ smtp:
|
||||
sidecar:
|
||||
image:
|
||||
repository: quay.io/kiwigrid/k8s-sidecar
|
||||
tag: 1.10.7
|
||||
tag: 1.12.2
|
||||
sha: ""
|
||||
imagePullPolicy: IfNotPresent
|
||||
resources: {}
|
||||
|
@ -16,4 +16,4 @@ name: kube-state-metrics
|
||||
sources:
|
||||
- https://github.com/kubernetes/kube-state-metrics/
|
||||
type: application
|
||||
version: 3.1.0
|
||||
version: 3.3.1
|
||||
|
@ -62,7 +62,7 @@ The upgraded chart now the following changes:
|
||||
See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_helm/#customizing-the-chart-before-installing). To see all configurable options with detailed comments:
|
||||
|
||||
```console
|
||||
helm show values kube-state-metrics/kube-state-metrics
|
||||
helm show values prometheus-community/kube-state-metrics
|
||||
```
|
||||
|
||||
You may also run `helm show values` on this chart's [dependencies](#dependencies) for additional options.
|
||||
|
@ -18,6 +18,6 @@ roleRef:
|
||||
{{- end }}
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ template "kube-state-metrics.fullname" . }}
|
||||
name: {{ template "kube-state-metrics.serviceAccountName" . }}
|
||||
namespace: {{ template "kube-state-metrics.namespace" . }}
|
||||
{{- end -}}
|
||||
|
@ -63,109 +63,40 @@ spec:
|
||||
fieldPath: metadata.namespace
|
||||
{{- end }}
|
||||
args:
|
||||
{{ if .Values.extraArgs }}
|
||||
{{- if .Values.extraArgs }}
|
||||
{{- range .Values.extraArgs }}
|
||||
- {{ . }}
|
||||
{{- end }}
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.certificatesigningrequests }}
|
||||
- --resources=certificatesigningrequests
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.configmaps }}
|
||||
- --resources=configmaps
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.cronjobs }}
|
||||
- --resources=cronjobs
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.daemonsets }}
|
||||
- --resources=daemonsets
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.deployments }}
|
||||
- --resources=deployments
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.endpoints }}
|
||||
- --resources=endpoints
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.horizontalpodautoscalers }}
|
||||
- --resources=horizontalpodautoscalers
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.ingresses }}
|
||||
- --resources=ingresses
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.jobs }}
|
||||
- --resources=jobs
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.limitranges }}
|
||||
- --resources=limitranges
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.mutatingwebhookconfigurations }}
|
||||
- --resources=mutatingwebhookconfigurations
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.namespaces }}
|
||||
- --resources=namespaces
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.networkpolicies }}
|
||||
- --resources=networkpolicies
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.nodes }}
|
||||
- --resources=nodes
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.persistentvolumeclaims }}
|
||||
- --resources=persistentvolumeclaims
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.persistentvolumes }}
|
||||
- --resources=persistentvolumes
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.poddisruptionbudgets }}
|
||||
- --resources=poddisruptionbudgets
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.pods }}
|
||||
- --resources=pods
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.replicasets }}
|
||||
- --resources=replicasets
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.replicationcontrollers }}
|
||||
- --resources=replicationcontrollers
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.resourcequotas }}
|
||||
- --resources=resourcequotas
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.secrets }}
|
||||
- --resources=secrets
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.services }}
|
||||
- --resources=services
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.statefulsets }}
|
||||
- --resources=statefulsets
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.storageclasses }}
|
||||
- --resources=storageclasses
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.validatingwebhookconfigurations }}
|
||||
- --resources=validatingwebhookconfigurations
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.verticalpodautoscalers }}
|
||||
- --resources=verticalpodautoscalers
|
||||
{{ end }}
|
||||
{{ if .Values.collectors.volumeattachments }}
|
||||
- --resources=volumeattachments
|
||||
{{ end }}
|
||||
{{ if .Values.namespaces }}
|
||||
- --namespaces={{ tpl .Values.namespaces $ | join "," }}
|
||||
{{ end }}
|
||||
{{ if .Values.autosharding.enabled }}
|
||||
{{- end }}
|
||||
{{- if .Values.service.port }}
|
||||
- --port={{ .Values.service.port | default 8080}}
|
||||
{{- end }}
|
||||
{{- if .Values.collectors }}
|
||||
- --resources={{ .Values.collectors | join "," }}
|
||||
{{- end }}
|
||||
{{- if .Values.metricLabelsAllowlist }}
|
||||
- --metric-labels-allowlist={{ .Values.metricLabelsAllowlist | join "," }}
|
||||
{{- end }}
|
||||
{{- if .Values.metricAllowlist }}
|
||||
- --metric-allowlist={{ .Values.metricAllowlist | join "," }}
|
||||
{{- end }}
|
||||
{{- if .Values.metricDenylist }}
|
||||
- --metric-denylist={{ .Values.metricDenylist | join "," }}
|
||||
{{- end }}
|
||||
{{- if .Values.namespaces }}
|
||||
- --namespaces={{ tpl (.Values.namespaces | join ",") $ }}
|
||||
{{- end }}
|
||||
{{- if .Values.autosharding.enabled }}
|
||||
- --pod=$(POD_NAME)
|
||||
- --pod-namespace=$(POD_NAMESPACE)
|
||||
{{ end }}
|
||||
{{ if .Values.kubeconfig.enabled }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeconfig.enabled }}
|
||||
- --kubeconfig=/opt/k8s/.kube/config
|
||||
{{ end }}
|
||||
{{ if .Values.selfMonitor.telemetryHost }}
|
||||
{{- end }}
|
||||
{{- if .Values.selfMonitor.telemetryHost }}
|
||||
- --telemetry-host={{ .Values.selfMonitor.telemetryHost }}
|
||||
{{ end }}
|
||||
- --telemetry-port=8081
|
||||
{{- end }}
|
||||
- --telemetry-port={{ .Values.selfMonitor.telemetryPort | default 8081 }}
|
||||
{{- if .Values.kubeconfig.enabled }}
|
||||
volumeMounts:
|
||||
- name: kubeconfig
|
||||
@ -175,26 +106,30 @@ spec:
|
||||
imagePullPolicy: {{ .Values.image.pullPolicy }}
|
||||
image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
- containerPort: {{ .Values.service.port | default 8080}}
|
||||
{{- if .Values.selfMonitor.enabled }}
|
||||
- containerPort: 8081
|
||||
- containerPort: {{ .Values.selfMonitor.telemetryPort | default 8081 }}
|
||||
{{- end }}
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: 8080
|
||||
port: {{ .Values.service.port | default 8080}}
|
||||
initialDelaySeconds: 5
|
||||
timeoutSeconds: 5
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: 8080
|
||||
port: {{ .Values.service.port | default 8080}}
|
||||
initialDelaySeconds: 5
|
||||
timeoutSeconds: 5
|
||||
{{- if .Values.resources }}
|
||||
resources:
|
||||
{{ toYaml .Values.resources | indent 10 }}
|
||||
{{- end }}
|
||||
{{- if .Values.containerSecurityContext }}
|
||||
securityContext:
|
||||
{{ toYaml .Values.containerSecurityContext | indent 10 }}
|
||||
{{- end }}
|
||||
{{- if .Values.imagePullSecrets }}
|
||||
imagePullSecrets:
|
||||
{{ toYaml .Values.imagePullSecrets | indent 8 }}
|
||||
|
@ -14,6 +14,6 @@ roleRef:
|
||||
name: psp-{{ template "kube-state-metrics.fullname" . }}
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ template "kube-state-metrics.fullname" . }}
|
||||
name: {{ template "kube-state-metrics.serviceAccountName" . }}
|
||||
namespace: {{ template "kube-state-metrics.namespace" . }}
|
||||
{{- end }}
|
||||
|
@ -18,169 +18,169 @@ metadata:
|
||||
namespace: {{ . }}
|
||||
{{- end }}
|
||||
rules:
|
||||
{{ if $.Values.collectors.certificatesigningrequests }}
|
||||
{{ if has "certificatesigningrequests" $.Values.collectors }}
|
||||
- apiGroups: ["certificates.k8s.io"]
|
||||
resources:
|
||||
- certificatesigningrequests
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.configmaps }}
|
||||
{{ if has "configmaps" $.Values.collectors }}
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- configmaps
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.cronjobs }}
|
||||
{{ if has "cronjobs" $.Values.collectors }}
|
||||
- apiGroups: ["batch"]
|
||||
resources:
|
||||
- cronjobs
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.daemonsets }}
|
||||
{{ if has "daemonsets" $.Values.collectors }}
|
||||
- apiGroups: ["extensions", "apps"]
|
||||
resources:
|
||||
- daemonsets
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.deployments }}
|
||||
{{ if has "deployments" $.Values.collectors }}
|
||||
- apiGroups: ["extensions", "apps"]
|
||||
resources:
|
||||
- deployments
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.endpoints }}
|
||||
{{ if has "endpoints" $.Values.collectors }}
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- endpoints
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.horizontalpodautoscalers }}
|
||||
{{ if has "horizontalpodautoscalers" $.Values.collectors }}
|
||||
- apiGroups: ["autoscaling"]
|
||||
resources:
|
||||
- horizontalpodautoscalers
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.ingresses }}
|
||||
{{ if has "ingresses" $.Values.collectors }}
|
||||
- apiGroups: ["extensions", "networking.k8s.io"]
|
||||
resources:
|
||||
- ingresses
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.jobs }}
|
||||
{{ if has "jobs" $.Values.collectors }}
|
||||
- apiGroups: ["batch"]
|
||||
resources:
|
||||
- jobs
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.limitranges }}
|
||||
{{ if has "limitranges" $.Values.collectors }}
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- limitranges
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.mutatingwebhookconfigurations }}
|
||||
{{ if has "mutatingwebhookconfigurations" $.Values.collectors }}
|
||||
- apiGroups: ["admissionregistration.k8s.io"]
|
||||
resources:
|
||||
- mutatingwebhookconfigurations
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.namespaces }}
|
||||
{{ if has "namespaces" $.Values.collectors }}
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- namespaces
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.networkpolicies }}
|
||||
{{ if has "networkpolicies" $.Values.collectors }}
|
||||
- apiGroups: ["networking.k8s.io"]
|
||||
resources:
|
||||
- networkpolicies
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.nodes }}
|
||||
{{ if has "nodes" $.Values.collectors }}
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- nodes
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.persistentvolumeclaims }}
|
||||
{{ if has "persistentvolumeclaims" $.Values.collectors }}
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- persistentvolumeclaims
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.persistentvolumes }}
|
||||
{{ if has "persistentvolumes" $.Values.collectors }}
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- persistentvolumes
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.poddisruptionbudgets }}
|
||||
{{ if has "poddisruptionbudgets" $.Values.collectors }}
|
||||
- apiGroups: ["policy"]
|
||||
resources:
|
||||
- poddisruptionbudgets
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.pods }}
|
||||
{{ if has "pods" $.Values.collectors }}
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- pods
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.replicasets }}
|
||||
{{ if has "replicasets" $.Values.collectors }}
|
||||
- apiGroups: ["extensions", "apps"]
|
||||
resources:
|
||||
- replicasets
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.replicationcontrollers }}
|
||||
{{ if has "replicationcontrollers" $.Values.collectors }}
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- replicationcontrollers
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.resourcequotas }}
|
||||
{{ if has "resourcequotas" $.Values.collectors }}
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- resourcequotas
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.secrets }}
|
||||
{{ if has "secrets" $.Values.collectors }}
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- secrets
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.services }}
|
||||
{{ if has "services" $.Values.collectors }}
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- services
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.statefulsets }}
|
||||
{{ if has "statefulsets" $.Values.collectors }}
|
||||
- apiGroups: ["apps"]
|
||||
resources:
|
||||
- statefulsets
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.storageclasses }}
|
||||
{{ if has "storageclasses" $.Values.collectors }}
|
||||
- apiGroups: ["storage.k8s.io"]
|
||||
resources:
|
||||
- storageclasses
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.validatingwebhookconfigurations }}
|
||||
{{ if has "validatingwebhookconfigurations" $.Values.collectors }}
|
||||
- apiGroups: ["admissionregistration.k8s.io"]
|
||||
resources:
|
||||
- validatingwebhookconfigurations
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.volumeattachments }}
|
||||
{{ if has "volumeattachments" $.Values.collectors }}
|
||||
- apiGroups: ["storage.k8s.io"]
|
||||
resources:
|
||||
- volumeattachments
|
||||
verbs: ["list", "watch"]
|
||||
{{ end -}}
|
||||
{{ if $.Values.collectors.verticalpodautoscalers }}
|
||||
{{ if has "verticalpodautoscalers" $.Values.collectors }}
|
||||
- apiGroups: ["autoscaling.k8s.io"]
|
||||
resources:
|
||||
- verticalpodautoscalers
|
||||
|
@ -21,7 +21,7 @@ roleRef:
|
||||
{{- end }}
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ template "kube-state-metrics.fullname" $ }}
|
||||
name: {{ template "kube-state-metrics.serviceAccountName" $ }}
|
||||
namespace: {{ template "kube-state-metrics.namespace" $ }}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
@ -23,16 +23,16 @@ spec:
|
||||
ports:
|
||||
- name: "http"
|
||||
protocol: TCP
|
||||
port: {{ .Values.service.port }}
|
||||
port: {{ .Values.service.port | default 8080}}
|
||||
{{- if .Values.service.nodePort }}
|
||||
nodePort: {{ .Values.service.nodePort }}
|
||||
{{- end }}
|
||||
targetPort: 8080
|
||||
targetPort: {{ .Values.service.port | default 8080}}
|
||||
{{ if .Values.selfMonitor.enabled }}
|
||||
- name: "metrics"
|
||||
protocol: TCP
|
||||
port: {{ .Values.selfMonitor.telemetryPort | default 8081 }}
|
||||
targetPort: 8081
|
||||
targetPort: {{ .Values.selfMonitor.telemetryPort | default 8081 }}
|
||||
{{ end }}
|
||||
{{- if .Values.service.loadBalancerIP }}
|
||||
loadBalancerIP: "{{ .Values.service.loadBalancerIP }}"
|
||||
|
@ -7,7 +7,7 @@ metadata:
|
||||
helm.sh/chart: {{ .Chart.Name }}-{{ .Chart.Version }}
|
||||
app.kubernetes.io/managed-by: {{ .Release.Service }}
|
||||
app.kubernetes.io/instance: {{ .Release.Name }}
|
||||
name: {{ template "kube-state-metrics.fullname" . }}
|
||||
name: {{ template "kube-state-metrics.serviceAccountName" . }}
|
||||
namespace: {{ template "kube-state-metrics.namespace" . }}
|
||||
{{- if .Values.serviceAccount.annotations }}
|
||||
annotations:
|
||||
|
@ -15,6 +15,6 @@ roleRef:
|
||||
name: stsdiscovery-{{ template "kube-state-metrics.fullname" . }}
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ template "kube-state-metrics.fullname" . }}
|
||||
name: {{ template "kube-state-metrics.serviceAccountName" . }}
|
||||
namespace: {{ template "kube-state-metrics.namespace" . }}
|
||||
{{- end }}
|
||||
|
@ -88,6 +88,11 @@ securityContext:
|
||||
runAsUser: 65534
|
||||
fsGroup: 65534
|
||||
|
||||
## Specify security settings for a Container
|
||||
## Allows overrides and additional options compared to (Pod) securityContext
|
||||
## Ref: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-container
|
||||
containerSecurityContext: {}
|
||||
|
||||
## Node labels for pod assignment
|
||||
## Ref: https://kubernetes.io/docs/user-guide/node-selection/
|
||||
nodeSelector: {}
|
||||
@ -109,37 +114,56 @@ podAnnotations: {}
|
||||
# Ref: https://kubernetes.io/docs/tasks/run-application/configure-pdb/
|
||||
podDisruptionBudget: {}
|
||||
|
||||
# Available collectors for kube-state-metrics. By default all available
|
||||
# resources are enabled.
|
||||
# Comma-separated list of metrics to be exposed.
|
||||
# This list comprises of exact metric names and/or regex patterns.
|
||||
# The allowlist and denylist are mutually exclusive.
|
||||
metricAllowlist: []
|
||||
|
||||
# Comma-separated list of metrics not to be enabled.
|
||||
# This list comprises of exact metric names and/or regex patterns.
|
||||
# The allowlist and denylist are mutually exclusive.
|
||||
metricDenylist: []
|
||||
|
||||
# Comma-separated list of additional Kubernetes label keys that will be used in the resource's
|
||||
# labels metric. By default the metric contains only name and namespace labels.
|
||||
# To include additional labels, provide a list of resource names in their plural form and Kubernetes
|
||||
# label keys you would like to allow for them (Example: '=namespaces=[k8s-label-1,k8s-label-n,...],pods=[app],...)'.
|
||||
# A single '*' can be provided per resource instead to allow any labels, but that has
|
||||
# severe performance implications (Example: '=pods=[*]').
|
||||
metricLabelsAllowlist: []
|
||||
# - namespaces=[k8s-label-1,k8s-label-n]
|
||||
|
||||
# Available collectors for kube-state-metrics.
|
||||
# By default, all available resources are enabled, comment out to disable.
|
||||
collectors:
|
||||
certificatesigningrequests: true
|
||||
configmaps: true
|
||||
cronjobs: true
|
||||
daemonsets: true
|
||||
deployments: true
|
||||
endpoints: true
|
||||
horizontalpodautoscalers: true
|
||||
ingresses: true
|
||||
jobs: true
|
||||
limitranges: true
|
||||
mutatingwebhookconfigurations: true
|
||||
namespaces: true
|
||||
networkpolicies: true
|
||||
nodes: true
|
||||
persistentvolumeclaims: true
|
||||
persistentvolumes: true
|
||||
poddisruptionbudgets: true
|
||||
pods: true
|
||||
replicasets: true
|
||||
replicationcontrollers: true
|
||||
resourcequotas: true
|
||||
secrets: true
|
||||
services: true
|
||||
statefulsets: true
|
||||
storageclasses: true
|
||||
validatingwebhookconfigurations: true
|
||||
verticalpodautoscalers: false
|
||||
volumeattachments: true
|
||||
- certificatesigningrequests
|
||||
- configmaps
|
||||
- cronjobs
|
||||
- daemonsets
|
||||
- deployments
|
||||
- endpoints
|
||||
- horizontalpodautoscalers
|
||||
- ingresses
|
||||
- jobs
|
||||
- limitranges
|
||||
- mutatingwebhookconfigurations
|
||||
- namespaces
|
||||
- networkpolicies
|
||||
- nodes
|
||||
- persistentvolumeclaims
|
||||
- persistentvolumes
|
||||
- poddisruptionbudgets
|
||||
- pods
|
||||
- replicasets
|
||||
- replicationcontrollers
|
||||
- resourcequotas
|
||||
- secrets
|
||||
- services
|
||||
- statefulsets
|
||||
- storageclasses
|
||||
- validatingwebhookconfigurations
|
||||
- verticalpodautoscalers
|
||||
- volumeattachments
|
||||
|
||||
# Enabling kubeconfig will pass the --kubeconfig argument to the container
|
||||
kubeconfig:
|
||||
|
@ -14,4 +14,4 @@ maintainers:
|
||||
name: prometheus-node-exporter
|
||||
sources:
|
||||
- https://github.com/prometheus/node_exporter/
|
||||
version: 1.18.0
|
||||
version: 1.18.2
|
||||
|
@ -22,6 +22,9 @@ spec:
|
||||
{{- if $.Values.prometheus.monitor.tlsConfig }}
|
||||
tlsConfig: {{ toYaml $.Values.prometheus.monitor.tlsConfig | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.monitor.proxyUrl }}
|
||||
proxyUrl: {{ .Values.prometheus.monitor.proxyUrl}}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.monitor.scrapeTimeout }}
|
||||
scrapeTimeout: {{ .Values.prometheus.monitor.scrapeTimeout }}
|
||||
{{- end }}
|
||||
|
@ -24,6 +24,10 @@ prometheus:
|
||||
bearerTokenFile:
|
||||
tlsConfig: {}
|
||||
|
||||
## proxyUrl: URL of a proxy that should be used for scraping.
|
||||
##
|
||||
proxyUrl: ""
|
||||
|
||||
relabelings: []
|
||||
scrapeTimeout: 10s
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml
|
||||
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml
|
||||
|
||||
---
|
||||
apiVersion: apiextensions.k8s.io/v1
|
||||
|
@ -1,4 +1,4 @@
|
||||
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml
|
||||
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml
|
||||
|
||||
---
|
||||
apiVersion: apiextensions.k8s.io/v1
|
||||
|
@ -1,4 +1,4 @@
|
||||
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml
|
||||
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml
|
||||
|
||||
---
|
||||
apiVersion: apiextensions.k8s.io/v1
|
||||
|
@ -1,4 +1,4 @@
|
||||
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml
|
||||
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml
|
||||
|
||||
---
|
||||
apiVersion: apiextensions.k8s.io/v1
|
||||
|
@ -1,4 +1,4 @@
|
||||
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml
|
||||
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml
|
||||
|
||||
---
|
||||
apiVersion: apiextensions.k8s.io/v1
|
||||
|
@ -1,4 +1,4 @@
|
||||
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml
|
||||
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml
|
||||
|
||||
---
|
||||
apiVersion: apiextensions.k8s.io/v1
|
||||
|
@ -1,4 +1,4 @@
|
||||
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml
|
||||
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml
|
||||
|
||||
---
|
||||
apiVersion: apiextensions.k8s.io/v1
|
||||
|
@ -1,4 +1,4 @@
|
||||
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml
|
||||
# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml
|
||||
|
||||
---
|
||||
apiVersion: apiextensions.k8s.io/v1
|
||||
|
@ -98,7 +98,12 @@ Allow the release namespace to be overridden for multi-namespace deployments in
|
||||
|
||||
{{/* Allow KubeVersion to be overridden. */}}
|
||||
{{- define "kube-prometheus-stack.ingress.kubeVersion" -}}
|
||||
{{- default .Capabilities.KubeVersion.Version .Values.kubeVersionOverride -}}
|
||||
{{- $kubeVersion := default .Capabilities.KubeVersion.Version .Values.kubeVersionOverride -}}
|
||||
{{/* Special use case for Amazon EKS, Google GKE */}}
|
||||
{{- if and (regexMatch "\\d+\\.\\d+\\.\\d+-(?:eks|gke).+" $kubeVersion) (not .Values.kubeVersionOverride) -}}
|
||||
{{- $kubeVersion = regexFind "\\d+\\.\\d+\\.\\d+" $kubeVersion -}}
|
||||
{{- end -}}
|
||||
{{- $kubeVersion -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/* Get Ingress API Version */}}
|
||||
|
@ -22,9 +22,9 @@ spec:
|
||||
metricRelabelings:
|
||||
{{ tpl (toYaml .Values.kubeApiServer.serviceMonitor.metricRelabelings | indent 6) . }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeApiServer.relabelings }}
|
||||
{{- if .Values.kubeApiServer.serviceMonitor.relabelings }}
|
||||
relabelings:
|
||||
{{ toYaml .Values.kubeApiServer.relabelings | indent 6 }}
|
||||
{{ toYaml .Values.kubeApiServer.serviceMonitor.relabelings | indent 6 }}
|
||||
{{- end }}
|
||||
tlsConfig:
|
||||
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
|
@ -26,6 +26,24 @@ spec:
|
||||
relabelings:
|
||||
{{ toYaml .Values.kubeStateMetrics.serviceMonitor.relabelings | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeStateMetrics.serviceMonitor.selfMonitor.enabled }}
|
||||
- port: metrics
|
||||
{{- if .Values.kubeStateMetrics.serviceMonitor.interval }}
|
||||
interval: {{ .Values.kubeStateMetrics.serviceMonitor.interval }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeStateMetrics.serviceMonitor.proxyUrl }}
|
||||
proxyUrl: {{ .Values.kubeStateMetrics.serviceMonitor.proxyUrl}}
|
||||
{{- end }}
|
||||
honorLabels: true
|
||||
{{- if .Values.kubeStateMetrics.serviceMonitor.metricRelabelings }}
|
||||
metricRelabelings:
|
||||
{{ tpl (toYaml .Values.kubeStateMetrics.serviceMonitor.metricRelabelings | indent 4) . }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeStateMetrics.serviceMonitor.relabelings }}
|
||||
relabelings:
|
||||
{{ toYaml .Values.kubeStateMetrics.serviceMonitor.relabelings | indent 4 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeStateMetrics.serviceMonitor.namespaceOverride }}
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
|
@ -168,6 +168,6 @@ spec:
|
||||
- {{ .Values.kubelet.namespace }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/managed-by: prometheus-operator
|
||||
app.kubernetes.io/name: kubelet
|
||||
k8s-app: kubelet
|
||||
{{- end}}
|
||||
|
@ -13,6 +13,11 @@ spec:
|
||||
matchLabels:
|
||||
app: prometheus-node-exporter
|
||||
release: {{ $.Release.Name }}
|
||||
{{- if (index .Values "prometheus-node-exporter" "namespaceOverride") }}
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- {{ index .Values "prometheus-node-exporter" "namespaceOverride" }}
|
||||
{{- end }}
|
||||
endpoints:
|
||||
- port: metrics
|
||||
{{- if .Values.nodeExporter.serviceMonitor.interval }}
|
||||
|
@ -1,4 +1,4 @@
|
||||
{{- if and .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
|
||||
{{- if or (and .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled) .Values.grafana.forceDeployDashboards }}
|
||||
{{- $files := .Files.Glob "dashboards-1.14/*.json" }}
|
||||
{{- if $files }}
|
||||
apiVersion: v1
|
||||
|
@ -1,4 +1,4 @@
|
||||
{{- if and .Values.grafana.enabled .Values.grafana.sidecar.datasources.enabled }}
|
||||
{{- if or (and .Values.grafana.enabled .Values.grafana.sidecar.datasources.enabled) .Values.grafana.forceDeployDatasources }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
@ -20,7 +20,11 @@ data:
|
||||
{{- if .Values.grafana.sidecar.datasources.defaultDatasourceEnabled }}
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
{{- if .Values.grafana.sidecar.datasources.url }}
|
||||
url: {{ .Values.grafana.sidecar.datasources.url }}
|
||||
{{- else }}
|
||||
url: http://{{ template "kube-prometheus-stack.fullname" . }}-prometheus:{{ .Values.prometheus.service.port }}/{{ trimPrefix "/" .Values.prometheus.prometheusSpec.routePrefix }}
|
||||
{{- end }}
|
||||
access: proxy
|
||||
isDefault: true
|
||||
jsonData:
|
||||
|
@ -0,0 +1,610 @@
|
||||
{{- /*
|
||||
Generated from 'alertmanager-overview' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "alertmanager-overview" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }}
|
||||
labels:
|
||||
{{- if $.Values.grafana.sidecar.dashboards.label }}
|
||||
{{ $.Values.grafana.sidecar.dashboards.label }}: "1"
|
||||
{{- end }}
|
||||
app: {{ template "kube-prometheus-stack.name" $ }}-grafana
|
||||
{{ include "kube-prometheus-stack.labels" $ | indent 4 }}
|
||||
data:
|
||||
alertmanager-overview.json: |-
|
||||
{
|
||||
"__inputs": [
|
||||
|
||||
],
|
||||
"__requires": [
|
||||
|
||||
],
|
||||
"annotations": {
|
||||
"list": [
|
||||
|
||||
]
|
||||
},
|
||||
"editable": false,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 1,
|
||||
"hideControls": false,
|
||||
"id": null,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"refresh": "30s",
|
||||
"rows": [
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
|
||||
},
|
||||
"id": 2,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": false,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(alertmanager_alerts{namespace=\"$namespace\",service=\"$service\"}) by (namespace,service,instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Alerts",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "none",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "none",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
|
||||
},
|
||||
"id": 3,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": false,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(alertmanager_alerts_received_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} Received",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(alertmanager_alerts_invalid_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} Invalid",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Alerts receive rate",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "ops",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "ops",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Alerts",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
|
||||
},
|
||||
"id": 4,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": false,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": "integration",
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(alertmanager_notifications_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} Total",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(alertmanager_notifications_failed_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} Failed",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "$integration: Notifications Send Rate",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "ops",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "ops",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
|
||||
},
|
||||
"id": 5,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": false,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": "integration",
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.99,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} 99th Percentile",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.50,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} Median",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(alertmanager_notification_latency_seconds_sum{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n/\nsum(rate(alertmanager_notification_latency_seconds_count{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} Average",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "$integration: Notification Duration",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "s",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "s",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Notifications",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 14,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"alertmanager-mixin"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"text": "Prometheus",
|
||||
"value": "Prometheus"
|
||||
},
|
||||
"hide": 0,
|
||||
"label": null,
|
||||
"name": "datasource",
|
||||
"options": [
|
||||
|
||||
],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {
|
||||
"text": "",
|
||||
"value": ""
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": null,
|
||||
"multi": false,
|
||||
"name": "namespace",
|
||||
"options": [
|
||||
|
||||
],
|
||||
"query": "label_values(alertmanager_alerts, namespace)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [
|
||||
|
||||
],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {
|
||||
"text": "",
|
||||
"value": ""
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": null,
|
||||
"multi": false,
|
||||
"name": "service",
|
||||
"options": [
|
||||
|
||||
],
|
||||
"query": "label_values(alertmanager_alerts, service)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [
|
||||
|
||||
],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {
|
||||
"text": "all",
|
||||
"value": "$__all"
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"hide": 2,
|
||||
"includeAll": true,
|
||||
"label": null,
|
||||
"multi": false,
|
||||
"name": "integration",
|
||||
"options": [
|
||||
|
||||
],
|
||||
"query": "label_values(alertmanager_notifications_total{integration=~\".*\"}, integration)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [
|
||||
|
||||
],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "utc",
|
||||
"title": "Alertmanager / Overview",
|
||||
"uid": "alertmanager-overview",
|
||||
"version": 0
|
||||
}
|
||||
{{- end }}
|
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'apiserver' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'apiserver' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeApiServer.enabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubeApiServer.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
|
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'cluster-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'cluster-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
|
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'controller-manager' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'controller-manager' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeControllerManager.enabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubeControllerManager.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
@ -176,10 +176,10 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)",
|
||||
"expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -282,10 +282,10 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)",
|
||||
"expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -388,10 +388,10 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name, le))",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
|
@ -4,7 +4,7 @@ Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeEtcd.enabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubeEtcd.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
|
@ -1,6 +1,6 @@
|
||||
{{- /* Added manually, can be changed in-place. */ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.coreDns.enabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.coreDns.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
|
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'k8s-resources-cluster' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'k8s-resources-cluster' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
@ -247,7 +247,7 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})",
|
||||
"expr": "sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})",
|
||||
"format": "time_series",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -499,7 +499,7 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})",
|
||||
"expr": "sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})",
|
||||
"format": "time_series",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -594,7 +594,7 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)",
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}namespace{{`}}`}}",
|
||||
@ -885,7 +885,7 @@ data:
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)",
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -903,7 +903,7 @@ data:
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)",
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -912,7 +912,7 @@ data:
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)",
|
||||
"expr": "sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -921,7 +921,7 @@ data:
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)",
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -1321,7 +1321,7 @@ data:
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)",
|
||||
"expr": "sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -1330,7 +1330,7 @@ data:
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)",
|
||||
"expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -1339,7 +1339,7 @@ data:
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)",
|
||||
"expr": "sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -1348,7 +1348,7 @@ data:
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)",
|
||||
"expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -2443,6 +2443,496 @@ data:
|
||||
"showTitle": true,
|
||||
"title": "Rate of Packets Dropped",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"decimals": -1,
|
||||
"fill": 10,
|
||||
"id": 20,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m])))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}namespace{{`}}`}}",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "IOPS(Reads+Writes)",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 10,
|
||||
"id": 21,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}namespace{{`}}`}}",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "ThroughPut(Read+Write)",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "Bps",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Storage IO",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 22,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"sort": {
|
||||
"col": 4,
|
||||
"desc": true
|
||||
},
|
||||
"spaceLength": 10,
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"styles": [
|
||||
{
|
||||
"alias": "Time",
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"pattern": "Time",
|
||||
"type": "hidden"
|
||||
},
|
||||
{
|
||||
"alias": "IOPS(Reads)",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": -1,
|
||||
"link": false,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #A",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "short"
|
||||
},
|
||||
{
|
||||
"alias": "IOPS(Writes)",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": -1,
|
||||
"link": false,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #B",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "short"
|
||||
},
|
||||
{
|
||||
"alias": "IOPS(Reads + Writes)",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": -1,
|
||||
"link": false,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #C",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "short"
|
||||
},
|
||||
{
|
||||
"alias": "Throughput(Read)",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #D",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "Bps"
|
||||
},
|
||||
{
|
||||
"alias": "Throughput(Write)",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #E",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "Bps"
|
||||
},
|
||||
{
|
||||
"alias": "Throughput(Read + Write)",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #F",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "Bps"
|
||||
},
|
||||
{
|
||||
"alias": "Namespace",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": true,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down to pods",
|
||||
"linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell",
|
||||
"pattern": "namespace",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "short"
|
||||
},
|
||||
{
|
||||
"alias": "",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"pattern": "/.*/",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "string",
|
||||
"unit": "short"
|
||||
}
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "A",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "B",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "C",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "D",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "E",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "F",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Current Storage IO",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"transform": "table",
|
||||
"type": "table",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Storage IO - Distribution",
|
||||
"titleSize": "h6"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 14,
|
||||
|
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'k8s-resources-namespace' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'k8s-resources-namespace' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
@ -78,7 +78,7 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})",
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})",
|
||||
"format": "time_series",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -162,7 +162,7 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})",
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})",
|
||||
"format": "time_series",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -256,7 +256,7 @@ data:
|
||||
"thresholds": "70,80",
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Memory Utilization (from requests)",
|
||||
"title": "Memory Utilisation (from requests)",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
@ -446,7 +446,7 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}pod{{`}}`}}",
|
||||
@ -697,7 +697,7 @@ data:
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -715,7 +715,7 @@ data:
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)",
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -733,7 +733,7 @@ data:
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)",
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -2136,6 +2136,496 @@ data:
|
||||
"showTitle": true,
|
||||
"title": "Rate of Packets Dropped",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"decimals": -1,
|
||||
"fill": 10,
|
||||
"id": 16,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m])))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}pod{{`}}`}}",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "IOPS(Reads+Writes)",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 10,
|
||||
"id": 17,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}pod{{`}}`}}",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "ThroughPut(Read+Write)",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "Bps",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Storage IO",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 18,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"sort": {
|
||||
"col": 4,
|
||||
"desc": true
|
||||
},
|
||||
"spaceLength": 10,
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"styles": [
|
||||
{
|
||||
"alias": "Time",
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"pattern": "Time",
|
||||
"type": "hidden"
|
||||
},
|
||||
{
|
||||
"alias": "IOPS(Reads)",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": -1,
|
||||
"link": false,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #A",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "short"
|
||||
},
|
||||
{
|
||||
"alias": "IOPS(Writes)",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": -1,
|
||||
"link": false,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #B",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "short"
|
||||
},
|
||||
{
|
||||
"alias": "IOPS(Reads + Writes)",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": -1,
|
||||
"link": false,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #C",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "short"
|
||||
},
|
||||
{
|
||||
"alias": "Throughput(Read)",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #D",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "Bps"
|
||||
},
|
||||
{
|
||||
"alias": "Throughput(Write)",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #E",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "Bps"
|
||||
},
|
||||
{
|
||||
"alias": "Throughput(Read + Write)",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #F",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "Bps"
|
||||
},
|
||||
{
|
||||
"alias": "Pod",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": true,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down to pods",
|
||||
"linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell",
|
||||
"pattern": "pod",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "short"
|
||||
},
|
||||
{
|
||||
"alias": "",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"pattern": "/.*/",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "string",
|
||||
"unit": "short"
|
||||
}
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "A",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum by(pod) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "B",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "C",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "D",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum by(pod) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "E",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "F",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Current Storage IO",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"transform": "table",
|
||||
"type": "table",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Storage IO - Distribution",
|
||||
"titleSize": "h6"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 14,
|
||||
|
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'k8s-resources-node' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'k8s-resources-node' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
@ -77,7 +77,7 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}pod{{`}}`}}",
|
||||
@ -312,7 +312,7 @@ data:
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -330,7 +330,7 @@ data:
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)",
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -348,7 +348,7 @@ data:
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)",
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
|
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'k8s-resources-pod' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'k8s-resources-pod' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
@ -94,7 +94,7 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}) by (container)",
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}) by (container)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}container{{`}}`}}",
|
||||
@ -450,7 +450,7 @@ data:
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -468,7 +468,7 @@ data:
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)",
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -486,7 +486,7 @@ data:
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)",
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -634,7 +634,7 @@ data:
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Memory Usage",
|
||||
"title": "Memory Usage (WSS)",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
@ -725,7 +725,7 @@ data:
|
||||
"type": "hidden"
|
||||
},
|
||||
{
|
||||
"alias": "Memory Usage",
|
||||
"alias": "Memory Usage (WSS)",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
@ -1591,6 +1591,697 @@ data:
|
||||
"showTitle": true,
|
||||
"title": "Rate of Packets Dropped",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"decimals": -1,
|
||||
"fill": 10,
|
||||
"id": 12,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m])))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "Reads",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "ceil(sum by(pod) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m])))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "Writes",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "IOPS",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 10,
|
||||
"id": 13,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "Reads",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum by(pod) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "Writes",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "ThroughPut",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "Bps",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Storage IO - Distribution(Pod - Read & Writes)",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"decimals": -1,
|
||||
"fill": 10,
|
||||
"id": 14,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceil(sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m])))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}container{{`}}`}}",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "IOPS(Reads+Writes)",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 10,
|
||||
"id": 15,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}container{{`}}`}}",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "ThroughPut(Read+Write)",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "Bps",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Storage IO - Distribution(Containers)",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {
|
||||
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 16,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [
|
||||
|
||||
],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
|
||||
],
|
||||
"sort": {
|
||||
"col": 4,
|
||||
"desc": true
|
||||
},
|
||||
"spaceLength": 10,
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"styles": [
|
||||
{
|
||||
"alias": "Time",
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"pattern": "Time",
|
||||
"type": "hidden"
|
||||
},
|
||||
{
|
||||
"alias": "IOPS(Reads)",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": -1,
|
||||
"link": false,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #A",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "short"
|
||||
},
|
||||
{
|
||||
"alias": "IOPS(Writes)",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": -1,
|
||||
"link": false,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #B",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "short"
|
||||
},
|
||||
{
|
||||
"alias": "IOPS(Reads + Writes)",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": -1,
|
||||
"link": false,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #C",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "short"
|
||||
},
|
||||
{
|
||||
"alias": "Throughput(Read)",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #D",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "Bps"
|
||||
},
|
||||
{
|
||||
"alias": "Throughput(Write)",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #E",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "Bps"
|
||||
},
|
||||
{
|
||||
"alias": "Throughput(Read + Write)",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #F",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "Bps"
|
||||
},
|
||||
{
|
||||
"alias": "Container",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTargetBlank": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "container",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "number",
|
||||
"unit": "short"
|
||||
},
|
||||
{
|
||||
"alias": "",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"pattern": "/.*/",
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"type": "string",
|
||||
"unit": "short"
|
||||
}
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "A",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum by(container) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "B",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "C",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "D",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum by(container) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "E",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "F",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Current Storage IO",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"transform": "table",
|
||||
"type": "table",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [
|
||||
|
||||
]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Storage IO - Distribution",
|
||||
"titleSize": "h6"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 14,
|
||||
|
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'k8s-resources-workload' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'k8s-resources-workload' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
@ -77,7 +77,7 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
|
||||
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}pod{{`}}`}}",
|
||||
@ -312,7 +312,7 @@ data:
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
|
||||
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -330,7 +330,7 @@ data:
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
|
||||
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -348,7 +348,7 @@ data:
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
|
||||
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
|
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'k8s-resources-workloads-namespace' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'k8s-resources-workloads-namespace' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
@ -98,7 +98,7 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
|
||||
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}workload{{`}}`}} - {{`{{`}}workload_type{{`}}`}}",
|
||||
@ -396,7 +396,7 @@ data:
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
|
||||
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -414,7 +414,7 @@ data:
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
|
||||
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
@ -432,7 +432,7 @@ data:
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
|
||||
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'namespace-by-pod' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'namespace-by-pod' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
|
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'namespace-by-workload' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'namespace-by-workload' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
|
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'node-cluster-rsrc-use' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'node-cluster-rsrc-use' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.nodeExporter.enabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.nodeExporter.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
|
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'node-rsrc-use' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'node-rsrc-use' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.nodeExporter.enabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.nodeExporter.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
|
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'nodes' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'nodes' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
@ -92,9 +92,8 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(\n (1 - rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[$__interval]))\n/ ignoring(cpu) group_left\n count without (cpu)( node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"})\n)\n",
|
||||
"expr": "(\n (1 - rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[$__rate_interval]))\n/ ignoring(cpu) group_left\n count without (cpu)( node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"})\n)\n",
|
||||
"format": "time_series",
|
||||
"interval": "$__rate_interval",
|
||||
"intervalFactor": 5,
|
||||
"legendFormat": "{{`{{`}}cpu{{`}}`}}",
|
||||
"refId": "A"
|
||||
@ -528,25 +527,22 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
|
||||
"expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
|
||||
"format": "time_series",
|
||||
"interval": "$__rate_interval",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}device{{`}}`}} read",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
|
||||
"expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
|
||||
"format": "time_series",
|
||||
"interval": "$__rate_interval",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}device{{`}}`}} written",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
|
||||
"expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
|
||||
"format": "time_series",
|
||||
"interval": "$__rate_interval",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}device{{`}}`}} io time",
|
||||
"refId": "C"
|
||||
@ -758,9 +754,8 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__interval])",
|
||||
"expr": "rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__rate_interval])",
|
||||
"format": "time_series",
|
||||
"interval": "$__rate_interval",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}device{{`}}`}}",
|
||||
"refId": "A"
|
||||
@ -852,9 +847,8 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__interval])",
|
||||
"expr": "rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__rate_interval])",
|
||||
"format": "time_series",
|
||||
"interval": "$__rate_interval",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}device{{`}}`}}",
|
||||
"refId": "A"
|
||||
|
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'persistentvolumesusage' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'persistentvolumesusage' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
@ -92,14 +92,14 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(\n sum without(instance, node) (kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n sum without(instance, node) (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n",
|
||||
"expr": "(\n sum without(instance, node) (topk(1, (kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n -\n sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Used Space",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum without(instance, node) (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n",
|
||||
"expr": "sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Free Space",
|
||||
@ -207,7 +207,7 @@ data:
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "max without(instance,node) (\n(\n kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n -\n kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n)\n/\nkubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100)\n",
|
||||
"expr": "max without(instance,node) (\n(\n topk(1, kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n topk(1, kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n/\ntopk(1, kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n* 100)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
@ -289,14 +289,14 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum without(instance, node) (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n",
|
||||
"expr": "sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Used inodes",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "(\n sum without(instance, node) (kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n sum without(instance, node) (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n",
|
||||
"expr": "(\n sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n -\n sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": " Free inodes",
|
||||
@ -404,7 +404,7 @@ data:
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "max without(instance,node) (\nkubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n/\nkubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100)\n",
|
||||
"expr": "max without(instance,node) (\ntopk(1, kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n/\ntopk(1, kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n* 100)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
|
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'pod-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'pod-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
|
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'prometheus-remote-write' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'prometheus-remote-write' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.prometheus.prometheusSpec.remoteWriteDashboards }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.prometheus.prometheusSpec.remoteWriteDashboards }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
|
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'prometheus' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'prometheus' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
|
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'proxy' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'proxy' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeProxy.enabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubeProxy.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
|
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'scheduler' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'scheduler' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeScheduler.enabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubeScheduler.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
@ -176,31 +176,31 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)",
|
||||
"expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} e2e",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} e2e",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)",
|
||||
"expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} binding",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} binding",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)",
|
||||
"expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} scheduling algorithm",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} scheduling algorithm",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)",
|
||||
"expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} volume",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} volume",
|
||||
"refId": "D"
|
||||
}
|
||||
],
|
||||
@ -290,31 +290,31 @@ data:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} e2e",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} e2e",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} binding",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} binding",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} scheduling algorithm",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} scheduling algorithm",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} volume",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} volume",
|
||||
"refId": "D"
|
||||
}
|
||||
],
|
||||
|
@ -4,7 +4,7 @@ Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
|
@ -1,10 +1,10 @@
|
||||
{{- /*
|
||||
Generated from 'workload-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
|
||||
Generated from 'workload-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
|
@ -58,8 +58,8 @@ spec:
|
||||
tolerations:
|
||||
{{ toYaml . | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheusOperator.admissionWebhooks.patch.securityContext }}
|
||||
securityContext:
|
||||
runAsGroup: 2000
|
||||
runAsNonRoot: true
|
||||
runAsUser: 2000
|
||||
{{ toYaml .Values.prometheusOperator.admissionWebhooks.patch.securityContext | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
@ -59,8 +59,8 @@ spec:
|
||||
tolerations:
|
||||
{{ toYaml . | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheusOperator.admissionWebhooks.patch.securityContext }}
|
||||
securityContext:
|
||||
runAsGroup: 2000
|
||||
runAsNonRoot: true
|
||||
runAsUser: 2000
|
||||
{{ toYaml .Values.prometheusOperator.admissionWebhooks.patch.securityContext | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
@ -5,8 +5,8 @@ metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-admission
|
||||
{{- if .Values.prometheusOperator.admissionWebhooks.certManager.enabled }}
|
||||
annotations:
|
||||
certmanager.k8s.io/inject-ca-from: {{ printf "%s/%s-root-cert" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }}
|
||||
cert-manager.io/inject-ca-from: {{ printf "%s/%s-root-cert" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }}
|
||||
certmanager.k8s.io/inject-ca-from: {{ printf "%s/%s-admission" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }}
|
||||
cert-manager.io/inject-ca-from: {{ printf "%s/%s-admission" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }}
|
||||
{{- end }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" $ }}-admission
|
||||
|
@ -5,8 +5,8 @@ metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-admission
|
||||
{{- if .Values.prometheusOperator.admissionWebhooks.certManager.enabled }}
|
||||
annotations:
|
||||
certmanager.k8s.io/inject-ca-from: {{ printf "%s/%s-root-cert" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }}
|
||||
cert-manager.io/inject-ca-from: {{ printf "%s/%s-root-cert" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }}
|
||||
certmanager.k8s.io/inject-ca-from: {{ printf "%s/%s-admission" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }}
|
||||
cert-manager.io/inject-ca-from: {{ printf "%s/%s-admission" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }}
|
||||
{{- end }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" $ }}-admission
|
||||
|
@ -78,6 +78,11 @@ spec:
|
||||
{{- if .Values.prometheusOperator.prometheusInstanceNamespaces }}
|
||||
- --prometheus-instance-namespaces={{ .Values.prometheusOperator.prometheusInstanceNamespaces | join "," }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheusOperator.thanosImage.sha }}
|
||||
- --thanos-default-base-image={{ .Values.prometheusOperator.thanosImage.repository }}:{{ .Values.prometheusOperator.thanosImage.tag }}@sha256:{{ .Values.prometheusOperator.thanosImage.sha }}
|
||||
{{- else }}
|
||||
- --thanos-default-base-image={{ .Values.prometheusOperator.thanosImage.repository }}:{{ .Values.prometheusOperator.thanosImage.tag }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheusOperator.thanosRulerInstanceNamespaces }}
|
||||
- --thanos-ruler-instance-namespaces={{ .Values.prometheusOperator.thanosRulerInstanceNamespaces | join "," }}
|
||||
{{- end }}
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml
|
||||
Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/alertmanager-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
@ -26,41 +26,146 @@ spec:
|
||||
groups:
|
||||
- name: alertmanager.rules
|
||||
rules:
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
annotations:
|
||||
message: 'The configuration of the instances of the Alertmanager cluster `{{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.service {{`}}`}}` are out of sync.
|
||||
|
||||
{{`{{`}} range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query {{`}}`}}
|
||||
|
||||
Configuration hash for pod {{`{{`}} .Labels.pod {{`}}`}} is "{{`{{`}} printf "%.f" .Value {{`}}`}}"
|
||||
|
||||
{{`{{`}} end {{`}}`}}
|
||||
|
||||
'
|
||||
expr: count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: AlertmanagerFailedReload
|
||||
annotations:
|
||||
message: Reloading Alertmanager's configuration has failed for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}.
|
||||
expr: alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} == 0
|
||||
description: Configuration has failed to load for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerfailedreload
|
||||
summary: Reloading an Alertmanager configuration has failed.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: AlertmanagerMembersInconsistent
|
||||
annotations:
|
||||
message: Alertmanager has not found all other members of the cluster.
|
||||
description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} has only found {{`{{`}} $value {{`}}`}} members of the {{`{{`}}$labels.job{{`}}`}} cluster.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagermembersinconsistent
|
||||
summary: A member of an Alertmanager cluster has not found all other cluster members.
|
||||
expr: |-
|
||||
alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}
|
||||
!= on (service) GROUP_LEFT()
|
||||
count by (service) (alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
< on (namespace,service) group_left
|
||||
count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]))
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: AlertmanagerFailedToSendAlerts
|
||||
annotations:
|
||||
description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} failed to send {{`{{`}} $value | humanizePercentage {{`}}`}} of notifications to {{`{{`}} $labels.integration {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerfailedtosendalerts
|
||||
summary: An Alertmanager instance failed to send notifications.
|
||||
expr: |-
|
||||
(
|
||||
rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
/
|
||||
rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
)
|
||||
> 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
annotations:
|
||||
description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterfailedtosendalerts
|
||||
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
|
||||
expr: |-
|
||||
min by (namespace,service, integration) (
|
||||
rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m])
|
||||
/
|
||||
rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m])
|
||||
)
|
||||
> 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
annotations:
|
||||
description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterfailedtosendalerts
|
||||
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
|
||||
expr: |-
|
||||
min by (namespace,service, integration) (
|
||||
rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m])
|
||||
/
|
||||
rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m])
|
||||
)
|
||||
> 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
annotations:
|
||||
description: Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have different configurations.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerconfiginconsistent
|
||||
summary: Alertmanager instances within the same cluster have different configurations.
|
||||
expr: |-
|
||||
count by (namespace,service) (
|
||||
count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})
|
||||
)
|
||||
!= 1
|
||||
for: 20m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: AlertmanagerClusterDown
|
||||
annotations:
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have been up for less than half of the last 5m.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterdown
|
||||
summary: Half or more of the Alertmanager instances within the same cluster are down.
|
||||
expr: |-
|
||||
(
|
||||
count by (namespace,service) (
|
||||
avg_over_time(up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) < 0.5
|
||||
)
|
||||
/
|
||||
count by (namespace,service) (
|
||||
up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}
|
||||
)
|
||||
)
|
||||
>= 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: AlertmanagerClusterCrashlooping
|
||||
annotations:
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have restarted at least 5 times in the last 10m.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclustercrashlooping
|
||||
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
|
||||
expr: |-
|
||||
(
|
||||
count by (namespace,service) (
|
||||
changes(process_start_time_seconds{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[10m]) > 4
|
||||
)
|
||||
/
|
||||
count by (namespace,service) (
|
||||
up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}
|
||||
)
|
||||
)
|
||||
>= 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml
|
||||
Generated from 'general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
@ -26,7 +26,9 @@ spec:
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
annotations:
|
||||
message: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.'
|
||||
description: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-targetdown
|
||||
summary: One or more targets are unreachable.
|
||||
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
@ -36,7 +38,7 @@ spec:
|
||||
{{- end }}
|
||||
- alert: Watchdog
|
||||
annotations:
|
||||
message: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
|
||||
description: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
|
||||
|
||||
This alert is always firing, therefore it should always be firing in Alertmanager
|
||||
|
||||
@ -47,6 +49,8 @@ spec:
|
||||
"DeadMansSnitch" integration in PagerDuty.
|
||||
|
||||
'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-watchdog
|
||||
summary: An alert that should always be firing to certify that Alertmanager is working properly.
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: none
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'k8s.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
|
||||
Generated from 'k8s.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
@ -26,11 +26,11 @@ spec:
|
||||
rules:
|
||||
- expr: |-
|
||||
sum by (cluster, namespace, pod, container) (
|
||||
rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
|
||||
irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
|
||||
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
|
||||
1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
|
||||
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
|
||||
- expr: |-
|
||||
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
@ -77,6 +77,28 @@ spec:
|
||||
)
|
||||
)
|
||||
record: namespace_cpu:kube_pod_container_resource_requests:sum
|
||||
- expr: |-
|
||||
sum by (namespace, cluster) (
|
||||
sum by (namespace, pod, cluster) (
|
||||
max by (namespace, pod, container, cluster) (
|
||||
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
|
||||
) * on(namespace, pod, cluster) group_left() max by (namespace, pod) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace_memory:kube_pod_container_resource_limits:sum
|
||||
- expr: |-
|
||||
sum by (namespace, cluster) (
|
||||
sum by (namespace, pod, cluster) (
|
||||
max by (namespace, pod, container, cluster) (
|
||||
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
|
||||
) * on(namespace, pod, cluster) group_left() max by (namespace, pod) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace_cpu:kube_pod_container_resource_limits:sum
|
||||
- expr: |-
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'kube-apiserver-availability.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
|
||||
Generated from 'kube-apiserver-availability.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
@ -29,56 +29,56 @@ spec:
|
||||
1 - (
|
||||
(
|
||||
# write too slow
|
||||
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
|
||||
-
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
|
||||
) +
|
||||
(
|
||||
# read too slow
|
||||
sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d]))
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d]))
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d]))
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d]))
|
||||
+
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
|
||||
)
|
||||
) +
|
||||
# errors
|
||||
sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum(code:apiserver_request_total:increase30d)
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d)
|
||||
labels:
|
||||
verb: all
|
||||
record: apiserver_request:availability30d
|
||||
- expr: |-
|
||||
1 - (
|
||||
sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d]))
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d]))
|
||||
-
|
||||
(
|
||||
# too slow
|
||||
(
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d]))
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d]))
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d]))
|
||||
+
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum(code:apiserver_request_total:increase30d{verb="read"})
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"})
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:availability30d
|
||||
@ -86,74 +86,74 @@ spec:
|
||||
1 - (
|
||||
(
|
||||
# too slow
|
||||
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
|
||||
-
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum(code:apiserver_request_total:increase30d{verb="write"})
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"})
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:availability30d
|
||||
- expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[1h]))
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
|
||||
- expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
|
||||
labels:
|
||||
verb: read
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- expr: sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
- expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
labels:
|
||||
verb: write
|
||||
record: code:apiserver_request_total:increase30d
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'kube-apiserver-slos' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
|
||||
Generated from 'kube-apiserver-slos' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'kube-apiserver.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
|
||||
Generated from 'kube-apiserver.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
@ -28,26 +28,26 @@ spec:
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d]))
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate1d
|
||||
@ -55,26 +55,26 @@ spec:
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h]))
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate1h
|
||||
@ -82,26 +82,26 @@ spec:
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h]))
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate2h
|
||||
@ -109,26 +109,26 @@ spec:
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m]))
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate30m
|
||||
@ -136,26 +136,26 @@ spec:
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d]))
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate3d
|
||||
@ -163,26 +163,26 @@ spec:
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m]))
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate5m
|
||||
@ -190,26 +190,26 @@ spec:
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h]))
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate6h
|
||||
@ -217,15 +217,15 @@ spec:
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate1d
|
||||
@ -233,15 +233,15 @@ spec:
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate1h
|
||||
@ -249,15 +249,15 @@ spec:
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate2h
|
||||
@ -265,15 +265,15 @@ spec:
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate30m
|
||||
@ -281,15 +281,15 @@ spec:
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate3d
|
||||
@ -297,15 +297,15 @@ spec:
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate5m
|
||||
@ -313,32 +313,32 @@ spec:
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h]))
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate6h
|
||||
- expr: sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
- expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- expr: sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
- expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- expr: histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0
|
||||
- expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
verb: read
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0
|
||||
- expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
verb: write
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'kube-prometheus-general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml
|
||||
Generated from 'kube-prometheus-general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml
|
||||
Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
|
||||
Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'kube-state-metrics' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml
|
||||
Generated from 'kube-state-metrics' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-state-metrics-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
@ -55,5 +55,33 @@ spec:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeStateMetricsShardingMismatch
|
||||
annotations:
|
||||
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricsshardingmismatch
|
||||
summary: kube-state-metrics sharding is misconfigured.
|
||||
expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeStateMetricsShardsMissing
|
||||
annotations:
|
||||
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricsshardsmissing
|
||||
summary: kube-state-metrics shards are missing.
|
||||
expr: |-
|
||||
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1
|
||||
-
|
||||
sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) )
|
||||
!= 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'kubelet.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
|
||||
Generated from 'kubelet.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
|
||||
Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
@ -30,7 +30,10 @@ spec:
|
||||
description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is restarting {{`{{`}} printf "%.2f" $value {{`}}`}} times / 10 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodcrashlooping
|
||||
summary: Pod is crash looping.
|
||||
expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m]) * 60 * 5 > 0
|
||||
expr: |-
|
||||
increase(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m]) > 0
|
||||
and
|
||||
sum without (phase) (kube_pod_status_phase{phase!="Running",job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
@ -261,19 +264,19 @@ spec:
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubehpareplicasmismatch
|
||||
summary: HPA has not matched descired number of replicas.
|
||||
expr: |-
|
||||
(kube_hpa_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
|
||||
and
|
||||
(kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
>
|
||||
kube_hpa_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
|
||||
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
|
||||
and
|
||||
(kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
<
|
||||
kube_hpa_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
|
||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
|
||||
and
|
||||
changes(kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) == 0
|
||||
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
@ -286,9 +289,9 @@ spec:
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubehpamaxedout
|
||||
summary: HPA is running at max replicas
|
||||
expr: |-
|
||||
kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
==
|
||||
kube_hpa_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
|
||||
Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
|
||||
Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system-apiserver' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
|
||||
Generated from 'kubernetes-system-apiserver' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system-controller-manager' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
|
||||
Generated from 'kubernetes-system-controller-manager' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system-kubelet' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
|
||||
Generated from 'kubernetes-system-kubelet' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system-scheduler' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
|
||||
Generated from 'kubernetes-system-scheduler' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
|
||||
Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'node-exporter.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/node-exporter-prometheusRule.yaml
|
||||
Generated from 'node-exporter.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/node-exporter-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
@ -33,9 +33,9 @@ spec:
|
||||
record: instance:node_num_cpu:sum
|
||||
- expr: |-
|
||||
1 - avg without (cpu, mode) (
|
||||
rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m])
|
||||
rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[5m])
|
||||
)
|
||||
record: instance:node_cpu_utilisation:rate1m
|
||||
record: instance:node_cpu_utilisation:rate5m
|
||||
- expr: |-
|
||||
(
|
||||
node_load1{job="node-exporter"}
|
||||
@ -50,30 +50,30 @@ spec:
|
||||
node_memory_MemTotal_bytes{job="node-exporter"}
|
||||
)
|
||||
record: instance:node_memory_utilisation:ratio
|
||||
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
|
||||
record: instance:node_vmstat_pgmajfault:rate1m
|
||||
- expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
|
||||
record: instance_device:node_disk_io_time_seconds:rate1m
|
||||
- expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
|
||||
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
|
||||
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
|
||||
record: instance:node_vmstat_pgmajfault:rate5m
|
||||
- expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
|
||||
record: instance_device:node_disk_io_time_seconds:rate5m
|
||||
- expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
|
||||
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m])
|
||||
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_receive_bytes_excluding_lo:rate1m
|
||||
record: instance:node_network_receive_bytes_excluding_lo:rate5m
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m])
|
||||
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_transmit_bytes_excluding_lo:rate1m
|
||||
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m])
|
||||
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_receive_drop_excluding_lo:rate1m
|
||||
record: instance:node_network_receive_drop_excluding_lo:rate5m
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
|
||||
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_transmit_drop_excluding_lo:rate1m
|
||||
record: instance:node_network_transmit_drop_excluding_lo:rate5m
|
||||
{{- end }}
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'node-exporter' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/node-exporter-prometheusRule.yaml
|
||||
Generated from 'node-exporter' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/node-exporter-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
@ -27,6 +27,7 @@ spec:
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
@ -45,6 +46,7 @@ spec:
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up fast.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
@ -63,6 +65,7 @@ spec:
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 5% space left.
|
||||
expr: |-
|
||||
(
|
||||
@ -79,6 +82,7 @@ spec:
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 3% space left.
|
||||
expr: |-
|
||||
(
|
||||
@ -95,6 +99,7 @@ spec:
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
@ -113,6 +118,7 @@ spec:
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up fast.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
@ -131,6 +137,7 @@ spec:
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 5% inodes left.
|
||||
expr: |-
|
||||
(
|
||||
@ -147,6 +154,7 @@ spec:
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 3% inodes left.
|
||||
expr: |-
|
||||
(
|
||||
@ -163,6 +171,7 @@ spec:
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} receive errors in the last two minutes.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworkreceiveerrs
|
||||
summary: Network interface is reporting many receive errors.
|
||||
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
|
||||
for: 1h
|
||||
@ -174,6 +183,7 @@ spec:
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} transmit errors in the last two minutes.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworktransmiterrs
|
||||
summary: Network interface is reporting many transmit errors.
|
||||
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
|
||||
for: 1h
|
||||
@ -185,6 +195,7 @@ spec:
|
||||
- alert: NodeHighNumberConntrackEntriesUsed
|
||||
annotations:
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodehighnumberconntrackentriesused
|
||||
summary: Number of conntrack are getting close to the limit.
|
||||
expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
|
||||
labels:
|
||||
@ -195,6 +206,7 @@ spec:
|
||||
- alert: NodeTextFileCollectorScrapeError
|
||||
annotations:
|
||||
description: Node Exporter text file collector failed to scrape.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodetextfilecollectorscrapeerror
|
||||
summary: Node Exporter text file collector failed to scrape.
|
||||
expr: node_textfile_scrape_error{job="node-exporter"} == 1
|
||||
labels:
|
||||
@ -204,7 +216,8 @@ spec:
|
||||
{{- end }}
|
||||
- alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
message: Clock on {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
|
||||
description: Clock on {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeclockskewdetected
|
||||
summary: Clock skew detected.
|
||||
expr: |-
|
||||
(
|
||||
@ -226,7 +239,8 @@ spec:
|
||||
{{- end }}
|
||||
- alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
message: Clock on {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host.
|
||||
description: Clock on {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeclocknotsynchronising
|
||||
summary: Clock not synchronising.
|
||||
expr: |-
|
||||
min_over_time(node_timex_sync_status[5m]) == 0
|
||||
@ -241,6 +255,7 @@ spec:
|
||||
- alert: NodeRAIDDegraded
|
||||
annotations:
|
||||
description: RAID array '{{`{{`}} $labels.device {{`}}`}}' on {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-noderaiddegraded
|
||||
summary: RAID Array is degraded
|
||||
expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
|
||||
for: 15m
|
||||
@ -252,8 +267,9 @@ spec:
|
||||
- alert: NodeRAIDDiskFailure
|
||||
annotations:
|
||||
description: At least one device in RAID array on {{`{{`}} $labels.instance {{`}}`}} failed. Array '{{`{{`}} $labels.device {{`}}`}}' needs attention and possibly a disk swap.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-noderaiddiskfailure
|
||||
summary: Failed device in RAID array
|
||||
expr: node_md_disks{state="fail"} > 0
|
||||
expr: node_md_disks{state="failed"} > 0
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'node-network' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml
|
||||
Generated from 'node-network' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
@ -26,7 +26,8 @@ spec:
|
||||
rules:
|
||||
- alert: NodeNetworkInterfaceFlapping
|
||||
annotations:
|
||||
message: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing it's up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}"
|
||||
message: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing it's up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworkinterfaceflapping
|
||||
expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
|
||||
for: 2m
|
||||
labels:
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'node.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
|
||||
Generated from 'node.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml
|
||||
Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheus-operator-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
|
@ -1,5 +1,5 @@
|
||||
{{- /*
|
||||
Generated from 'prometheus' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml
|
||||
Generated from 'prometheus' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
@ -29,6 +29,7 @@ spec:
|
||||
- alert: PrometheusBadConfig
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to reload its configuration.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusbadconfig
|
||||
summary: Failed Prometheus configuration reload.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
@ -43,6 +44,7 @@ spec:
|
||||
- alert: PrometheusNotificationQueueRunningFull
|
||||
annotations:
|
||||
description: Alert notification queue of Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is running full.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusnotificationqueuerunningfull
|
||||
summary: Prometheus alert notification queue predicted to run full in less than 30m.
|
||||
expr: |-
|
||||
# Without min_over_time, failed scrapes could create false negatives, see
|
||||
@ -61,6 +63,7 @@ spec:
|
||||
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||
annotations:
|
||||
description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.alertmanager{{`}}`}}.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheuserrorsendingalertstosomealertmanagers
|
||||
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
|
||||
expr: |-
|
||||
(
|
||||
@ -75,28 +78,11 @@ spec:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% minimum errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to any Alertmanager.'
|
||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
expr: |-
|
||||
min without(alertmanager) (
|
||||
rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
)
|
||||
* 100
|
||||
> 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusNotConnectedToAlertmanagers
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not connected to any Alertmanagers.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusnotconnectedtoalertmanagers
|
||||
summary: Prometheus is not connected to any Alertmanagers.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
@ -111,6 +97,7 @@ spec:
|
||||
- alert: PrometheusTSDBReloadsFailing
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} reload failures over the last 3h.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheustsdbreloadsfailing
|
||||
summary: Prometheus has issues reloading blocks from disk.
|
||||
expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0
|
||||
for: 4h
|
||||
@ -122,6 +109,7 @@ spec:
|
||||
- alert: PrometheusTSDBCompactionsFailing
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} compaction failures over the last 3h.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheustsdbcompactionsfailing
|
||||
summary: Prometheus has issues compacting blocks.
|
||||
expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0
|
||||
for: 4h
|
||||
@ -133,8 +121,18 @@ spec:
|
||||
- alert: PrometheusNotIngestingSamples
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not ingesting samples.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusnotingestingsamples
|
||||
summary: Prometheus is not ingesting samples.
|
||||
expr: rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0
|
||||
expr: |-
|
||||
(
|
||||
rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0
|
||||
and
|
||||
(
|
||||
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0
|
||||
or
|
||||
sum without(rule_group) (prometheus_rule_group_rules{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0
|
||||
)
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
@ -144,6 +142,7 @@ spec:
|
||||
- alert: PrometheusDuplicateTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with different values but duplicated timestamp.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusduplicatetimestamps
|
||||
summary: Prometheus is dropping samples with duplicate timestamps.
|
||||
expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 10m
|
||||
@ -155,6 +154,7 @@ spec:
|
||||
- alert: PrometheusOutOfOrderTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with timestamps arriving out of order.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoutofordertimestamps
|
||||
summary: Prometheus drops samples with out-of-order timestamps.
|
||||
expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 10m
|
||||
@ -166,15 +166,16 @@ spec:
|
||||
- alert: PrometheusRemoteStorageFailures
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} failed to send {{`{{`}} printf "%.1f" $value {{`}}`}}% of the samples to {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusremotestoragefailures
|
||||
summary: Prometheus fails to send samples to remote storage.
|
||||
expr: |-
|
||||
(
|
||||
rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
(rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]))
|
||||
/
|
||||
(
|
||||
rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
(rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]))
|
||||
+
|
||||
rate(prometheus_remote_storage_succeeded_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
(rate(prometheus_remote_storage_succeeded_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]))
|
||||
)
|
||||
)
|
||||
* 100
|
||||
@ -188,13 +189,14 @@ spec:
|
||||
- alert: PrometheusRemoteWriteBehind
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write is {{`{{`}} printf "%.1f" $value {{`}}`}}s behind for {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusremotewritebehind
|
||||
summary: Prometheus remote write is behind.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
- on(job, instance) group_right
|
||||
- ignoring(remote_name, url) group_right
|
||||
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
)
|
||||
> 120
|
||||
@ -207,6 +209,7 @@ spec:
|
||||
- alert: PrometheusRemoteWriteDesiredShards
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write desired shards calculation wants to run {{`{{`}} $value {{`}}`}} shards for queue {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}, which is more than the max of {{`{{`}} printf `prometheus_remote_storage_shards_max{instance="%s",job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}` $labels.instance | query | first | value {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusremotewritedesiredshards
|
||||
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
@ -225,6 +228,7 @@ spec:
|
||||
- alert: PrometheusRuleFailures
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to evaluate {{`{{`}} printf "%.0f" $value {{`}}`}} rules in the last 5m.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusrulefailures
|
||||
summary: Prometheus is failing rule evaluations.
|
||||
expr: increase(prometheus_rule_evaluation_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 15m
|
||||
@ -236,6 +240,7 @@ spec:
|
||||
- alert: PrometheusMissingRuleEvaluations
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has missed {{`{{`}} printf "%.0f" $value {{`}}`}} rule group evaluations in the last 5m.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusmissingruleevaluations
|
||||
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
||||
expr: increase(prometheus_rule_group_iterations_missed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 15m
|
||||
@ -247,6 +252,7 @@ spec:
|
||||
- alert: PrometheusTargetLimitHit
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because the number of targets exceeded the configured target_limit.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheustargetlimithit
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
|
||||
expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 15m
|
||||
@ -254,5 +260,36 @@ spec:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusLabelLimitHit
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheuslabellimithit
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit.
|
||||
expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% minimum errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to any Alertmanager.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheuserrorsendingalertstoanyalertmanager
|
||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
expr: |-
|
||||
min without (alertmanager) (
|
||||
rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}",alertmanager!~``}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}",alertmanager!~``}[5m])
|
||||
)
|
||||
* 100
|
||||
> 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,63 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.alertmanager }}
|
||||
{{- $operatorJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "operator" }}
|
||||
{{- $alertmanagerJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager" }}
|
||||
{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: alertmanager.rules
|
||||
rules:
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
annotations:
|
||||
message: The configuration of the instances of the Alertmanager cluster `{{`{{`}}$labels.service{{`}}`}}` are out of sync.
|
||||
expr: count_values("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{job="{{ $operatorJob }}",namespace="{{ $namespace }}",controller="alertmanager"}) by (name, job, namespace, controller), "service", "$1", "name", "(.*)") != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: AlertmanagerFailedReload
|
||||
annotations:
|
||||
message: Reloading Alertmanager's configuration has failed for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}.
|
||||
expr: alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: AlertmanagerMembersInconsistent
|
||||
annotations:
|
||||
message: Alertmanager has not found all other members of the cluster.
|
||||
expr: |-
|
||||
alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}
|
||||
!= on (service) GROUP_LEFT()
|
||||
count by (service) (alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,179 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'etcd' group from https://raw.githubusercontent.com/etcd-io/website/master/content/en/docs/v3.4/op-guide/etcd3_alert.rules.yml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeEtcd.enabled .Values.defaultRules.rules.etcd }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "etcd" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: etcd
|
||||
rules:
|
||||
- alert: etcdInsufficientMembers
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).'
|
||||
expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdNoLeader
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.'
|
||||
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": instance {{`{{`}} $labels.instance {{`}}`}} has seen {{`{{`}} $value {{`}}`}} leader changes within the last hour.'
|
||||
expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: |-
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: |-
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
> 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": gRPC requests to {{`{{`}} $labels.grpc_method {{`}}`}} are taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: |-
|
||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdMemberCommunicationSlow
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighNumberOfFailedProposals
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last hour on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighCommitDurations
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighNumberOfFailedHTTPRequests
|
||||
annotations:
|
||||
message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}'
|
||||
expr: |-
|
||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
||||
BY (method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighNumberOfFailedHTTPRequests
|
||||
annotations:
|
||||
message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: |-
|
||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
||||
BY (method) > 0.05
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHTTPRequestsSlow
|
||||
annotations:
|
||||
message: etcd instance {{`{{`}} $labels.instance {{`}}`}} HTTP requests to {{`{{`}} $labels.method {{`}}`}} are slow.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,56 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.general }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "general.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: general.rules
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
annotations:
|
||||
message: '{{`{{`}} $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}} targets are down.'
|
||||
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: Watchdog
|
||||
annotations:
|
||||
message: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
|
||||
|
||||
This alert is always firing, therefore it should always be firing in Alertmanager
|
||||
|
||||
and always fire against a receiver. There are integrations with various notification
|
||||
|
||||
mechanisms that send a notification when this alert is not firing. For example the
|
||||
|
||||
"DeadMansSnitch" integration in PagerDuty.
|
||||
|
||||
'
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: none
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,83 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'k8s.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8s }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules
|
||||
rules:
|
||||
- expr: sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace)
|
||||
record: namespace:container_cpu_usage_seconds_total:sum_rate
|
||||
- expr: sum(container_memory_usage_bytes{job="kubelet", image!="", container_name!=""}) by (namespace)
|
||||
record: namespace:container_memory_usage_bytes:sum
|
||||
- expr: |-
|
||||
sum by (namespace, pod_name, container_name) (
|
||||
rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])
|
||||
)
|
||||
record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate
|
||||
- expr: |-
|
||||
sum by(namespace) (
|
||||
kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}
|
||||
* on (endpoint, instance, job, namespace, pod, service)
|
||||
group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)
|
||||
)
|
||||
record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
|
||||
- expr: |-
|
||||
sum by (namespace) (
|
||||
kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"}
|
||||
* on (endpoint, instance, job, namespace, pod, service)
|
||||
group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)
|
||||
)
|
||||
record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
|
||||
- expr: |-
|
||||
sum(
|
||||
label_replace(
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
|
||||
"replicaset", "$1", "owner_name", "(.*)"
|
||||
) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (namespace, workload, pod)
|
||||
labels:
|
||||
workload_type: deployment
|
||||
record: mixin_pod_workload
|
||||
- expr: |-
|
||||
sum(
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (namespace, workload, pod)
|
||||
labels:
|
||||
workload_type: daemonset
|
||||
record: mixin_pod_workload
|
||||
- expr: |-
|
||||
sum(
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (namespace, workload, pod)
|
||||
labels:
|
||||
workload_type: statefulset
|
||||
record: mixin_pod_workload
|
||||
{{- end }}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user