diff --git a/charts/kubezero-metrics/Chart.yaml b/charts/kubezero-metrics/Chart.yaml index 78c5595..0c2a44c 100644 --- a/charts/kubezero-metrics/Chart.yaml +++ b/charts/kubezero-metrics/Chart.yaml @@ -16,11 +16,11 @@ dependencies: version: ">= 0.1.3" repository: https://zero-down-time.github.io/kubezero/ - name: kube-prometheus-stack - version: 16.1.2 + version: 16.12.0 # Switch back to upstream once all alerts are fixed eg. etcd gpcr # repository: https://prometheus-community.github.io/helm-charts - name: prometheus-adapter - version: 2.13 + version: 2.14.2 repository: https://prometheus-community.github.io/helm-charts condition: prometheus-adapter.enabled kubeVersion: ">= 1.18.0" diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/Chart.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/Chart.yaml index ba03554..fdad40a 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/Chart.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/Chart.yaml @@ -6,12 +6,12 @@ annotations: url: https://github.com/prometheus-operator/kube-prometheus artifacthub.io/operator: "true" apiVersion: v2 -appVersion: 0.48.0 +appVersion: 0.48.1 dependencies: - condition: kubeStateMetrics.enabled name: kube-state-metrics repository: https://prometheus-community.github.io/helm-charts - version: 3.1.* + version: 3.3.* - condition: nodeExporter.enabled name: prometheus-node-exporter repository: https://prometheus-community.github.io/helm-charts @@ -19,7 +19,7 @@ dependencies: - condition: grafana.enabled name: grafana repository: https://grafana.github.io/helm-charts - version: 6.9.* + version: 6.13.* description: kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards, and Prometheus rules combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus @@ -47,4 +47,4 @@ sources: - https://github.com/prometheus-community/helm-charts - https://github.com/prometheus-operator/kube-prometheus type: application -version: 16.1.2 +version: 16.12.0 diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/Chart.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/Chart.yaml index 899ba61..5f0c749 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/Chart.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v2 -appVersion: 7.5.5 +appVersion: 8.0.3 description: The leading tool for querying and visualizing time series and metrics. home: https://grafana.net icon: https://raw.githubusercontent.com/grafana/grafana/master/public/img/logo_transparent_400x.png @@ -19,4 +19,4 @@ name: grafana sources: - https://github.com/grafana/grafana type: application -version: 6.9.1 +version: 6.13.5 diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/README.md b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/README.md index 8719100..7bbf61e 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/README.md +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/README.md @@ -59,8 +59,8 @@ This version requires Helm >= 3.1.0. | `securityContext` | Deployment securityContext | `{"runAsUser": 472, "runAsGroup": 472, "fsGroup": 472}` | | `priorityClassName` | Name of Priority Class to assign pods | `nil` | | `image.repository` | Image repository | `grafana/grafana` | -| `image.tag` | Image tag (`Must be >= 5.0.0`) | `7.4.5` | -| `image.sha` | Image sha (optional) | `2b56f6106ddc376bb46d974230d530754bf65a640dfbc5245191d72d3b49efc6` | +| `image.tag` | Image tag (`Must be >= 5.0.0`) | `8.0.3` | +| `image.sha` | Image sha (optional) | `80c6d6ac633ba5ab3f722976fb1d9a138f87ca6a9934fcd26a5fc28cbde7dbfa` | | `image.pullPolicy` | Image pull policy | `IfNotPresent` | | `image.pullSecrets` | Image pull secrets | `{}` | | `service.enabled` | Enable grafana service | `true` | @@ -83,7 +83,7 @@ This version requires Helm >= 3.1.0. | `ingress.path` | Ingress accepted path | `/` | | `ingress.pathType` | Ingress type of path | `Prefix` | | `ingress.hosts` | Ingress accepted hostnames | `["chart-example.local"]` | -| `ingress.extraPaths` | Ingress extra paths to prepend to every host configuration. Useful when configuring [custom actions with AWS ALB Ingress Controller](https://kubernetes-sigs.github.io/aws-alb-ingress-controller/guide/ingress/annotation/#actions). | `[]` | +| `ingress.extraPaths` | Ingress extra paths to prepend to every host configuration. Useful when configuring [custom actions with AWS ALB Ingress Controller](https://kubernetes-sigs.github.io/aws-alb-ingress-controller/guide/ingress/annotation/#actions). Requires `ingress.hosts` to have one or more host entries. | `[]` | | `ingress.tls` | Ingress TLS configuration | `[]` | | `resources` | CPU/Memory resource requests/limits | `{}` | | `nodeSelector` | Node labels for pod assignment | `{}` | @@ -116,6 +116,7 @@ This version requires Helm >= 3.1.0. | `envValueFrom` | Environment variables from alternate sources. See the API docs on [EnvVarSource](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.17/#envvarsource-v1-core) for format details. | `{}` | | `envFromSecret` | Name of a Kubernetes secret (must be manually created in the same namespace) containing values to be added to the environment. Can be templated | `""` | | `envRenderSecret` | Sensible environment variables passed to pods and stored as secret | `{}` | +| `enableServiceLinks` | Inject Kubernetes services as environment variables. | `true` | | `extraSecretMounts` | Additional grafana server secret mounts | `[]` | | `extraVolumeMounts` | Additional grafana server volume mounts | `[]` | | `extraConfigmapMounts` | Additional grafana server configMap volume mounts | `[]` | @@ -136,7 +137,7 @@ This version requires Helm >= 3.1.0. | `podLabels` | Pod labels | `{}` | | `podPortName` | Name of the grafana port on the pod | `grafana` | | `sidecar.image.repository` | Sidecar image repository | `quay.io/kiwigrid/k8s-sidecar` | -| `sidecar.image.tag` | Sidecar image tag | `1.10.7` | +| `sidecar.image.tag` | Sidecar image tag | `1.12.2` | | `sidecar.image.sha` | Sidecar image sha (optional) | `""` | | `sidecar.imagePullPolicy` | Sidecar image pull policy | `IfNotPresent` | | `sidecar.resources` | Sidecar resources | `{}` | diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/_helpers.tpl b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/_helpers.tpl index 70e0558..868aa07 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/_helpers.tpl +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/_helpers.tpl @@ -104,13 +104,67 @@ app.kubernetes.io/name: {{ include "grafana.name" . }}-image-renderer app.kubernetes.io/instance: {{ .Release.Name }} {{- end -}} +{{/* +Looks if there's an existing secret and reuse its password. If not it generates +new password and use it. +*/}} +{{- define "grafana.password" -}} +{{- $secret := (lookup "v1" "Secret" (include "grafana.namespace" .) (include "grafana.fullname" .) ) -}} + {{- if $secret -}} + {{- index $secret "data" "admin-password" -}} + {{- else -}} + {{- (randAlphaNum 40) | b64enc | quote -}} + {{- end -}} +{{- end -}} + +{{/* +Get KubeVersion removing pre-release information. +*/}} +{{- define "grafana.kubeVersion" -}} + {{- default .Capabilities.KubeVersion.Version (regexFind "v[0-9]+\\.[0-9]+\\.[0-9]+" .Capabilities.KubeVersion.Version) -}} +{{- end -}} + {{/* Return the appropriate apiVersion for rbac. */}} -{{- define "rbac.apiVersion" -}} -{{- if .Capabilities.APIVersions.Has "rbac.authorization.k8s.io/v1" }} -{{- print "rbac.authorization.k8s.io/v1" -}} -{{- else -}} -{{- print "rbac.authorization.k8s.io/v1beta1" -}} +{{- define "grafana.rbac.apiVersion" -}} + {{- if .Capabilities.APIVersions.Has "rbac.authorization.k8s.io/v1" }} + {{- print "rbac.authorization.k8s.io/v1" -}} + {{- else -}} + {{- print "rbac.authorization.k8s.io/v1beta1" -}} + {{- end -}} {{- end -}} + +{{/* +Return the appropriate apiVersion for ingress. +*/}} +{{- define "grafana.ingress.apiVersion" -}} + {{- if and (.Capabilities.APIVersions.Has "networking.k8s.io/v1") (semverCompare ">= 1.19.x" (include "grafana.kubeVersion" .)) -}} + {{- print "networking.k8s.io/v1" -}} + {{- else if .Capabilities.APIVersions.Has "networking.k8s.io/v1beta1" -}} + {{- print "networking.k8s.io/v1beta1" -}} + {{- else -}} + {{- print "extensions/v1beta1" -}} + {{- end -}} +{{- end -}} + +{{/* +Return if ingress is stable. +*/}} +{{- define "grafana.ingress.isStable" -}} + {{- eq (include "grafana.ingress.apiVersion" .) "networking.k8s.io/v1" -}} +{{- end -}} + +{{/* +Return if ingress supports ingressClassName. +*/}} +{{- define "grafana.ingress.supportsIngressClassName" -}} + {{- or (eq (include "grafana.ingress.isStable" .) "true") (and (eq (include "grafana.ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18.x" (include "grafana.kubeVersion" .))) -}} +{{- end -}} + +{{/* +Return if ingress supports pathType. +*/}} +{{- define "grafana.ingress.supportsPathType" -}} + {{- or (eq (include "grafana.ingress.isStable" .) "true") (and (eq (include "grafana.ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18.x" (include "grafana.kubeVersion" .))) -}} {{- end -}} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/_pod.tpl b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/_pod.tpl index d74ee4d..3a74e9e 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/_pod.tpl +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/_pod.tpl @@ -159,6 +159,7 @@ imagePullSecrets: - name: {{ . }} {{- end}} {{- end }} +enableServiceLinks: {{ .Values.enableServiceLinks }} containers: {{- if .Values.sidecar.dashboards.enabled }} - name: {{ template "grafana.name" . }}-sc-dashboard @@ -357,6 +358,14 @@ containers: - name: GF_RENDERING_CALLBACK_URL value: http://{{ template "grafana.fullname" . }}.{{ template "grafana.namespace" . }}:{{ .Values.service.port }}/{{ .Values.imageRenderer.grafanaSubPath }} {{ end }} + - name: GF_PATHS_DATA + value: {{ (get .Values "grafana.ini").paths.data }} + - name: GF_PATHS_LOGS + value: {{ (get .Values "grafana.ini").paths.logs }} + - name: GF_PATHS_PLUGINS + value: {{ (get .Values "grafana.ini").paths.plugins }} + - name: GF_PATHS_PROVISIONING + value: {{ (get .Values "grafana.ini").paths.provisioning }} {{- range $key, $value := .Values.envValueFrom }} - name: {{ $key | quote }} valueFrom: diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/ingress.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/ingress.yaml index 44ebfc9..7699cec 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/ingress.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/ingress.yaml @@ -1,17 +1,13 @@ {{- if .Values.ingress.enabled -}} +{{- $ingressApiIsStable := eq (include "grafana.ingress.isStable" .) "true" -}} +{{- $ingressSupportsIngressClassName := eq (include "grafana.ingress.supportsIngressClassName" .) "true" -}} +{{- $ingressSupportsPathType := eq (include "grafana.ingress.supportsPathType" .) "true" -}} {{- $fullName := include "grafana.fullname" . -}} {{- $servicePort := .Values.service.port -}} {{- $ingressPath := .Values.ingress.path -}} {{- $ingressPathType := .Values.ingress.pathType -}} {{- $extraPaths := .Values.ingress.extraPaths -}} -{{- $newAPI := .Capabilities.APIVersions.Has "networking.k8s.io/v1/Ingress" -}} -{{- if $newAPI -}} -apiVersion: networking.k8s.io/v1 -{{- else if .Capabilities.APIVersions.Has "networking.k8s.io/v1beta1/Ingress" }} -apiVersion: networking.k8s.io/v1beta1 -{{- else }} -apiVersion: extensions/v1beta1 -{{- end }} +apiVersion: {{ include "grafana.ingress.apiVersion" . }} kind: Ingress metadata: name: {{ $fullName }} @@ -28,7 +24,7 @@ metadata: {{- end }} {{- end }} spec: - {{- if .Values.ingress.ingressClassName }} + {{- if and $ingressSupportsIngressClassName .Values.ingress.ingressClassName }} ingressClassName: {{ .Values.ingress.ingressClassName }} {{- end -}} {{- if .Values.ingress.tls }} @@ -41,15 +37,15 @@ spec: - host: {{ tpl . $}} http: paths: -{{ if $extraPaths }} +{{- if $extraPaths }} {{ toYaml $extraPaths | indent 10 }} {{- end }} - path: {{ $ingressPath }} - {{- if $newAPI }} + {{- if $ingressSupportsPathType }} pathType: {{ $ingressPathType }} {{- end }} backend: - {{- if $newAPI }} + {{- if $ingressApiIsStable }} service: name: {{ $fullName }} port: @@ -63,18 +59,20 @@ spec: - http: paths: - backend: - {{- if $newAPI }} + {{- if $ingressApiIsStable }} service: name: {{ $fullName }} port: number: {{ $servicePort }} - pathType: {{ $ingressPathType }} {{- else }} serviceName: {{ $fullName }} servicePort: {{ $servicePort }} {{- end }} - {{- if $ingressPath }} + {{- if $ingressPath }} path: {{ $ingressPath }} {{- end }} + {{- if $ingressSupportsPathType }} + pathType: {{ $ingressPathType }} + {{- end }} {{- end -}} {{- end }} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/role.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/role.yaml index 54c3fb0..6a1890f 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/role.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/role.yaml @@ -1,5 +1,5 @@ {{- if and .Values.rbac.create (not .Values.rbac.useExistingRole) -}} -apiVersion: {{ template "rbac.apiVersion" . }} +apiVersion: {{ template "grafana.rbac.apiVersion" . }} kind: Role metadata: name: {{ template "grafana.fullname" . }} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/rolebinding.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/rolebinding.yaml index 34f1ad6..e010725 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/rolebinding.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/rolebinding.yaml @@ -1,5 +1,5 @@ {{- if .Values.rbac.create -}} -apiVersion: {{ template "rbac.apiVersion" . }} +apiVersion: {{ template "grafana.rbac.apiVersion" . }} kind: RoleBinding metadata: name: {{ template "grafana.fullname" . }} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/secret.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/secret.yaml index ac804e2..6d06cf5 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/secret.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/templates/secret.yaml @@ -17,7 +17,7 @@ data: {{- if .Values.adminPassword }} admin-password: {{ .Values.adminPassword | b64enc | quote }} {{- else }} - admin-password: {{ randAlphaNum 40 | b64enc | quote }} + admin-password: {{ template "grafana.password" . }} {{- end }} {{- end }} {{- if not .Values.ldap.existingSecret }} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/values.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/values.yaml index a81db06..f4bc7c4 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/values.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/grafana/values.yaml @@ -69,7 +69,7 @@ livenessProbe: image: repository: grafana/grafana - tag: 7.5.5 + tag: 8.0.3 sha: "" pullPolicy: IfNotPresent @@ -188,7 +188,7 @@ ingress: labels: {} path: / - # pathType is only for k8s > 1.19 + # pathType is only for k8s >= 1.1= pathType: Prefix hosts: @@ -382,6 +382,10 @@ envFromSecret: "" ## This can be useful for auth tokens, etc envRenderSecret: {} +# Inject Kubernetes services as environment variables. +# See https://kubernetes.io/docs/concepts/services-networking/connect-applications-service/#environment-variables +enableServiceLinks: true + ## Additional grafana server secret mounts # Defines additional mounts with secrets. Secrets must be manually created in the namespace. extraSecretMounts: [] @@ -446,13 +450,13 @@ datasources: {} # access: proxy # isDefault: true # - name: CloudWatch -# type: cloudwatch -# access: proxy -# uid: cloudwatch -# editable: false -# jsonData: -# authType: credentials -# defaultRegion: us-east-1 +# type: cloudwatch +# access: proxy +# uid: cloudwatch +# editable: false +# jsonData: +# authType: default +# defaultRegion: us-east-1 ## Configure notifiers ## ref: http://docs.grafana.org/administration/provisioning/#alert-notification-channels @@ -532,7 +536,7 @@ dashboardsConfigMaps: {} ## grafana.ini: paths: - data: /var/lib/grafana/data + data: /var/lib/grafana/ logs: /var/log/grafana plugins: /var/lib/grafana/plugins provisioning: /etc/grafana/provisioning @@ -603,7 +607,7 @@ smtp: sidecar: image: repository: quay.io/kiwigrid/k8s-sidecar - tag: 1.10.7 + tag: 1.12.2 sha: "" imagePullPolicy: IfNotPresent resources: {} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/Chart.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/Chart.yaml index 02394f6..759f5af 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/Chart.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/Chart.yaml @@ -16,4 +16,4 @@ name: kube-state-metrics sources: - https://github.com/kubernetes/kube-state-metrics/ type: application -version: 3.1.0 +version: 3.3.1 diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/README.md b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/README.md index 531ac17..7c2e169 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/README.md +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/README.md @@ -62,7 +62,7 @@ The upgraded chart now the following changes: See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_helm/#customizing-the-chart-before-installing). To see all configurable options with detailed comments: ```console -helm show values kube-state-metrics/kube-state-metrics +helm show values prometheus-community/kube-state-metrics ``` You may also run `helm show values` on this chart's [dependencies](#dependencies) for additional options. diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/clusterrolebinding.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/clusterrolebinding.yaml index af158c5..223c285 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/clusterrolebinding.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/clusterrolebinding.yaml @@ -18,6 +18,6 @@ roleRef: {{- end }} subjects: - kind: ServiceAccount - name: {{ template "kube-state-metrics.fullname" . }} + name: {{ template "kube-state-metrics.serviceAccountName" . }} namespace: {{ template "kube-state-metrics.namespace" . }} {{- end -}} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/deployment.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/deployment.yaml index 090d880..a4c4c0b 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/deployment.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/deployment.yaml @@ -13,30 +13,30 @@ metadata: app.kubernetes.io/instance: "{{ .Release.Name }}" app.kubernetes.io/managed-by: "{{ .Release.Service }}" app.kubernetes.io/version: "{{ .Chart.AppVersion }}" -{{- if .Values.customLabels }} + {{- if .Values.customLabels }} {{ toYaml .Values.customLabels | indent 4 }} -{{- end }} + {{- end }} spec: selector: matchLabels: app.kubernetes.io/name: {{ template "kube-state-metrics.name" . }} replicas: {{ .Values.replicas }} -{{- if .Values.autosharding.enabled }} + {{- if .Values.autosharding.enabled }} serviceName: {{ template "kube-state-metrics.fullname" . }} volumeClaimTemplates: [] -{{- end }} + {{- end }} template: metadata: labels: app.kubernetes.io/name: {{ template "kube-state-metrics.name" . }} app.kubernetes.io/instance: "{{ .Release.Name }}" -{{- if .Values.customLabels }} + {{- if .Values.customLabels }} {{ toYaml .Values.customLabels | indent 8 }} -{{- end }} -{{- if .Values.podAnnotations }} + {{- end }} + {{- if .Values.podAnnotations }} annotations: {{ toYaml .Values.podAnnotations | indent 8 }} -{{- end }} + {{- end }} spec: hostNetwork: {{ .Values.hostNetwork }} serviceAccountName: {{ template "kube-state-metrics.serviceAccountName" . }} @@ -51,7 +51,7 @@ spec: {{- end }} containers: - name: {{ .Chart.Name }} -{{- if .Values.autosharding.enabled }} + {{- if .Values.autosharding.enabled }} env: - name: POD_NAME valueFrom: @@ -61,159 +61,94 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace -{{- end }} + {{- end }} args: -{{ if .Values.extraArgs }} + {{- if .Values.extraArgs }} {{- range .Values.extraArgs }} - {{ . }} {{- end }} -{{ end }} -{{ if .Values.collectors.certificatesigningrequests }} - - --resources=certificatesigningrequests -{{ end }} -{{ if .Values.collectors.configmaps }} - - --resources=configmaps -{{ end }} -{{ if .Values.collectors.cronjobs }} - - --resources=cronjobs -{{ end }} -{{ if .Values.collectors.daemonsets }} - - --resources=daemonsets -{{ end }} -{{ if .Values.collectors.deployments }} - - --resources=deployments -{{ end }} -{{ if .Values.collectors.endpoints }} - - --resources=endpoints -{{ end }} -{{ if .Values.collectors.horizontalpodautoscalers }} - - --resources=horizontalpodautoscalers -{{ end }} -{{ if .Values.collectors.ingresses }} - - --resources=ingresses -{{ end }} -{{ if .Values.collectors.jobs }} - - --resources=jobs -{{ end }} -{{ if .Values.collectors.limitranges }} - - --resources=limitranges -{{ end }} -{{ if .Values.collectors.mutatingwebhookconfigurations }} - - --resources=mutatingwebhookconfigurations -{{ end }} -{{ if .Values.collectors.namespaces }} - - --resources=namespaces -{{ end }} -{{ if .Values.collectors.networkpolicies }} - - --resources=networkpolicies -{{ end }} -{{ if .Values.collectors.nodes }} - - --resources=nodes -{{ end }} -{{ if .Values.collectors.persistentvolumeclaims }} - - --resources=persistentvolumeclaims -{{ end }} -{{ if .Values.collectors.persistentvolumes }} - - --resources=persistentvolumes -{{ end }} -{{ if .Values.collectors.poddisruptionbudgets }} - - --resources=poddisruptionbudgets -{{ end }} -{{ if .Values.collectors.pods }} - - --resources=pods -{{ end }} -{{ if .Values.collectors.replicasets }} - - --resources=replicasets -{{ end }} -{{ if .Values.collectors.replicationcontrollers }} - - --resources=replicationcontrollers -{{ end }} -{{ if .Values.collectors.resourcequotas }} - - --resources=resourcequotas -{{ end }} -{{ if .Values.collectors.secrets }} - - --resources=secrets -{{ end }} -{{ if .Values.collectors.services }} - - --resources=services -{{ end }} -{{ if .Values.collectors.statefulsets }} - - --resources=statefulsets -{{ end }} -{{ if .Values.collectors.storageclasses }} - - --resources=storageclasses -{{ end }} -{{ if .Values.collectors.validatingwebhookconfigurations }} - - --resources=validatingwebhookconfigurations -{{ end }} -{{ if .Values.collectors.verticalpodautoscalers }} - - --resources=verticalpodautoscalers -{{ end }} -{{ if .Values.collectors.volumeattachments }} - - --resources=volumeattachments -{{ end }} -{{ if .Values.namespaces }} - - --namespaces={{ tpl .Values.namespaces $ | join "," }} -{{ end }} -{{ if .Values.autosharding.enabled }} + {{- end }} + {{- if .Values.service.port }} + - --port={{ .Values.service.port | default 8080}} + {{- end }} + {{- if .Values.collectors }} + - --resources={{ .Values.collectors | join "," }} + {{- end }} + {{- if .Values.metricLabelsAllowlist }} + - --metric-labels-allowlist={{ .Values.metricLabelsAllowlist | join "," }} + {{- end }} + {{- if .Values.metricAllowlist }} + - --metric-allowlist={{ .Values.metricAllowlist | join "," }} + {{- end }} + {{- if .Values.metricDenylist }} + - --metric-denylist={{ .Values.metricDenylist | join "," }} + {{- end }} + {{- if .Values.namespaces }} + - --namespaces={{ tpl (.Values.namespaces | join ",") $ }} + {{- end }} + {{- if .Values.autosharding.enabled }} - --pod=$(POD_NAME) - --pod-namespace=$(POD_NAMESPACE) -{{ end }} -{{ if .Values.kubeconfig.enabled }} + {{- end }} + {{- if .Values.kubeconfig.enabled }} - --kubeconfig=/opt/k8s/.kube/config -{{ end }} -{{ if .Values.selfMonitor.telemetryHost }} + {{- end }} + {{- if .Values.selfMonitor.telemetryHost }} - --telemetry-host={{ .Values.selfMonitor.telemetryHost }} -{{ end }} - - --telemetry-port=8081 -{{- if .Values.kubeconfig.enabled }} + {{- end }} + - --telemetry-port={{ .Values.selfMonitor.telemetryPort | default 8081 }} + {{- if .Values.kubeconfig.enabled }} volumeMounts: - name: kubeconfig mountPath: /opt/k8s/.kube/ readOnly: true -{{- end }} + {{- end }} imagePullPolicy: {{ .Values.image.pullPolicy }} image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" ports: - - containerPort: 8080 -{{- if .Values.selfMonitor.enabled }} - - containerPort: 8081 -{{- end }} + - containerPort: {{ .Values.service.port | default 8080}} + {{- if .Values.selfMonitor.enabled }} + - containerPort: {{ .Values.selfMonitor.telemetryPort | default 8081 }} + {{- end }} livenessProbe: httpGet: path: /healthz - port: 8080 + port: {{ .Values.service.port | default 8080}} initialDelaySeconds: 5 timeoutSeconds: 5 readinessProbe: httpGet: path: / - port: 8080 + port: {{ .Values.service.port | default 8080}} initialDelaySeconds: 5 timeoutSeconds: 5 -{{- if .Values.resources }} + {{- if .Values.resources }} resources: {{ toYaml .Values.resources | indent 10 }} {{- end }} +{{- if .Values.containerSecurityContext }} + securityContext: +{{ toYaml .Values.containerSecurityContext | indent 10 }} +{{- end }} {{- if .Values.imagePullSecrets }} imagePullSecrets: {{ toYaml .Values.imagePullSecrets | indent 8 }} -{{- end }} -{{- if .Values.affinity }} + {{- end }} + {{- if .Values.affinity }} affinity: {{ toYaml .Values.affinity | indent 8 }} -{{- end }} -{{- if .Values.nodeSelector }} + {{- end }} + {{- if .Values.nodeSelector }} nodeSelector: {{ toYaml .Values.nodeSelector | indent 8 }} -{{- end }} -{{- if .Values.tolerations }} + {{- end }} + {{- if .Values.tolerations }} tolerations: {{ toYaml .Values.tolerations | indent 8 }} -{{- end }} -{{- if .Values.kubeconfig.enabled}} + {{- end }} + {{- if .Values.kubeconfig.enabled}} volumes: - name: kubeconfig secret: secretName: {{ template "kube-state-metrics.fullname" . }}-kubeconfig -{{- end }} + {{- end }} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/psp-clusterrolebinding.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/psp-clusterrolebinding.yaml index feb97f2..b96503f 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/psp-clusterrolebinding.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/psp-clusterrolebinding.yaml @@ -14,6 +14,6 @@ roleRef: name: psp-{{ template "kube-state-metrics.fullname" . }} subjects: - kind: ServiceAccount - name: {{ template "kube-state-metrics.fullname" . }} + name: {{ template "kube-state-metrics.serviceAccountName" . }} namespace: {{ template "kube-state-metrics.namespace" . }} {{- end }} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/role.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/role.yaml index 25c8bc8..8643652 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/role.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/role.yaml @@ -18,169 +18,169 @@ metadata: namespace: {{ . }} {{- end }} rules: -{{ if $.Values.collectors.certificatesigningrequests }} +{{ if has "certificatesigningrequests" $.Values.collectors }} - apiGroups: ["certificates.k8s.io"] resources: - certificatesigningrequests verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.configmaps }} +{{ if has "configmaps" $.Values.collectors }} - apiGroups: [""] resources: - configmaps verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.cronjobs }} +{{ if has "cronjobs" $.Values.collectors }} - apiGroups: ["batch"] resources: - cronjobs verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.daemonsets }} +{{ if has "daemonsets" $.Values.collectors }} - apiGroups: ["extensions", "apps"] resources: - daemonsets verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.deployments }} +{{ if has "deployments" $.Values.collectors }} - apiGroups: ["extensions", "apps"] resources: - deployments verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.endpoints }} +{{ if has "endpoints" $.Values.collectors }} - apiGroups: [""] resources: - endpoints verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.horizontalpodautoscalers }} +{{ if has "horizontalpodautoscalers" $.Values.collectors }} - apiGroups: ["autoscaling"] resources: - horizontalpodautoscalers verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.ingresses }} +{{ if has "ingresses" $.Values.collectors }} - apiGroups: ["extensions", "networking.k8s.io"] resources: - ingresses verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.jobs }} +{{ if has "jobs" $.Values.collectors }} - apiGroups: ["batch"] resources: - jobs verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.limitranges }} +{{ if has "limitranges" $.Values.collectors }} - apiGroups: [""] resources: - limitranges verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.mutatingwebhookconfigurations }} +{{ if has "mutatingwebhookconfigurations" $.Values.collectors }} - apiGroups: ["admissionregistration.k8s.io"] resources: - mutatingwebhookconfigurations verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.namespaces }} +{{ if has "namespaces" $.Values.collectors }} - apiGroups: [""] resources: - namespaces verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.networkpolicies }} +{{ if has "networkpolicies" $.Values.collectors }} - apiGroups: ["networking.k8s.io"] resources: - networkpolicies verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.nodes }} +{{ if has "nodes" $.Values.collectors }} - apiGroups: [""] resources: - nodes verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.persistentvolumeclaims }} +{{ if has "persistentvolumeclaims" $.Values.collectors }} - apiGroups: [""] resources: - persistentvolumeclaims verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.persistentvolumes }} +{{ if has "persistentvolumes" $.Values.collectors }} - apiGroups: [""] resources: - persistentvolumes verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.poddisruptionbudgets }} +{{ if has "poddisruptionbudgets" $.Values.collectors }} - apiGroups: ["policy"] resources: - poddisruptionbudgets verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.pods }} +{{ if has "pods" $.Values.collectors }} - apiGroups: [""] resources: - pods verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.replicasets }} +{{ if has "replicasets" $.Values.collectors }} - apiGroups: ["extensions", "apps"] resources: - replicasets verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.replicationcontrollers }} +{{ if has "replicationcontrollers" $.Values.collectors }} - apiGroups: [""] resources: - replicationcontrollers verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.resourcequotas }} +{{ if has "resourcequotas" $.Values.collectors }} - apiGroups: [""] resources: - resourcequotas verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.secrets }} +{{ if has "secrets" $.Values.collectors }} - apiGroups: [""] resources: - secrets verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.services }} +{{ if has "services" $.Values.collectors }} - apiGroups: [""] resources: - services verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.statefulsets }} +{{ if has "statefulsets" $.Values.collectors }} - apiGroups: ["apps"] resources: - statefulsets verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.storageclasses }} +{{ if has "storageclasses" $.Values.collectors }} - apiGroups: ["storage.k8s.io"] resources: - storageclasses verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.validatingwebhookconfigurations }} +{{ if has "validatingwebhookconfigurations" $.Values.collectors }} - apiGroups: ["admissionregistration.k8s.io"] resources: - validatingwebhookconfigurations verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.volumeattachments }} +{{ if has "volumeattachments" $.Values.collectors }} - apiGroups: ["storage.k8s.io"] resources: - volumeattachments verbs: ["list", "watch"] {{ end -}} -{{ if $.Values.collectors.verticalpodautoscalers }} +{{ if has "verticalpodautoscalers" $.Values.collectors }} - apiGroups: ["autoscaling.k8s.io"] resources: - verticalpodautoscalers diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/rolebinding.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/rolebinding.yaml index 72a1a2e..6809e2b 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/rolebinding.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/rolebinding.yaml @@ -21,7 +21,7 @@ roleRef: {{- end }} subjects: - kind: ServiceAccount - name: {{ template "kube-state-metrics.fullname" $ }} + name: {{ template "kube-state-metrics.serviceAccountName" $ }} namespace: {{ template "kube-state-metrics.namespace" $ }} {{- end -}} {{- end -}} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/service.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/service.yaml index 4f8e4a4..f03e6d6 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/service.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/service.yaml @@ -23,16 +23,16 @@ spec: ports: - name: "http" protocol: TCP - port: {{ .Values.service.port }} + port: {{ .Values.service.port | default 8080}} {{- if .Values.service.nodePort }} nodePort: {{ .Values.service.nodePort }} {{- end }} - targetPort: 8080 + targetPort: {{ .Values.service.port | default 8080}} {{ if .Values.selfMonitor.enabled }} - name: "metrics" protocol: TCP port: {{ .Values.selfMonitor.telemetryPort | default 8081 }} - targetPort: 8081 + targetPort: {{ .Values.selfMonitor.telemetryPort | default 8081 }} {{ end }} {{- if .Values.service.loadBalancerIP }} loadBalancerIP: "{{ .Values.service.loadBalancerIP }}" diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/serviceaccount.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/serviceaccount.yaml index 2e8a1ee..365ccec 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/serviceaccount.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/serviceaccount.yaml @@ -7,7 +7,7 @@ metadata: helm.sh/chart: {{ .Chart.Name }}-{{ .Chart.Version }} app.kubernetes.io/managed-by: {{ .Release.Service }} app.kubernetes.io/instance: {{ .Release.Name }} - name: {{ template "kube-state-metrics.fullname" . }} + name: {{ template "kube-state-metrics.serviceAccountName" . }} namespace: {{ template "kube-state-metrics.namespace" . }} {{- if .Values.serviceAccount.annotations }} annotations: diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/stsdiscovery-rolebinding.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/stsdiscovery-rolebinding.yaml index 6a2e5bf..2e43277 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/stsdiscovery-rolebinding.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/stsdiscovery-rolebinding.yaml @@ -15,6 +15,6 @@ roleRef: name: stsdiscovery-{{ template "kube-state-metrics.fullname" . }} subjects: - kind: ServiceAccount - name: {{ template "kube-state-metrics.fullname" . }} + name: {{ template "kube-state-metrics.serviceAccountName" . }} namespace: {{ template "kube-state-metrics.namespace" . }} {{- end }} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/values.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/values.yaml index 62e178d..a9853ba 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/values.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/kube-state-metrics/values.yaml @@ -88,6 +88,11 @@ securityContext: runAsUser: 65534 fsGroup: 65534 +## Specify security settings for a Container +## Allows overrides and additional options compared to (Pod) securityContext +## Ref: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-container +containerSecurityContext: {} + ## Node labels for pod assignment ## Ref: https://kubernetes.io/docs/user-guide/node-selection/ nodeSelector: {} @@ -109,37 +114,56 @@ podAnnotations: {} # Ref: https://kubernetes.io/docs/tasks/run-application/configure-pdb/ podDisruptionBudget: {} -# Available collectors for kube-state-metrics. By default all available -# resources are enabled. +# Comma-separated list of metrics to be exposed. +# This list comprises of exact metric names and/or regex patterns. +# The allowlist and denylist are mutually exclusive. +metricAllowlist: [] + +# Comma-separated list of metrics not to be enabled. +# This list comprises of exact metric names and/or regex patterns. +# The allowlist and denylist are mutually exclusive. +metricDenylist: [] + +# Comma-separated list of additional Kubernetes label keys that will be used in the resource's +# labels metric. By default the metric contains only name and namespace labels. +# To include additional labels, provide a list of resource names in their plural form and Kubernetes +# label keys you would like to allow for them (Example: '=namespaces=[k8s-label-1,k8s-label-n,...],pods=[app],...)'. +# A single '*' can be provided per resource instead to allow any labels, but that has +# severe performance implications (Example: '=pods=[*]'). +metricLabelsAllowlist: [] + # - namespaces=[k8s-label-1,k8s-label-n] + +# Available collectors for kube-state-metrics. +# By default, all available resources are enabled, comment out to disable. collectors: - certificatesigningrequests: true - configmaps: true - cronjobs: true - daemonsets: true - deployments: true - endpoints: true - horizontalpodautoscalers: true - ingresses: true - jobs: true - limitranges: true - mutatingwebhookconfigurations: true - namespaces: true - networkpolicies: true - nodes: true - persistentvolumeclaims: true - persistentvolumes: true - poddisruptionbudgets: true - pods: true - replicasets: true - replicationcontrollers: true - resourcequotas: true - secrets: true - services: true - statefulsets: true - storageclasses: true - validatingwebhookconfigurations: true - verticalpodautoscalers: false - volumeattachments: true + - certificatesigningrequests + - configmaps + - cronjobs + - daemonsets + - deployments + - endpoints + - horizontalpodautoscalers + - ingresses + - jobs + - limitranges + - mutatingwebhookconfigurations + - namespaces + - networkpolicies + - nodes + - persistentvolumeclaims + - persistentvolumes + - poddisruptionbudgets + - pods + - replicasets + - replicationcontrollers + - resourcequotas + - secrets + - services + - statefulsets + - storageclasses + - validatingwebhookconfigurations + - verticalpodautoscalers + - volumeattachments # Enabling kubeconfig will pass the --kubeconfig argument to the container kubeconfig: diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/prometheus-node-exporter/Chart.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/prometheus-node-exporter/Chart.yaml index a9c26d7..11152a7 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/prometheus-node-exporter/Chart.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/prometheus-node-exporter/Chart.yaml @@ -14,4 +14,4 @@ maintainers: name: prometheus-node-exporter sources: - https://github.com/prometheus/node_exporter/ -version: 1.18.0 +version: 1.18.2 diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/prometheus-node-exporter/templates/monitor.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/prometheus-node-exporter/templates/monitor.yaml index 2f7b6ae..88f9015 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/prometheus-node-exporter/templates/monitor.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/prometheus-node-exporter/templates/monitor.yaml @@ -22,6 +22,9 @@ spec: {{- if $.Values.prometheus.monitor.tlsConfig }} tlsConfig: {{ toYaml $.Values.prometheus.monitor.tlsConfig | nindent 8 }} {{- end }} + {{- if .Values.prometheus.monitor.proxyUrl }} + proxyUrl: {{ .Values.prometheus.monitor.proxyUrl}} + {{- end }} {{- if .Values.prometheus.monitor.scrapeTimeout }} scrapeTimeout: {{ .Values.prometheus.monitor.scrapeTimeout }} {{- end }} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/prometheus-node-exporter/templates/psp.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/prometheus-node-exporter/templates/psp.yaml index 34afb14..ec1259e 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/prometheus-node-exporter/templates/psp.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/prometheus-node-exporter/templates/psp.yaml @@ -8,7 +8,7 @@ metadata: labels: {{ include "prometheus-node-exporter.labels" . | indent 4 }} {{- if .Values.rbac.pspAnnotations }} annotations: - {{ toYaml .Values.rbac.pspAnnotations | indent 4 }} +{{ toYaml .Values.rbac.pspAnnotations | indent 4 }} {{- end}} spec: privileged: false diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/prometheus-node-exporter/values.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/prometheus-node-exporter/values.yaml index 400ce11..f4e927f 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/prometheus-node-exporter/values.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/charts/prometheus-node-exporter/values.yaml @@ -24,6 +24,10 @@ prometheus: bearerTokenFile: tlsConfig: {} + ## proxyUrl: URL of a proxy that should be used for scraping. + ## + proxyUrl: "" + relabelings: [] scrapeTimeout: 10s diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-alertmanagerconfigs.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-alertmanagerconfigs.yaml index a6988a7..e84eb6b 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-alertmanagerconfigs.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-alertmanagerconfigs.yaml @@ -1,4 +1,4 @@ -# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml +# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml --- apiVersion: apiextensions.k8s.io/v1 diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-alertmanagers.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-alertmanagers.yaml index 486ee7e..76f90f7 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-alertmanagers.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-alertmanagers.yaml @@ -1,4 +1,4 @@ -# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml +# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml --- apiVersion: apiextensions.k8s.io/v1 diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-podmonitors.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-podmonitors.yaml index c16e955..745b866 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-podmonitors.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-podmonitors.yaml @@ -1,4 +1,4 @@ -# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml +# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml --- apiVersion: apiextensions.k8s.io/v1 diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-probes.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-probes.yaml index 4b3f92c..ca2ef5b 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-probes.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-probes.yaml @@ -1,4 +1,4 @@ -# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml +# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml --- apiVersion: apiextensions.k8s.io/v1 diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-prometheuses.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-prometheuses.yaml index cea3585..ce033a6 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-prometheuses.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-prometheuses.yaml @@ -1,4 +1,4 @@ -# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml +# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml --- apiVersion: apiextensions.k8s.io/v1 diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-prometheusrules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-prometheusrules.yaml index 1c33519..8e625d6 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-prometheusrules.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-prometheusrules.yaml @@ -1,4 +1,4 @@ -# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml +# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml --- apiVersion: apiextensions.k8s.io/v1 diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-servicemonitors.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-servicemonitors.yaml index 1c8a4a3..3bf6572 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-servicemonitors.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-servicemonitors.yaml @@ -1,4 +1,4 @@ -# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml +# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml --- apiVersion: apiextensions.k8s.io/v1 diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-thanosrulers.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-thanosrulers.yaml index 9112650..faf36d2 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-thanosrulers.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/crds/crd-thanosrulers.yaml @@ -1,4 +1,4 @@ -# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.0/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml +# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml --- apiVersion: apiextensions.k8s.io/v1 diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/_helpers.tpl b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/_helpers.tpl index 9ee060b..61989b5 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/_helpers.tpl +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/_helpers.tpl @@ -98,7 +98,12 @@ Allow the release namespace to be overridden for multi-namespace deployments in {{/* Allow KubeVersion to be overridden. */}} {{- define "kube-prometheus-stack.ingress.kubeVersion" -}} - {{- default .Capabilities.KubeVersion.Version .Values.kubeVersionOverride -}} + {{- $kubeVersion := default .Capabilities.KubeVersion.Version .Values.kubeVersionOverride -}} + {{/* Special use case for Amazon EKS, Google GKE */}} + {{- if and (regexMatch "\\d+\\.\\d+\\.\\d+-(?:eks|gke).+" $kubeVersion) (not .Values.kubeVersionOverride) -}} + {{- $kubeVersion = regexFind "\\d+\\.\\d+\\.\\d+" $kubeVersion -}} + {{- end -}} + {{- $kubeVersion -}} {{- end -}} {{/* Get Ingress API Version */}} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/exporters/kube-api-server/servicemonitor.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/exporters/kube-api-server/servicemonitor.yaml index f34c187..33a57b8 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/exporters/kube-api-server/servicemonitor.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/exporters/kube-api-server/servicemonitor.yaml @@ -22,9 +22,9 @@ spec: metricRelabelings: {{ tpl (toYaml .Values.kubeApiServer.serviceMonitor.metricRelabelings | indent 6) . }} {{- end }} -{{- if .Values.kubeApiServer.relabelings }} +{{- if .Values.kubeApiServer.serviceMonitor.relabelings }} relabelings: -{{ toYaml .Values.kubeApiServer.relabelings | indent 6 }} +{{ toYaml .Values.kubeApiServer.serviceMonitor.relabelings | indent 6 }} {{- end }} tlsConfig: caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/exporters/kube-state-metrics/serviceMonitor.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/exporters/kube-state-metrics/serviceMonitor.yaml index caeaa1e..8dead5f 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/exporters/kube-state-metrics/serviceMonitor.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/exporters/kube-state-metrics/serviceMonitor.yaml @@ -26,6 +26,24 @@ spec: relabelings: {{ toYaml .Values.kubeStateMetrics.serviceMonitor.relabelings | indent 4 }} {{- end }} +{{- if .Values.kubeStateMetrics.serviceMonitor.selfMonitor.enabled }} + - port: metrics + {{- if .Values.kubeStateMetrics.serviceMonitor.interval }} + interval: {{ .Values.kubeStateMetrics.serviceMonitor.interval }} + {{- end }} + {{- if .Values.kubeStateMetrics.serviceMonitor.proxyUrl }} + proxyUrl: {{ .Values.kubeStateMetrics.serviceMonitor.proxyUrl}} + {{- end }} + honorLabels: true +{{- if .Values.kubeStateMetrics.serviceMonitor.metricRelabelings }} + metricRelabelings: +{{ tpl (toYaml .Values.kubeStateMetrics.serviceMonitor.metricRelabelings | indent 4) . }} +{{- end }} +{{- if .Values.kubeStateMetrics.serviceMonitor.relabelings }} + relabelings: +{{ toYaml .Values.kubeStateMetrics.serviceMonitor.relabelings | indent 4 }} +{{- end }} +{{- end }} {{- if .Values.kubeStateMetrics.serviceMonitor.namespaceOverride }} namespaceSelector: matchNames: diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/exporters/kubelet/servicemonitor.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/exporters/kubelet/servicemonitor.yaml index 549b3ff..9933ecb 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/exporters/kubelet/servicemonitor.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/exporters/kubelet/servicemonitor.yaml @@ -168,6 +168,6 @@ spec: - {{ .Values.kubelet.namespace }} selector: matchLabels: - app.kubernetes.io/managed-by: prometheus-operator + app.kubernetes.io/name: kubelet k8s-app: kubelet {{- end}} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/exporters/node-exporter/servicemonitor.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/exporters/node-exporter/servicemonitor.yaml index 2061b98..5615ba7 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/exporters/node-exporter/servicemonitor.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/exporters/node-exporter/servicemonitor.yaml @@ -13,6 +13,11 @@ spec: matchLabels: app: prometheus-node-exporter release: {{ $.Release.Name }} + {{- if (index .Values "prometheus-node-exporter" "namespaceOverride") }} + namespaceSelector: + matchNames: + - {{ index .Values "prometheus-node-exporter" "namespaceOverride" }} + {{- end }} endpoints: - port: metrics {{- if .Values.nodeExporter.serviceMonitor.interval }} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/configmap-dashboards.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/configmap-dashboards.yaml index 7ce2a7f..ce5dd06 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/configmap-dashboards.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/configmap-dashboards.yaml @@ -1,4 +1,4 @@ -{{- if and .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if or (and .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled) .Values.grafana.forceDeployDashboards }} {{- $files := .Files.Glob "dashboards-1.14/*.json" }} {{- if $files }} apiVersion: v1 diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/configmaps-datasources.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/configmaps-datasources.yaml index db62d53..c7c82ef 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/configmaps-datasources.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/configmaps-datasources.yaml @@ -1,4 +1,4 @@ -{{- if and .Values.grafana.enabled .Values.grafana.sidecar.datasources.enabled }} +{{- if or (and .Values.grafana.enabled .Values.grafana.sidecar.datasources.enabled) .Values.grafana.forceDeployDatasources }} apiVersion: v1 kind: ConfigMap metadata: @@ -20,7 +20,11 @@ data: {{- if .Values.grafana.sidecar.datasources.defaultDatasourceEnabled }} - name: Prometheus type: prometheus + {{- if .Values.grafana.sidecar.datasources.url }} + url: {{ .Values.grafana.sidecar.datasources.url }} + {{- else }} url: http://{{ template "kube-prometheus-stack.fullname" . }}-prometheus:{{ .Values.prometheus.service.port }}/{{ trimPrefix "/" .Values.prometheus.prometheusSpec.routePrefix }} + {{- end }} access: proxy isDefault: true jsonData: diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/alertmanager-overview.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/alertmanager-overview.yaml new file mode 100644 index 0000000..2135644 --- /dev/null +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/alertmanager-overview.yaml @@ -0,0 +1,610 @@ +{{- /* +Generated from 'alertmanager-overview' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + namespace: {{ template "kube-prometheus-stack.namespace" . }} + name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "alertmanager-overview" | trunc 63 | trimSuffix "-" }} + annotations: +{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }} + labels: + {{- if $.Values.grafana.sidecar.dashboards.label }} + {{ $.Values.grafana.sidecar.dashboards.label }}: "1" + {{- end }} + app: {{ template "kube-prometheus-stack.name" $ }}-grafana +{{ include "kube-prometheus-stack.labels" $ | indent 4 }} +data: + alertmanager-overview.json: |- + { + "__inputs": [ + + ], + "__requires": [ + + ], + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 1, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "30s", + "rows": [ + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(alertmanager_alerts{namespace=\"$namespace\",service=\"$service\"}) by (namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Alerts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(alertmanager_alerts_received_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} Received", + "refId": "A" + }, + { + "expr": "sum(rate(alertmanager_alerts_invalid_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} Invalid", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Alerts receive rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Alerts", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "integration", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(alertmanager_notifications_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} Total", + "refId": "A" + }, + { + "expr": "sum(rate(alertmanager_notifications_failed_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} Failed", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "$integration: Notifications Send Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "integration", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} 99th Percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} Median", + "refId": "B" + }, + { + "expr": "sum(rate(alertmanager_notification_latency_seconds_sum{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n/\nsum(rate(alertmanager_notification_latency_seconds_count{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} Average", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "$integration: Notification Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Notifications", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "alertmanager-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(alertmanager_alerts, namespace)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "service", + "options": [ + + ], + "query": "label_values(alertmanager_alerts, service)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": true, + "label": null, + "multi": false, + "name": "integration", + "options": [ + + ], + "query": "label_values(alertmanager_notifications_total{integration=~\".*\"}, integration)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Alertmanager / Overview", + "uid": "alertmanager-overview", + "version": 0 + } +{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/apiserver.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/apiserver.yaml index 1fa3a19..5ad5529 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/apiserver.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/apiserver.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'apiserver' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'apiserver' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeApiServer.enabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubeApiServer.enabled }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/cluster-total.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/cluster-total.yaml index 2fab7dc..a10f752 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/cluster-total.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/cluster-total.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'cluster-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'cluster-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/controller-manager.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/controller-manager.yaml index 139b1a5..3717c8e 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/controller-manager.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/controller-manager.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'controller-manager' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'controller-manager' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeControllerManager.enabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubeControllerManager.enabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -176,10 +176,10 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)", + "expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}", "refId": "A" } ], @@ -282,10 +282,10 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)", + "expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}", "refId": "A" } ], @@ -388,10 +388,10 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name, le))", + "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}", "refId": "A" } ], diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/etcd.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/etcd.yaml index f8217d9..2185b7d 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/etcd.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/etcd.yaml @@ -4,7 +4,7 @@ Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeEtcd.enabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubeEtcd.enabled }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-coredns.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-coredns.yaml index 3665b57..a8cfd81 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-coredns.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-coredns.yaml @@ -1,6 +1,6 @@ {{- /* Added manually, can be changed in-place. */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.coreDns.enabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.coreDns.enabled }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-cluster.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-cluster.yaml index a3c678c..ef43b4d 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-cluster.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-cluster.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'k8s-resources-cluster' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'k8s-resources-cluster' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -247,7 +247,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})", + "expr": "sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -499,7 +499,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})", + "expr": "sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -594,7 +594,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}namespace{{`}}`}}", @@ -885,7 +885,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -903,7 +903,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -912,7 +912,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", + "expr": "sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -921,7 +921,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1321,7 +1321,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)", + "expr": "sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1330,7 +1330,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1339,7 +1339,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)", + "expr": "sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1348,7 +1348,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2443,6 +2443,496 @@ data: "showTitle": true, "title": "Rate of Packets Dropped", "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "decimals": -1, + "fill": 10, + "id": 20, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}namespace{{`}}`}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "IOPS(Reads+Writes)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 21, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}namespace{{`}}`}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "ThroughPut(Read+Write)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage IO", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 22, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "sort": { + "col": 4, + "desc": true + }, + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "IOPS(Reads)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Reads + Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "Throughput(Read)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Read + Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Namespace", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTargetBlank": false, + "linkTooltip": "Drill down to pods", + "linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell", + "pattern": "namespace", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Storage IO", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage IO - Distribution", + "titleSize": "h6" } ], "schemaVersion": 14, diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-namespace.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-namespace.yaml index e20f344..f14b36d 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-namespace.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-namespace.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'k8s-resources-namespace' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'k8s-resources-namespace' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -78,7 +78,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -162,7 +162,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -256,7 +256,7 @@ data: "thresholds": "70,80", "timeFrom": null, "timeShift": null, - "title": "Memory Utilization (from requests)", + "title": "Memory Utilisation (from requests)", "tooltip": { "shared": false, "sort": 0, @@ -446,7 +446,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}pod{{`}}`}}", @@ -697,7 +697,7 @@ data: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -715,7 +715,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -733,7 +733,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2136,6 +2136,496 @@ data: "showTitle": true, "title": "Rate of Packets Dropped", "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "decimals": -1, + "fill": 10, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}pod{{`}}`}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "IOPS(Reads+Writes)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}pod{{`}}`}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "ThroughPut(Read+Write)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage IO", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "sort": { + "col": 4, + "desc": true + }, + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "IOPS(Reads)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Reads + Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "Throughput(Read)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Read + Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Pod", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTargetBlank": false, + "linkTooltip": "Drill down to pods", + "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", + "pattern": "pod", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Storage IO", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage IO - Distribution", + "titleSize": "h6" } ], "schemaVersion": 14, diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-node.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-node.yaml index 73a3e88..cfdf0ac 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-node.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-node.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'k8s-resources-node' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'k8s-resources-node' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -77,7 +77,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}pod{{`}}`}}", @@ -312,7 +312,7 @@ data: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -330,7 +330,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -348,7 +348,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-pod.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-pod.yaml index 014b165..b813ee2 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-pod.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-pod.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'k8s-resources-pod' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'k8s-resources-pod' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -94,7 +94,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}) by (container)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}container{{`}}`}}", @@ -450,7 +450,7 @@ data: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -468,7 +468,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -486,7 +486,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -634,7 +634,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Memory Usage", + "title": "Memory Usage (WSS)", "tooltip": { "shared": false, "sort": 0, @@ -725,7 +725,7 @@ data: "type": "hidden" }, { - "alias": "Memory Usage", + "alias": "Memory Usage (WSS)", "colorMode": null, "colors": [ @@ -1591,6 +1591,697 @@ data: "showTitle": true, "title": "Rate of Packets Dropped", "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "decimals": -1, + "fill": 10, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Reads", + "legendLink": null, + "step": 10 + }, + { + "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Writes", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "IOPS", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Reads", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Writes", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "ThroughPut", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage IO - Distribution(Pod - Read & Writes)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "decimals": -1, + "fill": 10, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "ceil(sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}container{{`}}`}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "IOPS(Reads+Writes)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}container{{`}}`}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "ThroughPut(Read+Write)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage IO - Distribution(Containers)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "sort": { + "col": 4, + "desc": true + }, + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "IOPS(Reads)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Reads + Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "Throughput(Read)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Read + Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Container", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "container", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Storage IO", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage IO - Distribution", + "titleSize": "h6" } ], "schemaVersion": 14, diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-workload.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-workload.yaml index 38c2ead..57b4088 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-workload.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-workload.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'k8s-resources-workload' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'k8s-resources-workload' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -77,7 +77,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}pod{{`}}`}}", @@ -312,7 +312,7 @@ data: ], "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -330,7 +330,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -348,7 +348,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-workloads-namespace.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-workloads-namespace.yaml index 0c53de7..a485147 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-workloads-namespace.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-workloads-namespace.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'k8s-resources-workloads-namespace' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'k8s-resources-workloads-namespace' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -98,7 +98,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}workload{{`}}`}} - {{`{{`}}workload_type{{`}}`}}", @@ -396,7 +396,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -414,7 +414,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -432,7 +432,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/kubelet.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/kubelet.yaml index 7238299..613a6c2 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/kubelet.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/kubelet.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'kubelet' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'kubelet' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubelet.enabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubelet.enabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -39,2385 +39,2100 @@ data: "id": null, "links": [ + ], + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ + + ], + "mappings": [ + + ], + "thresholds": { + "mode": "absolute", + "steps": [ + + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 2, + "links": [ + + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(kubelet_node_name{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Running Kubelets", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ + + ], + "mappings": [ + + ], + "thresholds": { + "mode": "absolute", + "steps": [ + + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 3, + "links": [ + + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "title": "Running Pods", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ + + ], + "mappings": [ + + ], + "thresholds": { + "mode": "absolute", + "steps": [ + + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 4, + "links": [ + + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "title": "Running Container", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ + + ], + "mappings": [ + + ], + "thresholds": { + "mode": "absolute", + "steps": [ + + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 12, + "y": 0 + }, + "id": 5, + "links": [ + + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\", state=\"actual_state_of_world\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "title": "Actual Volume Count", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ + + ], + "mappings": [ + + ], + "thresholds": { + "mode": "absolute", + "steps": [ + + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 16, + "y": 0 + }, + "id": 6, + "links": [ + + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",state=\"desired_state_of_world\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "title": "Desired Volume Count", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ + + ], + "mappings": [ + + ], + "thresholds": { + "mode": "absolute", + "steps": [ + + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 20, + "y": 0 + }, + "id": 7, + "links": [ + + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "title": "Config Error Count", + "transparent": false, + "type": "stat" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (operation_type, instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Operation Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Operation Error Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Operation duration 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} pod", + "refId": "A" + }, + { + "expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} worker", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Pod Start Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} pod", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} worker", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Pod Start Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Storage Operation Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Storage Operation Error Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 15, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Storage Operation Duration 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 16, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}operation_type{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Cgroup manager operation rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 42 + }, + "id": 17, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Cgroup manager 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Pod lifecycle event generator", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 49 + }, + "id": 18, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "PLEG relist rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 49 + }, + "id": 19, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "PLEG relist interval", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 56 + }, + "id": 20, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "PLEG relist duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 63 + }, + "id": 21, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "2xx", + "refId": "A" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "3xx", + "refId": "B" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "4xx", + "refId": "C" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "5xx", + "refId": "D" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "RPC Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 70 + }, + "id": 22, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Request duration 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 77 + }, + "id": 23, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_resident_memory_bytes{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 77 + }, + "id": 24, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 77 + }, + "id": 25, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } ], "refresh": "10s", "rows": [ - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - }, - "id": 2, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(up{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Up", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 3, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Running Pods", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 4, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Running Container", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 5, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\", state=\"actual_state_of_world\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Actual Volume Count", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 6, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",state=\"desired_state_of_world\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Desired Volume Count", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 7, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Config Error Count", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 8, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (operation_type, instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Operation Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 9, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Operation Error Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 10, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Operation duration 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 11, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} pod", - "refId": "A" - }, - { - "expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} worker", - "refId": "B" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Pod Start Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 12, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} pod", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} worker", - "refId": "B" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Pod Start Duration", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 13, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Storage Operation Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 14, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Storage Operation Error Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 15, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Storage Operation Duration 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 16, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}operation_type{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Cgroup manager operation rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 17, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Cgroup manager 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "Pod lifecycle event generator", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 18, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "PLEG relist rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 19, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "PLEG relist interval", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 20, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "PLEG relist duration", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 21, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"2..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "2xx", - "refId": "A" - }, - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"3..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "3xx", - "refId": "B" - }, - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"4..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "4xx", - "refId": "C" - }, - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"5..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "5xx", - "refId": "D" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "RPC Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 22, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Request duration 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 23, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "process_resident_memory_bytes{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 24, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 25, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_goroutines{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Goroutines", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - } ], "schemaVersion": 14, "style": "dark", diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/namespace-by-pod.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/namespace-by-pod.yaml index eb53ba0..288aeed 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/namespace-by-pod.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/namespace-by-pod.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'namespace-by-pod' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'namespace-by-pod' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/namespace-by-workload.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/namespace-by-workload.yaml index 6a01666..f533068 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/namespace-by-workload.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/namespace-by-workload.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'namespace-by-workload' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'namespace-by-workload' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/node-cluster-rsrc-use.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/node-cluster-rsrc-use.yaml index f486bcb..22f5aea 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/node-cluster-rsrc-use.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/node-cluster-rsrc-use.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'node-cluster-rsrc-use' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'node-cluster-rsrc-use' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.nodeExporter.enabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.nodeExporter.enabled }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/node-rsrc-use.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/node-rsrc-use.yaml index bfbe258..52d24bf 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/node-rsrc-use.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/node-rsrc-use.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'node-rsrc-use' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'node-rsrc-use' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.nodeExporter.enabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.nodeExporter.enabled }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/nodes.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/nodes.yaml index e3b3bd8..9da7444 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/nodes.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/nodes.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'nodes' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'nodes' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -92,9 +92,8 @@ data: "steppedLine": false, "targets": [ { - "expr": "(\n (1 - rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[$__interval]))\n/ ignoring(cpu) group_left\n count without (cpu)( node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"})\n)\n", + "expr": "(\n (1 - rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[$__rate_interval]))\n/ ignoring(cpu) group_left\n count without (cpu)( node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"})\n)\n", "format": "time_series", - "interval": "$__rate_interval", "intervalFactor": 5, "legendFormat": "{{`{{`}}cpu{{`}}`}}", "refId": "A" @@ -528,25 +527,22 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", + "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])", "format": "time_series", - "interval": "$__rate_interval", "intervalFactor": 2, "legendFormat": "{{`{{`}}device{{`}}`}} read", "refId": "A" }, { - "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", + "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])", "format": "time_series", - "interval": "$__rate_interval", "intervalFactor": 2, "legendFormat": "{{`{{`}}device{{`}}`}} written", "refId": "B" }, { - "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", + "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])", "format": "time_series", - "interval": "$__rate_interval", "intervalFactor": 2, "legendFormat": "{{`{{`}}device{{`}}`}} io time", "refId": "C" @@ -758,9 +754,8 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__interval])", + "expr": "rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__rate_interval])", "format": "time_series", - "interval": "$__rate_interval", "intervalFactor": 2, "legendFormat": "{{`{{`}}device{{`}}`}}", "refId": "A" @@ -852,9 +847,8 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__interval])", + "expr": "rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__rate_interval])", "format": "time_series", - "interval": "$__rate_interval", "intervalFactor": 2, "legendFormat": "{{`{{`}}device{{`}}`}}", "refId": "A" diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/persistentvolumesusage.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/persistentvolumesusage.yaml index 06fb315..93065ff 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/persistentvolumesusage.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/persistentvolumesusage.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'persistentvolumesusage' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'persistentvolumesusage' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -92,14 +92,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "(\n sum without(instance, node) (kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n sum without(instance, node) (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n", + "expr": "(\n sum without(instance, node) (topk(1, (kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n -\n sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Used Space", "refId": "A" }, { - "expr": "sum without(instance, node) (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n", + "expr": "sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Free Space", @@ -207,7 +207,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "max without(instance,node) (\n(\n kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n -\n kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n)\n/\nkubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100)\n", + "expr": "max without(instance,node) (\n(\n topk(1, kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n topk(1, kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n/\ntopk(1, kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n* 100)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -289,14 +289,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum without(instance, node) (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n", + "expr": "sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Used inodes", "refId": "A" }, { - "expr": "(\n sum without(instance, node) (kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n sum without(instance, node) (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n", + "expr": "(\n sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n -\n sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": " Free inodes", @@ -404,7 +404,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "max without(instance,node) (\nkubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n/\nkubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100)\n", + "expr": "max without(instance,node) (\ntopk(1, kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n/\ntopk(1, kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n* 100)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "", diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/pod-total.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/pod-total.yaml index 7145287..d2842c3 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/pod-total.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/pod-total.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'pod-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'pod-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/prometheus-remote-write.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/prometheus-remote-write.yaml index b33b738..a98581d 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/prometheus-remote-write.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/prometheus-remote-write.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'prometheus-remote-write' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'prometheus-remote-write' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.prometheus.prometheusSpec.remoteWriteDashboards }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.prometheus.prometheusSpec.remoteWriteDashboards }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/prometheus.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/prometheus.yaml index 0cd3e7b..71d3a62 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/prometheus.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/prometheus.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'prometheus' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'prometheus' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/proxy.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/proxy.yaml index 8f8559e..e6b9b98 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/proxy.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/proxy.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'proxy' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'proxy' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeProxy.enabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubeProxy.enabled }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/scheduler.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/scheduler.yaml index 3dbf4ab..82edd36 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/scheduler.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/scheduler.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'scheduler' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'scheduler' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeScheduler.enabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubeScheduler.enabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -176,31 +176,31 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} e2e", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} e2e", "refId": "A" }, { - "expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} binding", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} binding", "refId": "B" }, { - "expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} scheduling algorithm", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} scheduling algorithm", "refId": "C" }, { - "expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} volume", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} volume", "refId": "D" } ], @@ -290,31 +290,31 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} e2e", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} e2e", "refId": "A" }, { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} binding", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} binding", "refId": "B" }, { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} scheduling algorithm", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} scheduling algorithm", "refId": "C" }, { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} volume", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} volume", "refId": "D" } ], diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/statefulset.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/statefulset.yaml index 9b3ce7d..f1ffea5 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/statefulset.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/statefulset.yaml @@ -4,7 +4,7 @@ Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -925,4 +925,4 @@ data: "uid": "a31c1f46e6f727cb37c0d731a7245005", "version": 0 } -{{- end }} \ No newline at end of file +{{- end }} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/workload-total.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/workload-total.yaml index 75f95bd..dc1e4ed 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/workload-total.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/workload-total.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'workload-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'workload-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/job-createSecret.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/job-createSecret.yaml index f8afcb8..de81974 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/job-createSecret.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/job-createSecret.yaml @@ -58,8 +58,8 @@ spec: tolerations: {{ toYaml . | indent 8 }} {{- end }} +{{- if .Values.prometheusOperator.admissionWebhooks.patch.securityContext }} securityContext: - runAsGroup: 2000 - runAsNonRoot: true - runAsUser: 2000 +{{ toYaml .Values.prometheusOperator.admissionWebhooks.patch.securityContext | indent 8 }} +{{- end }} {{- end }} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/job-patchWebhook.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/job-patchWebhook.yaml index b2d8912..23dc9f4 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/job-patchWebhook.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/job-patchWebhook.yaml @@ -59,8 +59,8 @@ spec: tolerations: {{ toYaml . | indent 8 }} {{- end }} +{{- if .Values.prometheusOperator.admissionWebhooks.patch.securityContext }} securityContext: - runAsGroup: 2000 - runAsNonRoot: true - runAsUser: 2000 +{{ toYaml .Values.prometheusOperator.admissionWebhooks.patch.securityContext | indent 8 }} +{{- end }} {{- end }} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml index b67df54..f42e33e 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml @@ -5,8 +5,8 @@ metadata: name: {{ template "kube-prometheus-stack.fullname" . }}-admission {{- if .Values.prometheusOperator.admissionWebhooks.certManager.enabled }} annotations: - certmanager.k8s.io/inject-ca-from: {{ printf "%s/%s-root-cert" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }} - cert-manager.io/inject-ca-from: {{ printf "%s/%s-root-cert" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }} + certmanager.k8s.io/inject-ca-from: {{ printf "%s/%s-admission" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }} + cert-manager.io/inject-ca-from: {{ printf "%s/%s-admission" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }} {{- end }} labels: app: {{ template "kube-prometheus-stack.name" $ }}-admission diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/validatingWebhookConfiguration.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/validatingWebhookConfiguration.yaml index 249488e..1439ed5 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/validatingWebhookConfiguration.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/validatingWebhookConfiguration.yaml @@ -5,8 +5,8 @@ metadata: name: {{ template "kube-prometheus-stack.fullname" . }}-admission {{- if .Values.prometheusOperator.admissionWebhooks.certManager.enabled }} annotations: - certmanager.k8s.io/inject-ca-from: {{ printf "%s/%s-root-cert" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }} - cert-manager.io/inject-ca-from: {{ printf "%s/%s-root-cert" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }} + certmanager.k8s.io/inject-ca-from: {{ printf "%s/%s-admission" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }} + cert-manager.io/inject-ca-from: {{ printf "%s/%s-admission" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }} {{- end }} labels: app: {{ template "kube-prometheus-stack.name" $ }}-admission diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus-operator/deployment.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus-operator/deployment.yaml index 15b3684..00144ab 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus-operator/deployment.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus-operator/deployment.yaml @@ -78,6 +78,11 @@ spec: {{- if .Values.prometheusOperator.prometheusInstanceNamespaces }} - --prometheus-instance-namespaces={{ .Values.prometheusOperator.prometheusInstanceNamespaces | join "," }} {{- end }} + {{- if .Values.prometheusOperator.thanosImage.sha }} + - --thanos-default-base-image={{ .Values.prometheusOperator.thanosImage.repository }}:{{ .Values.prometheusOperator.thanosImage.tag }}@sha256:{{ .Values.prometheusOperator.thanosImage.sha }} + {{- else }} + - --thanos-default-base-image={{ .Values.prometheusOperator.thanosImage.repository }}:{{ .Values.prometheusOperator.thanosImage.tag }} + {{- end }} {{- if .Values.prometheusOperator.thanosRulerInstanceNamespaces }} - --thanos-ruler-instance-namespaces={{ .Values.prometheusOperator.thanosRulerInstanceNamespaces | join "," }} {{- end }} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml index 387a677..49e78b5 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml +Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/alertmanager-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -26,41 +26,146 @@ spec: groups: - name: alertmanager.rules rules: - - alert: AlertmanagerConfigInconsistent + - alert: AlertmanagerFailedReload annotations: - message: 'The configuration of the instances of the Alertmanager cluster `{{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.service {{`}}`}}` are out of sync. - - {{`{{`}} range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query {{`}}`}} - - Configuration hash for pod {{`{{`}} .Labels.pod {{`}}`}} is "{{`{{`}} printf "%.f" .Value {{`}}`}}" - - {{`{{`}} end {{`}}`}} - - ' - expr: count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})) != 1 + description: Configuration has failed to load for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerfailedreload + summary: Reloading an Alertmanager configuration has failed. + expr: |- + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) == 0 + for: 10m + labels: + severity: critical +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: AlertmanagerMembersInconsistent + annotations: + description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} has only found {{`{{`}} $value {{`}}`}} members of the {{`{{`}}$labels.job{{`}}`}} cluster. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagermembersinconsistent + summary: A member of an Alertmanager cluster has not found all other cluster members. + expr: |- + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) + < on (namespace,service) group_left + count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])) + for: 10m + labels: + severity: critical +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: AlertmanagerFailedToSendAlerts + annotations: + description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} failed to send {{`{{`}} $value | humanizePercentage {{`}}`}} of notifications to {{`{{`}} $labels.integration {{`}}`}}. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerfailedtosendalerts + summary: An Alertmanager instance failed to send notifications. + expr: |- + ( + rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) + / + rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. + expr: |- + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m]) + ) + > 0.01 for: 5m labels: severity: critical {{- if .Values.defaultRules.additionalRuleLabels }} {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} {{- end }} - - alert: AlertmanagerFailedReload + - alert: AlertmanagerClusterFailedToSendAlerts annotations: - message: Reloading Alertmanager's configuration has failed for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}. - expr: alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} == 0 - for: 10m + description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. + expr: |- + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m]) + ) + > 0.01 + for: 5m labels: severity: warning {{- if .Values.defaultRules.additionalRuleLabels }} {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} {{- end }} - - alert: AlertmanagerMembersInconsistent + - alert: AlertmanagerConfigInconsistent annotations: - message: Alertmanager has not found all other members of the cluster. + description: Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have different configurations. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerconfiginconsistent + summary: Alertmanager instances within the same cluster have different configurations. expr: |- - alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} - != on (service) GROUP_LEFT() - count by (service) (alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) + count by (namespace,service) ( + count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) + ) + != 1 + for: 20m + labels: + severity: critical +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: AlertmanagerClusterDown + annotations: + description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have been up for less than half of the last 5m.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterdown + summary: Half or more of the Alertmanager instances within the same cluster are down. + expr: |- + ( + count by (namespace,service) ( + avg_over_time(up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) < 0.5 + ) + / + count by (namespace,service) ( + up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: AlertmanagerClusterCrashlooping + annotations: + description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have restarted at least 5 times in the last 10m.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclustercrashlooping + summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. + expr: |- + ( + count by (namespace,service) ( + changes(process_start_time_seconds{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[10m]) > 4 + ) + / + count by (namespace,service) ( + up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} + ) + ) + >= 0.5 for: 5m labels: severity: critical diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml index 80771f4..e41175f 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml +Generated from 'general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -26,7 +26,9 @@ spec: rules: - alert: TargetDown annotations: - message: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.' + description: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-targetdown + summary: One or more targets are unreachable. expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 for: 10m labels: @@ -36,7 +38,7 @@ spec: {{- end }} - alert: Watchdog annotations: - message: 'This is an alert meant to ensure that the entire alerting pipeline is functional. + description: 'This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager @@ -47,6 +49,8 @@ spec: "DeadMansSnitch" integration in PagerDuty. ' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-watchdog + summary: An alert that should always be firing to certify that Alertmanager is working properly. expr: vector(1) labels: severity: none diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.yaml index 011a4a7..8f3314b 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'k8s.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'k8s.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -26,11 +26,11 @@ spec: rules: - expr: |- sum by (cluster, namespace, pod, container) ( - rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) + irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) ) - record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate + record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate - expr: |- container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, @@ -77,6 +77,28 @@ spec: ) ) record: namespace_cpu:kube_pod_container_resource_requests:sum + - expr: |- + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_memory:kube_pod_container_resource_limits:sum + - expr: |- + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_cpu:kube_pod_container_resource_limits:sum - expr: |- max by (cluster, namespace, workload, pod) ( label_replace( diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-availability.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-availability.rules.yaml index 7b00b54..8dda6b7 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-availability.rules.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-availability.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kube-apiserver-availability.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kube-apiserver-availability.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -29,56 +29,56 @@ spec: 1 - ( ( # write too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) - - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) ) + ( # read too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d])) - ( ( - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) or vector(0) ) + - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d])) ) ) + # errors - sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) + sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) ) / - sum(code:apiserver_request_total:increase30d) + sum by (cluster) (code:apiserver_request_total:increase30d) labels: verb: all record: apiserver_request:availability30d - expr: |- 1 - ( - sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d])) - ( # too slow ( - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) or vector(0) ) + - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d])) ) + # errors - sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) + sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) ) / - sum(code:apiserver_request_total:increase30d{verb="read"}) + sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"}) labels: verb: read record: apiserver_request:availability30d @@ -86,74 +86,74 @@ spec: 1 - ( ( # too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) - - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) ) + # errors - sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) + sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) ) / - sum(code:apiserver_request_total:increase30d{verb="write"}) + sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"}) labels: verb: write record: apiserver_request:availability30d - expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 record: code_verb:apiserver_request_total:increase30d - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) + - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) labels: verb: read record: code:apiserver_request_total:increase30d - - expr: sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) labels: verb: write record: code:apiserver_request_total:increase30d diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml index 0f44ccc..22ede05 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kube-apiserver-slos' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kube-apiserver-slos' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver.rules.yaml index eddc1e4..4bb373d 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver.rules.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kube-apiserver.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kube-apiserver.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -28,26 +28,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) labels: verb: read record: apiserver_request:burnrate1d @@ -55,26 +55,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) labels: verb: read record: apiserver_request:burnrate1h @@ -82,26 +82,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) labels: verb: read record: apiserver_request:burnrate2h @@ -109,26 +109,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) labels: verb: read record: apiserver_request:burnrate30m @@ -136,26 +136,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) labels: verb: read record: apiserver_request:burnrate3d @@ -163,26 +163,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) labels: verb: read record: apiserver_request:burnrate5m @@ -190,26 +190,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) labels: verb: read record: apiserver_request:burnrate6h @@ -217,15 +217,15 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) labels: verb: write record: apiserver_request:burnrate1d @@ -233,15 +233,15 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) labels: verb: write record: apiserver_request:burnrate1h @@ -249,15 +249,15 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) labels: verb: write record: apiserver_request:burnrate2h @@ -265,15 +265,15 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) labels: verb: write record: apiserver_request:burnrate30m @@ -281,15 +281,15 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) labels: verb: write record: apiserver_request:burnrate3d @@ -297,15 +297,15 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: verb: write record: apiserver_request:burnrate5m @@ -313,32 +313,32 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) labels: verb: write record: apiserver_request:burnrate6h - - expr: sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) labels: verb: read record: code_resource:apiserver_request_total:rate5m - - expr: sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: verb: write record: code_resource:apiserver_request_total:rate5m - - expr: histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 + - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 labels: quantile: '0.99' verb: read record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 + - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 labels: quantile: '0.99' verb: write diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml index e54bee5..1b95d21 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kube-prometheus-general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml +Generated from 'kube-prometheus-general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml index 27271f1..2d38ef8 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml +Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-scheduler.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-scheduler.rules.yaml index 24df268..4e44196 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-scheduler.rules.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-scheduler.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml index 0fa5032..f7e0c43 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kube-state-metrics' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml +Generated from 'kube-state-metrics' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-state-metrics-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -55,5 +55,33 @@ spec: severity: critical {{- if .Values.defaultRules.additionalRuleLabels }} {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: KubeStateMetricsShardingMismatch + annotations: + description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricsshardingmismatch + summary: kube-state-metrics sharding is misconfigured. + expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0 + for: 15m + labels: + severity: critical +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: KubeStateMetricsShardsMissing + annotations: + description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricsshardsmissing + summary: kube-state-metrics shards are missing. + expr: |- + 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1 + - + sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) + != 0 + for: 15m + labels: + severity: critical +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} {{- end }} {{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubelet.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubelet.rules.yaml index 8712b9f..a073214 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubelet.rules.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubelet.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kubelet.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kubelet.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml index 77bb40a..94c16cd 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -30,7 +30,10 @@ spec: description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is restarting {{`{{`}} printf "%.2f" $value {{`}}`}} times / 10 minutes. runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodcrashlooping summary: Pod is crash looping. - expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m]) * 60 * 5 > 0 + expr: |- + increase(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m]) > 0 + and + sum without (phase) (kube_pod_status_phase{phase!="Running",job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} == 1) for: 15m labels: severity: warning @@ -261,19 +264,19 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubehpareplicasmismatch summary: HPA has not matched descired number of replicas. expr: |- - (kube_hpa_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} != - kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) and - (kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > - kube_hpa_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) + kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) and - (kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} < - kube_hpa_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) and - changes(kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) == 0 + changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) == 0 for: 15m labels: severity: warning @@ -286,9 +289,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubehpamaxedout summary: HPA is running at max replicas expr: |- - kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} == - kube_hpa_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} for: 15m labels: severity: warning diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml index 27babbd..7a2ecbc 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml index 527e6e3..3e06766 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml index c3110cf..96945b8 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kubernetes-system-apiserver' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kubernetes-system-apiserver' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml index 3d1ace1..07ff37a 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kubernetes-system-controller-manager' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kubernetes-system-controller-manager' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml index 5671b1c..1ea531f 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kubernetes-system-kubelet' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kubernetes-system-kubelet' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml index 098f6fb..47376e3 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kubernetes-system-scheduler' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kubernetes-system-scheduler' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml index ea2f258..657e5c2 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.rules.yaml index ddb7376..f734e8d 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.rules.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'node-exporter.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/node-exporter-prometheusRule.yaml +Generated from 'node-exporter.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/node-exporter-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -33,9 +33,9 @@ spec: record: instance:node_num_cpu:sum - expr: |- 1 - avg without (cpu, mode) ( - rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m]) + rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[5m]) ) - record: instance:node_cpu_utilisation:rate1m + record: instance:node_cpu_utilisation:rate5m - expr: |- ( node_load1{job="node-exporter"} @@ -50,30 +50,30 @@ spec: node_memory_MemTotal_bytes{job="node-exporter"} ) record: instance:node_memory_utilisation:ratio - - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) - record: instance:node_vmstat_pgmajfault:rate1m - - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) - record: instance_device:node_disk_io_time_seconds:rate1m - - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) - record: instance_device:node_disk_io_time_weighted_seconds:rate1m + - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) + record: instance:node_vmstat_pgmajfault:rate5m + - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) + record: instance_device:node_disk_io_time_seconds:rate5m + - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) + record: instance_device:node_disk_io_time_weighted_seconds:rate5m - expr: |- sum without (device) ( - rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m]) + rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m]) ) - record: instance:node_network_receive_bytes_excluding_lo:rate1m + record: instance:node_network_receive_bytes_excluding_lo:rate5m - expr: |- sum without (device) ( - rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m]) + rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m]) ) - record: instance:node_network_transmit_bytes_excluding_lo:rate1m + record: instance:node_network_transmit_bytes_excluding_lo:rate5m - expr: |- sum without (device) ( - rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m]) + rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m]) ) - record: instance:node_network_receive_drop_excluding_lo:rate1m + record: instance:node_network_receive_drop_excluding_lo:rate5m - expr: |- sum without (device) ( - rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m]) + rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m]) ) - record: instance:node_network_transmit_drop_excluding_lo:rate1m + record: instance:node_network_transmit_drop_excluding_lo:rate5m {{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml index 763c4a2..468e1db 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'node-exporter' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/node-exporter-prometheusRule.yaml +Generated from 'node-exporter' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/node-exporter-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -27,6 +27,7 @@ spec: - alert: NodeFilesystemSpaceFillingUp annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemspacefillingup summary: Filesystem is predicted to run out of space within the next 24 hours. expr: |- ( @@ -45,6 +46,7 @@ spec: - alert: NodeFilesystemSpaceFillingUp annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up fast. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemspacefillingup summary: Filesystem is predicted to run out of space within the next 4 hours. expr: |- ( @@ -63,6 +65,7 @@ spec: - alert: NodeFilesystemAlmostOutOfSpace annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutofspace summary: Filesystem has less than 5% space left. expr: |- ( @@ -79,6 +82,7 @@ spec: - alert: NodeFilesystemAlmostOutOfSpace annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutofspace summary: Filesystem has less than 3% space left. expr: |- ( @@ -95,6 +99,7 @@ spec: - alert: NodeFilesystemFilesFillingUp annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemfilesfillingup summary: Filesystem is predicted to run out of inodes within the next 24 hours. expr: |- ( @@ -113,6 +118,7 @@ spec: - alert: NodeFilesystemFilesFillingUp annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up fast. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemfilesfillingup summary: Filesystem is predicted to run out of inodes within the next 4 hours. expr: |- ( @@ -131,6 +137,7 @@ spec: - alert: NodeFilesystemAlmostOutOfFiles annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutoffiles summary: Filesystem has less than 5% inodes left. expr: |- ( @@ -147,6 +154,7 @@ spec: - alert: NodeFilesystemAlmostOutOfFiles annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutoffiles summary: Filesystem has less than 3% inodes left. expr: |- ( @@ -163,6 +171,7 @@ spec: - alert: NodeNetworkReceiveErrs annotations: description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} receive errors in the last two minutes.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworkreceiveerrs summary: Network interface is reporting many receive errors. expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 for: 1h @@ -174,6 +183,7 @@ spec: - alert: NodeNetworkTransmitErrs annotations: description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} transmit errors in the last two minutes.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworktransmiterrs summary: Network interface is reporting many transmit errors. expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 for: 1h @@ -185,6 +195,7 @@ spec: - alert: NodeHighNumberConntrackEntriesUsed annotations: description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodehighnumberconntrackentriesused summary: Number of conntrack are getting close to the limit. expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 labels: @@ -195,6 +206,7 @@ spec: - alert: NodeTextFileCollectorScrapeError annotations: description: Node Exporter text file collector failed to scrape. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodetextfilecollectorscrapeerror summary: Node Exporter text file collector failed to scrape. expr: node_textfile_scrape_error{job="node-exporter"} == 1 labels: @@ -204,7 +216,8 @@ spec: {{- end }} - alert: NodeClockSkewDetected annotations: - message: Clock on {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. + description: Clock on {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeclockskewdetected summary: Clock skew detected. expr: |- ( @@ -226,7 +239,8 @@ spec: {{- end }} - alert: NodeClockNotSynchronising annotations: - message: Clock on {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host. + description: Clock on {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeclocknotsynchronising summary: Clock not synchronising. expr: |- min_over_time(node_timex_sync_status[5m]) == 0 @@ -241,6 +255,7 @@ spec: - alert: NodeRAIDDegraded annotations: description: RAID array '{{`{{`}} $labels.device {{`}}`}}' on {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-noderaiddegraded summary: RAID Array is degraded expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 for: 15m @@ -252,8 +267,9 @@ spec: - alert: NodeRAIDDiskFailure annotations: description: At least one device in RAID array on {{`{{`}} $labels.instance {{`}}`}} failed. Array '{{`{{`}} $labels.device {{`}}`}}' needs attention and possibly a disk swap. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-noderaiddiskfailure summary: Failed device in RAID array - expr: node_md_disks{state="fail"} > 0 + expr: node_md_disks{state="failed"} > 0 labels: severity: warning {{- if .Values.defaultRules.additionalRuleLabels }} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml index 9a6955a..8964334 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'node-network' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml +Generated from 'node-network' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -26,7 +26,8 @@ spec: rules: - alert: NodeNetworkInterfaceFlapping annotations: - message: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing it's up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}" + message: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing it's up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworkinterfaceflapping expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 for: 2m labels: diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml index f24c555..c6dd9e1 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'node.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'node.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml index d1c1f65..d3010bc 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml +Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheus-operator-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml index c9c805e..52cc0d8 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'prometheus' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml +Generated from 'prometheus' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheus-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -29,6 +29,7 @@ spec: - alert: PrometheusBadConfig annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to reload its configuration. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusbadconfig summary: Failed Prometheus configuration reload. expr: |- # Without max_over_time, failed scrapes could create false negatives, see @@ -43,6 +44,7 @@ spec: - alert: PrometheusNotificationQueueRunningFull annotations: description: Alert notification queue of Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is running full. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusnotificationqueuerunningfull summary: Prometheus alert notification queue predicted to run full in less than 30m. expr: |- # Without min_over_time, failed scrapes could create false negatives, see @@ -61,6 +63,7 @@ spec: - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers annotations: description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.alertmanager{{`}}`}}.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheuserrorsendingalertstosomealertmanagers summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. expr: |- ( @@ -75,28 +78,11 @@ spec: severity: warning {{- if .Values.defaultRules.additionalRuleLabels }} {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusErrorSendingAlertsToAnyAlertmanager - annotations: - description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% minimum errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to any Alertmanager.' - summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. - expr: |- - min without(alertmanager) ( - rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) - / - rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) - ) - * 100 - > 3 - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} {{- end }} - alert: PrometheusNotConnectedToAlertmanagers annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not connected to any Alertmanagers. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusnotconnectedtoalertmanagers summary: Prometheus is not connected to any Alertmanagers. expr: |- # Without max_over_time, failed scrapes could create false negatives, see @@ -111,6 +97,7 @@ spec: - alert: PrometheusTSDBReloadsFailing annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} reload failures over the last 3h. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheustsdbreloadsfailing summary: Prometheus has issues reloading blocks from disk. expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0 for: 4h @@ -122,6 +109,7 @@ spec: - alert: PrometheusTSDBCompactionsFailing annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} compaction failures over the last 3h. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheustsdbcompactionsfailing summary: Prometheus has issues compacting blocks. expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0 for: 4h @@ -133,8 +121,18 @@ spec: - alert: PrometheusNotIngestingSamples annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not ingesting samples. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusnotingestingsamples summary: Prometheus is not ingesting samples. - expr: rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0 + expr: |- + ( + rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0 + and + ( + sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0 + or + sum without(rule_group) (prometheus_rule_group_rules{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0 + ) + ) for: 10m labels: severity: warning @@ -144,6 +142,7 @@ spec: - alert: PrometheusDuplicateTimestamps annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with different values but duplicated timestamp. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusduplicatetimestamps summary: Prometheus is dropping samples with duplicate timestamps. expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 10m @@ -155,6 +154,7 @@ spec: - alert: PrometheusOutOfOrderTimestamps annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with timestamps arriving out of order. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoutofordertimestamps summary: Prometheus drops samples with out-of-order timestamps. expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 10m @@ -166,15 +166,16 @@ spec: - alert: PrometheusRemoteStorageFailures annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} failed to send {{`{{`}} printf "%.1f" $value {{`}}`}}% of the samples to {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}} + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusremotestoragefailures summary: Prometheus fails to send samples to remote storage. expr: |- ( - rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) + (rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])) / ( - rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) + (rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])) + - rate(prometheus_remote_storage_succeeded_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) + (rate(prometheus_remote_storage_succeeded_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])) ) ) * 100 @@ -188,13 +189,14 @@ spec: - alert: PrometheusRemoteWriteBehind annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write is {{`{{`}} printf "%.1f" $value {{`}}`}}s behind for {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusremotewritebehind summary: Prometheus remote write is behind. expr: |- # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. ( max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) - - on(job, instance) group_right + - ignoring(remote_name, url) group_right max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) ) > 120 @@ -207,6 +209,7 @@ spec: - alert: PrometheusRemoteWriteDesiredShards annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write desired shards calculation wants to run {{`{{`}} $value {{`}}`}} shards for queue {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}, which is more than the max of {{`{{`}} printf `prometheus_remote_storage_shards_max{instance="%s",job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}` $labels.instance | query | first | value {{`}}`}}. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusremotewritedesiredshards summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. expr: |- # Without max_over_time, failed scrapes could create false negatives, see @@ -225,6 +228,7 @@ spec: - alert: PrometheusRuleFailures annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to evaluate {{`{{`}} printf "%.0f" $value {{`}}`}} rules in the last 5m. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusrulefailures summary: Prometheus is failing rule evaluations. expr: increase(prometheus_rule_evaluation_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 15m @@ -236,6 +240,7 @@ spec: - alert: PrometheusMissingRuleEvaluations annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has missed {{`{{`}} printf "%.0f" $value {{`}}`}} rule group evaluations in the last 5m. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusmissingruleevaluations summary: Prometheus is missing rule evaluations due to slow rule group evaluation. expr: increase(prometheus_rule_group_iterations_missed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 15m @@ -247,6 +252,7 @@ spec: - alert: PrometheusTargetLimitHit annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because the number of targets exceeded the configured target_limit. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheustargetlimithit summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 15m @@ -254,5 +260,36 @@ spec: severity: warning {{- if .Values.defaultRules.additionalRuleLabels }} {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: PrometheusLabelLimitHit + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheuslabellimithit + summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit. + expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 + for: 15m + labels: + severity: warning +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: PrometheusErrorSendingAlertsToAnyAlertmanager + annotations: + description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% minimum errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to any Alertmanager.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheuserrorsendingalertstoanyalertmanager + summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. + expr: |- + min without (alertmanager) ( + rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}",alertmanager!~``}[5m]) + / + rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}",alertmanager!~``}[5m]) + ) + * 100 + > 3 + for: 15m + labels: + severity: critical +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} {{- end }} {{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/alertmanager.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/alertmanager.rules.yaml deleted file mode 100644 index 7115984..0000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/alertmanager.rules.yaml +++ /dev/null @@ -1,63 +0,0 @@ -{{- /* -Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.alertmanager }} -{{- $operatorJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "operator" }} -{{- $alertmanagerJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager" }} -{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: alertmanager.rules - rules: - - alert: AlertmanagerConfigInconsistent - annotations: - message: The configuration of the instances of the Alertmanager cluster `{{`{{`}}$labels.service{{`}}`}}` are out of sync. - expr: count_values("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{job="{{ $operatorJob }}",namespace="{{ $namespace }}",controller="alertmanager"}) by (name, job, namespace, controller), "service", "$1", "name", "(.*)") != 1 - for: 5m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: AlertmanagerFailedReload - annotations: - message: Reloading Alertmanager's configuration has failed for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}. - expr: alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} == 0 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: AlertmanagerMembersInconsistent - annotations: - message: Alertmanager has not found all other members of the cluster. - expr: |- - alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} - != on (service) GROUP_LEFT() - count by (service) (alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) - for: 5m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/etcd.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/etcd.yaml deleted file mode 100644 index ce4e87b..0000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/etcd.yaml +++ /dev/null @@ -1,179 +0,0 @@ -{{- /* -Generated from 'etcd' group from https://raw.githubusercontent.com/etcd-io/website/master/content/en/docs/v3.4/op-guide/etcd3_alert.rules.yml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeEtcd.enabled .Values.defaultRules.rules.etcd }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "etcd" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: etcd - rules: - - alert: etcdInsufficientMembers - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).' - expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2) - for: 3m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdNoLeader - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.' - expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 - for: 1m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighNumberOfLeaderChanges - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": instance {{`{{`}} $labels.instance {{`}}`}} has seen {{`{{`}} $value {{`}}`}} leader changes within the last hour.' - expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighNumberOfFailedGRPCRequests - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: |- - 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method) - / - sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method) - > 1 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighNumberOfFailedGRPCRequests - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: |- - 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method) - / - sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method) - > 5 - for: 5m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdGRPCRequestsSlow - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": gRPC requests to {{`{{`}} $labels.grpc_method {{`}}`}} are taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: |- - histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le)) - > 0.15 - for: 10m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdMemberCommunicationSlow - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: |- - histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) - > 0.15 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighNumberOfFailedProposals - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last hour on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighFsyncDurations - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: |- - histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) - > 0.5 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighCommitDurations - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: |- - histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) - > 0.25 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighNumberOfFailedHTTPRequests - annotations: - message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}' - expr: |- - sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) - BY (method) > 0.01 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighNumberOfFailedHTTPRequests - annotations: - message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: |- - sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) - BY (method) > 0.05 - for: 10m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHTTPRequestsSlow - annotations: - message: etcd instance {{`{{`}} $labels.instance {{`}}`}} HTTP requests to {{`{{`}} $labels.method {{`}}`}} are slow. - expr: |- - histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) - > 0.15 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/general.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/general.rules.yaml deleted file mode 100644 index cde6feb..0000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/general.rules.yaml +++ /dev/null @@ -1,56 +0,0 @@ -{{- /* -Generated from 'general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.general }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "general.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: general.rules - rules: - - alert: TargetDown - annotations: - message: '{{`{{`}} $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}} targets are down.' - expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: Watchdog - annotations: - message: 'This is an alert meant to ensure that the entire alerting pipeline is functional. - - This alert is always firing, therefore it should always be firing in Alertmanager - - and always fire against a receiver. There are integrations with various notification - - mechanisms that send a notification when this alert is not firing. For example the - - "DeadMansSnitch" integration in PagerDuty. - - ' - expr: vector(1) - labels: - severity: none -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/k8s.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/k8s.rules.yaml deleted file mode 100644 index 08aa7fe..0000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/k8s.rules.yaml +++ /dev/null @@ -1,83 +0,0 @@ -{{- /* -Generated from 'k8s.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8s }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: k8s.rules - rules: - - expr: sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace) - record: namespace:container_cpu_usage_seconds_total:sum_rate - - expr: sum(container_memory_usage_bytes{job="kubelet", image!="", container_name!=""}) by (namespace) - record: namespace:container_memory_usage_bytes:sum - - expr: |- - sum by (namespace, pod_name, container_name) ( - rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m]) - ) - record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate - - expr: |- - sum by(namespace) ( - kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} - * on (endpoint, instance, job, namespace, pod, service) - group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1) - ) - record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum - - expr: |- - sum by (namespace) ( - kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} - * on (endpoint, instance, job, namespace, pod, service) - group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1) - ) - record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum - - expr: |- - sum( - label_replace( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, - "replicaset", "$1", "owner_name", "(.*)" - ) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"}, - "workload", "$1", "owner_name", "(.*)" - ) - ) by (namespace, workload, pod) - labels: - workload_type: deployment - record: mixin_pod_workload - - expr: |- - sum( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, - "workload", "$1", "owner_name", "(.*)" - ) - ) by (namespace, workload, pod) - labels: - workload_type: daemonset - record: mixin_pod_workload - - expr: |- - sum( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, - "workload", "$1", "owner_name", "(.*)" - ) - ) by (namespace, workload, pod) - labels: - workload_type: statefulset - record: mixin_pod_workload -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kube-apiserver.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kube-apiserver.rules.yaml deleted file mode 100644 index e3a9296..0000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kube-apiserver.rules.yaml +++ /dev/null @@ -1,39 +0,0 @@ -{{- /* -Generated from 'kube-apiserver.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserver }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kube-apiserver.rules - rules: - - expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.99' - record: cluster_quantile:apiserver_request_latencies:histogram_quantile - - expr: histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.9' - record: cluster_quantile:apiserver_request_latencies:histogram_quantile - - expr: histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.5' - record: cluster_quantile:apiserver_request_latencies:histogram_quantile -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kube-prometheus-node-alerting.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kube-prometheus-node-alerting.rules.yaml deleted file mode 100644 index a8d5400..0000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kube-prometheus-node-alerting.rules.yaml +++ /dev/null @@ -1,47 +0,0 @@ -{{- /* -Generated from 'kube-prometheus-node-alerting.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusNodeAlerting }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-prometheus-node-alerting.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kube-prometheus-node-alerting.rules - rules: - - alert: NodeDiskRunningFull - annotations: - message: Device {{`{{`}} $labels.device {{`}}`}} of node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} will be full within the next 24 hours. - expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)' - for: 30m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeDiskRunningFull - annotations: - message: Device {{`{{`}} $labels.device {{`}}`}} of node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} will be full within the next 2 hours. - expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)' - for: 10m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kube-prometheus-node-recording.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kube-prometheus-node-recording.rules.yaml deleted file mode 100644 index 87f072f..0000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kube-prometheus-node-recording.rules.yaml +++ /dev/null @@ -1,41 +0,0 @@ -{{- /* -Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusNodeRecording }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-prometheus-node-recording.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kube-prometheus-node-recording.rules - rules: - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY (instance) - record: instance:node_cpu:rate:sum - - expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) BY (instance) - record: instance:node_filesystem_usage:sum - - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) - record: instance:node_network_receive_bytes:rate:sum - - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) - record: instance:node_network_transmit_bytes:rate:sum - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) - record: instance:node_cpu:ratio - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) - record: cluster:node_cpu:sum_rate5m - - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) - record: cluster:node_cpu:ratio -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kube-scheduler.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kube-scheduler.rules.yaml deleted file mode 100644 index 46c8d1d..0000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kube-scheduler.rules.yaml +++ /dev/null @@ -1,63 +0,0 @@ -{{- /* -Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeScheduler.enabled .Values.defaultRules.rules.kubeScheduler }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-scheduler.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kube-scheduler.rules - rules: - - expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.99' - record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile - - expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.99' - record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile - - expr: histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.99' - record: cluster_quantile:scheduler_binding_latency:histogram_quantile - - expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.9' - record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile - - expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.9' - record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile - - expr: histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.9' - record: cluster_quantile:scheduler_binding_latency:histogram_quantile - - expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.5' - record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile - - expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.5' - record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile - - expr: histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.5' - record: cluster_quantile:scheduler_binding_latency:histogram_quantile -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-absent.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-absent.yaml deleted file mode 100644 index 5c1ebce..0000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-absent.yaml +++ /dev/null @@ -1,159 +0,0 @@ -{{- /* -Generated from 'kubernetes-absent' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesAbsent }} -{{- $operatorJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "operator" }} -{{- $prometheusJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus" }} -{{- $alertmanagerJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager" }} -{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-absent" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kubernetes-absent - rules: -{{- if .Values.alertmanager.enabled }} - - alert: AlertmanagerDown - annotations: - message: Alertmanager has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerdown - expr: absent(up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} -{{- if .Values.kubeDns.enabled }} - - alert: CoreDNSDown - annotations: - message: CoreDNS has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-corednsdown - expr: absent(up{job="kube-dns"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} -{{- if .Values.kubeApiServer.enabled }} - - alert: KubeAPIDown - annotations: - message: KubeAPI has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapidown - expr: absent(up{job="apiserver"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} -{{- if .Values.kubeControllerManager.enabled }} - - alert: KubeControllerManagerDown - annotations: - message: KubeControllerManager has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecontrollermanagerdown - expr: absent(up{job="kube-controller-manager"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} -{{- if .Values.kubeScheduler.enabled }} - - alert: KubeSchedulerDown - annotations: - message: KubeScheduler has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeschedulerdown - expr: absent(up{job="kube-scheduler"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} -{{- if .Values.kubeStateMetrics.enabled }} - - alert: KubeStateMetricsDown - annotations: - message: KubeStateMetrics has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricsdown - expr: absent(up{job="kube-state-metrics"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} -{{- if .Values.prometheusOperator.kubeletService.enabled }} - - alert: KubeletDown - annotations: - message: Kubelet has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletdown - expr: absent(up{job="kubelet"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} -{{- if .Values.nodeExporter.enabled }} - - alert: NodeExporterDown - annotations: - message: NodeExporter has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeexporterdown - expr: absent(up{job="node-exporter"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} - - alert: PrometheusDown - annotations: - message: Prometheus has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusdown - expr: absent(up{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- if .Values.prometheusOperator.enabled }} - - alert: PrometheusOperatorDown - annotations: - message: PrometheusOperator has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoperatordown - expr: absent(up{job="{{ $operatorJob }}",namespace="{{ $namespace }}"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-apps.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-apps.yaml deleted file mode 100644 index e7a41ca..0000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-apps.yaml +++ /dev/null @@ -1,200 +0,0 @@ -{{- /* -Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesApps }} -{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-apps" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kubernetes-apps - rules: - - alert: KubePodCrashLooping - annotations: - message: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is restarting {{`{{`}} printf "%.2f" $value {{`}}`}} times / 5 minutes. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodcrashlooping - expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) * 60 * 5 > 0 - for: 1h - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubePodNotReady - annotations: - message: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than an hour. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodnotready - expr: sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}", phase=~"Pending|Unknown"}) > 0 - for: 1h - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeDeploymentGenerationMismatch - annotations: - message: Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentgenerationmismatch - expr: |- - kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - != - kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeDeploymentReplicasMismatch - annotations: - message: Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than an hour. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentreplicasmismatch - expr: |- - kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - != - kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - for: 1h - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeStatefulSetReplicasMismatch - annotations: - message: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetreplicasmismatch - expr: |- - kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - != - kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeStatefulSetGenerationMismatch - annotations: - message: StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetgenerationmismatch - expr: |- - kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - != - kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeStatefulSetUpdateNotRolledOut - annotations: - message: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetupdatenotrolledout - expr: |- - max without (revision) ( - kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - unless - kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - ) - * - ( - kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - != - kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - ) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeDaemonSetRolloutStuck - annotations: - message: Only {{`{{`}} $value {{`}}`}}% of the desired Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are scheduled and ready. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetrolloutstuck - expr: |- - kube_daemonset_status_number_ready{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - / - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} * 100 < 100 - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeDaemonSetNotScheduled - annotations: - message: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetnotscheduled - expr: |- - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - - - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeDaemonSetMisScheduled - annotations: - message: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetmisscheduled - expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeCronJobRunning - annotations: - message: CronJob {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.cronjob {{`}}`}} is taking more than 1h to complete. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecronjobrunning - expr: time() - kube_cronjob_next_schedule_time{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 3600 - for: 1h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeJobCompletion - annotations: - message: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than one hour to complete. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobcompletion - expr: kube_job_spec_completions{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - kube_job_status_succeeded{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 - for: 1h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeJobFailed - annotations: - message: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobfailed - expr: kube_job_status_failed{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 - for: 1h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-resources.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-resources.yaml deleted file mode 100644 index b34b442..0000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-resources.yaml +++ /dev/null @@ -1,121 +0,0 @@ -{{- /* -Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesResources }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-resources" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kubernetes-resources - rules: - - alert: KubeCPUOvercommit - annotations: - message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecpuovercommit - expr: |- - sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum) - / - sum(node:node_num_cpu:sum) - > - (count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum) - for: 5m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeMemOvercommit - annotations: - message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubememovercommit - expr: |- - sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum) - / - sum(node_memory_MemTotal_bytes) - > - (count(node:node_num_cpu:sum)-1) - / - count(node:node_num_cpu:sum) - for: 5m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeCPUOvercommit - annotations: - message: Cluster has overcommitted CPU resource requests for Namespaces. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecpuovercommit - expr: |- - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) - / - sum(node:node_num_cpu:sum) - > 1.5 - for: 5m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeMemOvercommit - annotations: - message: Cluster has overcommitted memory resource requests for Namespaces. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubememovercommit - expr: |- - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) - / - sum(node_memory_MemTotal_bytes{job="node-exporter"}) - > 1.5 - for: 5m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeQuotaExceeded - annotations: - message: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} printf "%0.0f" $value {{`}}`}}% of its {{`{{`}} $labels.resource {{`}}`}} quota. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubequotaexceeded - expr: |- - 100 * kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 90 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: CPUThrottlingHigh - annotations: - message: '{{`{{`}} printf "%0.0f" $value {{`}}`}}% throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container_name {{`}}`}} in pod {{`{{`}} $labels.pod_name {{`}}`}}.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-cputhrottlinghigh - expr: |- - 100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!="", }[5m])) by (container_name, pod_name, namespace) - / - sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container_name, pod_name, namespace) - > 25 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-storage.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-storage.yaml deleted file mode 100644 index 6469fff..0000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-storage.yaml +++ /dev/null @@ -1,72 +0,0 @@ -{{- /* -Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesStorage }} -{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-storage" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kubernetes-storage - rules: - - alert: KubePersistentVolumeUsageCritical - annotations: - message: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is only {{`{{`}} printf "%0.2f" $value {{`}}`}}% free. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumeusagecritical - expr: |- - 100 * kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}"} - < 3 - for: 1m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubePersistentVolumeFullInFourDays - annotations: - message: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} printf "%0.2f" $value {{`}}`}}% is available. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumefullinfourdays - expr: |- - 100 * ( - kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}"} - ) < 15 - and - predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}"}[6h], 4 * 24 * 3600) < 0 - for: 5m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubePersistentVolumeErrors - annotations: - message: The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumeerrors - expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 - for: 5m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-system.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-system.yaml deleted file mode 100644 index da23205..0000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-system.yaml +++ /dev/null @@ -1,184 +0,0 @@ -{{- /* -Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kubernetes-system - rules: - - alert: KubeNodeNotReady - annotations: - message: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than an hour.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodenotready - expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 - for: 1h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeVersionMismatch - annotations: - message: There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeversionmismatch - expr: count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1 - for: 1h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeClientErrors - annotations: - message: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} printf "%0.0f" $value {{`}}`}}% errors.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclienterrors - expr: |- - (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) - / - sum(rate(rest_client_requests_total[5m])) by (instance, job)) - * 100 > 1 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeClientErrors - annotations: - message: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} printf "%0.0f" $value {{`}}`}} errors / second. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclienterrors - expr: sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeletTooManyPods - annotations: - message: Kubelet {{`{{`}} $labels.instance {{`}}`}} is running {{`{{`}} $value {{`}}`}} Pods, close to the limit of 110. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubelettoomanypods - expr: kubelet_running_pod_count{job="kubelet"} > 110 * 0.9 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeAPILatencyHigh - annotations: - message: The API server has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapilatencyhigh - expr: cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeAPILatencyHigh - annotations: - message: The API server has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapilatencyhigh - expr: cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 - for: 10m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeAPIErrorsHigh - annotations: - message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh - expr: |- - sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) - / - sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 3 - for: 10m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeAPIErrorsHigh - annotations: - message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh - expr: |- - sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) - / - sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 1 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeAPIErrorsHigh - annotations: - message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}} {{`{{`}} $labels.subresource {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh - expr: |- - sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb) - / - sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10 - for: 10m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeAPIErrorsHigh - annotations: - message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}} {{`{{`}} $labels.subresource {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh - expr: |- - sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb) - / - sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeClientCertificateExpiration - annotations: - message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration - expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeClientCertificateExpiration - annotations: - message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration - expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/node-network.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/node-network.yaml deleted file mode 100644 index c75f1ae..0000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/node-network.yaml +++ /dev/null @@ -1,57 +0,0 @@ -{{- /* -Generated from 'node-network' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.network }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node-network" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: node-network - rules: - - alert: NetworkReceiveErrors - annotations: - message: Network interface "{{`{{`}} $labels.device {{`}}`}}" showing receive errors on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}" - expr: rate(node_network_receive_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0 - for: 2m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NetworkTransmitErrors - annotations: - message: Network interface "{{`{{`}} $labels.device {{`}}`}}" showing transmit errors on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}" - expr: rate(node_network_transmit_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0 - for: 2m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeNetworkInterfaceFlapping - annotations: - message: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing it's up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}" - expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 - for: 2m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/node-time.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/node-time.yaml deleted file mode 100644 index b7a2fc9..0000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/node-time.yaml +++ /dev/null @@ -1,37 +0,0 @@ -{{- /* -Generated from 'node-time' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.time }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node-time" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: node-time - rules: - - alert: ClockSkewDetected - annotations: - message: Clock skew detected on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}. Ensure NTP is configured correctly on this host. - expr: abs(node_timex_offset_seconds{job="node-exporter"}) > 0.03 - for: 2m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/node.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/node.rules.yaml deleted file mode 100644 index 2bc7af3..0000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/node.rules.yaml +++ /dev/null @@ -1,202 +0,0 @@ -{{- /* -Generated from 'node.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.node }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: node.rules - rules: - - expr: sum(min(kube_pod_info) by (node)) - record: ':kube_pod_info_node_count:' - - expr: max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod) - record: 'node_namespace_pod:kube_pod_info:' - - expr: |- - count by (node) (sum by (node, cpu) ( - node_cpu_seconds_total{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - )) - record: node:node_num_cpu:sum - - expr: 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])) - record: :node_cpu_utilisation:avg1m - - expr: |- - 1 - avg by (node) ( - rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info:) - record: node:node_cpu_utilisation:avg1m - - expr: |- - node:node_cpu_utilisation:avg1m - * - node:node_num_cpu:sum - / - scalar(sum(node:node_num_cpu:sum)) - record: node:cluster_cpu_utilisation:ratio - - expr: |- - sum(node_load1{job="node-exporter"}) - / - sum(node:node_num_cpu:sum) - record: ':node_cpu_saturation_load1:' - - expr: |- - sum by (node) ( - node_load1{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - / - node:node_num_cpu:sum - record: 'node:node_cpu_saturation_load1:' - - expr: |- - 1 - - sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) - / - sum(node_memory_MemTotal_bytes{job="node-exporter"}) - record: ':node_memory_utilisation:' - - expr: sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) - record: :node_memory_MemFreeCachedBuffers_bytes:sum - - expr: sum(node_memory_MemTotal_bytes{job="node-exporter"}) - record: :node_memory_MemTotal_bytes:sum - - expr: |- - sum by (node) ( - (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_memory_bytes_available:sum - - expr: |- - sum by (node) ( - node_memory_MemTotal_bytes{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_memory_bytes_total:sum - - expr: |- - (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) - / - node:node_memory_bytes_total:sum - record: node:node_memory_utilisation:ratio - - expr: |- - (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) - / - scalar(sum(node:node_memory_bytes_total:sum)) - record: node:cluster_memory_utilisation:ratio - - expr: |- - 1e3 * sum( - (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) - + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) - ) - record: :node_memory_swap_io_bytes:sum_rate - - expr: |- - 1 - - sum by (node) ( - (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - / - sum by (node) ( - node_memory_MemTotal_bytes{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: 'node:node_memory_utilisation:' - - expr: 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) - record: 'node:node_memory_utilisation_2:' - - expr: |- - 1e3 * sum by (node) ( - (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) - + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_memory_swap_io_bytes:sum_rate - - expr: avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) - record: :node_disk_utilisation:avg_irate - - expr: |- - avg by (node) ( - irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_disk_utilisation:avg_irate - - expr: avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) - record: :node_disk_saturation:avg_irate - - expr: |- - avg by (node) ( - irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_disk_saturation:avg_irate - - expr: |- - max by (instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} - - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) - / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) - record: 'node:node_filesystem_usage:' - - expr: max by (instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) - record: 'node:node_filesystem_avail:' - - expr: |- - sum(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) + - sum(irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) - record: :node_net_utilisation:sum_irate - - expr: |- - sum by (node) ( - (irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m]) + - irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_net_utilisation:sum_irate - - expr: |- - sum(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m])) + - sum(irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m])) - record: :node_net_saturation:sum_irate - - expr: |- - sum by (node) ( - (irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m]) + - irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m])) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_net_saturation:sum_irate - - expr: |- - max( - max( - kube_pod_info{job="kube-state-metrics", host_ip!=""} - ) by (node, host_ip) - * on (host_ip) group_right (node) - label_replace( - (max(node_filesystem_files{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*" - ) - ) by (node) - record: 'node:node_inodes_total:' - - expr: |- - max( - max( - kube_pod_info{job="kube-state-metrics", host_ip!=""} - ) by (node, host_ip) - * on (host_ip) group_right (node) - label_replace( - (max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*" - ) - ) by (node) - record: 'node:node_inodes_free:' -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/prometheus-operator.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/prometheus-operator.yaml deleted file mode 100644 index a8a8915..0000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/prometheus-operator.yaml +++ /dev/null @@ -1,49 +0,0 @@ -{{- /* -Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheusOperator }} -{{- $operatorJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "operator" }} -{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus-operator" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: prometheus-operator - rules: - - alert: PrometheusOperatorReconcileErrors - annotations: - message: Errors while reconciling {{`{{`}} $labels.controller {{`}}`}} in {{`{{`}} $labels.namespace {{`}}`}} Namespace. - expr: rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusOperatorNodeLookupErrors - annotations: - message: Errors while reconciling Prometheus in {{`{{`}} $labels.namespace {{`}}`}} Namespace. - expr: rate(prometheus_operator_node_address_lookup_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/prometheus.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/prometheus.rules.yaml deleted file mode 100644 index 0480c83..0000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules/prometheus.rules.yaml +++ /dev/null @@ -1,139 +0,0 @@ -{{- /* -Generated from 'prometheus.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheus }} -{{- $prometheusJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus" }} -{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: prometheus.rules - rules: - - alert: PrometheusConfigReloadFailed - annotations: - description: Reloading Prometheus' configuration has failed for {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} - summary: Reloading Prometheus' configuration failed - expr: prometheus_config_last_reload_successful{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} == 0 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusNotificationQueueRunningFull - annotations: - description: Prometheus' alert notification queue is running full for {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} - summary: Prometheus' alert notification queue is running full - expr: predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusErrorSendingAlerts - annotations: - description: Errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.Alertmanager{{`}}`}} - summary: Errors while sending alert from Prometheus - expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.01 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusErrorSendingAlerts - annotations: - description: Errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.Alertmanager{{`}}`}} - summary: Errors while sending alerts from Prometheus - expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.03 - for: 10m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusNotConnectedToAlertmanagers - annotations: - description: Prometheus {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} is not connected to any Alertmanagers - summary: Prometheus is not connected to any Alertmanagers - expr: prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} < 1 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusTSDBReloadsFailing - annotations: - description: '{{`{{`}}$labels.job{{`}}`}} at {{`{{`}}$labels.instance{{`}}`}} had {{`{{`}}$value | humanize{{`}}`}} reload failures over the last four hours.' - summary: Prometheus has issues reloading data blocks from disk - expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[2h]) > 0 - for: 12h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusTSDBCompactionsFailing - annotations: - description: '{{`{{`}}$labels.job{{`}}`}} at {{`{{`}}$labels.instance{{`}}`}} had {{`{{`}}$value | humanize{{`}}`}} compaction failures over the last four hours.' - summary: Prometheus has issues compacting sample blocks - expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[2h]) > 0 - for: 12h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusTSDBWALCorruptions - annotations: - description: '{{`{{`}}$labels.job{{`}}`}} at {{`{{`}}$labels.instance{{`}}`}} has a corrupted write-ahead log (WAL).' - summary: Prometheus write-ahead log is corrupted - expr: prometheus_tsdb_wal_corruptions_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} > 0 - for: 4h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusNotIngestingSamples - annotations: - description: Prometheus {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} isn't ingesting samples. - summary: Prometheus isn't ingesting samples - expr: rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusTargetScrapesDuplicate - annotations: - description: '{{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has many samples rejected due to duplicate timestamps but different values' - summary: Prometheus has many samples rejected - expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/values.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/values.yaml index 109820a..c3a9ffd 100644 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/values.yaml +++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/values.yaml @@ -396,7 +396,7 @@ alertmanager: ## image: repository: quay.io/prometheus/alertmanager - tag: v0.22.0 + tag: v0.22.2 sha: "" ## If true then the user will be responsible to provide a secret with alertmanager configuration @@ -613,6 +613,14 @@ grafana: enabled: true namespaceOverride: "" + ## ForceDeployDatasources Create datasource configmap even if grafana deployment has been disabled + ## + forceDeployDatasources: false + + ## ForceDeployDashboard Create dashboard configmap even if grafana deployment has been disabled + ## + forceDeployDashboards: false + ## Deploy default dashboards. ## defaultDashboardsEnabled: true @@ -665,6 +673,10 @@ grafana: enabled: true defaultDatasourceEnabled: true + ## URL of prometheus datasource + ## + # url: http://prometheus-stack-prometheus:9090/ + # If not defined, will use prometheus.prometheusSpec.scrapeInterval or its default # defaultDatasourceScrapeInterval: 15s @@ -742,19 +754,6 @@ kubeApiServer: tlsConfig: serverName: kubernetes insecureSkipVerify: false - - ## If your API endpoint address is not reachable (as in AKS) you can replace it with the kubernetes service - ## - relabelings: [] - # - sourceLabels: - # - __meta_kubernetes_namespace - # - __meta_kubernetes_service_name - # - __meta_kubernetes_endpoint_port_name - # action: keep - # regex: default;kubernetes;https - # - targetLabel: __address__ - # replacement: kubernetes.default.svc:443 - serviceMonitor: ## Scrape interval. If not set, the Prometheus default scrape interval is used. ## @@ -775,6 +774,15 @@ kubeApiServer: # - action: keep # regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+' # sourceLabels: [__name__] + relabelings: [] + # - sourceLabels: + # - __meta_kubernetes_namespace + # - __meta_kubernetes_service_name + # - __meta_kubernetes_endpoint_port_name + # action: keep + # regex: default;kubernetes;https + # - targetLabel: __address__ + # replacement: kubernetes.default.svc:443 ## Component scraping the kubelet and kubelet-hosted cAdvisor ## @@ -1250,6 +1258,10 @@ kubeStateMetrics: # replacement: $1 # action: replace + # Enable self metrics configuration for Service Monitor + selfMonitor: + enabled: false + ## Configuration for kube-state-metrics subchart ## kube-state-metrics: @@ -1353,6 +1365,16 @@ prometheusOperator: nodeSelector: {} affinity: {} tolerations: [] + + ## SecurityContext holds pod-level security attributes and common container settings. + ## This defaults to non root user with uid 2000 and gid 2000. *v1.PodSecurityContext false + ## ref: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ + ## + securityContext: + runAsGroup: 2000 + runAsNonRoot: true + runAsUser: 2000 + # Use certmanager to generate webhook certs certManager: enabled: false @@ -1540,7 +1562,7 @@ prometheusOperator: ## image: repository: quay.io/prometheus-operator/prometheus-operator - tag: v0.48.0 + tag: v0.48.1 sha: "" pullPolicy: IfNotPresent @@ -1556,7 +1578,7 @@ prometheusOperator: ## prometheusConfigReloaderImage: repository: quay.io/prometheus-operator/prometheus-config-reloader - tag: v0.48.0 + tag: v0.48.1 sha: "" ## Set the prometheus config reloader side-car CPU limit @@ -1567,6 +1589,13 @@ prometheusOperator: ## configReloaderMemory: 50Mi + ## Thanos side-car image when configured + ## + thanosImage: + repository: quay.io/thanos/thanos + tag: v0.17.2 + sha: "" + ## Set a Field Selector to filter watched secrets ## secretFieldSelector: "" diff --git a/charts/kubezero-metrics/update.sh b/charts/kubezero-metrics/update.sh index 9f3d1cc..47c86c5 100755 --- a/charts/kubezero-metrics/update.sh +++ b/charts/kubezero-metrics/update.sh @@ -1,6 +1,6 @@ #!/bin/bash -VERSION=16.1.2 +VERSION=16.12.0 rm -rf charts/kube-prometheus-stack curl -L -s -o - https://github.com/prometheus-community/helm-charts/releases/download/kube-prometheus-stack-${VERSION}/kube-prometheus-stack-${VERSION}.tgz | tar xfz - -C charts