feat: kube-prometheus-stack version bump, adjust filesytem alerts

This commit is contained in:
Stefan Reimer 2021-04-15 15:03:52 +02:00
parent 0d2452c18f
commit bb53c8cf35
37 changed files with 259 additions and 93 deletions

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: kubezero-metrics
description: KubeZero Umbrella Chart for prometheus-operator
type: application
version: 0.3.4
version: 0.3.5
home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords:
@ -16,7 +16,7 @@ dependencies:
version: ">= 0.1.3"
repository: https://zero-down-time.github.io/kubezero/
- name: kube-prometheus-stack
version: 14.3.0
version: 14.9.0
# Switch back to upstream once all alerts are fixed eg. etcd gpcr
# repository: https://prometheus-community.github.io/helm-charts
- name: prometheus-adapter

View File

@ -0,0 +1,66 @@
diff -turN charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml charts/kube-prometheus-stack.zdt/templates/prometheus/rules-1.14/etcd.yaml
--- charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml 2021-04-14 22:13:29.000000000 +0200
+++ charts/kube-prometheus-stack.zdt/templates/prometheus/rules-1.14/etcd.yaml 2021-04-15 14:43:03.074281889 +0200
@@ -54,34 +54,6 @@
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- - alert: etcdHighNumberOfFailedGRPCRequests
- annotations:
- message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
- expr: |-
- 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
- /
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
- > 1
- for: 10m
- labels:
- severity: warning
-{{- if .Values.defaultRules.additionalRuleLabels }}
-{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
-{{- end }}
- - alert: etcdHighNumberOfFailedGRPCRequests
- annotations:
- message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
- expr: |-
- 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
- /
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
- > 5
- for: 5m
- labels:
- severity: critical
-{{- if .Values.defaultRules.additionalRuleLabels }}
-{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
-{{- end }}
- alert: etcdGRPCRequestsSlow
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": gRPC requests to {{`{{`}} $labels.grpc_method {{`}}`}} are taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
diff -turN charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml charts/kube-prometheus-stack.zdt/templates/prometheus/rules-1.14/node-exporter.yaml
--- charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml 2021-04-14 22:13:29.000000000 +0200
+++ charts/kube-prometheus-stack.zdt/templates/prometheus/rules-1.14/node-exporter.yaml 2021-04-15 14:49:41.614282790 +0200
@@ -30,7 +30,7 @@
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 25
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
and
@@ -48,7 +48,7 @@
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 10
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and
@@ -259,4 +259,4 @@
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
-{{- end }}
\ No newline at end of file
+{{- end }}

View File

@ -15,11 +15,11 @@ dependencies:
- condition: nodeExporter.enabled
name: prometheus-node-exporter
repository: https://prometheus-community.github.io/helm-charts
version: 1.16.*
version: 1.17.*
- condition: grafana.enabled
name: grafana
repository: https://grafana.github.io/helm-charts
version: 6.6.*
version: 6.7.*
description: kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards, and Prometheus rules combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus Operator.
home: https://github.com/prometheus-operator/kube-prometheus
icon: https://raw.githubusercontent.com/prometheus/prometheus.github.io/master/assets/prometheus_logo-cb55bb5c346.png
@ -44,4 +44,4 @@ sources:
- https://github.com/prometheus-community/helm-charts
- https://github.com/prometheus-operator/kube-prometheus
type: application
version: 14.3.0
version: 14.9.0

View File

@ -1,5 +1,5 @@
apiVersion: v2
appVersion: 7.4.3
appVersion: 7.5.3
description: The leading tool for querying and visualizing time series and metrics.
home: https://grafana.net
icon: https://raw.githubusercontent.com/grafana/grafana/master/public/img/logo_transparent_400x.png
@ -19,4 +19,4 @@ name: grafana
sources:
- https://github.com/grafana/grafana
type: application
version: 6.6.3
version: 6.7.4

View File

@ -59,8 +59,8 @@ This version requires Helm >= 3.1.0.
| `securityContext` | Deployment securityContext | `{"runAsUser": 472, "runAsGroup": 472, "fsGroup": 472}` |
| `priorityClassName` | Name of Priority Class to assign pods | `nil` |
| `image.repository` | Image repository | `grafana/grafana` |
| `image.tag` | Image tag (`Must be >= 5.0.0`) | `7.4.3` |
| `image.sha` | Image sha (optional) | `16dc29783ec7d4a23fa19207507586344c6797023604347eb3e8ea5ae431e181` |
| `image.tag` | Image tag (`Must be >= 5.0.0`) | `7.4.5` |
| `image.sha` | Image sha (optional) | `2b56f6106ddc376bb46d974230d530754bf65a640dfbc5245191d72d3b49efc6` |
| `image.pullPolicy` | Image pull policy | `IfNotPresent` |
| `image.pullSecrets` | Image pull secrets | `{}` |
| `service.type` | Kubernetes service type | `ClusterIP` |
@ -242,6 +242,9 @@ ingress:
### Example of extraVolumeMounts
Volume can be type persistentVolumeClaim or hostPath but not both at same time.
If none existingClaim or hostPath argument is givent then type is emptyDir.
```yaml
- extraVolumeMounts:
- name: plugins
@ -249,6 +252,10 @@ ingress:
subPath: configs/grafana/plugins
existingClaim: existing-grafana-claim
readOnly: false
- name: dashboards
mountPath: /var/lib/grafana/dashboards
hostPath: /usr/shared/grafana/dashboards
readOnly: false
```
## Import dashboards

View File

@ -479,8 +479,15 @@ volumes:
{{- end }}
{{- range .Values.extraVolumeMounts }}
- name: {{ .name }}
{{- if .existingClaim }}
persistentVolumeClaim:
claimName: {{ .existingClaim }}
{{- else if .hostPath }}
hostPath:
path: {{ .hostPath }}
{{- else }}
emptyDir: {}
{{- end }}
{{- end }}
{{- range .Values.extraEmptyDirMounts }}
- name: {{ .name }}

View File

@ -17,19 +17,8 @@ spec:
privileged: false
allowPrivilegeEscalation: false
requiredDropCapabilities:
# Default set from Docker, without DAC_OVERRIDE or CHOWN
- FOWNER
- FSETID
- KILL
- SETGID
- SETUID
- SETPCAP
- NET_BIND_SERVICE
- NET_RAW
- SYS_CHROOT
- MKNOD
- AUDIT_WRITE
- SETFCAP
# Default set from Docker, with DAC_OVERRIDE and CHOWN
- ALL
volumes:
- 'configMap'
- 'emptyDir'
@ -42,12 +31,20 @@ spec:
hostIPC: false
hostPID: false
runAsUser:
rule: 'RunAsAny'
rule: 'MustRunAsNonRoot'
seLinux:
rule: 'RunAsAny'
supplementalGroups:
rule: 'RunAsAny'
rule: 'MustRunAs'
ranges:
# Forbid adding the root group.
- min: 1
max: 65535
fsGroup:
rule: 'RunAsAny'
rule: 'MustRunAs'
ranges:
# Forbid adding the root group.
- min: 1
max: 65535
readOnlyRootFilesystem: false
{{- end }}

View File

@ -53,7 +53,7 @@ livenessProbe:
image:
repository: grafana/grafana
tag: 7.4.3
tag: 7.5.3
sha: ""
pullPolicy: IfNotPresent
@ -401,10 +401,14 @@ extraSecretMounts: []
## Additional grafana server volume mounts
# Defines additional volume mounts.
extraVolumeMounts: []
# - name: extra-volume
# mountPath: /mnt/volume
# - name: extra-volume-0
# mountPath: /mnt/volume0
# readOnly: true
# existingClaim: volume-claim
# - name: extra-volume-1
# mountPath: /mnt/volume1
# readOnly: true
# hostPath: /usr/shared/
## Pass the plugins you want installed as a list.
##

View File

@ -15,4 +15,4 @@ maintainers:
name: kube-state-metrics
sources:
- https://github.com/kubernetes/kube-state-metrics/
version: 2.13.0
version: 2.13.2

View File

@ -1,7 +1,7 @@
{{- if and (eq .Values.rbac.create true) (eq .Values.rbac.useClusterRole false) -}}
{{- range (split "," $.Values.namespace) }}
---
apiVersion: rbac.authorization.k8s.io/v1beta1
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:

View File

@ -14,4 +14,4 @@ maintainers:
name: prometheus-node-exporter
sources:
- https://github.com/prometheus/node_exporter/
version: 1.16.2
version: 1.17.0

View File

@ -28,6 +28,10 @@ spec:
{{- end }}
{{- if .Values.priorityClassName }}
priorityClassName: {{ .Values.priorityClassName }}
{{- end }}
{{- if .Values.extraInitContainers }}
initContainers:
{{ toYaml .Values.extraInitContainers | nindent 6 }}
{{- end }}
containers:
- name: node-exporter

View File

@ -168,3 +168,7 @@ sidecarVolumeMount: []
## - name: collector-textfiles
## mountPath: /run/prometheus
## readOnly: false
## Additional InitContainers to initialize the pod
##
extraInitContainers: []

View File

@ -7,6 +7,10 @@ metadata:
labels:
app: {{ template "kube-prometheus-stack.name" . }}-alertmanager
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
{{- if .Values.alertmanager.annotations }}
annotations:
{{ toYaml .Values.alertmanager.annotations | indent 4 }}
{{- end }}
spec:
{{- if .Values.alertmanager.alertmanagerSpec.image }}
image: {{ .Values.alertmanager.alertmanagerSpec.image.repository }}:{{ .Values.alertmanager.alertmanagerSpec.image.tag }}

View File

@ -21,6 +21,9 @@ spec:
{{- if .Values.alertmanager.serviceMonitor.interval }}
interval: {{ .Values.alertmanager.serviceMonitor.interval }}
{{- end }}
{{- if .Values.alertmanager.serviceMonitor.proxyUrl }}
proxyUrl: {{ .Values.alertmanager.serviceMonitor.proxyUrl}}
{{- end }}
{{- if .Values.alertmanager.serviceMonitor.scheme }}
scheme: {{ .Values.alertmanager.serviceMonitor.scheme }}
{{- end }}

View File

@ -21,6 +21,9 @@ spec:
{{- if .Values.coreDns.serviceMonitor.interval}}
interval: {{ .Values.coreDns.serviceMonitor.interval }}
{{- end }}
{{- if .Values.coreDns.serviceMonitor.proxyUrl }}
proxyUrl: {{ .Values.coreDns.serviceMonitor.proxyUrl}}
{{- end }}
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
{{- if .Values.coreDns.serviceMonitor.metricRelabelings }}
metricRelabelings:

View File

@ -13,6 +13,9 @@ spec:
{{- if .Values.kubeApiServer.serviceMonitor.interval }}
interval: {{ .Values.kubeApiServer.serviceMonitor.interval }}
{{- end }}
{{- if .Values.kubeApiServer.serviceMonitor.proxyUrl }}
proxyUrl: {{ .Values.kubeApiServer.serviceMonitor.proxyUrl}}
{{- end }}
port: https
scheme: https
{{- if .Values.kubeApiServer.serviceMonitor.metricRelabelings }}

View File

@ -1,4 +1,4 @@
{{- if .Values.kubeControllerManager.enabled }}
{{- if and .Values.kubeControllerManager.enabled .Values.kubeControllerManager.service.enabled }}
apiVersion: v1
kind: Service
metadata:

View File

@ -1,4 +1,4 @@
{{- if .Values.kubeControllerManager.enabled }}
{{- if and .Values.kubeControllerManager.enabled .Values.kubeControllerManager.serviceMonitor.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
@ -22,6 +22,9 @@ spec:
interval: {{ .Values.kubeControllerManager.serviceMonitor.interval }}
{{- end }}
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
{{- if .Values.kubeControllerManager.serviceMonitor.proxyUrl }}
proxyUrl: {{ .Values.kubeControllerManager.serviceMonitor.proxyUrl}}
{{- end }}
{{- if .Values.kubeControllerManager.serviceMonitor.https }}
scheme: https
tlsConfig:

View File

@ -22,6 +22,9 @@ spec:
interval: {{ .Values.kubeDns.serviceMonitor.interval }}
{{- end }}
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
{{- if .Values.kubeDns.serviceMonitor.proxyUrl }}
proxyUrl: {{ .Values.kubeDns.serviceMonitor.proxyUrl}}
{{- end }}
{{- if .Values.kubeDns.serviceMonitor.dnsmasqMetricRelabelings }}
metricRelabelings:
{{ tpl (toYaml .Values.kubeDns.serviceMonitor.dnsmasqMetricRelabelings | indent 4) . }}

View File

@ -1,4 +1,4 @@
{{- if .Values.kubeEtcd.enabled }}
{{- if and .Values.kubeEtcd.enabled .Values.kubeEtcd.service.enabled }}
apiVersion: v1
kind: Service
metadata:

View File

@ -1,4 +1,4 @@
{{- if .Values.kubeEtcd.enabled }}
{{- if and .Values.kubeEtcd.enabled .Values.kubeEtcd.serviceMonitor.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
@ -22,6 +22,9 @@ spec:
interval: {{ .Values.kubeEtcd.serviceMonitor.interval }}
{{- end }}
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
{{- if .Values.kubeEtcd.serviceMonitor.proxyUrl }}
proxyUrl: {{ .Values.kubeEtcd.serviceMonitor.proxyUrl}}
{{- end }}
{{- if eq .Values.kubeEtcd.serviceMonitor.scheme "https" }}
scheme: https
tlsConfig:

View File

@ -1,4 +1,4 @@
{{- if .Values.kubeProxy.enabled }}
{{- if and .Values.kubeProxy.enabled .Values.kubeProxy.service.enabled }}
apiVersion: v1
kind: Service
metadata:

View File

@ -1,4 +1,4 @@
{{- if .Values.kubeProxy.enabled }}
{{- if and .Values.kubeProxy.enabled .Values.kubeProxy.serviceMonitor.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
@ -22,6 +22,9 @@ spec:
interval: {{ .Values.kubeProxy.serviceMonitor.interval }}
{{- end }}
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
{{- if .Values.kubeProxy.serviceMonitor.proxyUrl }}
proxyUrl: {{ .Values.kubeProxy.serviceMonitor.proxyUrl}}
{{- end }}
{{- if .Values.kubeProxy.serviceMonitor.https }}
scheme: https
tlsConfig:

View File

@ -1,4 +1,4 @@
{{- if .Values.kubeScheduler.enabled }}
{{- if and .Values.kubeScheduler.enabled .Values.kubeScheduler.service.enabled }}
apiVersion: v1
kind: Service
metadata:

View File

@ -1,4 +1,4 @@
{{- if .Values.kubeScheduler.enabled }}
{{- if and .Values.kubeScheduler.enabled .Values.kubeScheduler.serviceMonitor.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
@ -22,6 +22,9 @@ spec:
interval: {{ .Values.kubeScheduler.serviceMonitor.interval }}
{{- end }}
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
{{- if .Values.kubeScheduler.serviceMonitor.proxyUrl }}
proxyUrl: {{ .Values.kubeScheduler.serviceMonitor.proxyUrl}}
{{- end }}
{{- if .Values.kubeScheduler.serviceMonitor.https }}
scheme: https
tlsConfig:

View File

@ -14,6 +14,9 @@ spec:
{{- if .Values.kubeStateMetrics.serviceMonitor.interval }}
interval: {{ .Values.kubeStateMetrics.serviceMonitor.interval }}
{{- end }}
{{- if .Values.kubeStateMetrics.serviceMonitor.proxyUrl }}
proxyUrl: {{ .Values.kubeStateMetrics.serviceMonitor.proxyUrl}}
{{- end }}
honorLabels: true
{{- if .Values.kubeStateMetrics.serviceMonitor.metricRelabelings }}
metricRelabelings:
@ -22,6 +25,11 @@ spec:
{{- if .Values.kubeStateMetrics.serviceMonitor.relabelings }}
relabelings:
{{ toYaml .Values.kubeStateMetrics.serviceMonitor.relabelings | indent 4 }}
{{- end }}
{{- if .Values.kubeStateMetrics.serviceMonitor.namespaceOverride }}
namespaceSelector:
matchNames:
- {{ .Values.kubeStateMetrics.serviceMonitor.namespaceOverride }}
{{- end }}
selector:
matchLabels:

View File

@ -15,6 +15,9 @@ spec:
{{- if .Values.kubelet.serviceMonitor.interval }}
interval: {{ .Values.kubelet.serviceMonitor.interval }}
{{- end }}
{{- if .Values.kubelet.serviceMonitor.proxyUrl }}
proxyUrl: {{ .Values.kubelet.serviceMonitor.proxyUrl}}
{{- end }}
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecureSkipVerify: true

View File

@ -18,6 +18,9 @@ spec:
{{- if .Values.nodeExporter.serviceMonitor.interval }}
interval: {{ .Values.nodeExporter.serviceMonitor.interval }}
{{- end }}
{{- if .Values.kubeApiServer.serviceMonitor.proxyUrl }}
proxyUrl: {{ .Values.kubeApiServer.serviceMonitor.proxyUrl}}
{{- end }}
{{- if .Values.nodeExporter.serviceMonitor.scrapeTimeout }}
scrapeTimeout: {{ .Values.nodeExporter.serviceMonitor.scrapeTimeout }}
{{- end }}

View File

@ -18,7 +18,7 @@ metadata:
namespace: {{ template "kube-prometheus-stack.namespace" . }}
spec:
secretName: {{ template "kube-prometheus-stack.fullname" . }}-root-cert
duration: 43800h # 5y
duration: 43800h0m0s # 5y
issuerRef:
name: {{ template "kube-prometheus-stack.fullname" . }}-self-signed-issuer
commonName: "ca.webhook.kube-prometheus-stack"
@ -43,7 +43,7 @@ metadata:
namespace: {{ template "kube-prometheus-stack.namespace" . }}
spec:
secretName: {{ template "kube-prometheus-stack.fullname" . }}-admission
duration: 8760h # 1y
duration: 8760h0m0s # 1y
issuerRef:
{{- if .Values.prometheusOperator.admissionWebhooks.certManager.issuerRef }}
{{- toYaml .Values.prometheusOperator.admissionWebhooks.certManager.issuerRef | nindent 4 }}

View File

@ -150,14 +150,24 @@ spec:
{{ else }}
probeNamespaceSelector: {}
{{- end }}
{{- if .Values.prometheus.prometheusSpec.remoteRead }}
{{- if (or .Values.prometheus.prometheusSpec.remoteRead .Values.prometheus.prometheusSpec.additionalRemoteRead) }}
remoteRead:
{{- if .Values.prometheus.prometheusSpec.remoteRead }}
{{ toYaml .Values.prometheus.prometheusSpec.remoteRead | indent 4 }}
{{- end }}
{{- if .Values.prometheus.prometheusSpec.remoteWrite }}
{{- if .Values.prometheus.prometheusSpec.additionalRemoteRead }}
{{ toYaml .Values.prometheus.prometheusSpec.additionalRemoteRead | indent 4 }}
{{- end }}
{{- end }}
{{- if (or .Values.prometheus.prometheusSpec.remoteWrite .Values.prometheus.prometheusSpec.additionalRemoteWrite) }}
remoteWrite:
{{- if .Values.prometheus.prometheusSpec.remoteWrite }}
{{ toYaml .Values.prometheus.prometheusSpec.remoteWrite | indent 4 }}
{{- end }}
{{- if .Values.prometheus.prometheusSpec.additionalRemoteWrite }}
{{ toYaml .Values.prometheus.prometheusSpec.additionalRemoteWrite | indent 4 }}
{{- end }}
{{- end }}
{{- if .Values.prometheus.prometheusSpec.securityContext }}
securityContext:
{{ toYaml .Values.prometheus.prometheusSpec.securityContext | indent 4 }}

View File

@ -30,7 +30,7 @@ spec:
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |-
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 25
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
and
@ -48,7 +48,7 @@ spec:
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |-
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 10
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and

View File

@ -16,7 +16,7 @@ metadata:
{{- end }}
spec:
type: {{ .Values.prometheus.thanosService.type }}
clusterIP: None
clusterIP: {{ .Values.prometheus.thanosService.clusterIP }}
ports:
- name: {{ .Values.prometheus.thanosService.portName }}
port: {{ .Values.prometheus.thanosService.port }}

View File

@ -117,6 +117,10 @@ alertmanager:
##
enabled: true
## Annotations for Alertmanager
##
annotations: {}
## Api that prometheus will use to communicate with alertmanager. Possible values are v1, v2
##
apiVersion: v2
@ -194,7 +198,7 @@ alertmanager:
# *Graph:* <{{ .GeneratorURL }}|:chart_with_upwards_trend:>
# *Runbook:* <{{ .Annotations.runbook }}|:spiral_note_pad:>
# *Details:*
# {{ range .Labels.SortedPairs }} *{{ .Name }}:* `{{ .Value }}`
# {{ range .Labels.SortedPairs }} - *{{ .Name }}:* `{{ .Value }}`
# {{ end }}
# {{ end }}
# {{ end }}
@ -345,6 +349,10 @@ alertmanager:
interval: ""
selfMonitor: true
## proxyUrl: URL of a proxy that should be used for scraping.
##
proxyUrl: ""
## scheme: HTTP scheme to use for scraping. Can be used with `tlsConfig` for example if using istio mTLS.
scheme: ""
@ -375,7 +383,7 @@ alertmanager:
## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#alertmanagerspec
##
alertmanagerSpec:
## Standard objects metadata. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#metadata
## Standard object's metadata. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#metadata
## Metadata Labels and Annotations gets propagated to the Alertmanager pods.
##
podMetadata: {}
@ -747,6 +755,10 @@ kubeApiServer:
## Scrape interval. If not set, the Prometheus default scrape interval is used.
##
interval: ""
## proxyUrl: URL of a proxy that should be used for scraping.
##
proxyUrl: ""
jobLabel: component
selector:
matchLabels:
@ -771,6 +783,10 @@ kubelet:
##
interval: ""
## proxyUrl: URL of a proxy that should be used for scraping.
##
proxyUrl: ""
## Enable scraping the kubelet over https. For requirements to enable this see
## https://github.com/prometheus-operator/prometheus-operator/issues/926
##
@ -891,16 +907,22 @@ kubeControllerManager:
## If using kubeControllerManager.endpoints only the port and targetPort are used
##
service:
enabled: true
port: 10252
targetPort: 10252
# selector:
# component: kube-controller-manager
serviceMonitor:
enabled: true
## Scrape interval. If not set, the Prometheus default scrape interval is used.
##
interval: ""
## proxyUrl: URL of a proxy that should be used for scraping.
##
proxyUrl: ""
## Enable scraping kube-controller-manager over https.
## Requires proper certs (not self-signed) and delegated authentication/authorization checks
##
@ -943,6 +965,10 @@ coreDns:
##
interval: ""
## proxyUrl: URL of a proxy that should be used for scraping.
##
proxyUrl: ""
## metric relabel configs to apply to samples before ingestion.
##
metricRelabelings: []
@ -978,6 +1004,10 @@ kubeDns:
##
interval: ""
## proxyUrl: URL of a proxy that should be used for scraping.
##
proxyUrl: ""
## metric relabel configs to apply to samples before ingestion.
##
metricRelabelings: []
@ -1024,6 +1054,7 @@ kubeEtcd:
## Etcd service. If using kubeEtcd.endpoints only the port and targetPort are used
##
service:
enabled: true
port: 2379
targetPort: 2379
# selector:
@ -1041,9 +1072,13 @@ kubeEtcd:
## keyFile: /etc/prometheus/secrets/etcd-client-cert/etcd-client-key
##
serviceMonitor:
enabled: true
## Scrape interval. If not set, the Prometheus default scrape interval is used.
##
interval: ""
## proxyUrl: URL of a proxy that should be used for scraping.
##
proxyUrl: ""
scheme: http
insecureSkipVerify: false
serverName: ""
@ -1084,15 +1119,20 @@ kubeScheduler:
## If using kubeScheduler.endpoints only the port and targetPort are used
##
service:
enabled: true
port: 10251
targetPort: 10251
# selector:
# component: kube-scheduler
serviceMonitor:
enabled: true
## Scrape interval. If not set, the Prometheus default scrape interval is used.
##
interval: ""
## proxyUrl: URL of a proxy that should be used for scraping.
##
proxyUrl: ""
## Enable scraping kube-scheduler over https.
## Requires proper certs (not self-signed) and delegated authentication/authorization checks
##
@ -1135,16 +1175,22 @@ kubeProxy:
# - 10.141.4.24
service:
enabled: true
port: 10249
targetPort: 10249
# selector:
# k8s-app: kube-proxy
serviceMonitor:
enabled: true
## Scrape interval. If not set, the Prometheus default scrape interval is used.
##
interval: ""
## proxyUrl: URL of a proxy that should be used for scraping.
##
proxyUrl: ""
## Enable scraping kube-proxy over https.
## Requires proper certs (not self-signed) and delegated authentication/authorization checks
##
@ -1173,9 +1219,15 @@ kubeStateMetrics:
## Scrape interval. If not set, the Prometheus default scrape interval is used.
##
interval: ""
## proxyUrl: URL of a proxy that should be used for scraping.
##
proxyUrl: ""
## Override serviceMonitor selector
##
selectorOverride: {}
## Override namespace selector
##
namespaceOverride: ""
## metric relabel configs to apply to samples before ingestion.
##
@ -1217,6 +1269,10 @@ nodeExporter:
##
interval: ""
## proxyUrl: URL of a proxy that should be used for scraping.
##
proxyUrl: ""
## How long until a scrape request times out. If not set, the Prometheus default scape timeout is used.
##
scrapeTimeout: ""
@ -1527,6 +1583,7 @@ prometheus:
serviceAccount:
create: true
name: ""
annotations: {}
# Service for thanos service discovery on sidecar
# Enable this can make Thanos Query can use
@ -1540,6 +1597,7 @@ prometheus:
portName: grpc
port: 10901
targetPort: "grpc"
clusterIP: "None"
## Service type
##
@ -1814,7 +1872,7 @@ prometheus:
##
image:
repository: quay.io/prometheus/prometheus
tag: v2.24.0
tag: v2.26.0
sha: ""
## Tolerations for use with node taints
@ -2035,7 +2093,7 @@ prometheus:
##
routePrefix: /
## Standard objects metadata. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#metadata
## Standard object's metadata. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#metadata
## Metadata Labels and Annotations gets propagated to the prometheus pods.
##
podMetadata: {}
@ -2072,11 +2130,15 @@ prometheus:
## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#remotereadspec
remoteRead: []
# - url: http://remote1/read
## additionalRemoteRead is appended to remoteRead
additionalRemoteRead: []
## The remote_write spec configuration for Prometheus.
## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#remotewritespec
remoteWrite: []
# - url: http://remote1/push
## additionalRemoteWrite is appended to remoteWrite
additionalRemoteWrite: []
## Enable/Disable Grafana dashboards provisioning for prometheus remote write feature
remoteWriteDashboards: false

View File

@ -1,39 +0,0 @@
diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml
index b430951..8358704 100644
--- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml
+++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml
@@ -71,34 +71,6 @@ spec:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
-{{- end }}
- - alert: etcdHighNumberOfFailedGRPCRequests
- annotations:
- message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
- expr: |-
- 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
- /
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
- > 1
- for: 10m
- labels:
- severity: warning
-{{- if .Values.defaultRules.additionalRuleLabels }}
-{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
-{{- end }}
- - alert: etcdHighNumberOfFailedGRPCRequests
- annotations:
- message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
- expr: |-
- 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
- /
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
- > 5
- for: 5m
- labels:
- severity: critical
-{{- if .Values.defaultRules.additionalRuleLabels }}
-{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdGRPCRequestsSlow
annotations:

View File

@ -1,8 +1,10 @@
#!/bin/bash
VERSION=14.3.0
VERSION=14.9.0
rm -rf charts/kube-prometheus-stack
curl -L -s -o - https://github.com/prometheus-community/helm-charts/releases/download/kube-prometheus-stack-${VERSION}/kube-prometheus-stack-${VERSION}.tgz | tar xfz - -C charts
patch -p3 -i remove_etcd_grpc_alerts.patch --no-backup-if-mismatch
# The grpc alerts could be re-enabled with etcd 3.5
# https://github.com/etcd-io/etcd/pull/12196
patch -p0 -i adjust_alarms.patch --no-backup-if-mismatch