feat: kube-prometheus-stack version bump, adjust filesytem alerts
This commit is contained in:
parent
0d2452c18f
commit
bb53c8cf35
@ -2,7 +2,7 @@ apiVersion: v2
|
||||
name: kubezero-metrics
|
||||
description: KubeZero Umbrella Chart for prometheus-operator
|
||||
type: application
|
||||
version: 0.3.4
|
||||
version: 0.3.5
|
||||
home: https://kubezero.com
|
||||
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
|
||||
keywords:
|
||||
@ -16,7 +16,7 @@ dependencies:
|
||||
version: ">= 0.1.3"
|
||||
repository: https://zero-down-time.github.io/kubezero/
|
||||
- name: kube-prometheus-stack
|
||||
version: 14.3.0
|
||||
version: 14.9.0
|
||||
# Switch back to upstream once all alerts are fixed eg. etcd gpcr
|
||||
# repository: https://prometheus-community.github.io/helm-charts
|
||||
- name: prometheus-adapter
|
||||
|
66
charts/kubezero-metrics/adjust_alarms.patch
Normal file
66
charts/kubezero-metrics/adjust_alarms.patch
Normal file
@ -0,0 +1,66 @@
|
||||
diff -turN charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml charts/kube-prometheus-stack.zdt/templates/prometheus/rules-1.14/etcd.yaml
|
||||
--- charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml 2021-04-14 22:13:29.000000000 +0200
|
||||
+++ charts/kube-prometheus-stack.zdt/templates/prometheus/rules-1.14/etcd.yaml 2021-04-15 14:43:03.074281889 +0200
|
||||
@@ -54,34 +54,6 @@
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- - alert: etcdHighNumberOfFailedGRPCRequests
|
||||
- annotations:
|
||||
- message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
- expr: |-
|
||||
- 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
- /
|
||||
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
- > 1
|
||||
- for: 10m
|
||||
- labels:
|
||||
- severity: warning
|
||||
-{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
-{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
-{{- end }}
|
||||
- - alert: etcdHighNumberOfFailedGRPCRequests
|
||||
- annotations:
|
||||
- message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
- expr: |-
|
||||
- 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
- /
|
||||
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
- > 5
|
||||
- for: 5m
|
||||
- labels:
|
||||
- severity: critical
|
||||
-{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
-{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
-{{- end }}
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": gRPC requests to {{`{{`}} $labels.grpc_method {{`}}`}} are taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
diff -turN charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml charts/kube-prometheus-stack.zdt/templates/prometheus/rules-1.14/node-exporter.yaml
|
||||
--- charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml 2021-04-14 22:13:29.000000000 +0200
|
||||
+++ charts/kube-prometheus-stack.zdt/templates/prometheus/rules-1.14/node-exporter.yaml 2021-04-15 14:49:41.614282790 +0200
|
||||
@@ -30,7 +30,7 @@
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40
|
||||
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 25
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
@@ -48,7 +48,7 @@
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15
|
||||
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 10
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
@@ -259,4 +259,4 @@
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
-{{- end }}
|
||||
\ No newline at end of file
|
||||
+{{- end }}
|
@ -15,11 +15,11 @@ dependencies:
|
||||
- condition: nodeExporter.enabled
|
||||
name: prometheus-node-exporter
|
||||
repository: https://prometheus-community.github.io/helm-charts
|
||||
version: 1.16.*
|
||||
version: 1.17.*
|
||||
- condition: grafana.enabled
|
||||
name: grafana
|
||||
repository: https://grafana.github.io/helm-charts
|
||||
version: 6.6.*
|
||||
version: 6.7.*
|
||||
description: kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards, and Prometheus rules combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus Operator.
|
||||
home: https://github.com/prometheus-operator/kube-prometheus
|
||||
icon: https://raw.githubusercontent.com/prometheus/prometheus.github.io/master/assets/prometheus_logo-cb55bb5c346.png
|
||||
@ -44,4 +44,4 @@ sources:
|
||||
- https://github.com/prometheus-community/helm-charts
|
||||
- https://github.com/prometheus-operator/kube-prometheus
|
||||
type: application
|
||||
version: 14.3.0
|
||||
version: 14.9.0
|
||||
|
@ -1,5 +1,5 @@
|
||||
apiVersion: v2
|
||||
appVersion: 7.4.3
|
||||
appVersion: 7.5.3
|
||||
description: The leading tool for querying and visualizing time series and metrics.
|
||||
home: https://grafana.net
|
||||
icon: https://raw.githubusercontent.com/grafana/grafana/master/public/img/logo_transparent_400x.png
|
||||
@ -19,4 +19,4 @@ name: grafana
|
||||
sources:
|
||||
- https://github.com/grafana/grafana
|
||||
type: application
|
||||
version: 6.6.3
|
||||
version: 6.7.4
|
||||
|
@ -59,8 +59,8 @@ This version requires Helm >= 3.1.0.
|
||||
| `securityContext` | Deployment securityContext | `{"runAsUser": 472, "runAsGroup": 472, "fsGroup": 472}` |
|
||||
| `priorityClassName` | Name of Priority Class to assign pods | `nil` |
|
||||
| `image.repository` | Image repository | `grafana/grafana` |
|
||||
| `image.tag` | Image tag (`Must be >= 5.0.0`) | `7.4.3` |
|
||||
| `image.sha` | Image sha (optional) | `16dc29783ec7d4a23fa19207507586344c6797023604347eb3e8ea5ae431e181` |
|
||||
| `image.tag` | Image tag (`Must be >= 5.0.0`) | `7.4.5` |
|
||||
| `image.sha` | Image sha (optional) | `2b56f6106ddc376bb46d974230d530754bf65a640dfbc5245191d72d3b49efc6` |
|
||||
| `image.pullPolicy` | Image pull policy | `IfNotPresent` |
|
||||
| `image.pullSecrets` | Image pull secrets | `{}` |
|
||||
| `service.type` | Kubernetes service type | `ClusterIP` |
|
||||
@ -242,6 +242,9 @@ ingress:
|
||||
|
||||
### Example of extraVolumeMounts
|
||||
|
||||
Volume can be type persistentVolumeClaim or hostPath but not both at same time.
|
||||
If none existingClaim or hostPath argument is givent then type is emptyDir.
|
||||
|
||||
```yaml
|
||||
- extraVolumeMounts:
|
||||
- name: plugins
|
||||
@ -249,6 +252,10 @@ ingress:
|
||||
subPath: configs/grafana/plugins
|
||||
existingClaim: existing-grafana-claim
|
||||
readOnly: false
|
||||
- name: dashboards
|
||||
mountPath: /var/lib/grafana/dashboards
|
||||
hostPath: /usr/shared/grafana/dashboards
|
||||
readOnly: false
|
||||
```
|
||||
|
||||
## Import dashboards
|
||||
|
@ -479,8 +479,15 @@ volumes:
|
||||
{{- end }}
|
||||
{{- range .Values.extraVolumeMounts }}
|
||||
- name: {{ .name }}
|
||||
{{- if .existingClaim }}
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ .existingClaim }}
|
||||
{{- else if .hostPath }}
|
||||
hostPath:
|
||||
path: {{ .hostPath }}
|
||||
{{- else }}
|
||||
emptyDir: {}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- range .Values.extraEmptyDirMounts }}
|
||||
- name: {{ .name }}
|
||||
|
@ -17,19 +17,8 @@ spec:
|
||||
privileged: false
|
||||
allowPrivilegeEscalation: false
|
||||
requiredDropCapabilities:
|
||||
# Default set from Docker, without DAC_OVERRIDE or CHOWN
|
||||
- FOWNER
|
||||
- FSETID
|
||||
- KILL
|
||||
- SETGID
|
||||
- SETUID
|
||||
- SETPCAP
|
||||
- NET_BIND_SERVICE
|
||||
- NET_RAW
|
||||
- SYS_CHROOT
|
||||
- MKNOD
|
||||
- AUDIT_WRITE
|
||||
- SETFCAP
|
||||
# Default set from Docker, with DAC_OVERRIDE and CHOWN
|
||||
- ALL
|
||||
volumes:
|
||||
- 'configMap'
|
||||
- 'emptyDir'
|
||||
@ -42,12 +31,20 @@ spec:
|
||||
hostIPC: false
|
||||
hostPID: false
|
||||
runAsUser:
|
||||
rule: 'RunAsAny'
|
||||
rule: 'MustRunAsNonRoot'
|
||||
seLinux:
|
||||
rule: 'RunAsAny'
|
||||
supplementalGroups:
|
||||
rule: 'RunAsAny'
|
||||
rule: 'MustRunAs'
|
||||
ranges:
|
||||
# Forbid adding the root group.
|
||||
- min: 1
|
||||
max: 65535
|
||||
fsGroup:
|
||||
rule: 'RunAsAny'
|
||||
rule: 'MustRunAs'
|
||||
ranges:
|
||||
# Forbid adding the root group.
|
||||
- min: 1
|
||||
max: 65535
|
||||
readOnlyRootFilesystem: false
|
||||
{{- end }}
|
||||
|
@ -53,7 +53,7 @@ livenessProbe:
|
||||
|
||||
image:
|
||||
repository: grafana/grafana
|
||||
tag: 7.4.3
|
||||
tag: 7.5.3
|
||||
sha: ""
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
@ -401,10 +401,14 @@ extraSecretMounts: []
|
||||
## Additional grafana server volume mounts
|
||||
# Defines additional volume mounts.
|
||||
extraVolumeMounts: []
|
||||
# - name: extra-volume
|
||||
# mountPath: /mnt/volume
|
||||
# - name: extra-volume-0
|
||||
# mountPath: /mnt/volume0
|
||||
# readOnly: true
|
||||
# existingClaim: volume-claim
|
||||
# - name: extra-volume-1
|
||||
# mountPath: /mnt/volume1
|
||||
# readOnly: true
|
||||
# hostPath: /usr/shared/
|
||||
|
||||
## Pass the plugins you want installed as a list.
|
||||
##
|
||||
|
@ -15,4 +15,4 @@ maintainers:
|
||||
name: kube-state-metrics
|
||||
sources:
|
||||
- https://github.com/kubernetes/kube-state-metrics/
|
||||
version: 2.13.0
|
||||
version: 2.13.2
|
||||
|
@ -1,7 +1,7 @@
|
||||
{{- if and (eq .Values.rbac.create true) (eq .Values.rbac.useClusterRole false) -}}
|
||||
{{- range (split "," $.Values.namespace) }}
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
labels:
|
||||
|
@ -14,4 +14,4 @@ maintainers:
|
||||
name: prometheus-node-exporter
|
||||
sources:
|
||||
- https://github.com/prometheus/node_exporter/
|
||||
version: 1.16.2
|
||||
version: 1.17.0
|
||||
|
@ -28,6 +28,10 @@ spec:
|
||||
{{- end }}
|
||||
{{- if .Values.priorityClassName }}
|
||||
priorityClassName: {{ .Values.priorityClassName }}
|
||||
{{- end }}
|
||||
{{- if .Values.extraInitContainers }}
|
||||
initContainers:
|
||||
{{ toYaml .Values.extraInitContainers | nindent 6 }}
|
||||
{{- end }}
|
||||
containers:
|
||||
- name: node-exporter
|
||||
|
@ -168,3 +168,7 @@ sidecarVolumeMount: []
|
||||
## - name: collector-textfiles
|
||||
## mountPath: /run/prometheus
|
||||
## readOnly: false
|
||||
|
||||
## Additional InitContainers to initialize the pod
|
||||
##
|
||||
extraInitContainers: []
|
||||
|
@ -7,6 +7,10 @@ metadata:
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-alertmanager
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.alertmanager.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.alertmanager.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- if .Values.alertmanager.alertmanagerSpec.image }}
|
||||
image: {{ .Values.alertmanager.alertmanagerSpec.image.repository }}:{{ .Values.alertmanager.alertmanagerSpec.image.tag }}
|
||||
|
@ -21,6 +21,9 @@ spec:
|
||||
{{- if .Values.alertmanager.serviceMonitor.interval }}
|
||||
interval: {{ .Values.alertmanager.serviceMonitor.interval }}
|
||||
{{- end }}
|
||||
{{- if .Values.alertmanager.serviceMonitor.proxyUrl }}
|
||||
proxyUrl: {{ .Values.alertmanager.serviceMonitor.proxyUrl}}
|
||||
{{- end }}
|
||||
{{- if .Values.alertmanager.serviceMonitor.scheme }}
|
||||
scheme: {{ .Values.alertmanager.serviceMonitor.scheme }}
|
||||
{{- end }}
|
||||
|
@ -21,6 +21,9 @@ spec:
|
||||
{{- if .Values.coreDns.serviceMonitor.interval}}
|
||||
interval: {{ .Values.coreDns.serviceMonitor.interval }}
|
||||
{{- end }}
|
||||
{{- if .Values.coreDns.serviceMonitor.proxyUrl }}
|
||||
proxyUrl: {{ .Values.coreDns.serviceMonitor.proxyUrl}}
|
||||
{{- end }}
|
||||
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
{{- if .Values.coreDns.serviceMonitor.metricRelabelings }}
|
||||
metricRelabelings:
|
||||
|
@ -13,6 +13,9 @@ spec:
|
||||
{{- if .Values.kubeApiServer.serviceMonitor.interval }}
|
||||
interval: {{ .Values.kubeApiServer.serviceMonitor.interval }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeApiServer.serviceMonitor.proxyUrl }}
|
||||
proxyUrl: {{ .Values.kubeApiServer.serviceMonitor.proxyUrl}}
|
||||
{{- end }}
|
||||
port: https
|
||||
scheme: https
|
||||
{{- if .Values.kubeApiServer.serviceMonitor.metricRelabelings }}
|
||||
|
@ -1,4 +1,4 @@
|
||||
{{- if .Values.kubeControllerManager.enabled }}
|
||||
{{- if and .Values.kubeControllerManager.enabled .Values.kubeControllerManager.service.enabled }}
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
|
@ -1,4 +1,4 @@
|
||||
{{- if .Values.kubeControllerManager.enabled }}
|
||||
{{- if and .Values.kubeControllerManager.enabled .Values.kubeControllerManager.serviceMonitor.enabled }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
@ -22,6 +22,9 @@ spec:
|
||||
interval: {{ .Values.kubeControllerManager.serviceMonitor.interval }}
|
||||
{{- end }}
|
||||
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
{{- if .Values.kubeControllerManager.serviceMonitor.proxyUrl }}
|
||||
proxyUrl: {{ .Values.kubeControllerManager.serviceMonitor.proxyUrl}}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeControllerManager.serviceMonitor.https }}
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
|
@ -22,6 +22,9 @@ spec:
|
||||
interval: {{ .Values.kubeDns.serviceMonitor.interval }}
|
||||
{{- end }}
|
||||
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
{{- if .Values.kubeDns.serviceMonitor.proxyUrl }}
|
||||
proxyUrl: {{ .Values.kubeDns.serviceMonitor.proxyUrl}}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeDns.serviceMonitor.dnsmasqMetricRelabelings }}
|
||||
metricRelabelings:
|
||||
{{ tpl (toYaml .Values.kubeDns.serviceMonitor.dnsmasqMetricRelabelings | indent 4) . }}
|
||||
|
@ -1,4 +1,4 @@
|
||||
{{- if .Values.kubeEtcd.enabled }}
|
||||
{{- if and .Values.kubeEtcd.enabled .Values.kubeEtcd.service.enabled }}
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
|
@ -1,4 +1,4 @@
|
||||
{{- if .Values.kubeEtcd.enabled }}
|
||||
{{- if and .Values.kubeEtcd.enabled .Values.kubeEtcd.serviceMonitor.enabled }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
@ -22,6 +22,9 @@ spec:
|
||||
interval: {{ .Values.kubeEtcd.serviceMonitor.interval }}
|
||||
{{- end }}
|
||||
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
{{- if .Values.kubeEtcd.serviceMonitor.proxyUrl }}
|
||||
proxyUrl: {{ .Values.kubeEtcd.serviceMonitor.proxyUrl}}
|
||||
{{- end }}
|
||||
{{- if eq .Values.kubeEtcd.serviceMonitor.scheme "https" }}
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
|
@ -1,4 +1,4 @@
|
||||
{{- if .Values.kubeProxy.enabled }}
|
||||
{{- if and .Values.kubeProxy.enabled .Values.kubeProxy.service.enabled }}
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
|
@ -1,4 +1,4 @@
|
||||
{{- if .Values.kubeProxy.enabled }}
|
||||
{{- if and .Values.kubeProxy.enabled .Values.kubeProxy.serviceMonitor.enabled }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
@ -22,6 +22,9 @@ spec:
|
||||
interval: {{ .Values.kubeProxy.serviceMonitor.interval }}
|
||||
{{- end }}
|
||||
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
{{- if .Values.kubeProxy.serviceMonitor.proxyUrl }}
|
||||
proxyUrl: {{ .Values.kubeProxy.serviceMonitor.proxyUrl}}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeProxy.serviceMonitor.https }}
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
|
@ -1,4 +1,4 @@
|
||||
{{- if .Values.kubeScheduler.enabled }}
|
||||
{{- if and .Values.kubeScheduler.enabled .Values.kubeScheduler.service.enabled }}
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
|
@ -1,4 +1,4 @@
|
||||
{{- if .Values.kubeScheduler.enabled }}
|
||||
{{- if and .Values.kubeScheduler.enabled .Values.kubeScheduler.serviceMonitor.enabled }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
@ -22,6 +22,9 @@ spec:
|
||||
interval: {{ .Values.kubeScheduler.serviceMonitor.interval }}
|
||||
{{- end }}
|
||||
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
{{- if .Values.kubeScheduler.serviceMonitor.proxyUrl }}
|
||||
proxyUrl: {{ .Values.kubeScheduler.serviceMonitor.proxyUrl}}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeScheduler.serviceMonitor.https }}
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
|
@ -14,6 +14,9 @@ spec:
|
||||
{{- if .Values.kubeStateMetrics.serviceMonitor.interval }}
|
||||
interval: {{ .Values.kubeStateMetrics.serviceMonitor.interval }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeStateMetrics.serviceMonitor.proxyUrl }}
|
||||
proxyUrl: {{ .Values.kubeStateMetrics.serviceMonitor.proxyUrl}}
|
||||
{{- end }}
|
||||
honorLabels: true
|
||||
{{- if .Values.kubeStateMetrics.serviceMonitor.metricRelabelings }}
|
||||
metricRelabelings:
|
||||
@ -22,6 +25,11 @@ spec:
|
||||
{{- if .Values.kubeStateMetrics.serviceMonitor.relabelings }}
|
||||
relabelings:
|
||||
{{ toYaml .Values.kubeStateMetrics.serviceMonitor.relabelings | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeStateMetrics.serviceMonitor.namespaceOverride }}
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- {{ .Values.kubeStateMetrics.serviceMonitor.namespaceOverride }}
|
||||
{{- end }}
|
||||
selector:
|
||||
matchLabels:
|
||||
|
@ -15,6 +15,9 @@ spec:
|
||||
{{- if .Values.kubelet.serviceMonitor.interval }}
|
||||
interval: {{ .Values.kubelet.serviceMonitor.interval }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubelet.serviceMonitor.proxyUrl }}
|
||||
proxyUrl: {{ .Values.kubelet.serviceMonitor.proxyUrl}}
|
||||
{{- end }}
|
||||
tlsConfig:
|
||||
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
insecureSkipVerify: true
|
||||
|
@ -18,6 +18,9 @@ spec:
|
||||
{{- if .Values.nodeExporter.serviceMonitor.interval }}
|
||||
interval: {{ .Values.nodeExporter.serviceMonitor.interval }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeApiServer.serviceMonitor.proxyUrl }}
|
||||
proxyUrl: {{ .Values.kubeApiServer.serviceMonitor.proxyUrl}}
|
||||
{{- end }}
|
||||
{{- if .Values.nodeExporter.serviceMonitor.scrapeTimeout }}
|
||||
scrapeTimeout: {{ .Values.nodeExporter.serviceMonitor.scrapeTimeout }}
|
||||
{{- end }}
|
||||
|
@ -18,7 +18,7 @@ metadata:
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
spec:
|
||||
secretName: {{ template "kube-prometheus-stack.fullname" . }}-root-cert
|
||||
duration: 43800h # 5y
|
||||
duration: 43800h0m0s # 5y
|
||||
issuerRef:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-self-signed-issuer
|
||||
commonName: "ca.webhook.kube-prometheus-stack"
|
||||
@ -43,7 +43,7 @@ metadata:
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
spec:
|
||||
secretName: {{ template "kube-prometheus-stack.fullname" . }}-admission
|
||||
duration: 8760h # 1y
|
||||
duration: 8760h0m0s # 1y
|
||||
issuerRef:
|
||||
{{- if .Values.prometheusOperator.admissionWebhooks.certManager.issuerRef }}
|
||||
{{- toYaml .Values.prometheusOperator.admissionWebhooks.certManager.issuerRef | nindent 4 }}
|
||||
|
@ -150,14 +150,24 @@ spec:
|
||||
{{ else }}
|
||||
probeNamespaceSelector: {}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.remoteRead }}
|
||||
{{- if (or .Values.prometheus.prometheusSpec.remoteRead .Values.prometheus.prometheusSpec.additionalRemoteRead) }}
|
||||
remoteRead:
|
||||
{{- if .Values.prometheus.prometheusSpec.remoteRead }}
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.remoteRead | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.remoteWrite }}
|
||||
{{- if .Values.prometheus.prometheusSpec.additionalRemoteRead }}
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.additionalRemoteRead | indent 4 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if (or .Values.prometheus.prometheusSpec.remoteWrite .Values.prometheus.prometheusSpec.additionalRemoteWrite) }}
|
||||
remoteWrite:
|
||||
{{- if .Values.prometheus.prometheusSpec.remoteWrite }}
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.remoteWrite | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.additionalRemoteWrite }}
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.additionalRemoteWrite | indent 4 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.securityContext }}
|
||||
securityContext:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.securityContext | indent 4 }}
|
||||
|
@ -30,7 +30,7 @@ spec:
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 25
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
@ -48,7 +48,7 @@ spec:
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 10
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
|
@ -16,7 +16,7 @@ metadata:
|
||||
{{- end }}
|
||||
spec:
|
||||
type: {{ .Values.prometheus.thanosService.type }}
|
||||
clusterIP: None
|
||||
clusterIP: {{ .Values.prometheus.thanosService.clusterIP }}
|
||||
ports:
|
||||
- name: {{ .Values.prometheus.thanosService.portName }}
|
||||
port: {{ .Values.prometheus.thanosService.port }}
|
@ -117,6 +117,10 @@ alertmanager:
|
||||
##
|
||||
enabled: true
|
||||
|
||||
## Annotations for Alertmanager
|
||||
##
|
||||
annotations: {}
|
||||
|
||||
## Api that prometheus will use to communicate with alertmanager. Possible values are v1, v2
|
||||
##
|
||||
apiVersion: v2
|
||||
@ -194,7 +198,7 @@ alertmanager:
|
||||
# *Graph:* <{{ .GeneratorURL }}|:chart_with_upwards_trend:>
|
||||
# *Runbook:* <{{ .Annotations.runbook }}|:spiral_note_pad:>
|
||||
# *Details:*
|
||||
# {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
|
||||
# {{ range .Labels.SortedPairs }} - *{{ .Name }}:* `{{ .Value }}`
|
||||
# {{ end }}
|
||||
# {{ end }}
|
||||
# {{ end }}
|
||||
@ -345,6 +349,10 @@ alertmanager:
|
||||
interval: ""
|
||||
selfMonitor: true
|
||||
|
||||
## proxyUrl: URL of a proxy that should be used for scraping.
|
||||
##
|
||||
proxyUrl: ""
|
||||
|
||||
## scheme: HTTP scheme to use for scraping. Can be used with `tlsConfig` for example if using istio mTLS.
|
||||
scheme: ""
|
||||
|
||||
@ -375,7 +383,7 @@ alertmanager:
|
||||
## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#alertmanagerspec
|
||||
##
|
||||
alertmanagerSpec:
|
||||
## Standard object’s metadata. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#metadata
|
||||
## Standard object's metadata. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#metadata
|
||||
## Metadata Labels and Annotations gets propagated to the Alertmanager pods.
|
||||
##
|
||||
podMetadata: {}
|
||||
@ -747,6 +755,10 @@ kubeApiServer:
|
||||
## Scrape interval. If not set, the Prometheus default scrape interval is used.
|
||||
##
|
||||
interval: ""
|
||||
## proxyUrl: URL of a proxy that should be used for scraping.
|
||||
##
|
||||
proxyUrl: ""
|
||||
|
||||
jobLabel: component
|
||||
selector:
|
||||
matchLabels:
|
||||
@ -771,6 +783,10 @@ kubelet:
|
||||
##
|
||||
interval: ""
|
||||
|
||||
## proxyUrl: URL of a proxy that should be used for scraping.
|
||||
##
|
||||
proxyUrl: ""
|
||||
|
||||
## Enable scraping the kubelet over https. For requirements to enable this see
|
||||
## https://github.com/prometheus-operator/prometheus-operator/issues/926
|
||||
##
|
||||
@ -891,16 +907,22 @@ kubeControllerManager:
|
||||
## If using kubeControllerManager.endpoints only the port and targetPort are used
|
||||
##
|
||||
service:
|
||||
enabled: true
|
||||
port: 10252
|
||||
targetPort: 10252
|
||||
# selector:
|
||||
# component: kube-controller-manager
|
||||
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
## Scrape interval. If not set, the Prometheus default scrape interval is used.
|
||||
##
|
||||
interval: ""
|
||||
|
||||
## proxyUrl: URL of a proxy that should be used for scraping.
|
||||
##
|
||||
proxyUrl: ""
|
||||
|
||||
## Enable scraping kube-controller-manager over https.
|
||||
## Requires proper certs (not self-signed) and delegated authentication/authorization checks
|
||||
##
|
||||
@ -943,6 +965,10 @@ coreDns:
|
||||
##
|
||||
interval: ""
|
||||
|
||||
## proxyUrl: URL of a proxy that should be used for scraping.
|
||||
##
|
||||
proxyUrl: ""
|
||||
|
||||
## metric relabel configs to apply to samples before ingestion.
|
||||
##
|
||||
metricRelabelings: []
|
||||
@ -978,6 +1004,10 @@ kubeDns:
|
||||
##
|
||||
interval: ""
|
||||
|
||||
## proxyUrl: URL of a proxy that should be used for scraping.
|
||||
##
|
||||
proxyUrl: ""
|
||||
|
||||
## metric relabel configs to apply to samples before ingestion.
|
||||
##
|
||||
metricRelabelings: []
|
||||
@ -1024,6 +1054,7 @@ kubeEtcd:
|
||||
## Etcd service. If using kubeEtcd.endpoints only the port and targetPort are used
|
||||
##
|
||||
service:
|
||||
enabled: true
|
||||
port: 2379
|
||||
targetPort: 2379
|
||||
# selector:
|
||||
@ -1041,9 +1072,13 @@ kubeEtcd:
|
||||
## keyFile: /etc/prometheus/secrets/etcd-client-cert/etcd-client-key
|
||||
##
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
## Scrape interval. If not set, the Prometheus default scrape interval is used.
|
||||
##
|
||||
interval: ""
|
||||
## proxyUrl: URL of a proxy that should be used for scraping.
|
||||
##
|
||||
proxyUrl: ""
|
||||
scheme: http
|
||||
insecureSkipVerify: false
|
||||
serverName: ""
|
||||
@ -1084,15 +1119,20 @@ kubeScheduler:
|
||||
## If using kubeScheduler.endpoints only the port and targetPort are used
|
||||
##
|
||||
service:
|
||||
enabled: true
|
||||
port: 10251
|
||||
targetPort: 10251
|
||||
# selector:
|
||||
# component: kube-scheduler
|
||||
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
## Scrape interval. If not set, the Prometheus default scrape interval is used.
|
||||
##
|
||||
interval: ""
|
||||
## proxyUrl: URL of a proxy that should be used for scraping.
|
||||
##
|
||||
proxyUrl: ""
|
||||
## Enable scraping kube-scheduler over https.
|
||||
## Requires proper certs (not self-signed) and delegated authentication/authorization checks
|
||||
##
|
||||
@ -1135,16 +1175,22 @@ kubeProxy:
|
||||
# - 10.141.4.24
|
||||
|
||||
service:
|
||||
enabled: true
|
||||
port: 10249
|
||||
targetPort: 10249
|
||||
# selector:
|
||||
# k8s-app: kube-proxy
|
||||
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
## Scrape interval. If not set, the Prometheus default scrape interval is used.
|
||||
##
|
||||
interval: ""
|
||||
|
||||
## proxyUrl: URL of a proxy that should be used for scraping.
|
||||
##
|
||||
proxyUrl: ""
|
||||
|
||||
## Enable scraping kube-proxy over https.
|
||||
## Requires proper certs (not self-signed) and delegated authentication/authorization checks
|
||||
##
|
||||
@ -1173,9 +1219,15 @@ kubeStateMetrics:
|
||||
## Scrape interval. If not set, the Prometheus default scrape interval is used.
|
||||
##
|
||||
interval: ""
|
||||
## proxyUrl: URL of a proxy that should be used for scraping.
|
||||
##
|
||||
proxyUrl: ""
|
||||
## Override serviceMonitor selector
|
||||
##
|
||||
selectorOverride: {}
|
||||
## Override namespace selector
|
||||
##
|
||||
namespaceOverride: ""
|
||||
|
||||
## metric relabel configs to apply to samples before ingestion.
|
||||
##
|
||||
@ -1217,6 +1269,10 @@ nodeExporter:
|
||||
##
|
||||
interval: ""
|
||||
|
||||
## proxyUrl: URL of a proxy that should be used for scraping.
|
||||
##
|
||||
proxyUrl: ""
|
||||
|
||||
## How long until a scrape request times out. If not set, the Prometheus default scape timeout is used.
|
||||
##
|
||||
scrapeTimeout: ""
|
||||
@ -1527,6 +1583,7 @@ prometheus:
|
||||
serviceAccount:
|
||||
create: true
|
||||
name: ""
|
||||
annotations: {}
|
||||
|
||||
# Service for thanos service discovery on sidecar
|
||||
# Enable this can make Thanos Query can use
|
||||
@ -1540,6 +1597,7 @@ prometheus:
|
||||
portName: grpc
|
||||
port: 10901
|
||||
targetPort: "grpc"
|
||||
clusterIP: "None"
|
||||
|
||||
## Service type
|
||||
##
|
||||
@ -1814,7 +1872,7 @@ prometheus:
|
||||
##
|
||||
image:
|
||||
repository: quay.io/prometheus/prometheus
|
||||
tag: v2.24.0
|
||||
tag: v2.26.0
|
||||
sha: ""
|
||||
|
||||
## Tolerations for use with node taints
|
||||
@ -2035,7 +2093,7 @@ prometheus:
|
||||
##
|
||||
routePrefix: /
|
||||
|
||||
## Standard object’s metadata. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#metadata
|
||||
## Standard object's metadata. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#metadata
|
||||
## Metadata Labels and Annotations gets propagated to the prometheus pods.
|
||||
##
|
||||
podMetadata: {}
|
||||
@ -2072,11 +2130,15 @@ prometheus:
|
||||
## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#remotereadspec
|
||||
remoteRead: []
|
||||
# - url: http://remote1/read
|
||||
## additionalRemoteRead is appended to remoteRead
|
||||
additionalRemoteRead: []
|
||||
|
||||
## The remote_write spec configuration for Prometheus.
|
||||
## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#remotewritespec
|
||||
remoteWrite: []
|
||||
# - url: http://remote1/push
|
||||
## additionalRemoteWrite is appended to remoteWrite
|
||||
additionalRemoteWrite: []
|
||||
|
||||
## Enable/Disable Grafana dashboards provisioning for prometheus remote write feature
|
||||
remoteWriteDashboards: false
|
||||
|
@ -1,39 +0,0 @@
|
||||
diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml
|
||||
index b430951..8358704 100644
|
||||
--- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml
|
||||
+++ b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml
|
||||
@@ -71,34 +71,6 @@ spec:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
-{{- end }}
|
||||
- - alert: etcdHighNumberOfFailedGRPCRequests
|
||||
- annotations:
|
||||
- message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
- expr: |-
|
||||
- 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
- /
|
||||
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
- > 1
|
||||
- for: 10m
|
||||
- labels:
|
||||
- severity: warning
|
||||
-{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
-{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
-{{- end }}
|
||||
- - alert: etcdHighNumberOfFailedGRPCRequests
|
||||
- annotations:
|
||||
- message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
- expr: |-
|
||||
- 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
- /
|
||||
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
- > 5
|
||||
- for: 5m
|
||||
- labels:
|
||||
- severity: critical
|
||||
-{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
-{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
@ -1,8 +1,10 @@
|
||||
#!/bin/bash
|
||||
|
||||
VERSION=14.3.0
|
||||
VERSION=14.9.0
|
||||
|
||||
rm -rf charts/kube-prometheus-stack
|
||||
curl -L -s -o - https://github.com/prometheus-community/helm-charts/releases/download/kube-prometheus-stack-${VERSION}/kube-prometheus-stack-${VERSION}.tgz | tar xfz - -C charts
|
||||
|
||||
patch -p3 -i remove_etcd_grpc_alerts.patch --no-backup-if-mismatch
|
||||
# The grpc alerts could be re-enabled with etcd 3.5
|
||||
# https://github.com/etcd-io/etcd/pull/12196
|
||||
patch -p0 -i adjust_alarms.patch --no-backup-if-mismatch
|
||||
|
Loading…
Reference in New Issue
Block a user