feat(metrics): Use jsonnet prometheus-operator rules directly
This commit is contained in:
parent
d99565a457
commit
148cd40ca0
2
charts/kubezero-metrics/.gitignore
vendored
2
charts/kubezero-metrics/.gitignore
vendored
@ -1,2 +0,0 @@
|
||||
dashboards/vendor
|
||||
dashboards/jb-linux-amd64
|
@ -26,4 +26,4 @@ README.md.gotmpl
|
||||
*.patch
|
||||
*.sh
|
||||
*.py
|
||||
dashboards
|
||||
configs
|
||||
|
@ -2,7 +2,7 @@ apiVersion: v2
|
||||
name: kubezero-metrics
|
||||
description: KubeZero Umbrella Chart for Prometheus, Grafana and Alertmanager as well as all Kubernetes integrations.
|
||||
type: application
|
||||
version: 0.4.8
|
||||
version: 0.5.1
|
||||
home: https://kubezero.com
|
||||
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
|
||||
keywords:
|
||||
|
@ -1,8 +1,8 @@
|
||||
# kubezero-metrics
|
||||
|
||||
![Version: 0.4.1](https://img.shields.io/badge/Version-0.4.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)
|
||||
![Version: 0.5.1](https://img.shields.io/badge/Version-0.5.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)
|
||||
|
||||
KubeZero Umbrella Chart for prometheus-operator
|
||||
KubeZero Umbrella Chart for Prometheus, Grafana and Alertmanager as well as all Kubernetes integrations.
|
||||
|
||||
**Homepage:** <https://kubezero.com>
|
||||
|
||||
@ -18,9 +18,10 @@ Kubernetes: `>= 1.18.0`
|
||||
|
||||
| Repository | Name | Version |
|
||||
|------------|------|---------|
|
||||
| | kube-prometheus-stack | 16.12.0 |
|
||||
| https://prometheus-community.github.io/helm-charts | prometheus-adapter | 2.14.2 |
|
||||
| https://zero-down-time.github.io/kubezero/ | kubezero-lib | >= 0.1.3 |
|
||||
| | kube-prometheus-stack | 18.1.0 |
|
||||
| | prometheus-pushgateway | 1.10.1 |
|
||||
| https://prometheus-community.github.io/helm-charts | prometheus-adapter | 2.17 |
|
||||
| https://zero-down-time.github.io/kubezero/ | kubezero-lib | >= 0.1.4 |
|
||||
|
||||
## Values
|
||||
|
||||
@ -41,10 +42,42 @@ Kubernetes: `>= 1.18.0`
|
||||
| istio.prometheus.gateway | string | `"istio-ingress/ingressgateway"` | |
|
||||
| istio.prometheus.ipBlocks | list | `[]` | |
|
||||
| istio.prometheus.url | string | `""` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].env[0].name | string | `"SNS_FORWARDER_ARN_PREFIX"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].env[0].valueFrom.fieldRef.fieldPath | string | `"metadata.annotations['kubezero.com/sns_forwarder_ARN_PREFIX']"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].env[1].name | string | `"AWS_ROLE_ARN"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].env[1].valueFrom.fieldRef.fieldPath | string | `"metadata.annotations['kubezero.com/sns_forwarder_AWS_ROLE_ARN']"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].env[2].name | string | `"AWS_WEB_IDENTITY_TOKEN_FILE"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].env[2].value | string | `"/var/run/secrets/sts.amazonaws.com/serviceaccount/token"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].env[3].name | string | `"AWS_STS_REGIONAL_ENDPOINTS"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].env[3].value | string | `"regional"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].image | string | `"datareply/alertmanager-sns-forwarder:latest"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].imagePullPolicy | string | `"Always"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].livenessProbe.httpGet.path | string | `"/health"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].livenessProbe.httpGet.port | string | `"webhook-port"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].livenessProbe.initialDelaySeconds | int | `30` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].livenessProbe.timeoutSeconds | int | `10` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].name | string | `"alertmanager-sns-forwarder"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].ports[0].containerPort | int | `9087` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].ports[0].name | string | `"webhook-port"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].readinessProbe.httpGet.path | string | `"/health"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].readinessProbe.httpGet.port | string | `"webhook-port"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].readinessProbe.initialDelaySeconds | int | `10` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].readinessProbe.timeoutSeconds | int | `10` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].resources.limits.cpu | string | `"100m"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].resources.limits.memory | string | `"64Mi"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].resources.requests.cpu | string | `"25m"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].resources.requests.memory | string | `"32Mi"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].volumeMounts[0].mountPath | string | `"/var/run/secrets/sts.amazonaws.com/serviceaccount/"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].volumeMounts[0].name | string | `"aws-token"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].volumeMounts[0].readOnly | bool | `true` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.logFormat | string | `"json"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.volumes[0].name | string | `"aws-token"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.volumes[0].projected.sources[0].serviceAccountToken.audience | string | `"sts.amazonaws.com"` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.volumes[0].projected.sources[0].serviceAccountToken.expirationSeconds | int | `86400` | |
|
||||
| kube-prometheus-stack.alertmanager.alertmanagerSpec.volumes[0].projected.sources[0].serviceAccountToken.path | string | `"token"` | |
|
||||
| kube-prometheus-stack.alertmanager.enabled | bool | `false` | |
|
||||
| kube-prometheus-stack.coreDns.enabled | bool | `true` | |
|
||||
| kube-prometheus-stack.defaultRules.create | bool | `true` | |
|
||||
| kube-prometheus-stack.defaultRules.create | bool | `false` | |
|
||||
| kube-prometheus-stack.global.rbac.pspEnabled | bool | `false` | |
|
||||
| kube-prometheus-stack.grafana."grafana.ini"."auth.anonymous".enabled | bool | `true` | |
|
||||
| kube-prometheus-stack.grafana."grafana.ini".alerting.enabled | bool | `false` | |
|
||||
@ -113,7 +146,6 @@ Kubernetes: `>= 1.18.0`
|
||||
| kube-prometheus-stack.prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues | bool | `false` | |
|
||||
| kube-prometheus-stack.prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.accessModes[0] | string | `"ReadWriteOnce"` | |
|
||||
| kube-prometheus-stack.prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.resources.requests.storage | string | `"16Gi"` | |
|
||||
| kube-prometheus-stack.prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.storageClassName | string | `"ebs-sc-gp3-xfs"` | |
|
||||
| kube-prometheus-stack.prometheus.prometheusSpec.walCompression | bool | `true` | |
|
||||
| kube-prometheus-stack.prometheusOperator.admissionWebhooks.patch.nodeSelector."node-role.kubernetes.io/master" | string | `""` | |
|
||||
| kube-prometheus-stack.prometheusOperator.admissionWebhooks.patch.tolerations[0].effect | string | `"NoSchedule"` | |
|
||||
@ -146,6 +178,8 @@ Kubernetes: `>= 1.18.0`
|
||||
| prometheus-adapter.rules.resource.window | string | `"5m"` | |
|
||||
| prometheus-adapter.tolerations[0].effect | string | `"NoSchedule"` | |
|
||||
| prometheus-adapter.tolerations[0].key | string | `"node-role.kubernetes.io/master"` | |
|
||||
| prometheus-pushgateway.enabled | bool | `false` | |
|
||||
| prometheus-pushgateway.serviceMonitor.enabled | bool | `true` | |
|
||||
|
||||
# Dashboards
|
||||
|
||||
@ -153,3 +187,5 @@ Kubernetes: `>= 1.18.0`
|
||||
- https://grafana.com/api/dashboards/9578/revisions/4/download
|
||||
## Prometheus
|
||||
- https://grafana.com/api/dashboards/3662/revisions/2/download
|
||||
## AlertManager SNS Forwarder
|
||||
- https://github.com/DataReply/alertmanager-sns-forwarder
|
||||
|
@ -1,175 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/alertmanager-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.alertmanager }}
|
||||
{{- $alertmanagerJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager" }}
|
||||
{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: alertmanager.rules
|
||||
rules:
|
||||
- alert: AlertmanagerFailedReload
|
||||
annotations:
|
||||
description: Configuration has failed to load for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerfailedreload
|
||||
summary: Reloading an Alertmanager configuration has failed.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: AlertmanagerMembersInconsistent
|
||||
annotations:
|
||||
description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} has only found {{`{{`}} $value {{`}}`}} members of the {{`{{`}}$labels.job{{`}}`}} cluster.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagermembersinconsistent
|
||||
summary: A member of an Alertmanager cluster has not found all other cluster members.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
< on (namespace,service) group_left
|
||||
count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]))
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: AlertmanagerFailedToSendAlerts
|
||||
annotations:
|
||||
description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} failed to send {{`{{`}} $value | humanizePercentage {{`}}`}} of notifications to {{`{{`}} $labels.integration {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerfailedtosendalerts
|
||||
summary: An Alertmanager instance failed to send notifications.
|
||||
expr: |-
|
||||
(
|
||||
rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
/
|
||||
rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
)
|
||||
> 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
annotations:
|
||||
description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterfailedtosendalerts
|
||||
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
|
||||
expr: |-
|
||||
min by (namespace,service, integration) (
|
||||
rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m])
|
||||
/
|
||||
rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m])
|
||||
)
|
||||
> 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
annotations:
|
||||
description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterfailedtosendalerts
|
||||
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
|
||||
expr: |-
|
||||
min by (namespace,service, integration) (
|
||||
rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m])
|
||||
/
|
||||
rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m])
|
||||
)
|
||||
> 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
annotations:
|
||||
description: Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have different configurations.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerconfiginconsistent
|
||||
summary: Alertmanager instances within the same cluster have different configurations.
|
||||
expr: |-
|
||||
count by (namespace,service) (
|
||||
count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})
|
||||
)
|
||||
!= 1
|
||||
for: 20m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: AlertmanagerClusterDown
|
||||
annotations:
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have been up for less than half of the last 5m.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterdown
|
||||
summary: Half or more of the Alertmanager instances within the same cluster are down.
|
||||
expr: |-
|
||||
(
|
||||
count by (namespace,service) (
|
||||
avg_over_time(up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) < 0.5
|
||||
)
|
||||
/
|
||||
count by (namespace,service) (
|
||||
up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}
|
||||
)
|
||||
)
|
||||
>= 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: AlertmanagerClusterCrashlooping
|
||||
annotations:
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have restarted at least 5 times in the last 10m.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclustercrashlooping
|
||||
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
|
||||
expr: |-
|
||||
(
|
||||
count by (namespace,service) (
|
||||
changes(process_start_time_seconds{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[10m]) > 4
|
||||
)
|
||||
/
|
||||
count by (namespace,service) (
|
||||
up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}
|
||||
)
|
||||
)
|
||||
>= 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,151 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'etcd' group from https://raw.githubusercontent.com/etcd-io/website/master/content/en/docs/v3.4/op-guide/etcd3_alert.rules.yml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeEtcd.enabled .Values.defaultRules.rules.etcd }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "etcd" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: etcd
|
||||
rules:
|
||||
- alert: etcdInsufficientMembers
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).'
|
||||
expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdNoLeader
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.'
|
||||
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": instance {{`{{`}} $labels.instance {{`}}`}} has seen {{`{{`}} $value {{`}}`}} leader changes within the last hour.'
|
||||
expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": gRPC requests to {{`{{`}} $labels.grpc_method {{`}}`}} are taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: |-
|
||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdMemberCommunicationSlow
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighNumberOfFailedProposals
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last hour on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighCommitDurations
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighNumberOfFailedHTTPRequests
|
||||
annotations:
|
||||
message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}'
|
||||
expr: |-
|
||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
||||
BY (method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighNumberOfFailedHTTPRequests
|
||||
annotations:
|
||||
message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: |-
|
||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
||||
BY (method) > 0.05
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHTTPRequestsSlow
|
||||
annotations:
|
||||
message: etcd instance {{`{{`}} $labels.instance {{`}}`}} HTTP requests to {{`{{`}} $labels.method {{`}}`}} are slow.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,60 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.general }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "general.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: general.rules
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
annotations:
|
||||
description: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-targetdown
|
||||
summary: One or more targets are unreachable.
|
||||
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: Watchdog
|
||||
annotations:
|
||||
description: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
|
||||
|
||||
This alert is always firing, therefore it should always be firing in Alertmanager
|
||||
|
||||
and always fire against a receiver. There are integrations with various notification
|
||||
|
||||
mechanisms that send a notification when this alert is not firing. For example the
|
||||
|
||||
"DeadMansSnitch" integration in PagerDuty.
|
||||
|
||||
'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-watchdog
|
||||
summary: An alert that should always be firing to certify that Alertmanager is working properly.
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: none
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,163 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'k8s.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8s }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules
|
||||
rules:
|
||||
- expr: |-
|
||||
sum by (cluster, namespace, pod, container) (
|
||||
irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
|
||||
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
|
||||
1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
|
||||
- expr: |-
|
||||
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_working_set_bytes
|
||||
- expr: |-
|
||||
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_rss
|
||||
- expr: |-
|
||||
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_cache
|
||||
- expr: |-
|
||||
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_swap
|
||||
- expr: |-
|
||||
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
|
||||
group_left() max by (namespace, pod) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
|
||||
- expr: |-
|
||||
sum by (namespace, cluster) (
|
||||
sum by (namespace, pod, cluster) (
|
||||
max by (namespace, pod, container, cluster) (
|
||||
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
|
||||
) * on(namespace, pod, cluster) group_left() max by (namespace, pod) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace_memory:kube_pod_container_resource_requests:sum
|
||||
- expr: |-
|
||||
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
|
||||
group_left() max by (namespace, pod) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
|
||||
- expr: |-
|
||||
sum by (namespace, cluster) (
|
||||
sum by (namespace, pod, cluster) (
|
||||
max by (namespace, pod, container, cluster) (
|
||||
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
|
||||
) * on(namespace, pod, cluster) group_left() max by (namespace, pod) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace_cpu:kube_pod_container_resource_requests:sum
|
||||
- expr: |-
|
||||
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
|
||||
group_left() max by (namespace, pod) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
|
||||
- expr: |-
|
||||
sum by (namespace, cluster) (
|
||||
sum by (namespace, pod, cluster) (
|
||||
max by (namespace, pod, container, cluster) (
|
||||
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
|
||||
) * on(namespace, pod, cluster) group_left() max by (namespace, pod) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace_memory:kube_pod_container_resource_limits:sum
|
||||
- expr: |-
|
||||
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
|
||||
group_left() max by (namespace, pod) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
|
||||
- expr: |-
|
||||
sum by (namespace, cluster) (
|
||||
sum by (namespace, pod, cluster) (
|
||||
max by (namespace, pod, container, cluster) (
|
||||
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
|
||||
) * on(namespace, pod, cluster) group_left() max by (namespace, pod) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace_cpu:kube_pod_container_resource_limits:sum
|
||||
- expr: |-
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
|
||||
"replicaset", "$1", "owner_name", "(.*)"
|
||||
) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (
|
||||
1, max by (replicaset, namespace, owner_name) (
|
||||
kube_replicaset_owner{job="kube-state-metrics"}
|
||||
)
|
||||
),
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: deployment
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- expr: |-
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: daemonset
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- expr: |-
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: statefulset
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
{{- end }}
|
@ -1,128 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kube-apiserver-availability.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverAvailability }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver-availability.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- interval: 3m
|
||||
name: kube-apiserver-availability.rules
|
||||
rules:
|
||||
- expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
|
||||
labels:
|
||||
verb: read
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
labels:
|
||||
verb: write
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- expr: |-
|
||||
1 - (
|
||||
(
|
||||
# write too slow
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
|
||||
-
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
|
||||
) +
|
||||
(
|
||||
# read too slow
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="1"}[30d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="5"}[30d]))
|
||||
+
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="40"}[30d]))
|
||||
)
|
||||
) +
|
||||
# errors
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d)
|
||||
labels:
|
||||
verb: all
|
||||
record: apiserver_request:availability30d
|
||||
- expr: |-
|
||||
1 - (
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d]))
|
||||
-
|
||||
(
|
||||
# too slow
|
||||
(
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[30d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[30d]))
|
||||
+
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[30d]))
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"})
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:availability30d
|
||||
- expr: |-
|
||||
1 - (
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
|
||||
-
|
||||
sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"})
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:availability30d
|
||||
- expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
{{- end }}
|
@ -1,328 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kube-apiserver-burnrate.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver-burnrate.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-apiserver-burnrate.rules
|
||||
rules:
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[1d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[1d]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[1d]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate1d
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[1h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[1h]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[1h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate1h
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[2h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[2h]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[2h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate2h
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[30m]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[30m]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[30m]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate30m
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[3d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[3d]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[3d]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate3d
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[5m]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[5m]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[5m]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate5m
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[6h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[6h]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[6h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate6h
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate1d
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate1h
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate2h
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate30m
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate3d
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate5m
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate6h
|
||||
{{- end }}
|
@ -1,49 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kube-apiserver-histogram.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver-histogram.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-apiserver-histogram.rules
|
||||
rules:
|
||||
- expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
verb: read
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
verb: write
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
{{- end }}
|
@ -1,95 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kube-apiserver-slos' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverSlos }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver-slos" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-apiserver-slos
|
||||
rules:
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000)
|
||||
for: 2m
|
||||
labels:
|
||||
long: 1h
|
||||
severity: critical
|
||||
short: 5m
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000)
|
||||
for: 15m
|
||||
labels:
|
||||
long: 6h
|
||||
severity: critical
|
||||
short: 30m
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
|
||||
for: 1h
|
||||
labels:
|
||||
long: 1d
|
||||
severity: warning
|
||||
short: 2h
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
|
||||
for: 3h
|
||||
labels:
|
||||
long: 3d
|
||||
severity: warning
|
||||
short: 6h
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,358 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kube-apiserver.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserver }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-apiserver.rules
|
||||
rules:
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate1d
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate1h
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate2h
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate30m
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate3d
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate5m
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate6h
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate1d
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate1h
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate2h
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate30m
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate3d
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate5m
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate6h
|
||||
- expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
verb: read
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
verb: write
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
{{- end }}
|
@ -1,31 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kube-prometheus-general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusGeneral }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-prometheus-general.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-prometheus-general.rules
|
||||
rules:
|
||||
- expr: count without(instance, pod, node) (up == 1)
|
||||
record: count:up1
|
||||
- expr: count without(instance, pod, node) (up == 0)
|
||||
record: count:up0
|
||||
{{- end }}
|
@ -1,39 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusNodeRecording }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-prometheus-node-recording.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-prometheus-node-recording.rules
|
||||
rules:
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)
|
||||
record: instance:node_cpu:rate:sum
|
||||
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
|
||||
record: instance:node_network_receive_bytes:rate:sum
|
||||
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
|
||||
record: instance:node_network_transmit_bytes:rate:sum
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
|
||||
record: instance:node_cpu:ratio
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
|
||||
record: cluster:node_cpu:sum_rate5m
|
||||
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
|
||||
record: cluster:node_cpu:ratio
|
||||
{{- end }}
|
@ -1,63 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeScheduler.enabled .Values.defaultRules.rules.kubeScheduler }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-scheduler.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-scheduler.rules
|
||||
rules:
|
||||
- expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
{{- end }}
|
@ -1,87 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kube-state-metrics' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-state-metrics-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubeStateMetrics }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-state-metrics" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-state-metrics
|
||||
rules:
|
||||
- alert: KubeStateMetricsListErrors
|
||||
annotations:
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricslisterrors
|
||||
summary: kube-state-metrics is experiencing errors in list operations.
|
||||
expr: |-
|
||||
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
|
||||
/
|
||||
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeStateMetricsWatchErrors
|
||||
annotations:
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricswatcherrors
|
||||
summary: kube-state-metrics is experiencing errors in watch operations.
|
||||
expr: |-
|
||||
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
|
||||
/
|
||||
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeStateMetricsShardingMismatch
|
||||
annotations:
|
||||
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricsshardingmismatch
|
||||
summary: kube-state-metrics sharding is misconfigured.
|
||||
expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeStateMetricsShardsMissing
|
||||
annotations:
|
||||
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricsshardsmissing
|
||||
summary: kube-state-metrics shards are missing.
|
||||
expr: |-
|
||||
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1
|
||||
-
|
||||
sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) )
|
||||
!= 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,39 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kubelet.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubelet.enabled .Values.defaultRules.rules.kubelet }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubelet.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubelet.rules
|
||||
rules:
|
||||
- expr: histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
{{- end }}
|
@ -1,301 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesApps }}
|
||||
{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-apps" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-apps
|
||||
rules:
|
||||
- alert: KubePodCrashLooping
|
||||
annotations:
|
||||
description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is restarting {{`{{`}} printf "%.2f" $value {{`}}`}} times / 10 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodcrashlooping
|
||||
summary: Pod is crash looping.
|
||||
expr: |-
|
||||
increase(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m]) > 0
|
||||
and
|
||||
kube_pod_container_status_waiting{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} == 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubePodNotReady
|
||||
annotations:
|
||||
description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodnotready
|
||||
summary: Pod has been in a non-ready state for more than 15 minutes.
|
||||
expr: |-
|
||||
sum by (namespace, pod) (
|
||||
max by(namespace, pod) (
|
||||
kube_pod_status_phase{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}", phase=~"Pending|Unknown"}
|
||||
) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
|
||||
1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
|
||||
)
|
||||
) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeDeploymentGenerationMismatch
|
||||
annotations:
|
||||
description: Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentgenerationmismatch
|
||||
summary: Deployment generation mismatch due to possible roll-back
|
||||
expr: |-
|
||||
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeDeploymentReplicasMismatch
|
||||
annotations:
|
||||
description: Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentreplicasmismatch
|
||||
summary: Deployment has not matched the expected number of replicas.
|
||||
expr: |-
|
||||
(
|
||||
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
>
|
||||
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
) and (
|
||||
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeStatefulSetReplicasMismatch
|
||||
annotations:
|
||||
description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetreplicasmismatch
|
||||
summary: Deployment has not matched the expected number of replicas.
|
||||
expr: |-
|
||||
(
|
||||
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeStatefulSetGenerationMismatch
|
||||
annotations:
|
||||
description: StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetgenerationmismatch
|
||||
summary: StatefulSet generation mismatch due to possible roll-back
|
||||
expr: |-
|
||||
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
||||
annotations:
|
||||
description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetupdatenotrolledout
|
||||
summary: StatefulSet update has not been rolled out.
|
||||
expr: |-
|
||||
(
|
||||
max without (revision) (
|
||||
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
unless
|
||||
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
)
|
||||
*
|
||||
(
|
||||
kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
)
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeDaemonSetRolloutStuck
|
||||
annotations:
|
||||
description: DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetrolloutstuck
|
||||
summary: DaemonSet rollout is stuck.
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
) or (
|
||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
0
|
||||
) or (
|
||||
kube_daemonset_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
) or (
|
||||
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
)
|
||||
) and (
|
||||
changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeContainerWaiting
|
||||
annotations:
|
||||
description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecontainerwaiting
|
||||
summary: Pod container waiting longer than 1 hour
|
||||
expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeDaemonSetNotScheduled
|
||||
annotations:
|
||||
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetnotscheduled
|
||||
summary: DaemonSet pods are not scheduled.
|
||||
expr: |-
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
-
|
||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeDaemonSetMisScheduled
|
||||
annotations:
|
||||
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetmisscheduled
|
||||
summary: DaemonSet pods are misscheduled.
|
||||
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeJobCompletion
|
||||
annotations:
|
||||
description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than 12 hours to complete.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobcompletion
|
||||
summary: Job did not complete in time
|
||||
expr: kube_job_spec_completions{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - kube_job_status_succeeded{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
|
||||
for: 12h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeJobFailed
|
||||
annotations:
|
||||
description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobfailed
|
||||
summary: Job failed to complete.
|
||||
expr: kube_job_failed{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeHpaReplicasMismatch
|
||||
annotations:
|
||||
description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubehpareplicasmismatch
|
||||
summary: HPA has not matched descired number of replicas.
|
||||
expr: |-
|
||||
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
|
||||
and
|
||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
>
|
||||
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
|
||||
and
|
||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
<
|
||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
|
||||
and
|
||||
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeHpaMaxedOut
|
||||
annotations:
|
||||
description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has been running at max replicas for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubehpamaxedout
|
||||
summary: HPA is running at max replicas
|
||||
expr: |-
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
==
|
||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,159 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesResources }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-resources" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-resources
|
||||
rules:
|
||||
- alert: KubeCPUOvercommit
|
||||
annotations:
|
||||
description: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecpuovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
expr: |-
|
||||
sum(namespace_cpu:kube_pod_container_resource_requests:sum{})
|
||||
/
|
||||
sum(kube_node_status_allocatable{resource="cpu"})
|
||||
>
|
||||
((count(kube_node_status_allocatable{resource="cpu"}) > 1) - 1) / count(kube_node_status_allocatable{resource="cpu"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeMemoryOvercommit
|
||||
annotations:
|
||||
description: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubememoryovercommit
|
||||
summary: Cluster has overcommitted memory resource requests.
|
||||
expr: |-
|
||||
sum(namespace_memory:kube_pod_container_resource_requests:sum{})
|
||||
/
|
||||
sum(kube_node_status_allocatable{resource="memory"})
|
||||
>
|
||||
((count(kube_node_status_allocatable{resource="memory"}) > 1) - 1)
|
||||
/
|
||||
count(kube_node_status_allocatable{resource="memory"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeCPUQuotaOvercommit
|
||||
annotations:
|
||||
description: Cluster has overcommitted CPU resource requests for Namespaces.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecpuquotaovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
expr: |-
|
||||
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
|
||||
/
|
||||
sum(kube_node_status_allocatable{resource="cpu"})
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeMemoryQuotaOvercommit
|
||||
annotations:
|
||||
description: Cluster has overcommitted memory resource requests for Namespaces.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubememoryquotaovercommit
|
||||
summary: Cluster has overcommitted memory resource requests.
|
||||
expr: |-
|
||||
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
|
||||
/
|
||||
sum(kube_node_status_allocatable{resource="memory",job="kube-state-metrics"})
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeQuotaAlmostFull
|
||||
annotations:
|
||||
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubequotaalmostfull
|
||||
summary: Namespace quota is going to be full.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
> 0.9 < 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeQuotaFullyUsed
|
||||
annotations:
|
||||
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubequotafullyused
|
||||
summary: Namespace quota is fully used.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
== 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeQuotaExceeded
|
||||
annotations:
|
||||
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubequotaexceeded
|
||||
summary: Namespace quota has exceeded the limits.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: CPUThrottlingHigh
|
||||
annotations:
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}}.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-cputhrottlinghigh
|
||||
summary: Processes experience elevated CPU throttling.
|
||||
expr: |-
|
||||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
|
||||
/
|
||||
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)
|
||||
> ( 25 / 100 )
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,80 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesStorage }}
|
||||
{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-storage" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-storage
|
||||
rules:
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
description: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is only {{`{{`}} $value | humanizePercentage {{`}}`}} free.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumefillingup
|
||||
summary: PersistentVolume is filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
) < 0.03
|
||||
and
|
||||
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
description: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} is available.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumefillingup
|
||||
summary: PersistentVolume is filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
) < 0.15
|
||||
and
|
||||
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubePersistentVolumeErrors
|
||||
annotations:
|
||||
description: The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumeerrors
|
||||
summary: PersistentVolume is having issues with provisioning.
|
||||
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,100 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system-apiserver' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-apiserver" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-apiserver
|
||||
rules:
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration
|
||||
summary: Client certificate is about to expire.
|
||||
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration
|
||||
summary: Client certificate is about to expire.
|
||||
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: AggregatedAPIErrors
|
||||
annotations:
|
||||
description: An aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has reported errors. It has appeared unavailable {{`{{`}} $value | humanize {{`}}`}} times averaged over the past 10m.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-aggregatedapierrors
|
||||
summary: An aggregated API has reported errors.
|
||||
expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if semverCompare ">=1.18.0-0" $kubeTargetVersion }}
|
||||
- alert: AggregatedAPIDown
|
||||
annotations:
|
||||
description: An aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has been only {{`{{`}} $value | humanize {{`}}`}}% available over the last 10m.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-aggregatedapidown
|
||||
summary: An aggregated API is down.
|
||||
expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeApiServer.enabled }}
|
||||
- alert: KubeAPIDown
|
||||
annotations:
|
||||
description: KubeAPI has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapidown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="apiserver"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- alert: KubeAPITerminatedRequests
|
||||
annotations:
|
||||
description: The apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapiterminatedrequests
|
||||
summary: The apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.
|
||||
expr: sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,41 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system-controller-manager' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeControllerManager.enabled }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-controller-manager" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-controller-manager
|
||||
rules:
|
||||
{{- if .Values.kubeControllerManager.enabled }}
|
||||
- alert: KubeControllerManagerDown
|
||||
annotations:
|
||||
description: KubeControllerManager has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecontrollermanagerdown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="kube-controller-manager"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,188 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system-kubelet' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-kubelet" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-kubelet
|
||||
rules:
|
||||
- alert: KubeNodeNotReady
|
||||
annotations:
|
||||
description: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodenotready
|
||||
summary: Node is not ready.
|
||||
expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeNodeUnreachable
|
||||
annotations:
|
||||
description: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodeunreachable
|
||||
summary: Node is unreachable.
|
||||
expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeletTooManyPods
|
||||
annotations:
|
||||
description: Kubelet '{{`{{`}} $labels.node {{`}}`}}' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubelettoomanypods
|
||||
summary: Kubelet is running at capacity.
|
||||
expr: |-
|
||||
count by(node) (
|
||||
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"})
|
||||
)
|
||||
/
|
||||
max by(node) (
|
||||
kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1
|
||||
) > 0.95
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeNodeReadinessFlapping
|
||||
annotations:
|
||||
description: The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodereadinessflapping
|
||||
summary: Node readiness status is flapping.
|
||||
expr: sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeletPlegDurationHigh
|
||||
annotations:
|
||||
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletplegdurationhigh
|
||||
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
|
||||
expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeletPodStartUpLatencyHigh
|
||||
annotations:
|
||||
description: Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletpodstartuplatencyhigh
|
||||
summary: Kubelet Pod startup latency is too high.
|
||||
expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeletClientCertificateExpiration
|
||||
annotations:
|
||||
description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletclientcertificateexpiration
|
||||
summary: Kubelet client certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_client_ttl_seconds < 604800
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeletClientCertificateExpiration
|
||||
annotations:
|
||||
description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletclientcertificateexpiration
|
||||
summary: Kubelet client certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_client_ttl_seconds < 86400
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeletServerCertificateExpiration
|
||||
annotations:
|
||||
description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletservercertificateexpiration
|
||||
summary: Kubelet server certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_server_ttl_seconds < 604800
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeletServerCertificateExpiration
|
||||
annotations:
|
||||
description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletservercertificateexpiration
|
||||
summary: Kubelet server certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_server_ttl_seconds < 86400
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeletClientCertificateRenewalErrors
|
||||
annotations:
|
||||
description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its client certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletclientcertificaterenewalerrors
|
||||
summary: Kubelet has failed to renew its client certificate.
|
||||
expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeletServerCertificateRenewalErrors
|
||||
annotations:
|
||||
description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its server certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletservercertificaterenewalerrors
|
||||
summary: Kubelet has failed to renew its server certificate.
|
||||
expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheusOperator.kubeletService.enabled }}
|
||||
- alert: KubeletDown
|
||||
annotations:
|
||||
description: Kubelet has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletdown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,41 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system-scheduler' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeScheduler.enabled .Values.defaultRules.rules.kubeScheduler }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-scheduler" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-scheduler
|
||||
rules:
|
||||
{{- if .Values.kubeScheduler.enabled }}
|
||||
- alert: KubeSchedulerDown
|
||||
annotations:
|
||||
description: KubeScheduler has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeschedulerdown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="kube-scheduler"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,55 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system
|
||||
rules:
|
||||
- alert: KubeVersionMismatch
|
||||
annotations:
|
||||
description: There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeversionmismatch
|
||||
summary: Different semantic versions of Kubernetes components running.
|
||||
expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeClientErrors
|
||||
annotations:
|
||||
description: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} $value | humanizePercentage {{`}}`}} errors.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclienterrors
|
||||
summary: Kubernetes API server client is experiencing errors.
|
||||
expr: |-
|
||||
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
|
||||
/
|
||||
sum(rate(rest_client_requests_total[5m])) by (instance, job))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,79 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'node-exporter.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/node-exporter-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.node }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node-exporter.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: node-exporter.rules
|
||||
rules:
|
||||
- expr: |-
|
||||
count without (cpu) (
|
||||
count without (mode) (
|
||||
node_cpu_seconds_total{job="node-exporter"}
|
||||
)
|
||||
)
|
||||
record: instance:node_num_cpu:sum
|
||||
- expr: |-
|
||||
1 - avg without (cpu, mode) (
|
||||
rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[5m])
|
||||
)
|
||||
record: instance:node_cpu_utilisation:rate5m
|
||||
- expr: |-
|
||||
(
|
||||
node_load1{job="node-exporter"}
|
||||
/
|
||||
instance:node_num_cpu:sum{job="node-exporter"}
|
||||
)
|
||||
record: instance:node_load1_per_cpu:ratio
|
||||
- expr: |-
|
||||
1 - (
|
||||
node_memory_MemAvailable_bytes{job="node-exporter"}
|
||||
/
|
||||
node_memory_MemTotal_bytes{job="node-exporter"}
|
||||
)
|
||||
record: instance:node_memory_utilisation:ratio
|
||||
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
|
||||
record: instance:node_vmstat_pgmajfault:rate5m
|
||||
- expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
|
||||
record: instance_device:node_disk_io_time_seconds:rate5m
|
||||
- expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
|
||||
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_receive_bytes_excluding_lo:rate5m
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_receive_drop_excluding_lo:rate5m
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_transmit_drop_excluding_lo:rate5m
|
||||
{{- end }}
|
@ -1,308 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'node-exporter' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/node-exporter-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.node }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node-exporter" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: node-exporter
|
||||
rules:
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 25
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up fast.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 10
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 5% space left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 3% space left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up fast.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 5% inodes left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 3% inodes left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} receive errors in the last two minutes.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworkreceiveerrs
|
||||
summary: Network interface is reporting many receive errors.
|
||||
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} transmit errors in the last two minutes.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworktransmiterrs
|
||||
summary: Network interface is reporting many transmit errors.
|
||||
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: NodeHighNumberConntrackEntriesUsed
|
||||
annotations:
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodehighnumberconntrackentriesused
|
||||
summary: Number of conntrack are getting close to the limit.
|
||||
expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: NodeTextFileCollectorScrapeError
|
||||
annotations:
|
||||
description: Node Exporter text file collector failed to scrape.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodetextfilecollectorscrapeerror
|
||||
summary: Node Exporter text file collector failed to scrape.
|
||||
expr: node_textfile_scrape_error{job="node-exporter"} == 1
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
description: Clock on {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeclockskewdetected
|
||||
summary: Clock skew detected.
|
||||
expr: |-
|
||||
(
|
||||
node_timex_offset_seconds > 0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds[5m]) >= 0
|
||||
)
|
||||
or
|
||||
(
|
||||
node_timex_offset_seconds < -0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds[5m]) <= 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
description: Clock on {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeclocknotsynchronising
|
||||
summary: Clock not synchronising.
|
||||
expr: |-
|
||||
min_over_time(node_timex_sync_status[5m]) == 0
|
||||
and
|
||||
node_timex_maxerror_seconds >= 16
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: NodeRAIDDegraded
|
||||
annotations:
|
||||
description: RAID array '{{`{{`}} $labels.device {{`}}`}}' on {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-noderaiddegraded
|
||||
summary: RAID Array is degraded
|
||||
expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: NodeRAIDDiskFailure
|
||||
annotations:
|
||||
description: At least one device in RAID array on {{`{{`}} $labels.instance {{`}}`}} failed. Array '{{`{{`}} $labels.device {{`}}`}}' needs attention and possibly a disk swap.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-noderaiddiskfailure
|
||||
summary: Failed device in RAID array
|
||||
expr: node_md_disks{state="failed"} > 0
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description: File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |-
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description: File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |-
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,39 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'node-network' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.network }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node-network" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: node-network
|
||||
rules:
|
||||
- alert: NodeNetworkInterfaceFlapping
|
||||
annotations:
|
||||
description: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing it's up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworkinterfaceflapping
|
||||
summary: Network interface is often changin it's status
|
||||
expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,51 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'node.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.node }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: node.rules
|
||||
rules:
|
||||
- expr: |-
|
||||
topk by(namespace, pod) (1,
|
||||
max by (node, namespace, pod) (
|
||||
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
|
||||
))
|
||||
record: 'node_namespace_pod:kube_pod_info:'
|
||||
- expr: |-
|
||||
count by (cluster, node) (sum by (node, cpu) (
|
||||
node_cpu_seconds_total{job="node-exporter"}
|
||||
* on (namespace, pod) group_left(node)
|
||||
topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:)
|
||||
))
|
||||
record: node:node_num_cpu:sum
|
||||
- expr: |-
|
||||
sum(
|
||||
node_memory_MemAvailable_bytes{job="node-exporter"} or
|
||||
(
|
||||
node_memory_Buffers_bytes{job="node-exporter"} +
|
||||
node_memory_Cached_bytes{job="node-exporter"} +
|
||||
node_memory_MemFree_bytes{job="node-exporter"} +
|
||||
node_memory_Slab_bytes{job="node-exporter"}
|
||||
)
|
||||
) by (cluster)
|
||||
record: :node_memory_MemAvailable_bytes:sum
|
||||
{{- end }}
|
@ -1,113 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheus-operator-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheusOperator }}
|
||||
{{- $operatorJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "operator" }}
|
||||
{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus-operator" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: prometheus-operator
|
||||
rules:
|
||||
- alert: PrometheusOperatorListErrors
|
||||
annotations:
|
||||
description: Errors while performing List operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoperatorlisterrors
|
||||
summary: Errors while performing list operations in controller.
|
||||
expr: (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m]))) > 0.4
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusOperatorWatchErrors
|
||||
annotations:
|
||||
description: Errors while performing watch operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoperatorwatcherrors
|
||||
summary: Errors while performing watch operations in controller.
|
||||
expr: (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m]))) > 0.4
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusOperatorSyncFailed
|
||||
annotations:
|
||||
description: Controller {{`{{`}} $labels.controller {{`}}`}} in {{`{{`}} $labels.namespace {{`}}`}} namespace fails to reconcile {{`{{`}} $value {{`}}`}} objects.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoperatorsyncfailed
|
||||
summary: Last controller reconciliation failed
|
||||
expr: min_over_time(prometheus_operator_syncs{status="failed",job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusOperatorReconcileErrors
|
||||
annotations:
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of reconciling operations failed for {{`{{`}} $labels.controller {{`}}`}} controller in {{`{{`}} $labels.namespace {{`}}`}} namespace.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoperatorreconcileerrors
|
||||
summary: Errors while reconciling controller.
|
||||
expr: (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusOperatorNodeLookupErrors
|
||||
annotations:
|
||||
description: Errors while reconciling Prometheus in {{`{{`}} $labels.namespace {{`}}`}} Namespace.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoperatornodelookuperrors
|
||||
summary: Errors while reconciling Prometheus.
|
||||
expr: rate(prometheus_operator_node_address_lookup_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusOperatorNotReady
|
||||
annotations:
|
||||
description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace isn't ready to reconcile {{`{{`}} $labels.controller {{`}}`}} resources.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoperatornotready
|
||||
summary: Prometheus operator not ready
|
||||
expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) == 0)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusOperatorRejectedResources
|
||||
annotations:
|
||||
description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace rejected {{`{{`}} printf "%0.0f" $value {{`}}`}} {{`{{`}} $labels.controller {{`}}`}}/{{`{{`}} $labels.resource {{`}}`}} resources.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoperatorrejectedresources
|
||||
summary: Resources rejected by Prometheus operator
|
||||
expr: min_over_time(prometheus_operator_managed_resources{state="rejected",job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -1,307 +0,0 @@
|
||||
{{- /*
|
||||
Generated from 'prometheus' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheus }}
|
||||
{{- $prometheusJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus" }}
|
||||
{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: prometheus
|
||||
rules:
|
||||
- alert: PrometheusBadConfig
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to reload its configuration.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusbadconfig
|
||||
summary: Failed Prometheus configuration reload.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(prometheus_config_last_reload_successful{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusNotificationQueueRunningFull
|
||||
annotations:
|
||||
description: Alert notification queue of Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is running full.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusnotificationqueuerunningfull
|
||||
summary: Prometheus alert notification queue predicted to run full in less than 30m.
|
||||
expr: |-
|
||||
# Without min_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m], 60 * 30)
|
||||
>
|
||||
min_over_time(prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||
annotations:
|
||||
description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.alertmanager{{`}}`}}.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheuserrorsendingalertstosomealertmanagers
|
||||
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
|
||||
expr: |-
|
||||
(
|
||||
rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
)
|
||||
* 100
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusNotConnectedToAlertmanagers
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not connected to any Alertmanagers.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusnotconnectedtoalertmanagers
|
||||
summary: Prometheus is not connected to any Alertmanagers.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusTSDBReloadsFailing
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} reload failures over the last 3h.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheustsdbreloadsfailing
|
||||
summary: Prometheus has issues reloading blocks from disk.
|
||||
expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusTSDBCompactionsFailing
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} compaction failures over the last 3h.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheustsdbcompactionsfailing
|
||||
summary: Prometheus has issues compacting blocks.
|
||||
expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusNotIngestingSamples
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not ingesting samples.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusnotingestingsamples
|
||||
summary: Prometheus is not ingesting samples.
|
||||
expr: |-
|
||||
(
|
||||
rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0
|
||||
and
|
||||
(
|
||||
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0
|
||||
or
|
||||
sum without(rule_group) (prometheus_rule_group_rules{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0
|
||||
)
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusDuplicateTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with different values but duplicated timestamp.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusduplicatetimestamps
|
||||
summary: Prometheus is dropping samples with duplicate timestamps.
|
||||
expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusOutOfOrderTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with timestamps arriving out of order.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoutofordertimestamps
|
||||
summary: Prometheus drops samples with out-of-order timestamps.
|
||||
expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusRemoteStorageFailures
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} failed to send {{`{{`}} printf "%.1f" $value {{`}}`}}% of the samples to {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusremotestoragefailures
|
||||
summary: Prometheus fails to send samples to remote storage.
|
||||
expr: |-
|
||||
(
|
||||
(rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]))
|
||||
/
|
||||
(
|
||||
(rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]))
|
||||
+
|
||||
(rate(prometheus_remote_storage_succeeded_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]))
|
||||
)
|
||||
)
|
||||
* 100
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusRemoteWriteBehind
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write is {{`{{`}} printf "%.1f" $value {{`}}`}}s behind for {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusremotewritebehind
|
||||
summary: Prometheus remote write is behind.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
- ignoring(remote_name, url) group_right
|
||||
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
)
|
||||
> 120
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusRemoteWriteDesiredShards
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write desired shards calculation wants to run {{`{{`}} $value {{`}}`}} shards for queue {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}, which is more than the max of {{`{{`}} printf `prometheus_remote_storage_shards_max{instance="%s",job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}` $labels.instance | query | first | value {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusremotewritedesiredshards
|
||||
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
max_over_time(prometheus_remote_storage_shards_desired{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
>
|
||||
max_over_time(prometheus_remote_storage_shards_max{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusRuleFailures
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to evaluate {{`{{`}} printf "%.0f" $value {{`}}`}} rules in the last 5m.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusrulefailures
|
||||
summary: Prometheus is failing rule evaluations.
|
||||
expr: increase(prometheus_rule_evaluation_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusMissingRuleEvaluations
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has missed {{`{{`}} printf "%.0f" $value {{`}}`}} rule group evaluations in the last 5m.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusmissingruleevaluations
|
||||
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
||||
expr: increase(prometheus_rule_group_iterations_missed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusTargetLimitHit
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because the number of targets exceeded the configured target_limit.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheustargetlimithit
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
|
||||
expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusLabelLimitHit
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheuslabellimithit
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit.
|
||||
expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusTargetSyncFailure
|
||||
annotations:
|
||||
description: '{{`{{`}} printf "%.0f" $value {{`}}`}} targets in Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} have failed to sync because invalid configuration was supplied.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheustargetsyncfailure
|
||||
summary: Prometheus has failed to sync targets.
|
||||
expr: increase(prometheus_target_sync_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[30m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% minimum errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to any Alertmanager.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheuserrorsendingalertstoanyalertmanager
|
||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
expr: |-
|
||||
min without (alertmanager) (
|
||||
rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}",alertmanager!~``}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}",alertmanager!~``}[5m])
|
||||
)
|
||||
* 100
|
||||
> 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
2
charts/kubezero-metrics/configs/.gitignore
vendored
Normal file
2
charts/kubezero-metrics/configs/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
vendor
|
||||
jb-linux-amd64
|
21
charts/kubezero-metrics/configs/build.sh
Executable file
21
charts/kubezero-metrics/configs/build.sh
Executable file
@ -0,0 +1,21 @@
|
||||
#!/bin/bash -ex
|
||||
|
||||
JB='./jb-linux-amd64'
|
||||
|
||||
which jsonnet > /dev/null || { echo "Required jsonnet not found!"; }
|
||||
[ -x $JB ] || { wget https://github.com/jsonnet-bundler/jsonnet-bundler/releases/download/v0.4.0/jb-linux-amd64 && chmod +x $JB; }
|
||||
#which gojsontoyaml || go install github.com/brancz/gojsontoyaml@latest
|
||||
|
||||
[ -r jsonnetfile.json ] || $JB init
|
||||
if [ -r jsonnetfile.lock.json ]; then
|
||||
$JB update
|
||||
else
|
||||
$JB install github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus@main
|
||||
fi
|
||||
|
||||
rm -rf dashboards && mkdir -p dashboards
|
||||
jsonnet -J vendor -m dashboards -e '(import "mixin.libsonnet").grafanaDashboards'
|
||||
|
||||
rm -rf rules && mkdir -p rules
|
||||
#jsonnet -J vendor -m rules rules.libsonnet | xargs -I{} sh -c 'cat {} | gojsontoyaml > {}.yaml' -- {}
|
||||
jsonnet -J vendor -m rules rules.libsonnet
|
15
charts/kubezero-metrics/configs/jsonnetfile.json
Normal file
15
charts/kubezero-metrics/configs/jsonnetfile.json
Normal file
@ -0,0 +1,15 @@
|
||||
{
|
||||
"version": 1,
|
||||
"dependencies": [
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/prometheus-operator/kube-prometheus.git",
|
||||
"subdir": "jsonnet/kube-prometheus"
|
||||
}
|
||||
},
|
||||
"version": "main"
|
||||
}
|
||||
],
|
||||
"legacyImports": true
|
||||
}
|
160
charts/kubezero-metrics/configs/jsonnetfile.lock.json
Normal file
160
charts/kubezero-metrics/configs/jsonnetfile.lock.json
Normal file
@ -0,0 +1,160 @@
|
||||
{
|
||||
"version": 1,
|
||||
"dependencies": [
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/brancz/kubernetes-grafana.git",
|
||||
"subdir": "grafana"
|
||||
}
|
||||
},
|
||||
"version": "c3b14b24b83cfe9abf1064649d19e2d679f033fb",
|
||||
"sum": "YrE4DNQsWgYWs6h0j/FjQETt8xDXdYdsslb1WK7xQEk="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/etcd-io/etcd.git",
|
||||
"subdir": "contrib/mixin"
|
||||
}
|
||||
},
|
||||
"version": "3df272774672366beb02c5447782805ab5fec957",
|
||||
"sum": "5XhYOigrKipOWDbIn9hlrz7JcbelzvJnormxSaup9JI="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/grafana/grafonnet-lib.git",
|
||||
"subdir": "grafonnet"
|
||||
}
|
||||
},
|
||||
"version": "19b27b272abf4263af1365ec485784c49815a332",
|
||||
"sum": "gF8foHByYcB25jcUOBqP6jxk0OPifQMjPvKY0HaCk6w="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/grafana/jsonnet-libs.git",
|
||||
"subdir": "grafana-builder"
|
||||
}
|
||||
},
|
||||
"version": "b9941184e4c9987077898e788343e21a800d8465",
|
||||
"sum": "GRf2GvwEU4jhXV+JOonXSZ4wdDv8mnHBPCQ6TUVd+g8="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin.git",
|
||||
"subdir": ""
|
||||
}
|
||||
},
|
||||
"version": "ff4641bcd83314c955150bea6b147df9ca335c4a",
|
||||
"sum": "oUVGwcCbmdH8qz9B+lbRawI9s23GY9HeW7MwYZRbZ/0="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin.git",
|
||||
"subdir": "lib/promgrafonnet"
|
||||
}
|
||||
},
|
||||
"version": "ff4641bcd83314c955150bea6b147df9ca335c4a",
|
||||
"sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/kubernetes/kube-state-metrics.git",
|
||||
"subdir": "jsonnet/kube-state-metrics"
|
||||
}
|
||||
},
|
||||
"version": "8dab6f7472c26987ab7f8899a4a2f753fed8e8a8",
|
||||
"sum": "S5qI+PJUdNeYOv76jH5nxwYS9N6U7CRxvyuB1wI4cTE="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/kubernetes/kube-state-metrics.git",
|
||||
"subdir": "jsonnet/kube-state-metrics-mixin"
|
||||
}
|
||||
},
|
||||
"version": "8dab6f7472c26987ab7f8899a4a2f753fed8e8a8",
|
||||
"sum": "u8gaydJoxEjzizQ8jY8xSjYgWooPmxw+wIWdDxifMAk="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/prometheus-operator/kube-prometheus.git",
|
||||
"subdir": "jsonnet/kube-prometheus"
|
||||
}
|
||||
},
|
||||
"version": "a2eee1803a074fb40cad109d690732c22f0130cf",
|
||||
"sum": "kqVnoNBux2YF1s03m+O3w/5jreAnjXx2/NjvNP1Hoy4="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/prometheus-operator/prometheus-operator.git",
|
||||
"subdir": "jsonnet/mixin"
|
||||
}
|
||||
},
|
||||
"version": "c6b5cc2e178002683ab3afe8d023a8bb2508ba19",
|
||||
"sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U=",
|
||||
"name": "prometheus-operator-mixin"
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/prometheus-operator/prometheus-operator.git",
|
||||
"subdir": "jsonnet/prometheus-operator"
|
||||
}
|
||||
},
|
||||
"version": "c6b5cc2e178002683ab3afe8d023a8bb2508ba19",
|
||||
"sum": "sECNXs/aIEreFUma1BWVyknBygqh3AVJEB3msmrAYYY="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/prometheus/alertmanager.git",
|
||||
"subdir": "doc/alertmanager-mixin"
|
||||
}
|
||||
},
|
||||
"version": "e35efbddb66a73fd8723be5334477e76f21fbd19",
|
||||
"sum": "pep+dHzfIjh2SU5pEkwilMCAT/NoL6YYflV4x8cr7vU=",
|
||||
"name": "alertmanager"
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/prometheus/node_exporter.git",
|
||||
"subdir": "docs/node-mixin"
|
||||
}
|
||||
},
|
||||
"version": "0e6b23c338e98809c9872c70a2f5dfa8d6d370d4",
|
||||
"sum": "MnfAA4+l2BkgJncnYfV8uHC7CxHZut8+ap8KkEqyB5Y="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/prometheus/prometheus.git",
|
||||
"subdir": "documentation/prometheus-mixin"
|
||||
}
|
||||
},
|
||||
"version": "a05b510fc32c3ecc2fc369002576179ae1cbcc23",
|
||||
"sum": "m4VHwft4fUcxzL4+52lLZG/V5aH5ZEdjaweb88vISL0=",
|
||||
"name": "prometheus"
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/thanos-io/thanos.git",
|
||||
"subdir": "mixin"
|
||||
}
|
||||
},
|
||||
"version": "360b39e1c6ab3ac8dcefa225a6205142f9362c68",
|
||||
"sum": "Og+wEHfgzXBvBLAeeQvGNoiCw3FY4LQHlJdpsG/owj8=",
|
||||
"name": "thanos-mixin"
|
||||
}
|
||||
],
|
||||
"legacyImports": false
|
||||
}
|
49
charts/kubezero-metrics/configs/k8s-dashboards.yaml
Normal file
49
charts/kubezero-metrics/configs/k8s-dashboards.yaml
Normal file
@ -0,0 +1,49 @@
|
||||
configmap: grafana-dashboards-k8s
|
||||
gzip: true
|
||||
folder: Kubernetes
|
||||
dashboards:
|
||||
- name: coreDNS
|
||||
url: https://grafana.com/api/dashboards/12539/revisions/5/download
|
||||
tags: ['kubernetes', 'DNS']
|
||||
- name: etcd
|
||||
url: https://grafana.com/api/dashboards/3070/revisions/3/download
|
||||
tags: ['kubernetes', 'etcd']
|
||||
- name: node
|
||||
url: https://grafana.com/api/dashboards/1860/revisions/23/download
|
||||
tags: ['kubernetes']
|
||||
# cd dashboards; for f in *.json; do echo "- name: ${f%%.json}" >> ../dashboards.yaml; echo " url: file://dashboards/$f" >> ../dashboards.yaml; done; cd -
|
||||
- name: apiserver
|
||||
url: file://dashboards/apiserver.json
|
||||
- name: cluster-total
|
||||
url: file://dashboards/cluster-total.json
|
||||
- name: controller-manager
|
||||
url: file://dashboards/controller-manager.json
|
||||
- name: k8s-resources-cluster
|
||||
url: file://dashboards/k8s-resources-cluster.json
|
||||
- name: k8s-resources-namespace
|
||||
url: file://dashboards/k8s-resources-namespace.json
|
||||
- name: k8s-resources-node
|
||||
url: file://dashboards/k8s-resources-node.json
|
||||
- name: k8s-resources-pod
|
||||
url: file://dashboards/k8s-resources-pod.json
|
||||
- name: k8s-resources-workload
|
||||
url: file://dashboards/k8s-resources-workload.json
|
||||
- name: k8s-resources-workloads-namespace
|
||||
url: file://dashboards/k8s-resources-workloads-namespace.json
|
||||
- name: kubelet
|
||||
url: file://dashboards/kubelet.json
|
||||
- name: namespace-by-pod
|
||||
url: file://dashboards/namespace-by-pod.json
|
||||
- name: namespace-by-workload
|
||||
url: file://dashboards/namespace-by-workload.json
|
||||
- name: persistentvolumesusage
|
||||
url: file://dashboards/persistentvolumesusage.json
|
||||
tags: ['kubernetes', 'csi']
|
||||
- name: pod-total
|
||||
url: file://dashboards/pod-total.json
|
||||
- name: proxy
|
||||
url: file://dashboards/proxy.json
|
||||
- name: scheduler
|
||||
url: file://dashboards/scheduler.json
|
||||
- name: workload-total
|
||||
url: file://dashboards/workload-total.json
|
18
charts/kubezero-metrics/configs/k8s-rules.yaml
Normal file
18
charts/kubezero-metrics/configs/k8s-rules.yaml
Normal file
@ -0,0 +1,18 @@
|
||||
# cd rules; for f in *-prometheusRule; do echo "- name: ${f%%-prometheusRule}" >> ../k8s-rules.yaml; echo " url: file://rules/$f" >> ../k8s-rules.yaml; done; cd -
|
||||
rules:
|
||||
- name: alertmanager
|
||||
url: file://rules/alertmanager-prometheusRule
|
||||
- name: etcd-mixin
|
||||
url: file://rules/etcd-mixin-prometheusRule
|
||||
- name: kube-prometheus
|
||||
url: file://rules/kube-prometheus-prometheusRule
|
||||
- name: kube-state-metrics
|
||||
url: file://rules/kube-state-metrics-prometheusRule
|
||||
- name: kubernetes
|
||||
url: file://rules/kubernetes-prometheusRule
|
||||
- name: node-exporter
|
||||
url: file://rules/node-exporter-prometheusRule
|
||||
- name: prometheus-operator
|
||||
url: file://rules/prometheus-operator-prometheusRule
|
||||
- name: prometheus
|
||||
url: file://rules/prometheus-prometheusRule
|
49
charts/kubezero-metrics/configs/rules.libsonnet
Normal file
49
charts/kubezero-metrics/configs/rules.libsonnet
Normal file
@ -0,0 +1,49 @@
|
||||
local addMixin = (import 'kube-prometheus/lib/mixin.libsonnet');
|
||||
|
||||
local etcdMixin = addMixin({
|
||||
name: 'etcd',
|
||||
mixin: (import 'github.com/etcd-io/etcd/contrib/mixin/mixin.libsonnet') +
|
||||
{
|
||||
_config+: {
|
||||
etcd_instance_labels: 'instance, pod',
|
||||
}
|
||||
} +
|
||||
// Remove both etcdHighNumberOfFailedGRPCRequests from etcd-mixin for now
|
||||
{
|
||||
prometheusAlerts+: {
|
||||
groups: std.map(
|
||||
function(group)
|
||||
if group.name == 'etcd' then
|
||||
group {
|
||||
rules: std.filter(
|
||||
function(rule)
|
||||
rule.alert != 'etcdHighNumberOfFailedGRPCRequests',
|
||||
group.rules
|
||||
),
|
||||
}
|
||||
else
|
||||
group,
|
||||
super.groups
|
||||
),
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
local kp = (import 'kube-prometheus/main.libsonnet') +
|
||||
{
|
||||
values+:: {
|
||||
common+: {
|
||||
namespace: 'monitoring',
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// We just want the Prometheus Rules
|
||||
{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } +
|
||||
{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } +
|
||||
{ 'alertmanager-prometheusRule': kp.alertmanager.prometheusRule } +
|
||||
{ 'kube-state-metrics-prometheusRule': kp.kubeStateMetrics.prometheusRule } +
|
||||
{ 'kubernetes-prometheusRule': kp.kubernetesControlPlane.prometheusRule } +
|
||||
{ 'node-exporter-prometheusRule': kp.nodeExporter.prometheusRule } +
|
||||
{ 'prometheus-prometheusRule': kp.prometheus.prometheusRule } +
|
||||
{ 'etcd-mixin-prometheusRule': etcdMixin.prometheusRules }
|
@ -0,0 +1,129 @@
|
||||
{
|
||||
"apiVersion": "monitoring.coreos.com/v1",
|
||||
"kind": "PrometheusRule",
|
||||
"metadata": {
|
||||
"labels": {
|
||||
"app.kubernetes.io/component": "alert-router",
|
||||
"app.kubernetes.io/name": "alertmanager",
|
||||
"app.kubernetes.io/part-of": "kube-prometheus",
|
||||
"app.kubernetes.io/version": "0.23.0",
|
||||
"prometheus": "k8s",
|
||||
"role": "alert-rules"
|
||||
},
|
||||
"name": "alertmanager-main-rules",
|
||||
"namespace": "monitoring"
|
||||
},
|
||||
"spec": {
|
||||
"groups": [
|
||||
{
|
||||
"name": "alertmanager.rules",
|
||||
"rules": [
|
||||
{
|
||||
"alert": "AlertmanagerFailedReload",
|
||||
"annotations": {
|
||||
"description": "Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload",
|
||||
"summary": "Reloading an Alertmanager configuration has failed."
|
||||
},
|
||||
"expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\nmax_over_time(alertmanager_config_last_reload_successful{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m]) == 0\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "AlertmanagerMembersInconsistent",
|
||||
"annotations": {
|
||||
"description": "Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent",
|
||||
"summary": "A member of an Alertmanager cluster has not found all other cluster members."
|
||||
},
|
||||
"expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n max_over_time(alertmanager_cluster_members{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m])\n< on (namespace,service) group_left\n count by (namespace,service) (max_over_time(alertmanager_cluster_members{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m]))\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "AlertmanagerFailedToSendAlerts",
|
||||
"annotations": {
|
||||
"description": "Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts",
|
||||
"summary": "An Alertmanager instance failed to send notifications."
|
||||
},
|
||||
"expr": "(\n rate(alertmanager_notifications_failed_total{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m])\n/\n rate(alertmanager_notifications_total{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m])\n)\n> 0.01\n",
|
||||
"for": "5m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "AlertmanagerClusterFailedToSendAlerts",
|
||||
"annotations": {
|
||||
"description": "The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts",
|
||||
"summary": "All Alertmanager instances in a cluster failed to send notifications to a critical integration."
|
||||
},
|
||||
"expr": "min by (namespace,service, integration) (\n rate(alertmanager_notifications_failed_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration=~`.*`}[5m])\n/\n rate(alertmanager_notifications_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration=~`.*`}[5m])\n)\n> 0.01\n",
|
||||
"for": "5m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "AlertmanagerClusterFailedToSendAlerts",
|
||||
"annotations": {
|
||||
"description": "The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts",
|
||||
"summary": "All Alertmanager instances in a cluster failed to send notifications to a non-critical integration."
|
||||
},
|
||||
"expr": "min by (namespace,service, integration) (\n rate(alertmanager_notifications_failed_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration!~`.*`}[5m])\n/\n rate(alertmanager_notifications_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration!~`.*`}[5m])\n)\n> 0.01\n",
|
||||
"for": "5m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "AlertmanagerConfigInconsistent",
|
||||
"annotations": {
|
||||
"description": "Alertmanager instances within the {{$labels.job}} cluster have different configurations.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent",
|
||||
"summary": "Alertmanager instances within the same cluster have different configurations."
|
||||
},
|
||||
"expr": "count by (namespace,service) (\n count_values by (namespace,service) (\"config_hash\", alertmanager_config_hash{job=\"alertmanager-main\",namespace=\"monitoring\"})\n)\n!= 1\n",
|
||||
"for": "20m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "AlertmanagerClusterDown",
|
||||
"annotations": {
|
||||
"description": "{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown",
|
||||
"summary": "Half or more of the Alertmanager instances within the same cluster are down."
|
||||
},
|
||||
"expr": "(\n count by (namespace,service) (\n avg_over_time(up{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m]) < 0.5\n )\n/\n count by (namespace,service) (\n up{job=\"alertmanager-main\",namespace=\"monitoring\"}\n )\n)\n>= 0.5\n",
|
||||
"for": "5m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "AlertmanagerClusterCrashlooping",
|
||||
"annotations": {
|
||||
"description": "{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping",
|
||||
"summary": "Half or more of the Alertmanager instances within the same cluster are crashlooping."
|
||||
},
|
||||
"expr": "(\n count by (namespace,service) (\n changes(process_start_time_seconds{job=\"alertmanager-main\",namespace=\"monitoring\"}[10m]) > 4\n )\n/\n count by (namespace,service) (\n up{job=\"alertmanager-main\",namespace=\"monitoring\"}\n )\n)\n>= 0.5\n",
|
||||
"for": "5m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
161
charts/kubezero-metrics/configs/rules/etcd-mixin-prometheusRule
Normal file
161
charts/kubezero-metrics/configs/rules/etcd-mixin-prometheusRule
Normal file
@ -0,0 +1,161 @@
|
||||
{
|
||||
"apiVersion": "monitoring.coreos.com/v1",
|
||||
"kind": "PrometheusRule",
|
||||
"metadata": {
|
||||
"labels": {
|
||||
"prometheus": "k8s"
|
||||
},
|
||||
"name": "etcd",
|
||||
"namespace": "monitoring"
|
||||
},
|
||||
"spec": {
|
||||
"groups": [
|
||||
{
|
||||
"name": "etcd",
|
||||
"rules": [
|
||||
{
|
||||
"alert": "etcdMembersDown",
|
||||
"annotations": {
|
||||
"description": "etcd cluster \"{{ $labels.job }}\": members are down ({{ $value }}).",
|
||||
"summary": "etcd cluster members are down."
|
||||
},
|
||||
"expr": "max without (endpoint) (\n sum without (instance, pod) (up{job=~\".*etcd.*\"} == bool 0)\nor\n count without (To) (\n sum without (instance, pod) (rate(etcd_network_peer_sent_failures_total{job=~\".*etcd.*\"}[120s])) > 0.01\n )\n)\n> 0\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "etcdInsufficientMembers",
|
||||
"annotations": {
|
||||
"description": "etcd cluster \"{{ $labels.job }}\": insufficient members ({{ $value }}).",
|
||||
"summary": "etcd cluster has insufficient number of members."
|
||||
},
|
||||
"expr": "sum(up{job=~\".*etcd.*\"} == bool 1) without (instance, pod) < ((count(up{job=~\".*etcd.*\"}) without (instance, pod) + 1) / 2)\n",
|
||||
"for": "3m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "etcdNoLeader",
|
||||
"annotations": {
|
||||
"description": "etcd cluster \"{{ $labels.job }}\": member {{ $labels.instance }} has no leader.",
|
||||
"summary": "etcd cluster has no leader."
|
||||
},
|
||||
"expr": "etcd_server_has_leader{job=~\".*etcd.*\"} == 0\n",
|
||||
"for": "1m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "etcdHighNumberOfLeaderChanges",
|
||||
"annotations": {
|
||||
"description": "etcd cluster \"{{ $labels.job }}\": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.",
|
||||
"summary": "etcd cluster has high number of leader changes."
|
||||
},
|
||||
"expr": "increase((max without (instance, pod) (etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}))[15m:1m]) >= 4\n",
|
||||
"for": "5m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "etcdGRPCRequestsSlow",
|
||||
"annotations": {
|
||||
"description": "etcd cluster \"{{ $labels.job }}\": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }}.",
|
||||
"summary": "etcd grpc requests are slow"
|
||||
},
|
||||
"expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\", grpc_type=\"unary\"}[5m])) without(grpc_type))\n> 0.15\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "etcdMemberCommunicationSlow",
|
||||
"annotations": {
|
||||
"description": "etcd cluster \"{{ $labels.job }}\": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.",
|
||||
"summary": "etcd cluster member communication is slow."
|
||||
},
|
||||
"expr": "histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.15\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "etcdHighNumberOfFailedProposals",
|
||||
"annotations": {
|
||||
"description": "etcd cluster \"{{ $labels.job }}\": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.",
|
||||
"summary": "etcd cluster has high number of proposal failures."
|
||||
},
|
||||
"expr": "rate(etcd_server_proposals_failed_total{job=~\".*etcd.*\"}[15m]) > 5\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "etcdHighFsyncDurations",
|
||||
"annotations": {
|
||||
"description": "etcd cluster \"{{ $labels.job }}\": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.",
|
||||
"summary": "etcd cluster 99th percentile fsync durations are too high."
|
||||
},
|
||||
"expr": "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.5\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "etcdHighFsyncDurations",
|
||||
"annotations": {
|
||||
"description": "etcd cluster \"{{ $labels.job }}\": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}."
|
||||
},
|
||||
"expr": "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 1\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "etcdHighCommitDurations",
|
||||
"annotations": {
|
||||
"description": "etcd cluster \"{{ $labels.job }}\": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.",
|
||||
"summary": "etcd cluster 99th percentile commit durations are too high."
|
||||
},
|
||||
"expr": "histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.25\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "etcdBackendQuotaLowSpace",
|
||||
"annotations": {
|
||||
"description": "etcd cluster \"{{ $labels.job }}\": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full."
|
||||
},
|
||||
"expr": "(etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100 > 95\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "etcdExcessiveDatabaseGrowth",
|
||||
"annotations": {
|
||||
"description": "etcd cluster \"{{ $labels.job }}\": Observed surge in etcd writes leading to 50% increase in database size over the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive."
|
||||
},
|
||||
"expr": "increase(((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100)[240m:1m]) > 50\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
@ -0,0 +1,109 @@
|
||||
{
|
||||
"apiVersion": "monitoring.coreos.com/v1",
|
||||
"kind": "PrometheusRule",
|
||||
"metadata": {
|
||||
"labels": {
|
||||
"app.kubernetes.io/component": "exporter",
|
||||
"app.kubernetes.io/name": "kube-prometheus",
|
||||
"app.kubernetes.io/part-of": "kube-prometheus",
|
||||
"prometheus": "k8s",
|
||||
"role": "alert-rules"
|
||||
},
|
||||
"name": "kube-prometheus-rules",
|
||||
"namespace": "monitoring"
|
||||
},
|
||||
"spec": {
|
||||
"groups": [
|
||||
{
|
||||
"name": "general.rules",
|
||||
"rules": [
|
||||
{
|
||||
"alert": "TargetDown",
|
||||
"annotations": {
|
||||
"description": "{{ printf \"%.4g\" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/general/targetdown",
|
||||
"summary": "One or more targets are unreachable."
|
||||
},
|
||||
"expr": "100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "Watchdog",
|
||||
"annotations": {
|
||||
"description": "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty.\n",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/general/watchdog",
|
||||
"summary": "An alert that should always be firing to certify that Alertmanager is working properly."
|
||||
},
|
||||
"expr": "vector(1)",
|
||||
"labels": {
|
||||
"severity": "none"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "node-network",
|
||||
"rules": [
|
||||
{
|
||||
"alert": "NodeNetworkInterfaceFlapping",
|
||||
"annotations": {
|
||||
"description": "Network interface \"{{ $labels.device }}\" changing its up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping",
|
||||
"summary": "Network interface is often changing its status"
|
||||
},
|
||||
"expr": "changes(node_network_up{job=\"node-exporter\",device!~\"veth.+\"}[2m]) > 2\n",
|
||||
"for": "2m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "kube-prometheus-node-recording.rules",
|
||||
"rules": [
|
||||
{
|
||||
"expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[3m])) BY (instance)",
|
||||
"record": "instance:node_cpu:rate:sum"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(node_network_receive_bytes_total[3m])) BY (instance)",
|
||||
"record": "instance:node_network_receive_bytes:rate:sum"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)",
|
||||
"record": "instance:node_network_transmit_bytes:rate:sum"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)",
|
||||
"record": "instance:node_cpu:ratio"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[5m]))",
|
||||
"record": "cluster:node_cpu:sum_rate5m"
|
||||
},
|
||||
{
|
||||
"expr": "cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))",
|
||||
"record": "cluster:node_cpu:ratio"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "kube-prometheus-general.rules",
|
||||
"rules": [
|
||||
{
|
||||
"expr": "count without(instance, pod, node) (up == 1)",
|
||||
"record": "count:up1"
|
||||
},
|
||||
{
|
||||
"expr": "count without(instance, pod, node) (up == 0)",
|
||||
"record": "count:up0"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
@ -0,0 +1,77 @@
|
||||
{
|
||||
"apiVersion": "monitoring.coreos.com/v1",
|
||||
"kind": "PrometheusRule",
|
||||
"metadata": {
|
||||
"labels": {
|
||||
"app.kubernetes.io/component": "exporter",
|
||||
"app.kubernetes.io/name": "kube-state-metrics",
|
||||
"app.kubernetes.io/part-of": "kube-prometheus",
|
||||
"app.kubernetes.io/version": "2.2.1",
|
||||
"prometheus": "k8s",
|
||||
"role": "alert-rules"
|
||||
},
|
||||
"name": "kube-state-metrics-rules",
|
||||
"namespace": "monitoring"
|
||||
},
|
||||
"spec": {
|
||||
"groups": [
|
||||
{
|
||||
"name": "kube-state-metrics",
|
||||
"rules": [
|
||||
{
|
||||
"alert": "KubeStateMetricsListErrors",
|
||||
"annotations": {
|
||||
"description": "kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors",
|
||||
"summary": "kube-state-metrics is experiencing errors in list operations."
|
||||
},
|
||||
"expr": "(sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\",result=\"error\"}[5m]))\n /\nsum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\"}[5m])))\n> 0.01\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "KubeStateMetricsWatchErrors",
|
||||
"annotations": {
|
||||
"description": "kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors",
|
||||
"summary": "kube-state-metrics is experiencing errors in watch operations."
|
||||
},
|
||||
"expr": "(sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\",result=\"error\"}[5m]))\n /\nsum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\"}[5m])))\n> 0.01\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "KubeStateMetricsShardingMismatch",
|
||||
"annotations": {
|
||||
"description": "kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch",
|
||||
"summary": "kube-state-metrics sharding is misconfigured."
|
||||
},
|
||||
"expr": "stdvar (kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) != 0\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "KubeStateMetricsShardsMissing",
|
||||
"annotations": {
|
||||
"description": "kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing",
|
||||
"summary": "kube-state-metrics shards are missing."
|
||||
},
|
||||
"expr": "2^max(kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) - 1\n -\nsum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job=\"kube-state-metrics\"}) )\n!= 0\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
1178
charts/kubezero-metrics/configs/rules/kubernetes-prometheusRule
Normal file
1178
charts/kubezero-metrics/configs/rules/kubernetes-prometheusRule
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,305 @@
|
||||
{
|
||||
"apiVersion": "monitoring.coreos.com/v1",
|
||||
"kind": "PrometheusRule",
|
||||
"metadata": {
|
||||
"labels": {
|
||||
"app.kubernetes.io/component": "exporter",
|
||||
"app.kubernetes.io/name": "node-exporter",
|
||||
"app.kubernetes.io/part-of": "kube-prometheus",
|
||||
"app.kubernetes.io/version": "1.2.2",
|
||||
"prometheus": "k8s",
|
||||
"role": "alert-rules"
|
||||
},
|
||||
"name": "node-exporter-rules",
|
||||
"namespace": "monitoring"
|
||||
},
|
||||
"spec": {
|
||||
"groups": [
|
||||
{
|
||||
"name": "node-exporter",
|
||||
"rules": [
|
||||
{
|
||||
"alert": "NodeFilesystemSpaceFillingUp",
|
||||
"annotations": {
|
||||
"description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup",
|
||||
"summary": "Filesystem is predicted to run out of space within the next 24 hours."
|
||||
},
|
||||
"expr": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n",
|
||||
"for": "1h",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "NodeFilesystemSpaceFillingUp",
|
||||
"annotations": {
|
||||
"description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up fast.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup",
|
||||
"summary": "Filesystem is predicted to run out of space within the next 4 hours."
|
||||
},
|
||||
"expr": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 15\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n",
|
||||
"for": "1h",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "NodeFilesystemAlmostOutOfSpace",
|
||||
"annotations": {
|
||||
"description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace",
|
||||
"summary": "Filesystem has less than 5% space left."
|
||||
},
|
||||
"expr": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n",
|
||||
"for": "30m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "NodeFilesystemAlmostOutOfSpace",
|
||||
"annotations": {
|
||||
"description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace",
|
||||
"summary": "Filesystem has less than 3% space left."
|
||||
},
|
||||
"expr": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n",
|
||||
"for": "30m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "NodeFilesystemFilesFillingUp",
|
||||
"annotations": {
|
||||
"description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left and is filling up.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup",
|
||||
"summary": "Filesystem is predicted to run out of inodes within the next 24 hours."
|
||||
},
|
||||
"expr": "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n",
|
||||
"for": "1h",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "NodeFilesystemFilesFillingUp",
|
||||
"annotations": {
|
||||
"description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left and is filling up fast.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup",
|
||||
"summary": "Filesystem is predicted to run out of inodes within the next 4 hours."
|
||||
},
|
||||
"expr": "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n",
|
||||
"for": "1h",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "NodeFilesystemAlmostOutOfFiles",
|
||||
"annotations": {
|
||||
"description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles",
|
||||
"summary": "Filesystem has less than 5% inodes left."
|
||||
},
|
||||
"expr": "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n",
|
||||
"for": "1h",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "NodeFilesystemAlmostOutOfFiles",
|
||||
"annotations": {
|
||||
"description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles",
|
||||
"summary": "Filesystem has less than 3% inodes left."
|
||||
},
|
||||
"expr": "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n",
|
||||
"for": "1h",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "NodeNetworkReceiveErrs",
|
||||
"annotations": {
|
||||
"description": "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs",
|
||||
"summary": "Network interface is reporting many receive errors."
|
||||
},
|
||||
"expr": "rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01\n",
|
||||
"for": "1h",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "NodeNetworkTransmitErrs",
|
||||
"annotations": {
|
||||
"description": "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs",
|
||||
"summary": "Network interface is reporting many transmit errors."
|
||||
},
|
||||
"expr": "rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01\n",
|
||||
"for": "1h",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "NodeHighNumberConntrackEntriesUsed",
|
||||
"annotations": {
|
||||
"description": "{{ $value | humanizePercentage }} of conntrack entries are used.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused",
|
||||
"summary": "Number of conntrack are getting close to the limit."
|
||||
},
|
||||
"expr": "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "NodeTextFileCollectorScrapeError",
|
||||
"annotations": {
|
||||
"description": "Node Exporter text file collector failed to scrape.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror",
|
||||
"summary": "Node Exporter text file collector failed to scrape."
|
||||
},
|
||||
"expr": "node_textfile_scrape_error{job=\"node-exporter\"} == 1\n",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "NodeClockSkewDetected",
|
||||
"annotations": {
|
||||
"description": "Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected",
|
||||
"summary": "Clock skew detected."
|
||||
},
|
||||
"expr": "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "NodeClockNotSynchronising",
|
||||
"annotations": {
|
||||
"description": "Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising",
|
||||
"summary": "Clock not synchronising."
|
||||
},
|
||||
"expr": "min_over_time(node_timex_sync_status[5m]) == 0\nand\nnode_timex_maxerror_seconds >= 16\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "NodeRAIDDegraded",
|
||||
"annotations": {
|
||||
"description": "RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded",
|
||||
"summary": "RAID Array is degraded"
|
||||
},
|
||||
"expr": "node_md_disks_required - ignoring (state) (node_md_disks{state=\"active\"}) > 0\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "NodeRAIDDiskFailure",
|
||||
"annotations": {
|
||||
"description": "At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure",
|
||||
"summary": "Failed device in RAID array"
|
||||
},
|
||||
"expr": "node_md_disks{state=\"failed\"} > 0\n",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "NodeFileDescriptorLimit",
|
||||
"annotations": {
|
||||
"description": "File descriptors limit at {{ $labels.instance }} is currently at {{ printf \"%.2f\" $value }}%.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit",
|
||||
"summary": "Kernel is predicted to exhaust file descriptors limit soon."
|
||||
},
|
||||
"expr": "(\n node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"node-exporter\"} > 70\n)\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "NodeFileDescriptorLimit",
|
||||
"annotations": {
|
||||
"description": "File descriptors limit at {{ $labels.instance }} is currently at {{ printf \"%.2f\" $value }}%.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit",
|
||||
"summary": "Kernel is predicted to exhaust file descriptors limit soon."
|
||||
},
|
||||
"expr": "(\n node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"node-exporter\"} > 90\n)\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "node-exporter.rules",
|
||||
"rules": [
|
||||
{
|
||||
"expr": "count without (cpu, mode) (\n node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"}\n)\n",
|
||||
"record": "instance:node_num_cpu:sum"
|
||||
},
|
||||
{
|
||||
"expr": "1 - avg without (cpu, mode) (\n rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\"}[5m])\n)\n",
|
||||
"record": "instance:node_cpu_utilisation:rate5m"
|
||||
},
|
||||
{
|
||||
"expr": "(\n node_load1{job=\"node-exporter\"}\n/\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n",
|
||||
"record": "instance:node_load1_per_cpu:ratio"
|
||||
},
|
||||
{
|
||||
"expr": "1 - (\n node_memory_MemAvailable_bytes{job=\"node-exporter\"}\n or\n (\n node_memory_Buffers_bytes{job=\"node-exporter\"}\n +\n node_memory_Cached_bytes{job=\"node-exporter\"}\n +\n node_memory_MemFree_bytes{job=\"node-exporter\"}\n +\n node_memory_Slab_bytes{job=\"node-exporter\"}\n )\n/\n node_memory_MemTotal_bytes{job=\"node-exporter\"}\n)\n",
|
||||
"record": "instance:node_memory_utilisation:ratio"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_vmstat_pgmajfault{job=\"node-exporter\"}[5m])\n",
|
||||
"record": "instance:node_vmstat_pgmajfault:rate5m"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[5m])\n",
|
||||
"record": "instance_device:node_disk_io_time_seconds:rate5m"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_disk_io_time_weighted_seconds_total{job=\"node-exporter\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[5m])\n",
|
||||
"record": "instance_device:node_disk_io_time_weighted_seconds:rate5m"
|
||||
},
|
||||
{
|
||||
"expr": "sum without (device) (\n rate(node_network_receive_bytes_total{job=\"node-exporter\", device!=\"lo\"}[5m])\n)\n",
|
||||
"record": "instance:node_network_receive_bytes_excluding_lo:rate5m"
|
||||
},
|
||||
{
|
||||
"expr": "sum without (device) (\n rate(node_network_transmit_bytes_total{job=\"node-exporter\", device!=\"lo\"}[5m])\n)\n",
|
||||
"record": "instance:node_network_transmit_bytes_excluding_lo:rate5m"
|
||||
},
|
||||
{
|
||||
"expr": "sum without (device) (\n rate(node_network_receive_drop_total{job=\"node-exporter\", device!=\"lo\"}[5m])\n)\n",
|
||||
"record": "instance:node_network_receive_drop_excluding_lo:rate5m"
|
||||
},
|
||||
{
|
||||
"expr": "sum without (device) (\n rate(node_network_transmit_drop_total{job=\"node-exporter\", device!=\"lo\"}[5m])\n)\n",
|
||||
"record": "instance:node_network_transmit_drop_excluding_lo:rate5m"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
@ -0,0 +1,116 @@
|
||||
{
|
||||
"apiVersion": "monitoring.coreos.com/v1",
|
||||
"kind": "PrometheusRule",
|
||||
"metadata": {
|
||||
"labels": {
|
||||
"app.kubernetes.io/component": "controller",
|
||||
"app.kubernetes.io/name": "prometheus-operator",
|
||||
"app.kubernetes.io/part-of": "kube-prometheus",
|
||||
"app.kubernetes.io/version": "0.51.1",
|
||||
"prometheus": "k8s",
|
||||
"role": "alert-rules"
|
||||
},
|
||||
"name": "prometheus-operator-rules",
|
||||
"namespace": "monitoring"
|
||||
},
|
||||
"spec": {
|
||||
"groups": [
|
||||
{
|
||||
"name": "prometheus-operator",
|
||||
"rules": [
|
||||
{
|
||||
"alert": "PrometheusOperatorListErrors",
|
||||
"annotations": {
|
||||
"description": "Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors",
|
||||
"summary": "Errors while performing list operations in controller."
|
||||
},
|
||||
"expr": "(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[10m]))) > 0.4\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusOperatorWatchErrors",
|
||||
"annotations": {
|
||||
"description": "Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors",
|
||||
"summary": "Errors while performing watch operations in controller."
|
||||
},
|
||||
"expr": "(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[10m]))) > 0.4\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusOperatorSyncFailed",
|
||||
"annotations": {
|
||||
"description": "Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorsyncfailed",
|
||||
"summary": "Last controller reconciliation failed"
|
||||
},
|
||||
"expr": "min_over_time(prometheus_operator_syncs{status=\"failed\",job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]) > 0\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusOperatorReconcileErrors",
|
||||
"annotations": {
|
||||
"description": "{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors",
|
||||
"summary": "Errors while reconciling controller."
|
||||
},
|
||||
"expr": "(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]))) > 0.1\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusOperatorNodeLookupErrors",
|
||||
"annotations": {
|
||||
"description": "Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornodelookuperrors",
|
||||
"summary": "Errors while reconciling Prometheus."
|
||||
},
|
||||
"expr": "rate(prometheus_operator_node_address_lookup_errors_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]) > 0.1\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusOperatorNotReady",
|
||||
"annotations": {
|
||||
"description": "Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready",
|
||||
"summary": "Prometheus operator not ready"
|
||||
},
|
||||
"expr": "min by(namespace, controller) (max_over_time(prometheus_operator_ready{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]) == 0)\n",
|
||||
"for": "5m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusOperatorRejectedResources",
|
||||
"annotations": {
|
||||
"description": "Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf \"%0.0f\" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorrejectedresources",
|
||||
"summary": "Resources rejected by Prometheus operator"
|
||||
},
|
||||
"expr": "min_over_time(prometheus_operator_managed_resources{state=\"rejected\",job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]) > 0\n",
|
||||
"for": "5m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
259
charts/kubezero-metrics/configs/rules/prometheus-prometheusRule
Normal file
259
charts/kubezero-metrics/configs/rules/prometheus-prometheusRule
Normal file
@ -0,0 +1,259 @@
|
||||
{
|
||||
"apiVersion": "monitoring.coreos.com/v1",
|
||||
"kind": "PrometheusRule",
|
||||
"metadata": {
|
||||
"labels": {
|
||||
"app.kubernetes.io/component": "prometheus",
|
||||
"app.kubernetes.io/name": "prometheus",
|
||||
"app.kubernetes.io/part-of": "kube-prometheus",
|
||||
"app.kubernetes.io/version": "2.30.0",
|
||||
"prometheus": "k8s",
|
||||
"role": "alert-rules"
|
||||
},
|
||||
"name": "prometheus-k8s-prometheus-rules",
|
||||
"namespace": "monitoring"
|
||||
},
|
||||
"spec": {
|
||||
"groups": [
|
||||
{
|
||||
"name": "prometheus",
|
||||
"rules": [
|
||||
{
|
||||
"alert": "PrometheusBadConfig",
|
||||
"annotations": {
|
||||
"description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig",
|
||||
"summary": "Failed Prometheus configuration reload."
|
||||
},
|
||||
"expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\nmax_over_time(prometheus_config_last_reload_successful{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) == 0\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusNotificationQueueRunningFull",
|
||||
"annotations": {
|
||||
"description": "Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull",
|
||||
"summary": "Prometheus alert notification queue predicted to run full in less than 30m."
|
||||
},
|
||||
"expr": "# Without min_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n predict_linear(prometheus_notifications_queue_length{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m], 60 * 30)\n>\n min_over_time(prometheus_notifications_queue_capacity{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusErrorSendingAlertsToSomeAlertmanagers",
|
||||
"annotations": {
|
||||
"description": "{{ printf \"%.1f\" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers",
|
||||
"summary": "Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager."
|
||||
},
|
||||
"expr": "(\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n* 100\n> 1\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusNotConnectedToAlertmanagers",
|
||||
"annotations": {
|
||||
"description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers",
|
||||
"summary": "Prometheus is not connected to any Alertmanagers."
|
||||
},
|
||||
"expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\nmax_over_time(prometheus_notifications_alertmanagers_discovered{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) < 1\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusTSDBReloadsFailing",
|
||||
"annotations": {
|
||||
"description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing",
|
||||
"summary": "Prometheus has issues reloading blocks from disk."
|
||||
},
|
||||
"expr": "increase(prometheus_tsdb_reloads_failures_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[3h]) > 0\n",
|
||||
"for": "4h",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusTSDBCompactionsFailing",
|
||||
"annotations": {
|
||||
"description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing",
|
||||
"summary": "Prometheus has issues compacting blocks."
|
||||
},
|
||||
"expr": "increase(prometheus_tsdb_compactions_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[3h]) > 0\n",
|
||||
"for": "4h",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusNotIngestingSamples",
|
||||
"annotations": {
|
||||
"description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples",
|
||||
"summary": "Prometheus is not ingesting samples."
|
||||
},
|
||||
"expr": "(\n rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) <= 0\nand\n (\n sum without(scrape_job) (prometheus_target_metadata_cache_entries{job=\"prometheus-k8s\",namespace=\"monitoring\"}) > 0\n or\n sum without(rule_group) (prometheus_rule_group_rules{job=\"prometheus-k8s\",namespace=\"monitoring\"}) > 0\n )\n)\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusDuplicateTimestamps",
|
||||
"annotations": {
|
||||
"description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf \"%.4g\" $value }} samples/s with different values but duplicated timestamp.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps",
|
||||
"summary": "Prometheus is dropping samples with duplicate timestamps."
|
||||
},
|
||||
"expr": "rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) > 0\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusOutOfOrderTimestamps",
|
||||
"annotations": {
|
||||
"description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf \"%.4g\" $value }} samples/s with timestamps arriving out of order.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps",
|
||||
"summary": "Prometheus drops samples with out-of-order timestamps."
|
||||
},
|
||||
"expr": "rate(prometheus_target_scrapes_sample_out_of_order_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) > 0\n",
|
||||
"for": "10m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusRemoteStorageFailures",
|
||||
"annotations": {
|
||||
"description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf \"%.1f\" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures",
|
||||
"summary": "Prometheus fails to send samples to remote storage."
|
||||
},
|
||||
"expr": "(\n (rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n/\n (\n (rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n +\n (rate(prometheus_remote_storage_succeeded_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n )\n)\n* 100\n> 1\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusRemoteWriteBehind",
|
||||
"annotations": {
|
||||
"description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf \"%.1f\" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind",
|
||||
"summary": "Prometheus remote write is behind."
|
||||
},
|
||||
"expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n- ignoring(remote_name, url) group_right\n max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n> 120\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusRemoteWriteDesiredShards",
|
||||
"annotations": {
|
||||
"description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance=\"%s\",job=\"prometheus-k8s\",namespace=\"monitoring\"}` $labels.instance | query | first | value }}.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards",
|
||||
"summary": "Prometheus remote write desired shards calculation wants to run more than configured max shards."
|
||||
},
|
||||
"expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_shards_desired{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n>\n max_over_time(prometheus_remote_storage_shards_max{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusRuleFailures",
|
||||
"annotations": {
|
||||
"description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf \"%.0f\" $value }} rules in the last 5m.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures",
|
||||
"summary": "Prometheus is failing rule evaluations."
|
||||
},
|
||||
"expr": "increase(prometheus_rule_evaluation_failures_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) > 0\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusMissingRuleEvaluations",
|
||||
"annotations": {
|
||||
"description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf \"%.0f\" $value }} rule group evaluations in the last 5m.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations",
|
||||
"summary": "Prometheus is missing rule evaluations due to slow rule group evaluation."
|
||||
},
|
||||
"expr": "increase(prometheus_rule_group_iterations_missed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) > 0\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusTargetLimitHit",
|
||||
"annotations": {
|
||||
"description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf \"%.0f\" $value }} targets because the number of targets exceeded the configured target_limit.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit",
|
||||
"summary": "Prometheus has dropped targets because some scrape configs have exceeded the targets limit."
|
||||
},
|
||||
"expr": "increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) > 0\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusLabelLimitHit",
|
||||
"annotations": {
|
||||
"description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf \"%.0f\" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit",
|
||||
"summary": "Prometheus has dropped targets because some scrape configs have exceeded the labels limit."
|
||||
},
|
||||
"expr": "increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) > 0\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "warning"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusTargetSyncFailure",
|
||||
"annotations": {
|
||||
"description": "{{ printf \"%.0f\" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}} have failed to sync because invalid configuration was supplied.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure",
|
||||
"summary": "Prometheus has failed to sync targets."
|
||||
},
|
||||
"expr": "increase(prometheus_target_sync_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[30m]) > 0\n",
|
||||
"for": "5m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
},
|
||||
{
|
||||
"alert": "PrometheusErrorSendingAlertsToAnyAlertmanager",
|
||||
"annotations": {
|
||||
"description": "{{ printf \"%.1f\" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.",
|
||||
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager",
|
||||
"summary": "Prometheus encounters more than 3% errors sending alerts to any Alertmanager."
|
||||
},
|
||||
"expr": "min without (alertmanager) (\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\",alertmanager!~``}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\",alertmanager!~``}[5m])\n)\n* 100\n> 3\n",
|
||||
"for": "15m",
|
||||
"labels": {
|
||||
"severity": "critical"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
@ -1,10 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
[ -x ./jb-linux-amd64 ] || wget https://github.com/jsonnet-bundler/jsonnet-bundler/releases/download/v0.4.0/jb-linux-amd64 && chmod +x ./jb-linux-amd64
|
||||
|
||||
./jb-linux-amd64 update
|
||||
|
||||
mkdir -p kube-mixin
|
||||
#jsonnet -J vendor -S -e 'std.manifestYamlDoc((import "mixin.libsonnet").prometheusAlerts)' > kube-mixin/alerts.yml
|
||||
#jsonnet -J vendor -S -e 'std.manifestYamlDoc((import "mixin.libsonnet").prometheusRules)' > kube-mixin/rules.yml
|
||||
jsonnet -J vendor -m kube-mixin -e '(import "mixin.libsonnet").grafanaDashboards'
|
@ -1,15 +0,0 @@
|
||||
{
|
||||
"version": 1,
|
||||
"dependencies": [
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin.git",
|
||||
"subdir": ""
|
||||
}
|
||||
},
|
||||
"version": "master"
|
||||
}
|
||||
],
|
||||
"legacyImports": true
|
||||
}
|
@ -1,36 +0,0 @@
|
||||
{
|
||||
"version": 1,
|
||||
"dependencies": [
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/grafana/grafonnet-lib.git",
|
||||
"subdir": "grafonnet"
|
||||
}
|
||||
},
|
||||
"version": "19b27b272abf4263af1365ec485784c49815a332",
|
||||
"sum": "gF8foHByYcB25jcUOBqP6jxk0OPifQMjPvKY0HaCk6w="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/grafana/jsonnet-libs.git",
|
||||
"subdir": "grafana-builder"
|
||||
}
|
||||
},
|
||||
"version": "14bc3cec71f32ba3cc490a47a22004a744b6bb4e",
|
||||
"sum": "GRf2GvwEU4jhXV+JOonXSZ4wdDv8mnHBPCQ6TUVd+g8="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin.git",
|
||||
"subdir": ""
|
||||
}
|
||||
},
|
||||
"version": "ff4641bcd83314c955150bea6b147df9ca335c4a",
|
||||
"sum": "oUVGwcCbmdH8qz9B+lbRawI9s23GY9HeW7MwYZRbZ/0="
|
||||
}
|
||||
],
|
||||
"legacyImports": false
|
||||
}
|
@ -1,49 +0,0 @@
|
||||
configmap: grafana-dashboards-kube-mixin
|
||||
gzip: true
|
||||
folder: Kubernetes
|
||||
dashboards:
|
||||
- name: coreDNS
|
||||
url: https://grafana.com/api/dashboards/12539/revisions/5/download
|
||||
tags: ['kubernetes', 'DNS']
|
||||
- name: etcd
|
||||
url: https://grafana.com/api/dashboards/3070/revisions/3/download
|
||||
tags: ['kubernetes', 'etcd']
|
||||
- name: node
|
||||
url: https://grafana.com/api/dashboards/1860/revisions/23/download
|
||||
tags: ['kubernetes']
|
||||
# cd kube-mixin; for f in *.json; do echo "- name: ${f%%.json}" >> ../dashboards.yaml; echo " url: file://kube-mixin/$f" >> ../dashboards.yaml; done; cd -
|
||||
- name: apiserver
|
||||
url: file://kube-mixin/apiserver.json
|
||||
- name: cluster-total
|
||||
url: file://kube-mixin/cluster-total.json
|
||||
- name: controller-manager
|
||||
url: file://kube-mixin/controller-manager.json
|
||||
- name: k8s-resources-cluster
|
||||
url: file://kube-mixin/k8s-resources-cluster.json
|
||||
- name: k8s-resources-namespace
|
||||
url: file://kube-mixin/k8s-resources-namespace.json
|
||||
- name: k8s-resources-node
|
||||
url: file://kube-mixin/k8s-resources-node.json
|
||||
- name: k8s-resources-pod
|
||||
url: file://kube-mixin/k8s-resources-pod.json
|
||||
- name: k8s-resources-workload
|
||||
url: file://kube-mixin/k8s-resources-workload.json
|
||||
- name: k8s-resources-workloads-namespace
|
||||
url: file://kube-mixin/k8s-resources-workloads-namespace.json
|
||||
- name: kubelet
|
||||
url: file://kube-mixin/kubelet.json
|
||||
- name: namespace-by-pod
|
||||
url: file://kube-mixin/namespace-by-pod.json
|
||||
- name: namespace-by-workload
|
||||
url: file://kube-mixin/namespace-by-workload.json
|
||||
- name: persistentvolumesusage
|
||||
url: file://kube-mixin/persistentvolumesusage.json
|
||||
tags: ['kubernetes', 'csi']
|
||||
- name: pod-total
|
||||
url: file://kube-mixin/pod-total.json
|
||||
- name: proxy
|
||||
url: file://kube-mixin/proxy.json
|
||||
- name: scheduler
|
||||
url: file://kube-mixin/scheduler.json
|
||||
- name: workload-total
|
||||
url: file://kube-mixin/workload-total.json
|
@ -88,12 +88,12 @@ for b in config['dashboards']:
|
||||
traverse_json(obj)
|
||||
|
||||
# Set default tim in all charts to 1h
|
||||
obj['time'] = { "from": "now-1h", "to": "now" }
|
||||
obj['time'] = {"from": "now-1h", "to": "now"}
|
||||
obj['refresh'] = "30s"
|
||||
|
||||
# set tags
|
||||
if 'tags' in b:
|
||||
obj['tags'] = b['tags']
|
||||
obj['tags'] = b['tags']
|
||||
|
||||
text = json.dumps(obj, indent=2)
|
||||
|
||||
|
84
charts/kubezero-metrics/sync_prometheus_rules.py
Executable file
84
charts/kubezero-metrics/sync_prometheus_rules.py
Executable file
@ -0,0 +1,84 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import yaml
|
||||
import requests
|
||||
import textwrap
|
||||
|
||||
config_file = sys.argv[1]
|
||||
configmap_folder = sys.argv[2]
|
||||
|
||||
|
||||
# read config file
|
||||
with open(config_file, 'r') as yaml_contents:
|
||||
config = yaml.safe_load(yaml_contents.read())
|
||||
|
||||
|
||||
def base_rule(config):
|
||||
rule = ''
|
||||
if 'condition' in config:
|
||||
# use index function to make go template happy if '-' in names
|
||||
if '-' in config['condition'] and "index" not in config['condition']:
|
||||
tokens = config['condition'].split('.')
|
||||
rule = '''{{- if index .Values %(condition)s }}
|
||||
''' % {'condition': ' '.join(f'"{w}"' for w in tokens[2:])}
|
||||
|
||||
else:
|
||||
rule = '''{{- if %(condition)s }}
|
||||
''' % config
|
||||
|
||||
# Base rule for KubeZero
|
||||
rule += '''apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%%s-%%s" (include "kubezero-lib.fullname" $) "%(name)s" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
{{- include "kubezero-lib.labels" . | nindent 4 }}
|
||||
''' % config
|
||||
|
||||
return rule
|
||||
|
||||
|
||||
for r in config['rules']:
|
||||
if not r['url'].startswith('file://'):
|
||||
response = requests.get(r['url'])
|
||||
if response.status_code != 200:
|
||||
print('Skipping the file, response code %s not equals 200' % response.status_code)
|
||||
continue
|
||||
raw_text = response.text
|
||||
else:
|
||||
with open(r['url'].replace('file://', ''), 'r') as file_contents:
|
||||
raw_text = file_contents.read()
|
||||
|
||||
obj = json.loads(raw_text)
|
||||
|
||||
rule = base_rule(r)
|
||||
|
||||
text = yaml.dump(obj['spec'], default_flow_style=False, width=1000, indent=2)
|
||||
|
||||
# Encode {{ }} for helm
|
||||
text = text.replace("{{", "{{`{{").replace("}}", "}}`}}").replace("{{`{{", "{{`{{`}}").replace("}}`}}", "{{`}}`}}")
|
||||
text = textwrap.indent(text, ' '*2)
|
||||
|
||||
# add support for additionalRuleLabels from Helm values
|
||||
helm_labels = '''{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
'''
|
||||
helm_labels = textwrap.indent(helm_labels, ' '*8)
|
||||
|
||||
text = text.replace("labels:\n", "labels:\n{}".format(helm_labels))
|
||||
|
||||
rule += '''spec:\n'''
|
||||
rule += text+'\n'
|
||||
|
||||
if 'condition' in r:
|
||||
rule += '{{- end }}'+'\n'
|
||||
|
||||
# Write Configmap
|
||||
configmap_file = os.path.join(configmap_folder, r['name'] + '.yaml')
|
||||
with open(configmap_file, 'w') as f:
|
||||
f.write(rule)
|
@ -1,7 +1,7 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "grafana-dashboards-kube-mixin" | trunc 63 | trimSuffix "-" }}
|
||||
name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "grafana-dashboards-k8s" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
|
114
charts/kubezero-metrics/templates/rules/alertmanager.yaml
Normal file
114
charts/kubezero-metrics/templates/rules/alertmanager.yaml
Normal file
@ -0,0 +1,114 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "alertmanager" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
{{- include "kubezero-lib.labels" . | nindent 4 }}
|
||||
spec:
|
||||
groups:
|
||||
- name: alertmanager.rules
|
||||
rules:
|
||||
- alert: AlertmanagerFailedReload
|
||||
annotations:
|
||||
description: Configuration has failed to load for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload
|
||||
summary: Reloading an Alertmanager configuration has failed.
|
||||
expr: '# Without max_over_time, failed scrapes could create false negatives, see
|
||||
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
|
||||
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m]) == 0
|
||||
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: AlertmanagerMembersInconsistent
|
||||
annotations:
|
||||
description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} has only found {{`{{`}} $value {{`}}`}} members of the {{`{{`}}$labels.job{{`}}`}} cluster.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent
|
||||
summary: A member of an Alertmanager cluster has not found all other cluster members.
|
||||
expr: "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n max_over_time(alertmanager_cluster_members{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m])\n< on (namespace,service) group_left\n count by (namespace,service) (max_over_time(alertmanager_cluster_members{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m]))\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: AlertmanagerFailedToSendAlerts
|
||||
annotations:
|
||||
description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} failed to send {{`{{`}} $value | humanizePercentage {{`}}`}} of notifications to {{`{{`}} $labels.integration {{`}}`}}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts
|
||||
summary: An Alertmanager instance failed to send notifications.
|
||||
expr: "(\n rate(alertmanager_notifications_failed_total{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m])\n/\n rate(alertmanager_notifications_total{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m])\n)\n> 0.01\n"
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
annotations:
|
||||
description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
|
||||
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
|
||||
expr: "min by (namespace,service, integration) (\n rate(alertmanager_notifications_failed_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration=~`.*`}[5m])\n/\n rate(alertmanager_notifications_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration=~`.*`}[5m])\n)\n> 0.01\n"
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
annotations:
|
||||
description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
|
||||
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
|
||||
expr: "min by (namespace,service, integration) (\n rate(alertmanager_notifications_failed_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration!~`.*`}[5m])\n/\n rate(alertmanager_notifications_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration!~`.*`}[5m])\n)\n> 0.01\n"
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
annotations:
|
||||
description: Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have different configurations.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent
|
||||
summary: Alertmanager instances within the same cluster have different configurations.
|
||||
expr: "count by (namespace,service) (\n count_values by (namespace,service) (\"config_hash\", alertmanager_config_hash{job=\"alertmanager-main\",namespace=\"monitoring\"})\n)\n!= 1\n"
|
||||
for: 20m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: AlertmanagerClusterDown
|
||||
annotations:
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have been up for less than half of the last 5m.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown
|
||||
summary: Half or more of the Alertmanager instances within the same cluster are down.
|
||||
expr: "(\n count by (namespace,service) (\n avg_over_time(up{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m]) < 0.5\n )\n/\n count by (namespace,service) (\n up{job=\"alertmanager-main\",namespace=\"monitoring\"}\n )\n)\n>= 0.5\n"
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: AlertmanagerClusterCrashlooping
|
||||
annotations:
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have restarted at least 5 times in the last 10m.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping
|
||||
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
|
||||
expr: "(\n count by (namespace,service) (\n changes(process_start_time_seconds{job=\"alertmanager-main\",namespace=\"monitoring\"}[10m]) > 4\n )\n/\n count by (namespace,service) (\n up{job=\"alertmanager-main\",namespace=\"monitoring\"}\n )\n)\n>= 0.5\n"
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
|
195
charts/kubezero-metrics/templates/rules/etcd-mixin.yaml
Normal file
195
charts/kubezero-metrics/templates/rules/etcd-mixin.yaml
Normal file
@ -0,0 +1,195 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "etcd-mixin" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
{{- include "kubezero-lib.labels" . | nindent 4 }}
|
||||
spec:
|
||||
groups:
|
||||
- name: etcd
|
||||
rules:
|
||||
- alert: etcdMembersDown
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": members are down ({{`{{`}} $value {{`}}`}}).'
|
||||
summary: etcd cluster members are down.
|
||||
expr: "max without (endpoint) (\n sum without (instance, pod) (up{job=~\".*etcd.*\"} == bool 0)\nor\n count without (To) (\n sum without (instance, pod) (rate(etcd_network_peer_sent_failures_total{job=~\".*etcd.*\"}[120s])) > 0.01\n )\n)\n> 0\n"
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: etcdInsufficientMembers
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).'
|
||||
summary: etcd cluster has insufficient number of members.
|
||||
expr: 'sum(up{job=~".*etcd.*"} == bool 1) without (instance, pod) < ((count(up{job=~".*etcd.*"}) without (instance, pod) + 1) / 2)
|
||||
|
||||
'
|
||||
for: 3m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: etcdNoLeader
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.'
|
||||
summary: etcd cluster has no leader.
|
||||
expr: 'etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||
|
||||
'
|
||||
for: 1m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: etcdHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
|
||||
summary: etcd cluster has high number of leader changes.
|
||||
expr: 'increase((max without (instance, pod) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
|
||||
|
||||
'
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster has high number of failed grpc requests.
|
||||
expr: "100 * sum(rate(grpc_server_handled_total{job=~\".*etcd.*\", grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[5m])) without (grpc_type, grpc_code)\n /\nsum(rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) without (grpc_type, grpc_code)\n > 1\n"
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster has high number of failed grpc requests.
|
||||
expr: "100 * sum(rate(grpc_server_handled_total{job=~\".*etcd.*\", grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[5m])) without (grpc_type, grpc_code)\n /\nsum(rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) without (grpc_type, grpc_code)\n > 5\n"
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile of gRPC requests is {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd grpc requests are slow
|
||||
expr: 'histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) without(grpc_type))
|
||||
|
||||
> 0.15
|
||||
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: etcdMemberCommunicationSlow
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster member communication is slow.
|
||||
expr: 'histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
|
||||
> 0.15
|
||||
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedProposals
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster has high number of proposal failures.
|
||||
expr: 'rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fsync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||
expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
|
||||
> 0.5
|
||||
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fsync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
|
||||
> 1
|
||||
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: etcdHighCommitDurations
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster 99th percentile commit durations are too high.
|
||||
expr: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
|
||||
> 0.25
|
||||
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: etcdBackendQuotaLowSpace
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": database size exceeds the defined quota on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
|
||||
expr: '(etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100 > 95
|
||||
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: etcdExcessiveDatabaseGrowth
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": Observed surge in etcd writes leading to 50% increase in database size over the past four hours on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please check as it might be disruptive.'
|
||||
expr: 'increase(((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100)[240m:1m]) > 50
|
||||
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
|
81
charts/kubezero-metrics/templates/rules/kube-prometheus.yaml
Normal file
81
charts/kubezero-metrics/templates/rules/kube-prometheus.yaml
Normal file
@ -0,0 +1,81 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "kube-prometheus" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
{{- include "kubezero-lib.labels" . | nindent 4 }}
|
||||
spec:
|
||||
groups:
|
||||
- name: general.rules
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
annotations:
|
||||
description: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
|
||||
summary: One or more targets are unreachable.
|
||||
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: Watchdog
|
||||
annotations:
|
||||
description: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
|
||||
|
||||
This alert is always firing, therefore it should always be firing in Alertmanager
|
||||
|
||||
and always fire against a receiver. There are integrations with various notification
|
||||
|
||||
mechanisms that send a notification when this alert is not firing. For example the
|
||||
|
||||
"DeadMansSnitch" integration in PagerDuty.
|
||||
|
||||
'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
|
||||
summary: An alert that should always be firing to certify that Alertmanager is working properly.
|
||||
expr: vector(1)
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: none
|
||||
- name: node-network
|
||||
rules:
|
||||
- alert: NodeNetworkInterfaceFlapping
|
||||
annotations:
|
||||
description: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing its up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping
|
||||
summary: Network interface is often changing its status
|
||||
expr: 'changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
|
||||
|
||||
'
|
||||
for: 2m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- name: kube-prometheus-node-recording.rules
|
||||
rules:
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)
|
||||
record: instance:node_cpu:rate:sum
|
||||
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
|
||||
record: instance:node_network_receive_bytes:rate:sum
|
||||
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
|
||||
record: instance:node_network_transmit_bytes:rate:sum
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
|
||||
record: instance:node_cpu:ratio
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
|
||||
record: cluster:node_cpu:sum_rate5m
|
||||
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
|
||||
record: cluster:node_cpu:ratio
|
||||
- name: kube-prometheus-general.rules
|
||||
rules:
|
||||
- expr: count without(instance, pod, node) (up == 1)
|
||||
record: count:up1
|
||||
- expr: count without(instance, pod, node) (up == 0)
|
||||
record: count:up0
|
||||
|
@ -0,0 +1,62 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "kube-state-metrics" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
{{- include "kubezero-lib.labels" . | nindent 4 }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-state-metrics
|
||||
rules:
|
||||
- alert: KubeStateMetricsListErrors
|
||||
annotations:
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors
|
||||
summary: kube-state-metrics is experiencing errors in list operations.
|
||||
expr: "(sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\",result=\"error\"}[5m]))\n /\nsum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\"}[5m])))\n> 0.01\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: KubeStateMetricsWatchErrors
|
||||
annotations:
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors
|
||||
summary: kube-state-metrics is experiencing errors in watch operations.
|
||||
expr: "(sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\",result=\"error\"}[5m]))\n /\nsum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\"}[5m])))\n> 0.01\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: KubeStateMetricsShardingMismatch
|
||||
annotations:
|
||||
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch
|
||||
summary: kube-state-metrics sharding is misconfigured.
|
||||
expr: 'stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0
|
||||
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: KubeStateMetricsShardsMissing
|
||||
annotations:
|
||||
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing
|
||||
summary: kube-state-metrics shards are missing.
|
||||
expr: "2^max(kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) - 1\n -\nsum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job=\"kube-state-metrics\"}) )\n!= 0\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
|
1174
charts/kubezero-metrics/templates/rules/kubernetes.yaml
Normal file
1174
charts/kubezero-metrics/templates/rules/kubernetes.yaml
Normal file
File diff suppressed because it is too large
Load Diff
273
charts/kubezero-metrics/templates/rules/node-exporter.yaml
Normal file
273
charts/kubezero-metrics/templates/rules/node-exporter.yaml
Normal file
@ -0,0 +1,273 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "node-exporter" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
{{- include "kubezero-lib.labels" . | nindent 4 }}
|
||||
spec:
|
||||
groups:
|
||||
- name: node-exporter
|
||||
rules:
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
|
||||
for: 1h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up fast.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 15\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
|
||||
for: 1h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 5% space left.
|
||||
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
|
||||
for: 30m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 3% space left.
|
||||
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
|
||||
for: 30m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||
expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
|
||||
for: 1h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up fast.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||
expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
|
||||
for: 1h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 5% inodes left.
|
||||
expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
|
||||
for: 1h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 3% inodes left.
|
||||
expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n"
|
||||
for: 1h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} receive errors in the last two minutes.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
|
||||
summary: Network interface is reporting many receive errors.
|
||||
expr: 'rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
|
||||
|
||||
'
|
||||
for: 1h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} transmit errors in the last two minutes.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
|
||||
summary: Network interface is reporting many transmit errors.
|
||||
expr: 'rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
|
||||
|
||||
'
|
||||
for: 1h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeHighNumberConntrackEntriesUsed
|
||||
annotations:
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
|
||||
summary: Number of conntrack are getting close to the limit.
|
||||
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeTextFileCollectorScrapeError
|
||||
annotations:
|
||||
description: Node Exporter text file collector failed to scrape.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
|
||||
summary: Node Exporter text file collector failed to scrape.
|
||||
expr: 'node_textfile_scrape_error{job="node-exporter"} == 1
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
description: Clock on {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
|
||||
summary: Clock skew detected.
|
||||
expr: "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n"
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
description: Clock on {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
|
||||
summary: Clock not synchronising.
|
||||
expr: 'min_over_time(node_timex_sync_status[5m]) == 0
|
||||
|
||||
and
|
||||
|
||||
node_timex_maxerror_seconds >= 16
|
||||
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeRAIDDegraded
|
||||
annotations:
|
||||
description: RAID array '{{`{{`}} $labels.device {{`}}`}}' on {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
|
||||
summary: RAID Array is degraded
|
||||
expr: 'node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
|
||||
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: NodeRAIDDiskFailure
|
||||
annotations:
|
||||
description: At least one device in RAID array on {{`{{`}} $labels.instance {{`}}`}} failed. Array '{{`{{`}} $labels.device {{`}}`}}' needs attention and possibly a disk swap.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
|
||||
summary: Failed device in RAID array
|
||||
expr: 'node_md_disks{state="failed"} > 0
|
||||
|
||||
'
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description: File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: "(\n node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"node-exporter\"} > 70\n)\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description: File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: "(\n node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"node-exporter\"} > 90\n)\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- name: node-exporter.rules
|
||||
rules:
|
||||
- expr: "count without (cpu, mode) (\n node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"}\n)\n"
|
||||
record: instance:node_num_cpu:sum
|
||||
- expr: "1 - avg without (cpu, mode) (\n rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\"}[5m])\n)\n"
|
||||
record: instance:node_cpu_utilisation:rate5m
|
||||
- expr: "(\n node_load1{job=\"node-exporter\"}\n/\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n"
|
||||
record: instance:node_load1_per_cpu:ratio
|
||||
- expr: "1 - (\n node_memory_MemAvailable_bytes{job=\"node-exporter\"}\n or\n (\n node_memory_Buffers_bytes{job=\"node-exporter\"}\n +\n node_memory_Cached_bytes{job=\"node-exporter\"}\n +\n node_memory_MemFree_bytes{job=\"node-exporter\"}\n +\n node_memory_Slab_bytes{job=\"node-exporter\"}\n )\n/\n node_memory_MemTotal_bytes{job=\"node-exporter\"}\n)\n"
|
||||
record: instance:node_memory_utilisation:ratio
|
||||
- expr: 'rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
|
||||
|
||||
'
|
||||
record: instance:node_vmstat_pgmajfault:rate5m
|
||||
- expr: 'rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
|
||||
|
||||
'
|
||||
record: instance_device:node_disk_io_time_seconds:rate5m
|
||||
- expr: 'rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
|
||||
|
||||
'
|
||||
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
|
||||
- expr: "sum without (device) (\n rate(node_network_receive_bytes_total{job=\"node-exporter\", device!=\"lo\"}[5m])\n)\n"
|
||||
record: instance:node_network_receive_bytes_excluding_lo:rate5m
|
||||
- expr: "sum without (device) (\n rate(node_network_transmit_bytes_total{job=\"node-exporter\", device!=\"lo\"}[5m])\n)\n"
|
||||
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
|
||||
- expr: "sum without (device) (\n rate(node_network_receive_drop_total{job=\"node-exporter\", device!=\"lo\"}[5m])\n)\n"
|
||||
record: instance:node_network_receive_drop_excluding_lo:rate5m
|
||||
- expr: "sum without (device) (\n rate(node_network_transmit_drop_total{job=\"node-exporter\", device!=\"lo\"}[5m])\n)\n"
|
||||
record: instance:node_network_transmit_drop_excluding_lo:rate5m
|
||||
|
110
charts/kubezero-metrics/templates/rules/prometheus-operator.yaml
Normal file
110
charts/kubezero-metrics/templates/rules/prometheus-operator.yaml
Normal file
@ -0,0 +1,110 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "prometheus-operator" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
{{- include "kubezero-lib.labels" . | nindent 4 }}
|
||||
spec:
|
||||
groups:
|
||||
- name: prometheus-operator
|
||||
rules:
|
||||
- alert: PrometheusOperatorListErrors
|
||||
annotations:
|
||||
description: Errors while performing List operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors
|
||||
summary: Errors while performing list operations in controller.
|
||||
expr: '(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
|
||||
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorWatchErrors
|
||||
annotations:
|
||||
description: Errors while performing watch operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors
|
||||
summary: Errors while performing watch operations in controller.
|
||||
expr: '(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
|
||||
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorSyncFailed
|
||||
annotations:
|
||||
description: Controller {{`{{`}} $labels.controller {{`}}`}} in {{`{{`}} $labels.namespace {{`}}`}} namespace fails to reconcile {{`{{`}} $value {{`}}`}} objects.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorsyncfailed
|
||||
summary: Last controller reconciliation failed
|
||||
expr: 'min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m]) > 0
|
||||
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorReconcileErrors
|
||||
annotations:
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of reconciling operations failed for {{`{{`}} $labels.controller {{`}}`}} controller in {{`{{`}} $labels.namespace {{`}}`}} namespace.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors
|
||||
summary: Errors while reconciling controller.
|
||||
expr: '(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1
|
||||
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorNodeLookupErrors
|
||||
annotations:
|
||||
description: Errors while reconciling Prometheus in {{`{{`}} $labels.namespace {{`}}`}} Namespace.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornodelookuperrors
|
||||
summary: Errors while reconciling Prometheus.
|
||||
expr: 'rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
|
||||
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorNotReady
|
||||
annotations:
|
||||
description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace isn't ready to reconcile {{`{{`}} $labels.controller {{`}}`}} resources.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
|
||||
summary: Prometheus operator not ready
|
||||
expr: 'min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0)
|
||||
|
||||
'
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorRejectedResources
|
||||
annotations:
|
||||
description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace rejected {{`{{`}} printf "%0.0f" $value {{`}}`}} {{`{{`}} $labels.controller {{`}}`}}/{{`{{`}} $labels.resource {{`}}`}} resources.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorrejectedresources
|
||||
summary: Resources rejected by Prometheus operator
|
||||
expr: 'min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="monitoring"}[5m]) > 0
|
||||
|
||||
'
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
|
258
charts/kubezero-metrics/templates/rules/prometheus.yaml
Normal file
258
charts/kubezero-metrics/templates/rules/prometheus.yaml
Normal file
@ -0,0 +1,258 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "prometheus" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
{{- include "kubezero-lib.labels" . | nindent 4 }}
|
||||
spec:
|
||||
groups:
|
||||
- name: prometheus
|
||||
rules:
|
||||
- alert: PrometheusBadConfig
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to reload its configuration.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig
|
||||
summary: Failed Prometheus configuration reload.
|
||||
expr: '# Without max_over_time, failed scrapes could create false negatives, see
|
||||
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
|
||||
max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0
|
||||
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: PrometheusNotificationQueueRunningFull
|
||||
annotations:
|
||||
description: Alert notification queue of Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is running full.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull
|
||||
summary: Prometheus alert notification queue predicted to run full in less than 30m.
|
||||
expr: "# Without min_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n predict_linear(prometheus_notifications_queue_length{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m], 60 * 30)\n>\n min_over_time(prometheus_notifications_queue_capacity{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||
annotations:
|
||||
description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.alertmanager{{`}}`}}.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers
|
||||
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
|
||||
expr: "(\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n* 100\n> 1\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusNotConnectedToAlertmanagers
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not connected to any Alertmanagers.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers
|
||||
summary: Prometheus is not connected to any Alertmanagers.
|
||||
expr: '# Without max_over_time, failed scrapes could create false negatives, see
|
||||
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
|
||||
max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"}[5m]) < 1
|
||||
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusTSDBReloadsFailing
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} reload failures over the last 3h.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing
|
||||
summary: Prometheus has issues reloading blocks from disk.
|
||||
expr: 'increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
|
||||
|
||||
'
|
||||
for: 4h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusTSDBCompactionsFailing
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} compaction failures over the last 3h.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing
|
||||
summary: Prometheus has issues compacting blocks.
|
||||
expr: 'increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
|
||||
|
||||
'
|
||||
for: 4h
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusNotIngestingSamples
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not ingesting samples.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples
|
||||
summary: Prometheus is not ingesting samples.
|
||||
expr: "(\n rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) <= 0\nand\n (\n sum without(scrape_job) (prometheus_target_metadata_cache_entries{job=\"prometheus-k8s\",namespace=\"monitoring\"}) > 0\n or\n sum without(rule_group) (prometheus_rule_group_rules{job=\"prometheus-k8s\",namespace=\"monitoring\"}) > 0\n )\n)\n"
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusDuplicateTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with different values but duplicated timestamp.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps
|
||||
summary: Prometheus is dropping samples with duplicate timestamps.
|
||||
expr: 'rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusOutOfOrderTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with timestamps arriving out of order.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps
|
||||
summary: Prometheus drops samples with out-of-order timestamps.
|
||||
expr: 'rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||
|
||||
'
|
||||
for: 10m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusRemoteStorageFailures
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} failed to send {{`{{`}} printf "%.1f" $value {{`}}`}}% of the samples to {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures
|
||||
summary: Prometheus fails to send samples to remote storage.
|
||||
expr: "(\n (rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n/\n (\n (rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n +\n (rate(prometheus_remote_storage_succeeded_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n )\n)\n* 100\n> 1\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: PrometheusRemoteWriteBehind
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write is {{`{{`}} printf "%.1f" $value {{`}}`}}s behind for {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind
|
||||
summary: Prometheus remote write is behind.
|
||||
expr: "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n- ignoring(remote_name, url) group_right\n max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n> 120\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: PrometheusRemoteWriteDesiredShards
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write desired shards calculation wants to run {{`{{`}} $value {{`}}`}} shards for queue {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}, which is more than the max of {{`{{`}} printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value {{`}}`}}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards
|
||||
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
|
||||
expr: "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_shards_desired{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n>\n max_over_time(prometheus_remote_storage_shards_max{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusRuleFailures
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to evaluate {{`{{`}} printf "%.0f" $value {{`}}`}} rules in the last 5m.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures
|
||||
summary: Prometheus is failing rule evaluations.
|
||||
expr: 'increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: PrometheusMissingRuleEvaluations
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has missed {{`{{`}} printf "%.0f" $value {{`}}`}} rule group evaluations in the last 5m.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations
|
||||
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
||||
expr: 'increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusTargetLimitHit
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because the number of targets exceeded the configured target_limit.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
|
||||
expr: 'increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusLabelLimitHit
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit.
|
||||
expr: 'increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||
|
||||
'
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: warning
|
||||
- alert: PrometheusTargetSyncFailure
|
||||
annotations:
|
||||
description: '{{`{{`}} printf "%.0f" $value {{`}}`}} targets in Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} have failed to sync because invalid configuration was supplied.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure
|
||||
summary: Prometheus has failed to sync targets.
|
||||
expr: 'increase(prometheus_target_sync_failed_total{job="prometheus-k8s",namespace="monitoring"}[30m]) > 0
|
||||
|
||||
'
|
||||
for: 5m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% minimum errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to any Alertmanager.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager
|
||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
expr: "min without (alertmanager) (\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\",alertmanager!~``}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\",alertmanager!~``}[5m])\n)\n* 100\n> 3\n"
|
||||
for: 15m
|
||||
labels:
|
||||
{{- if .Values.additionalRuleLabels }}
|
||||
{{- toYaml .Values.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
severity: critical
|
||||
|
@ -9,20 +9,20 @@ helm pull prometheus-community/kube-prometheus-stack --untar --untardir charts -
|
||||
rm -rf charts/prometheus-pushgateway
|
||||
helm pull prometheus-community/prometheus-pushgateway --untar --untardir charts --version $PG_VER
|
||||
|
||||
# The grpc alerts could be re-enabled with etcd 3.5
|
||||
# https://github.com/etcd-io/etcd/pull/12196
|
||||
patch -p0 -i zdt.patch --no-backup-if-mismatch
|
||||
|
||||
patch -p0 -i zdt-pushgateway.patch --no-backup-if-mismatch
|
||||
|
||||
# Create ZDT dashboard configmap
|
||||
cd dashboards
|
||||
# Create ZDT dashboard, alerts etc configmaps
|
||||
cd configs
|
||||
./build.sh
|
||||
|
||||
../sync_grafana_dashboards.py metrics-dashboards.yaml ../templates/grafana-dashboards-metrics.yaml
|
||||
../sync_grafana_dashboards.py k8s-dashboards.yaml ../templates/grafana-dashboards-k8s.yaml
|
||||
../sync_grafana_dashboards.py zdt-dashboards.yaml ../templates/grafana-dashboards-zdt.yaml
|
||||
|
||||
../sync_prometheus_rules.py k8s-rules.yaml ../templates/rules
|
||||
cd -
|
||||
|
||||
# Delete not used upstream dashboards
|
||||
rm -rf charts/kube-prometheus-stack/templates/grafana/dashboards-1.14
|
||||
# Delete not used upstream dashboards or rules
|
||||
rm -rf charts/kube-prometheus-stack/templates/grafana/dashboards-1.14 charts/kube-prometheus-stack/templates/prometheus/rules-1.14
|
||||
|
@ -1,16 +1,14 @@
|
||||
#additionalRuleLabels:
|
||||
# clusterName: myTestCluster
|
||||
# awsRegion: eu-central-1
|
||||
|
||||
kube-prometheus-stack:
|
||||
global:
|
||||
rbac:
|
||||
pspEnabled: false
|
||||
|
||||
defaultRules:
|
||||
create: true
|
||||
# Disable kubeapi error budget rules for now as they are buggy
|
||||
rules:
|
||||
kubeApiserverSlos: false
|
||||
#additionalRuleLabels:
|
||||
# clusterName: myTestCluster
|
||||
# awsRegion: eu-central-1
|
||||
create: false
|
||||
|
||||
coreDns:
|
||||
enabled: true
|
||||
|
@ -18,62 +18,3 @@ diff -tubr charts/kube-prometheus-stack/charts/grafana/templates/_pod.tpl charts
|
||||
{{- end}}
|
||||
- name: {{ .Chart.Name }}
|
||||
{{- if .Values.image.sha }}
|
||||
diff -tubr charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml charts/kube-prometheus-stack.zdt/templates/prometheus/rules-1.14/etcd.yaml
|
||||
--- charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml 2021-06-26 19:54:57.000000000 +0200
|
||||
+++ charts/kube-prometheus-stack.zdt/templates/prometheus/rules-1.14/etcd.yaml 2021-07-01 13:31:20.750609396 +0200
|
||||
@@ -54,34 +54,6 @@
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- - alert: etcdHighNumberOfFailedGRPCRequests
|
||||
- annotations:
|
||||
- message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
- expr: |-
|
||||
- 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
- /
|
||||
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
- > 1
|
||||
- for: 10m
|
||||
- labels:
|
||||
- severity: warning
|
||||
-{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
-{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
-{{- end }}
|
||||
- - alert: etcdHighNumberOfFailedGRPCRequests
|
||||
- annotations:
|
||||
- message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
- expr: |-
|
||||
- 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
- /
|
||||
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
- > 5
|
||||
- for: 5m
|
||||
- labels:
|
||||
- severity: critical
|
||||
-{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
-{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
-{{- end }}
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": gRPC requests to {{`{{`}} $labels.grpc_method {{`}}`}} are taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
diff -tubr charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml charts/kube-prometheus-stack.zdt/templates/prometheus/rules-1.14/node-exporter.yaml
|
||||
--- charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml 2021-06-26 19:54:57.000000000 +0200
|
||||
+++ charts/kube-prometheus-stack.zdt/templates/prometheus/rules-1.14/node-exporter.yaml 2021-07-01 13:31:20.753942729 +0200
|
||||
@@ -31,7 +31,7 @@
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40
|
||||
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 25
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
@@ -50,7 +50,7 @@
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15
|
||||
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 10
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
|
Loading…
Reference in New Issue
Block a user