diff --git a/charts/kubezero-metrics/.gitignore b/charts/kubezero-metrics/.gitignore deleted file mode 100644 index 26187bf2..00000000 --- a/charts/kubezero-metrics/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -dashboards/vendor -dashboards/jb-linux-amd64 diff --git a/charts/kubezero-metrics/.helmignore b/charts/kubezero-metrics/.helmignore index 150ee457..c85ebc47 100644 --- a/charts/kubezero-metrics/.helmignore +++ b/charts/kubezero-metrics/.helmignore @@ -26,4 +26,4 @@ README.md.gotmpl *.patch *.sh *.py -dashboards +configs diff --git a/charts/kubezero-metrics/Chart.yaml b/charts/kubezero-metrics/Chart.yaml index 27429e19..6a3e45d9 100644 --- a/charts/kubezero-metrics/Chart.yaml +++ b/charts/kubezero-metrics/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: kubezero-metrics description: KubeZero Umbrella Chart for Prometheus, Grafana and Alertmanager as well as all Kubernetes integrations. type: application -version: 0.4.8 +version: 0.5.1 home: https://kubezero.com icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png keywords: diff --git a/charts/kubezero-metrics/README.md b/charts/kubezero-metrics/README.md index a0e52796..9c6f50e4 100644 --- a/charts/kubezero-metrics/README.md +++ b/charts/kubezero-metrics/README.md @@ -1,8 +1,8 @@ # kubezero-metrics -![Version: 0.4.1](https://img.shields.io/badge/Version-0.4.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) +![Version: 0.5.1](https://img.shields.io/badge/Version-0.5.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) -KubeZero Umbrella Chart for prometheus-operator +KubeZero Umbrella Chart for Prometheus, Grafana and Alertmanager as well as all Kubernetes integrations. **Homepage:** @@ -18,9 +18,10 @@ Kubernetes: `>= 1.18.0` | Repository | Name | Version | |------------|------|---------| -| | kube-prometheus-stack | 16.12.0 | -| https://prometheus-community.github.io/helm-charts | prometheus-adapter | 2.14.2 | -| https://zero-down-time.github.io/kubezero/ | kubezero-lib | >= 0.1.3 | +| | kube-prometheus-stack | 18.1.0 | +| | prometheus-pushgateway | 1.10.1 | +| https://prometheus-community.github.io/helm-charts | prometheus-adapter | 2.17 | +| https://zero-down-time.github.io/kubezero/ | kubezero-lib | >= 0.1.4 | ## Values @@ -41,10 +42,42 @@ Kubernetes: `>= 1.18.0` | istio.prometheus.gateway | string | `"istio-ingress/ingressgateway"` | | | istio.prometheus.ipBlocks | list | `[]` | | | istio.prometheus.url | string | `""` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].env[0].name | string | `"SNS_FORWARDER_ARN_PREFIX"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].env[0].valueFrom.fieldRef.fieldPath | string | `"metadata.annotations['kubezero.com/sns_forwarder_ARN_PREFIX']"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].env[1].name | string | `"AWS_ROLE_ARN"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].env[1].valueFrom.fieldRef.fieldPath | string | `"metadata.annotations['kubezero.com/sns_forwarder_AWS_ROLE_ARN']"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].env[2].name | string | `"AWS_WEB_IDENTITY_TOKEN_FILE"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].env[2].value | string | `"/var/run/secrets/sts.amazonaws.com/serviceaccount/token"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].env[3].name | string | `"AWS_STS_REGIONAL_ENDPOINTS"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].env[3].value | string | `"regional"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].image | string | `"datareply/alertmanager-sns-forwarder:latest"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].imagePullPolicy | string | `"Always"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].livenessProbe.httpGet.path | string | `"/health"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].livenessProbe.httpGet.port | string | `"webhook-port"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].livenessProbe.initialDelaySeconds | int | `30` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].livenessProbe.timeoutSeconds | int | `10` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].name | string | `"alertmanager-sns-forwarder"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].ports[0].containerPort | int | `9087` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].ports[0].name | string | `"webhook-port"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].readinessProbe.httpGet.path | string | `"/health"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].readinessProbe.httpGet.port | string | `"webhook-port"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].readinessProbe.initialDelaySeconds | int | `10` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].readinessProbe.timeoutSeconds | int | `10` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].resources.limits.cpu | string | `"100m"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].resources.limits.memory | string | `"64Mi"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].resources.requests.cpu | string | `"25m"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].resources.requests.memory | string | `"32Mi"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].volumeMounts[0].mountPath | string | `"/var/run/secrets/sts.amazonaws.com/serviceaccount/"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].volumeMounts[0].name | string | `"aws-token"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.containers[0].volumeMounts[0].readOnly | bool | `true` | | | kube-prometheus-stack.alertmanager.alertmanagerSpec.logFormat | string | `"json"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.volumes[0].name | string | `"aws-token"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.volumes[0].projected.sources[0].serviceAccountToken.audience | string | `"sts.amazonaws.com"` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.volumes[0].projected.sources[0].serviceAccountToken.expirationSeconds | int | `86400` | | +| kube-prometheus-stack.alertmanager.alertmanagerSpec.volumes[0].projected.sources[0].serviceAccountToken.path | string | `"token"` | | | kube-prometheus-stack.alertmanager.enabled | bool | `false` | | | kube-prometheus-stack.coreDns.enabled | bool | `true` | | -| kube-prometheus-stack.defaultRules.create | bool | `true` | | +| kube-prometheus-stack.defaultRules.create | bool | `false` | | | kube-prometheus-stack.global.rbac.pspEnabled | bool | `false` | | | kube-prometheus-stack.grafana."grafana.ini"."auth.anonymous".enabled | bool | `true` | | | kube-prometheus-stack.grafana."grafana.ini".alerting.enabled | bool | `false` | | @@ -113,7 +146,6 @@ Kubernetes: `>= 1.18.0` | kube-prometheus-stack.prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues | bool | `false` | | | kube-prometheus-stack.prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.accessModes[0] | string | `"ReadWriteOnce"` | | | kube-prometheus-stack.prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.resources.requests.storage | string | `"16Gi"` | | -| kube-prometheus-stack.prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.storageClassName | string | `"ebs-sc-gp3-xfs"` | | | kube-prometheus-stack.prometheus.prometheusSpec.walCompression | bool | `true` | | | kube-prometheus-stack.prometheusOperator.admissionWebhooks.patch.nodeSelector."node-role.kubernetes.io/master" | string | `""` | | | kube-prometheus-stack.prometheusOperator.admissionWebhooks.patch.tolerations[0].effect | string | `"NoSchedule"` | | @@ -146,6 +178,8 @@ Kubernetes: `>= 1.18.0` | prometheus-adapter.rules.resource.window | string | `"5m"` | | | prometheus-adapter.tolerations[0].effect | string | `"NoSchedule"` | | | prometheus-adapter.tolerations[0].key | string | `"node-role.kubernetes.io/master"` | | +| prometheus-pushgateway.enabled | bool | `false` | | +| prometheus-pushgateway.serviceMonitor.enabled | bool | `true` | | # Dashboards @@ -153,3 +187,5 @@ Kubernetes: `>= 1.18.0` - https://grafana.com/api/dashboards/9578/revisions/4/download ## Prometheus - https://grafana.com/api/dashboards/3662/revisions/2/download +## AlertManager SNS Forwarder +- https://github.com/DataReply/alertmanager-sns-forwarder diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml deleted file mode 100644 index 34eecbe2..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml +++ /dev/null @@ -1,175 +0,0 @@ -{{- /* -Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/alertmanager-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.alertmanager }} -{{- $alertmanagerJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager" }} -{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: alertmanager.rules - rules: - - alert: AlertmanagerFailedReload - annotations: - description: Configuration has failed to load for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerfailedreload - summary: Reloading an Alertmanager configuration has failed. - expr: |- - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) == 0 - for: 10m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: AlertmanagerMembersInconsistent - annotations: - description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} has only found {{`{{`}} $value {{`}}`}} members of the {{`{{`}}$labels.job{{`}}`}} cluster. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagermembersinconsistent - summary: A member of an Alertmanager cluster has not found all other cluster members. - expr: |- - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) - < on (namespace,service) group_left - count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: AlertmanagerFailedToSendAlerts - annotations: - description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} failed to send {{`{{`}} $value | humanizePercentage {{`}}`}} of notifications to {{`{{`}} $labels.integration {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerfailedtosendalerts - summary: An Alertmanager instance failed to send notifications. - expr: |- - ( - rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) - / - rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) - ) - > 0.01 - for: 5m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: AlertmanagerClusterFailedToSendAlerts - annotations: - description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterfailedtosendalerts - summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. - expr: |- - min by (namespace,service, integration) ( - rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m]) - / - rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m]) - ) - > 0.01 - for: 5m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: AlertmanagerClusterFailedToSendAlerts - annotations: - description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterfailedtosendalerts - summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. - expr: |- - min by (namespace,service, integration) ( - rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m]) - / - rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m]) - ) - > 0.01 - for: 5m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: AlertmanagerConfigInconsistent - annotations: - description: Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have different configurations. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerconfiginconsistent - summary: Alertmanager instances within the same cluster have different configurations. - expr: |- - count by (namespace,service) ( - count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) - ) - != 1 - for: 20m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: AlertmanagerClusterDown - annotations: - description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have been up for less than half of the last 5m.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterdown - summary: Half or more of the Alertmanager instances within the same cluster are down. - expr: |- - ( - count by (namespace,service) ( - avg_over_time(up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) < 0.5 - ) - / - count by (namespace,service) ( - up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} - ) - ) - >= 0.5 - for: 5m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: AlertmanagerClusterCrashlooping - annotations: - description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have restarted at least 5 times in the last 10m.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclustercrashlooping - summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. - expr: |- - ( - count by (namespace,service) ( - changes(process_start_time_seconds{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[10m]) > 4 - ) - / - count by (namespace,service) ( - up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} - ) - ) - >= 0.5 - for: 5m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml deleted file mode 100644 index 53995c5f..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml +++ /dev/null @@ -1,151 +0,0 @@ -{{- /* -Generated from 'etcd' group from https://raw.githubusercontent.com/etcd-io/website/master/content/en/docs/v3.4/op-guide/etcd3_alert.rules.yml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeEtcd.enabled .Values.defaultRules.rules.etcd }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "etcd" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: etcd - rules: - - alert: etcdInsufficientMembers - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).' - expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2) - for: 3m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdNoLeader - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.' - expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 - for: 1m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighNumberOfLeaderChanges - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": instance {{`{{`}} $labels.instance {{`}}`}} has seen {{`{{`}} $value {{`}}`}} leader changes within the last hour.' - expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdGRPCRequestsSlow - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": gRPC requests to {{`{{`}} $labels.grpc_method {{`}}`}} are taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: |- - histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le)) - > 0.15 - for: 10m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdMemberCommunicationSlow - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: |- - histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) - > 0.15 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighNumberOfFailedProposals - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last hour on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighFsyncDurations - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: |- - histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) - > 0.5 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighCommitDurations - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: |- - histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) - > 0.25 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighNumberOfFailedHTTPRequests - annotations: - message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}' - expr: |- - sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) - BY (method) > 0.01 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighNumberOfFailedHTTPRequests - annotations: - message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: |- - sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) - BY (method) > 0.05 - for: 10m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHTTPRequestsSlow - annotations: - message: etcd instance {{`{{`}} $labels.instance {{`}}`}} HTTP requests to {{`{{`}} $labels.method {{`}}`}} are slow. - expr: |- - histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) - > 0.15 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml deleted file mode 100644 index e41175f5..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml +++ /dev/null @@ -1,60 +0,0 @@ -{{- /* -Generated from 'general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.general }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "general.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: general.rules - rules: - - alert: TargetDown - annotations: - description: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-targetdown - summary: One or more targets are unreachable. - expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: Watchdog - annotations: - description: 'This is an alert meant to ensure that the entire alerting pipeline is functional. - - This alert is always firing, therefore it should always be firing in Alertmanager - - and always fire against a receiver. There are integrations with various notification - - mechanisms that send a notification when this alert is not firing. For example the - - "DeadMansSnitch" integration in PagerDuty. - - ' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-watchdog - summary: An alert that should always be firing to certify that Alertmanager is working properly. - expr: vector(1) - labels: - severity: none -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.yaml deleted file mode 100644 index 86b46eaf..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.yaml +++ /dev/null @@ -1,163 +0,0 @@ -{{- /* -Generated from 'k8s.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8s }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: k8s.rules - rules: - - expr: |- - sum by (cluster, namespace, pod, container) ( - irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) - ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( - 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate - - expr: |- - container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, - max by(namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_working_set_bytes - - expr: |- - container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, - max by(namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_rss - - expr: |- - container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, - max by(namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_cache - - expr: |- - container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, - max by(namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_swap - - expr: |- - kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) - group_left() max by (namespace, pod) ( - (kube_pod_status_phase{phase=~"Pending|Running"} == 1) - ) - record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests - - expr: |- - sum by (namespace, cluster) ( - sum by (namespace, pod, cluster) ( - max by (namespace, pod, container, cluster) ( - kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} - ) * on(namespace, pod, cluster) group_left() max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 - ) - ) - ) - record: namespace_memory:kube_pod_container_resource_requests:sum - - expr: |- - kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) - group_left() max by (namespace, pod) ( - (kube_pod_status_phase{phase=~"Pending|Running"} == 1) - ) - record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests - - expr: |- - sum by (namespace, cluster) ( - sum by (namespace, pod, cluster) ( - max by (namespace, pod, container, cluster) ( - kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} - ) * on(namespace, pod, cluster) group_left() max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 - ) - ) - ) - record: namespace_cpu:kube_pod_container_resource_requests:sum - - expr: |- - kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) - group_left() max by (namespace, pod) ( - (kube_pod_status_phase{phase=~"Pending|Running"} == 1) - ) - record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits - - expr: |- - sum by (namespace, cluster) ( - sum by (namespace, pod, cluster) ( - max by (namespace, pod, container, cluster) ( - kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} - ) * on(namespace, pod, cluster) group_left() max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 - ) - ) - ) - record: namespace_memory:kube_pod_container_resource_limits:sum - - expr: |- - kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) - group_left() max by (namespace, pod) ( - (kube_pod_status_phase{phase=~"Pending|Running"} == 1) - ) - record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits - - expr: |- - sum by (namespace, cluster) ( - sum by (namespace, pod, cluster) ( - max by (namespace, pod, container, cluster) ( - kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} - ) * on(namespace, pod, cluster) group_left() max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 - ) - ) - ) - record: namespace_cpu:kube_pod_container_resource_limits:sum - - expr: |- - max by (cluster, namespace, workload, pod) ( - label_replace( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, - "replicaset", "$1", "owner_name", "(.*)" - ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( - 1, max by (replicaset, namespace, owner_name) ( - kube_replicaset_owner{job="kube-state-metrics"} - ) - ), - "workload", "$1", "owner_name", "(.*)" - ) - ) - labels: - workload_type: deployment - record: namespace_workload_pod:kube_pod_owner:relabel - - expr: |- - max by (cluster, namespace, workload, pod) ( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, - "workload", "$1", "owner_name", "(.*)" - ) - ) - labels: - workload_type: daemonset - record: namespace_workload_pod:kube_pod_owner:relabel - - expr: |- - max by (cluster, namespace, workload, pod) ( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, - "workload", "$1", "owner_name", "(.*)" - ) - ) - labels: - workload_type: statefulset - record: namespace_workload_pod:kube_pod_owner:relabel -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-availability.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-availability.rules.yaml deleted file mode 100644 index 9de49f17..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-availability.rules.yaml +++ /dev/null @@ -1,128 +0,0 @@ -{{- /* -Generated from 'kube-apiserver-availability.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverAvailability }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver-availability.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - interval: 3m - name: kube-apiserver-availability.rules - rules: - - expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 - record: code_verb:apiserver_request_total:increase30d - - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) - labels: - verb: read - record: code:apiserver_request_total:increase30d - - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) - labels: - verb: write - record: code:apiserver_request_total:increase30d - - expr: |- - 1 - ( - ( - # write too slow - sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) - - - sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) - ) + - ( - # read too slow - sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d])) - - - ( - ( - sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="1"}[30d])) - or - vector(0) - ) - + - sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="5"}[30d])) - + - sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="40"}[30d])) - ) - ) + - # errors - sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) - ) - / - sum by (cluster) (code:apiserver_request_total:increase30d) - labels: - verb: all - record: apiserver_request:availability30d - - expr: |- - 1 - ( - sum by (cluster) (increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d])) - - - ( - # too slow - ( - sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[30d])) - or - vector(0) - ) - + - sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[30d])) - + - sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[30d])) - ) - + - # errors - sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) - ) - / - sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"}) - labels: - verb: read - record: apiserver_request:availability30d - - expr: |- - 1 - ( - ( - # too slow - sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) - - - sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) - ) - + - # errors - sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) - ) - / - sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"}) - labels: - verb: write - record: apiserver_request:availability30d - - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) - labels: - verb: read - record: code_resource:apiserver_request_total:rate5m - - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - labels: - verb: write - record: code_resource:apiserver_request_total:rate5m - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) - record: code_verb:apiserver_request_total:increase1h -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-burnrate.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-burnrate.rules.yaml deleted file mode 100644 index f2b8563b..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-burnrate.rules.yaml +++ /dev/null @@ -1,328 +0,0 @@ -{{- /* -Generated from 'kube-apiserver-burnrate.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver-burnrate.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kube-apiserver-burnrate.rules - rules: - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[1d])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[1d])) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[1d])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) - labels: - verb: read - record: apiserver_request:burnrate1d - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[1h])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[1h])) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[1h])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) - labels: - verb: read - record: apiserver_request:burnrate1h - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[2h])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[2h])) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[2h])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) - labels: - verb: read - record: apiserver_request:burnrate2h - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[30m])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[30m])) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[30m])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) - labels: - verb: read - record: apiserver_request:burnrate30m - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[3d])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[3d])) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[3d])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) - labels: - verb: read - record: apiserver_request:burnrate3d - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[5m])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[5m])) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[5m])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) - labels: - verb: read - record: apiserver_request:burnrate5m - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[6h])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[6h])) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[6h])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) - labels: - verb: read - record: apiserver_request:burnrate6h - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) - - - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) - labels: - verb: write - record: apiserver_request:burnrate1d - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) - - - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) - labels: - verb: write - record: apiserver_request:burnrate1h - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) - - - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) - labels: - verb: write - record: apiserver_request:burnrate2h - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) - - - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) - labels: - verb: write - record: apiserver_request:burnrate30m - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) - - - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) - labels: - verb: write - record: apiserver_request:burnrate3d - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - - - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - labels: - verb: write - record: apiserver_request:burnrate5m - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) - - - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) - labels: - verb: write - record: apiserver_request:burnrate6h -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-histogram.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-histogram.rules.yaml deleted file mode 100644 index f5886a15..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-histogram.rules.yaml +++ /dev/null @@ -1,49 +0,0 @@ -{{- /* -Generated from 'kube-apiserver-histogram.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver-histogram.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kube-apiserver-histogram.rules - rules: - - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 - labels: - quantile: '0.99' - verb: read - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 - labels: - quantile: '0.99' - verb: write - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) - labels: - quantile: '0.99' - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) - labels: - quantile: '0.9' - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) - labels: - quantile: '0.5' - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml deleted file mode 100644 index 22ede051..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml +++ /dev/null @@ -1,95 +0,0 @@ -{{- /* -Generated from 'kube-apiserver-slos' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverSlos }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver-slos" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kube-apiserver-slos - rules: - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: |- - sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) - and - sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) - for: 2m - labels: - long: 1h - severity: critical - short: 5m -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: |- - sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) - and - sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) - for: 15m - labels: - long: 6h - severity: critical - short: 30m -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: |- - sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) - and - sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) - for: 1h - labels: - long: 1d - severity: warning - short: 2h -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: |- - sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) - and - sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) - for: 3h - labels: - long: 3d - severity: warning - short: 6h -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver.rules.yaml deleted file mode 100644 index 4bb373d6..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver.rules.yaml +++ /dev/null @@ -1,358 +0,0 @@ -{{- /* -Generated from 'kube-apiserver.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserver }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kube-apiserver.rules - rules: - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) - labels: - verb: read - record: apiserver_request:burnrate1d - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) - labels: - verb: read - record: apiserver_request:burnrate1h - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) - labels: - verb: read - record: apiserver_request:burnrate2h - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) - labels: - verb: read - record: apiserver_request:burnrate30m - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) - labels: - verb: read - record: apiserver_request:burnrate3d - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) - labels: - verb: read - record: apiserver_request:burnrate5m - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) - + - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) - labels: - verb: read - record: apiserver_request:burnrate6h - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) - - - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) - labels: - verb: write - record: apiserver_request:burnrate1d - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) - - - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) - labels: - verb: write - record: apiserver_request:burnrate1h - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) - - - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) - labels: - verb: write - record: apiserver_request:burnrate2h - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) - - - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) - labels: - verb: write - record: apiserver_request:burnrate30m - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) - - - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) - labels: - verb: write - record: apiserver_request:burnrate3d - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - - - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - labels: - verb: write - record: apiserver_request:burnrate5m - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) - - - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) - labels: - verb: write - record: apiserver_request:burnrate6h - - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) - labels: - verb: read - record: code_resource:apiserver_request_total:rate5m - - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - labels: - verb: write - record: code_resource:apiserver_request_total:rate5m - - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 - labels: - quantile: '0.99' - verb: read - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 - labels: - quantile: '0.99' - verb: write - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) - labels: - quantile: '0.99' - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) - labels: - quantile: '0.9' - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) - labels: - quantile: '0.5' - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml deleted file mode 100644 index 1b95d214..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml +++ /dev/null @@ -1,31 +0,0 @@ -{{- /* -Generated from 'kube-prometheus-general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusGeneral }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-prometheus-general.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kube-prometheus-general.rules - rules: - - expr: count without(instance, pod, node) (up == 1) - record: count:up1 - - expr: count without(instance, pod, node) (up == 0) - record: count:up0 -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml deleted file mode 100644 index 2d38ef86..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml +++ /dev/null @@ -1,39 +0,0 @@ -{{- /* -Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusNodeRecording }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-prometheus-node-recording.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kube-prometheus-node-recording.rules - rules: - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance) - record: instance:node_cpu:rate:sum - - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) - record: instance:node_network_receive_bytes:rate:sum - - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) - record: instance:node_network_transmit_bytes:rate:sum - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) - record: instance:node_cpu:ratio - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) - record: cluster:node_cpu:sum_rate5m - - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) - record: cluster:node_cpu:ratio -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-scheduler.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-scheduler.rules.yaml deleted file mode 100644 index 4e441964..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-scheduler.rules.yaml +++ /dev/null @@ -1,63 +0,0 @@ -{{- /* -Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeScheduler.enabled .Values.defaultRules.rules.kubeScheduler }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-scheduler.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kube-scheduler.rules - rules: - - expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: '0.99' - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: '0.99' - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: '0.99' - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: '0.9' - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: '0.9' - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: '0.9' - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: '0.5' - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: '0.5' - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: '0.5' - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml deleted file mode 100644 index f7e0c439..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml +++ /dev/null @@ -1,87 +0,0 @@ -{{- /* -Generated from 'kube-state-metrics' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-state-metrics-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubeStateMetrics }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-state-metrics" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kube-state-metrics - rules: - - alert: KubeStateMetricsListErrors - annotations: - description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricslisterrors - summary: kube-state-metrics is experiencing errors in list operations. - expr: |- - (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) - / - sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) - > 0.01 - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeStateMetricsWatchErrors - annotations: - description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricswatcherrors - summary: kube-state-metrics is experiencing errors in watch operations. - expr: |- - (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) - / - sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) - > 0.01 - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeStateMetricsShardingMismatch - annotations: - description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricsshardingmismatch - summary: kube-state-metrics sharding is misconfigured. - expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0 - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeStateMetricsShardsMissing - annotations: - description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricsshardsmissing - summary: kube-state-metrics shards are missing. - expr: |- - 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1 - - - sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) - != 0 - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubelet.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubelet.rules.yaml deleted file mode 100644 index a0732146..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubelet.rules.yaml +++ /dev/null @@ -1,39 +0,0 @@ -{{- /* -Generated from 'kubelet.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubelet.enabled .Values.defaultRules.rules.kubelet }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubelet.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kubelet.rules - rules: - - expr: histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) - labels: - quantile: '0.99' - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) - labels: - quantile: '0.9' - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) - labels: - quantile: '0.5' - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml deleted file mode 100644 index cb7a13c5..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml +++ /dev/null @@ -1,301 +0,0 @@ -{{- /* -Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesApps }} -{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-apps" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kubernetes-apps - rules: - - alert: KubePodCrashLooping - annotations: - description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is restarting {{`{{`}} printf "%.2f" $value {{`}}`}} times / 10 minutes. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodcrashlooping - summary: Pod is crash looping. - expr: |- - increase(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m]) > 0 - and - kube_pod_container_status_waiting{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} == 1 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubePodNotReady - annotations: - description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodnotready - summary: Pod has been in a non-ready state for more than 15 minutes. - expr: |- - sum by (namespace, pod) ( - max by(namespace, pod) ( - kube_pod_status_phase{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}", phase=~"Pending|Unknown"} - ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) ( - 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}) - ) - ) > 0 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeDeploymentGenerationMismatch - annotations: - description: Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentgenerationmismatch - summary: Deployment generation mismatch due to possible roll-back - expr: |- - kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - != - kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeDeploymentReplicasMismatch - annotations: - description: Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentreplicasmismatch - summary: Deployment has not matched the expected number of replicas. - expr: |- - ( - kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - > - kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - ) and ( - changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m]) - == - 0 - ) - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeStatefulSetReplicasMismatch - annotations: - description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetreplicasmismatch - summary: Deployment has not matched the expected number of replicas. - expr: |- - ( - kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - != - kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - ) and ( - changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m]) - == - 0 - ) - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeStatefulSetGenerationMismatch - annotations: - description: StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetgenerationmismatch - summary: StatefulSet generation mismatch due to possible roll-back - expr: |- - kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - != - kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeStatefulSetUpdateNotRolledOut - annotations: - description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetupdatenotrolledout - summary: StatefulSet update has not been rolled out. - expr: |- - ( - max without (revision) ( - kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - unless - kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - ) - * - ( - kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - != - kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - ) - ) and ( - changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeDaemonSetRolloutStuck - annotations: - description: DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15 minutes. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetrolloutstuck - summary: DaemonSet rollout is stuck. - expr: |- - ( - ( - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - ) or ( - kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - != - 0 - ) or ( - kube_daemonset_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - ) or ( - kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - ) - ) and ( - changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeContainerWaiting - annotations: - description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecontainerwaiting - summary: Pod container waiting longer than 1 hour - expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) > 0 - for: 1h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeDaemonSetNotScheduled - annotations: - description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetnotscheduled - summary: DaemonSet pods are not scheduled. - expr: |- - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - - - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeDaemonSetMisScheduled - annotations: - description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetmisscheduled - summary: DaemonSet pods are misscheduled. - expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeJobCompletion - annotations: - description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than 12 hours to complete. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobcompletion - summary: Job did not complete in time - expr: kube_job_spec_completions{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - kube_job_status_succeeded{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 - for: 12h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeJobFailed - annotations: - description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobfailed - summary: Job failed to complete. - expr: kube_job_failed{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeHpaReplicasMismatch - annotations: - description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubehpareplicasmismatch - summary: HPA has not matched descired number of replicas. - expr: |- - (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - != - kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) - and - (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - > - kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) - and - (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - < - kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) - and - changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) == 0 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeHpaMaxedOut - annotations: - description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has been running at max replicas for longer than 15 minutes. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubehpamaxedout - summary: HPA is running at max replicas - expr: |- - kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - == - kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml deleted file mode 100644 index 7a2ecbc3..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml +++ /dev/null @@ -1,159 +0,0 @@ -{{- /* -Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesResources }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-resources" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kubernetes-resources - rules: - - alert: KubeCPUOvercommit - annotations: - description: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecpuovercommit - summary: Cluster has overcommitted CPU resource requests. - expr: |- - sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - / - sum(kube_node_status_allocatable{resource="cpu"}) - > - ((count(kube_node_status_allocatable{resource="cpu"}) > 1) - 1) / count(kube_node_status_allocatable{resource="cpu"}) - for: 5m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeMemoryOvercommit - annotations: - description: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubememoryovercommit - summary: Cluster has overcommitted memory resource requests. - expr: |- - sum(namespace_memory:kube_pod_container_resource_requests:sum{}) - / - sum(kube_node_status_allocatable{resource="memory"}) - > - ((count(kube_node_status_allocatable{resource="memory"}) > 1) - 1) - / - count(kube_node_status_allocatable{resource="memory"}) - for: 5m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeCPUQuotaOvercommit - annotations: - description: Cluster has overcommitted CPU resource requests for Namespaces. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecpuquotaovercommit - summary: Cluster has overcommitted CPU resource requests. - expr: |- - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) - / - sum(kube_node_status_allocatable{resource="cpu"}) - > 1.5 - for: 5m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeMemoryQuotaOvercommit - annotations: - description: Cluster has overcommitted memory resource requests for Namespaces. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubememoryquotaovercommit - summary: Cluster has overcommitted memory resource requests. - expr: |- - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) - / - sum(kube_node_status_allocatable{resource="memory",job="kube-state-metrics"}) - > 1.5 - for: 5m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeQuotaAlmostFull - annotations: - description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubequotaalmostfull - summary: Namespace quota is going to be full. - expr: |- - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 0.9 < 1 - for: 15m - labels: - severity: info -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeQuotaFullyUsed - annotations: - description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubequotafullyused - summary: Namespace quota is fully used. - expr: |- - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - == 1 - for: 15m - labels: - severity: info -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeQuotaExceeded - annotations: - description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubequotaexceeded - summary: Namespace quota has exceeded the limits. - expr: |- - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 1 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: CPUThrottlingHigh - annotations: - description: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}}.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-cputhrottlinghigh - summary: Processes experience elevated CPU throttling. - expr: |- - sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) - / - sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace) - > ( 25 / 100 ) - for: 15m - labels: - severity: info -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml deleted file mode 100644 index 96d3c6fe..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml +++ /dev/null @@ -1,80 +0,0 @@ -{{- /* -Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesStorage }} -{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-storage" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kubernetes-storage - rules: - - alert: KubePersistentVolumeFillingUp - annotations: - description: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is only {{`{{`}} $value | humanizePercentage {{`}}`}} free. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: |- - ( - kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} - ) < 0.03 - and - kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0 - for: 1m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubePersistentVolumeFillingUp - annotations: - description: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} is available. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: |- - ( - kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} - ) < 0.15 - and - kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0 - and - predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 - for: 1h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubePersistentVolumeErrors - annotations: - description: The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumeerrors - summary: PersistentVolume is having issues with provisioning. - expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 - for: 5m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml deleted file mode 100644 index c08baede..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml +++ /dev/null @@ -1,100 +0,0 @@ -{{- /* -Generated from 'kubernetes-system-apiserver' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-apiserver" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kubernetes-system-apiserver - rules: - - alert: KubeClientCertificateExpiration - annotations: - description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration - summary: Client certificate is about to expire. - expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeClientCertificateExpiration - annotations: - description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration - summary: Client certificate is about to expire. - expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: AggregatedAPIErrors - annotations: - description: An aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has reported errors. It has appeared unavailable {{`{{`}} $value | humanize {{`}}`}} times averaged over the past 10m. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-aggregatedapierrors - summary: An aggregated API has reported errors. - expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4 - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- if semverCompare ">=1.18.0-0" $kubeTargetVersion }} - - alert: AggregatedAPIDown - annotations: - description: An aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has been only {{`{{`}} $value | humanize {{`}}`}}% available over the last 10m. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-aggregatedapidown - summary: An aggregated API is down. - expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 - for: 5m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} -{{- if .Values.kubeApiServer.enabled }} - - alert: KubeAPIDown - annotations: - description: KubeAPI has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapidown - summary: Target disappeared from Prometheus target discovery. - expr: absent(up{job="apiserver"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} - - alert: KubeAPITerminatedRequests - annotations: - description: The apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapiterminatedrequests - summary: The apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests. - expr: sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 - for: 5m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml deleted file mode 100644 index 07ff37a2..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml +++ /dev/null @@ -1,41 +0,0 @@ -{{- /* -Generated from 'kubernetes-system-controller-manager' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeControllerManager.enabled }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-controller-manager" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kubernetes-system-controller-manager - rules: -{{- if .Values.kubeControllerManager.enabled }} - - alert: KubeControllerManagerDown - annotations: - description: KubeControllerManager has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecontrollermanagerdown - summary: Target disappeared from Prometheus target discovery. - expr: absent(up{job="kube-controller-manager"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml deleted file mode 100644 index 1ea531f2..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml +++ /dev/null @@ -1,188 +0,0 @@ -{{- /* -Generated from 'kubernetes-system-kubelet' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-kubelet" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kubernetes-system-kubelet - rules: - - alert: KubeNodeNotReady - annotations: - description: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodenotready - summary: Node is not ready. - expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeNodeUnreachable - annotations: - description: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodeunreachable - summary: Node is unreachable. - expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeletTooManyPods - annotations: - description: Kubelet '{{`{{`}} $labels.node {{`}}`}}' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubelettoomanypods - summary: Kubelet is running at capacity. - expr: |- - count by(node) ( - (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) - ) - / - max by(node) ( - kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1 - ) > 0.95 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeNodeReadinessFlapping - annotations: - description: The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodereadinessflapping - summary: Node readiness status is flapping. - expr: sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeletPlegDurationHigh - annotations: - description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletplegdurationhigh - summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. - expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 - for: 5m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeletPodStartUpLatencyHigh - annotations: - description: Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletpodstartuplatencyhigh - summary: Kubelet Pod startup latency is too high. - expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeletClientCertificateExpiration - annotations: - description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletclientcertificateexpiration - summary: Kubelet client certificate is about to expire. - expr: kubelet_certificate_manager_client_ttl_seconds < 604800 - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeletClientCertificateExpiration - annotations: - description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletclientcertificateexpiration - summary: Kubelet client certificate is about to expire. - expr: kubelet_certificate_manager_client_ttl_seconds < 86400 - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeletServerCertificateExpiration - annotations: - description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletservercertificateexpiration - summary: Kubelet server certificate is about to expire. - expr: kubelet_certificate_manager_server_ttl_seconds < 604800 - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeletServerCertificateExpiration - annotations: - description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletservercertificateexpiration - summary: Kubelet server certificate is about to expire. - expr: kubelet_certificate_manager_server_ttl_seconds < 86400 - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeletClientCertificateRenewalErrors - annotations: - description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its client certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes). - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletclientcertificaterenewalerrors - summary: Kubelet has failed to renew its client certificate. - expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeletServerCertificateRenewalErrors - annotations: - description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its server certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes). - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletservercertificaterenewalerrors - summary: Kubelet has failed to renew its server certificate. - expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- if .Values.prometheusOperator.kubeletService.enabled }} - - alert: KubeletDown - annotations: - description: Kubelet has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletdown - summary: Target disappeared from Prometheus target discovery. - expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml deleted file mode 100644 index 47376e3b..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml +++ /dev/null @@ -1,41 +0,0 @@ -{{- /* -Generated from 'kubernetes-system-scheduler' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeScheduler.enabled .Values.defaultRules.rules.kubeScheduler }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-scheduler" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kubernetes-system-scheduler - rules: -{{- if .Values.kubeScheduler.enabled }} - - alert: KubeSchedulerDown - annotations: - description: KubeScheduler has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeschedulerdown - summary: Target disappeared from Prometheus target discovery. - expr: absent(up{job="kube-scheduler"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml deleted file mode 100644 index 657e5c23..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml +++ /dev/null @@ -1,55 +0,0 @@ -{{- /* -Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kubernetes-system - rules: - - alert: KubeVersionMismatch - annotations: - description: There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeversionmismatch - summary: Different semantic versions of Kubernetes components running. - expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeClientErrors - annotations: - description: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} $value | humanizePercentage {{`}}`}} errors.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclienterrors - summary: Kubernetes API server client is experiencing errors. - expr: |- - (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) - / - sum(rate(rest_client_requests_total[5m])) by (instance, job)) - > 0.01 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.rules.yaml deleted file mode 100644 index f734e8dc..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.rules.yaml +++ /dev/null @@ -1,79 +0,0 @@ -{{- /* -Generated from 'node-exporter.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/node-exporter-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.node }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node-exporter.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: node-exporter.rules - rules: - - expr: |- - count without (cpu) ( - count without (mode) ( - node_cpu_seconds_total{job="node-exporter"} - ) - ) - record: instance:node_num_cpu:sum - - expr: |- - 1 - avg without (cpu, mode) ( - rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[5m]) - ) - record: instance:node_cpu_utilisation:rate5m - - expr: |- - ( - node_load1{job="node-exporter"} - / - instance:node_num_cpu:sum{job="node-exporter"} - ) - record: instance:node_load1_per_cpu:ratio - - expr: |- - 1 - ( - node_memory_MemAvailable_bytes{job="node-exporter"} - / - node_memory_MemTotal_bytes{job="node-exporter"} - ) - record: instance:node_memory_utilisation:ratio - - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) - record: instance:node_vmstat_pgmajfault:rate5m - - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) - record: instance_device:node_disk_io_time_seconds:rate5m - - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) - record: instance_device:node_disk_io_time_weighted_seconds:rate5m - - expr: |- - sum without (device) ( - rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m]) - ) - record: instance:node_network_receive_bytes_excluding_lo:rate5m - - expr: |- - sum without (device) ( - rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m]) - ) - record: instance:node_network_transmit_bytes_excluding_lo:rate5m - - expr: |- - sum without (device) ( - rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m]) - ) - record: instance:node_network_receive_drop_excluding_lo:rate5m - - expr: |- - sum without (device) ( - rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m]) - ) - record: instance:node_network_transmit_drop_excluding_lo:rate5m -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml deleted file mode 100644 index a563ff6f..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml +++ /dev/null @@ -1,308 +0,0 @@ -{{- /* -Generated from 'node-exporter' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/node-exporter-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.node }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node-exporter" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: node-exporter - rules: - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemspacefillingup - summary: Filesystem is predicted to run out of space within the next 24 hours. - expr: |- - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 25 - and - predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up fast. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemspacefillingup - summary: Filesystem is predicted to run out of space within the next 4 hours. - expr: |- - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 10 - and - predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutofspace - summary: Filesystem has less than 5% space left. - expr: |- - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutofspace - summary: Filesystem has less than 3% space left. - expr: |- - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemfilesfillingup - summary: Filesystem is predicted to run out of inodes within the next 24 hours. - expr: |- - ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40 - and - predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up fast. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemfilesfillingup - summary: Filesystem is predicted to run out of inodes within the next 4 hours. - expr: |- - ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20 - and - predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutoffiles - summary: Filesystem has less than 5% inodes left. - expr: |- - ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutoffiles - summary: Filesystem has less than 3% inodes left. - expr: |- - ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeNetworkReceiveErrs - annotations: - description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} receive errors in the last two minutes.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworkreceiveerrs - summary: Network interface is reporting many receive errors. - expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 - for: 1h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeNetworkTransmitErrs - annotations: - description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} transmit errors in the last two minutes.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworktransmiterrs - summary: Network interface is reporting many transmit errors. - expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 - for: 1h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeHighNumberConntrackEntriesUsed - annotations: - description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodehighnumberconntrackentriesused - summary: Number of conntrack are getting close to the limit. - expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeTextFileCollectorScrapeError - annotations: - description: Node Exporter text file collector failed to scrape. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodetextfilecollectorscrapeerror - summary: Node Exporter text file collector failed to scrape. - expr: node_textfile_scrape_error{job="node-exporter"} == 1 - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeClockSkewDetected - annotations: - description: Clock on {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeclockskewdetected - summary: Clock skew detected. - expr: |- - ( - node_timex_offset_seconds > 0.05 - and - deriv(node_timex_offset_seconds[5m]) >= 0 - ) - or - ( - node_timex_offset_seconds < -0.05 - and - deriv(node_timex_offset_seconds[5m]) <= 0 - ) - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeClockNotSynchronising - annotations: - description: Clock on {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeclocknotsynchronising - summary: Clock not synchronising. - expr: |- - min_over_time(node_timex_sync_status[5m]) == 0 - and - node_timex_maxerror_seconds >= 16 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeRAIDDegraded - annotations: - description: RAID array '{{`{{`}} $labels.device {{`}}`}}' on {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-noderaiddegraded - summary: RAID Array is degraded - expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeRAIDDiskFailure - annotations: - description: At least one device in RAID array on {{`{{`}} $labels.instance {{`}}`}} failed. Array '{{`{{`}} $labels.device {{`}}`}}' needs attention and possibly a disk swap. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-noderaiddiskfailure - summary: Failed device in RAID array - expr: node_md_disks{state="failed"} > 0 - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeFileDescriptorLimit - annotations: - description: File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefiledescriptorlimit - summary: Kernel is predicted to exhaust file descriptors limit soon. - expr: |- - ( - node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70 - ) - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeFileDescriptorLimit - annotations: - description: File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefiledescriptorlimit - summary: Kernel is predicted to exhaust file descriptors limit soon. - expr: |- - ( - node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 - ) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml deleted file mode 100644 index a9033a9f..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml +++ /dev/null @@ -1,39 +0,0 @@ -{{- /* -Generated from 'node-network' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.network }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node-network" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: node-network - rules: - - alert: NodeNetworkInterfaceFlapping - annotations: - description: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing it's up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworkinterfaceflapping - summary: Network interface is often changin it's status - expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 - for: 2m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml deleted file mode 100644 index c6dd9e18..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml +++ /dev/null @@ -1,51 +0,0 @@ -{{- /* -Generated from 'node.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.node }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: node.rules - rules: - - expr: |- - topk by(namespace, pod) (1, - max by (node, namespace, pod) ( - label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") - )) - record: 'node_namespace_pod:kube_pod_info:' - - expr: |- - count by (cluster, node) (sum by (node, cpu) ( - node_cpu_seconds_total{job="node-exporter"} - * on (namespace, pod) group_left(node) - topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:) - )) - record: node:node_num_cpu:sum - - expr: |- - sum( - node_memory_MemAvailable_bytes{job="node-exporter"} or - ( - node_memory_Buffers_bytes{job="node-exporter"} + - node_memory_Cached_bytes{job="node-exporter"} + - node_memory_MemFree_bytes{job="node-exporter"} + - node_memory_Slab_bytes{job="node-exporter"} - ) - ) by (cluster) - record: :node_memory_MemAvailable_bytes:sum -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml deleted file mode 100644 index d3010bcd..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml +++ /dev/null @@ -1,113 +0,0 @@ -{{- /* -Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheus-operator-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheusOperator }} -{{- $operatorJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "operator" }} -{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus-operator" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: prometheus-operator - rules: - - alert: PrometheusOperatorListErrors - annotations: - description: Errors while performing List operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoperatorlisterrors - summary: Errors while performing list operations in controller. - expr: (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m]))) > 0.4 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusOperatorWatchErrors - annotations: - description: Errors while performing watch operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoperatorwatcherrors - summary: Errors while performing watch operations in controller. - expr: (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m]))) > 0.4 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusOperatorSyncFailed - annotations: - description: Controller {{`{{`}} $labels.controller {{`}}`}} in {{`{{`}} $labels.namespace {{`}}`}} namespace fails to reconcile {{`{{`}} $value {{`}}`}} objects. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoperatorsyncfailed - summary: Last controller reconciliation failed - expr: min_over_time(prometheus_operator_syncs{status="failed",job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusOperatorReconcileErrors - annotations: - description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of reconciling operations failed for {{`{{`}} $labels.controller {{`}}`}} controller in {{`{{`}} $labels.namespace {{`}}`}} namespace.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoperatorreconcileerrors - summary: Errors while reconciling controller. - expr: (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.1 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusOperatorNodeLookupErrors - annotations: - description: Errors while reconciling Prometheus in {{`{{`}} $labels.namespace {{`}}`}} Namespace. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoperatornodelookuperrors - summary: Errors while reconciling Prometheus. - expr: rate(prometheus_operator_node_address_lookup_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusOperatorNotReady - annotations: - description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace isn't ready to reconcile {{`{{`}} $labels.controller {{`}}`}} resources. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoperatornotready - summary: Prometheus operator not ready - expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) == 0) - for: 5m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusOperatorRejectedResources - annotations: - description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace rejected {{`{{`}} printf "%0.0f" $value {{`}}`}} {{`{{`}} $labels.controller {{`}}`}}/{{`{{`}} $labels.resource {{`}}`}} resources. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoperatorrejectedresources - summary: Resources rejected by Prometheus operator - expr: min_over_time(prometheus_operator_managed_resources{state="rejected",job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 5m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml b/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml deleted file mode 100644 index 10479429..00000000 --- a/charts/kubezero-metrics/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml +++ /dev/null @@ -1,307 +0,0 @@ -{{- /* -Generated from 'prometheus' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheus-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheus }} -{{- $prometheusJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus" }} -{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: prometheus - rules: - - alert: PrometheusBadConfig - annotations: - description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to reload its configuration. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusbadconfig - summary: Failed Prometheus configuration reload. - expr: |- - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(prometheus_config_last_reload_successful{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) == 0 - for: 10m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusNotificationQueueRunningFull - annotations: - description: Alert notification queue of Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is running full. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusnotificationqueuerunningfull - summary: Prometheus alert notification queue predicted to run full in less than 30m. - expr: |- - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m], 60 * 30) - > - min_over_time(prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) - ) - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers - annotations: - description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.alertmanager{{`}}`}}.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheuserrorsendingalertstosomealertmanagers - summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. - expr: |- - ( - rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) - / - rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) - ) - * 100 - > 1 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusNotConnectedToAlertmanagers - annotations: - description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not connected to any Alertmanagers. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusnotconnectedtoalertmanagers - summary: Prometheus is not connected to any Alertmanagers. - expr: |- - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) < 1 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusTSDBReloadsFailing - annotations: - description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} reload failures over the last 3h. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheustsdbreloadsfailing - summary: Prometheus has issues reloading blocks from disk. - expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0 - for: 4h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusTSDBCompactionsFailing - annotations: - description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} compaction failures over the last 3h. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheustsdbcompactionsfailing - summary: Prometheus has issues compacting blocks. - expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0 - for: 4h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusNotIngestingSamples - annotations: - description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not ingesting samples. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusnotingestingsamples - summary: Prometheus is not ingesting samples. - expr: |- - ( - rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0 - and - ( - sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0 - or - sum without(rule_group) (prometheus_rule_group_rules{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0 - ) - ) - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusDuplicateTimestamps - annotations: - description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with different values but duplicated timestamp. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusduplicatetimestamps - summary: Prometheus is dropping samples with duplicate timestamps. - expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusOutOfOrderTimestamps - annotations: - description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with timestamps arriving out of order. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoutofordertimestamps - summary: Prometheus drops samples with out-of-order timestamps. - expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusRemoteStorageFailures - annotations: - description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} failed to send {{`{{`}} printf "%.1f" $value {{`}}`}}% of the samples to {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}} - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusremotestoragefailures - summary: Prometheus fails to send samples to remote storage. - expr: |- - ( - (rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])) - / - ( - (rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])) - + - (rate(prometheus_remote_storage_succeeded_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])) - ) - ) - * 100 - > 1 - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusRemoteWriteBehind - annotations: - description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write is {{`{{`}} printf "%.1f" $value {{`}}`}}s behind for {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusremotewritebehind - summary: Prometheus remote write is behind. - expr: |- - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) - - ignoring(remote_name, url) group_right - max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) - ) - > 120 - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusRemoteWriteDesiredShards - annotations: - description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write desired shards calculation wants to run {{`{{`}} $value {{`}}`}} shards for queue {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}, which is more than the max of {{`{{`}} printf `prometheus_remote_storage_shards_max{instance="%s",job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}` $labels.instance | query | first | value {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusremotewritedesiredshards - summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. - expr: |- - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - max_over_time(prometheus_remote_storage_shards_desired{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) - > - max_over_time(prometheus_remote_storage_shards_max{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) - ) - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusRuleFailures - annotations: - description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to evaluate {{`{{`}} printf "%.0f" $value {{`}}`}} rules in the last 5m. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusrulefailures - summary: Prometheus is failing rule evaluations. - expr: increase(prometheus_rule_evaluation_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusMissingRuleEvaluations - annotations: - description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has missed {{`{{`}} printf "%.0f" $value {{`}}`}} rule group evaluations in the last 5m. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusmissingruleevaluations - summary: Prometheus is missing rule evaluations due to slow rule group evaluation. - expr: increase(prometheus_rule_group_iterations_missed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusTargetLimitHit - annotations: - description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because the number of targets exceeded the configured target_limit. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheustargetlimithit - summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. - expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusLabelLimitHit - annotations: - description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheuslabellimithit - summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit. - expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusTargetSyncFailure - annotations: - description: '{{`{{`}} printf "%.0f" $value {{`}}`}} targets in Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} have failed to sync because invalid configuration was supplied.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheustargetsyncfailure - summary: Prometheus has failed to sync targets. - expr: increase(prometheus_target_sync_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[30m]) > 0 - for: 5m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusErrorSendingAlertsToAnyAlertmanager - annotations: - description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% minimum errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to any Alertmanager.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheuserrorsendingalertstoanyalertmanager - summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. - expr: |- - min without (alertmanager) ( - rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}",alertmanager!~``}[5m]) - / - rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}",alertmanager!~``}[5m]) - ) - * 100 - > 3 - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kubezero-metrics/configs/.gitignore b/charts/kubezero-metrics/configs/.gitignore new file mode 100644 index 00000000..b7de3ec0 --- /dev/null +++ b/charts/kubezero-metrics/configs/.gitignore @@ -0,0 +1,2 @@ +vendor +jb-linux-amd64 diff --git a/charts/kubezero-metrics/configs/build.sh b/charts/kubezero-metrics/configs/build.sh new file mode 100755 index 00000000..6d3bc970 --- /dev/null +++ b/charts/kubezero-metrics/configs/build.sh @@ -0,0 +1,21 @@ +#!/bin/bash -ex + +JB='./jb-linux-amd64' + +which jsonnet > /dev/null || { echo "Required jsonnet not found!"; } +[ -x $JB ] || { wget https://github.com/jsonnet-bundler/jsonnet-bundler/releases/download/v0.4.0/jb-linux-amd64 && chmod +x $JB; } +#which gojsontoyaml || go install github.com/brancz/gojsontoyaml@latest + +[ -r jsonnetfile.json ] || $JB init +if [ -r jsonnetfile.lock.json ]; then + $JB update +else + $JB install github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus@main +fi + +rm -rf dashboards && mkdir -p dashboards +jsonnet -J vendor -m dashboards -e '(import "mixin.libsonnet").grafanaDashboards' + +rm -rf rules && mkdir -p rules +#jsonnet -J vendor -m rules rules.libsonnet | xargs -I{} sh -c 'cat {} | gojsontoyaml > {}.yaml' -- {} +jsonnet -J vendor -m rules rules.libsonnet diff --git a/charts/kubezero-metrics/dashboards/kube-mixin/apiserver.json b/charts/kubezero-metrics/configs/dashboards/apiserver.json similarity index 100% rename from charts/kubezero-metrics/dashboards/kube-mixin/apiserver.json rename to charts/kubezero-metrics/configs/dashboards/apiserver.json diff --git a/charts/kubezero-metrics/dashboards/kube-mixin/cluster-total.json b/charts/kubezero-metrics/configs/dashboards/cluster-total.json similarity index 100% rename from charts/kubezero-metrics/dashboards/kube-mixin/cluster-total.json rename to charts/kubezero-metrics/configs/dashboards/cluster-total.json diff --git a/charts/kubezero-metrics/dashboards/kube-mixin/controller-manager.json b/charts/kubezero-metrics/configs/dashboards/controller-manager.json similarity index 100% rename from charts/kubezero-metrics/dashboards/kube-mixin/controller-manager.json rename to charts/kubezero-metrics/configs/dashboards/controller-manager.json diff --git a/charts/kubezero-metrics/dashboards/kube-mixin/k8s-resources-cluster.json b/charts/kubezero-metrics/configs/dashboards/k8s-resources-cluster.json similarity index 100% rename from charts/kubezero-metrics/dashboards/kube-mixin/k8s-resources-cluster.json rename to charts/kubezero-metrics/configs/dashboards/k8s-resources-cluster.json diff --git a/charts/kubezero-metrics/dashboards/kube-mixin/k8s-resources-namespace.json b/charts/kubezero-metrics/configs/dashboards/k8s-resources-namespace.json similarity index 100% rename from charts/kubezero-metrics/dashboards/kube-mixin/k8s-resources-namespace.json rename to charts/kubezero-metrics/configs/dashboards/k8s-resources-namespace.json diff --git a/charts/kubezero-metrics/dashboards/kube-mixin/k8s-resources-node.json b/charts/kubezero-metrics/configs/dashboards/k8s-resources-node.json similarity index 100% rename from charts/kubezero-metrics/dashboards/kube-mixin/k8s-resources-node.json rename to charts/kubezero-metrics/configs/dashboards/k8s-resources-node.json diff --git a/charts/kubezero-metrics/dashboards/kube-mixin/k8s-resources-pod.json b/charts/kubezero-metrics/configs/dashboards/k8s-resources-pod.json similarity index 100% rename from charts/kubezero-metrics/dashboards/kube-mixin/k8s-resources-pod.json rename to charts/kubezero-metrics/configs/dashboards/k8s-resources-pod.json diff --git a/charts/kubezero-metrics/dashboards/kube-mixin/k8s-resources-workload.json b/charts/kubezero-metrics/configs/dashboards/k8s-resources-workload.json similarity index 100% rename from charts/kubezero-metrics/dashboards/kube-mixin/k8s-resources-workload.json rename to charts/kubezero-metrics/configs/dashboards/k8s-resources-workload.json diff --git a/charts/kubezero-metrics/dashboards/kube-mixin/k8s-resources-workloads-namespace.json b/charts/kubezero-metrics/configs/dashboards/k8s-resources-workloads-namespace.json similarity index 100% rename from charts/kubezero-metrics/dashboards/kube-mixin/k8s-resources-workloads-namespace.json rename to charts/kubezero-metrics/configs/dashboards/k8s-resources-workloads-namespace.json diff --git a/charts/kubezero-metrics/dashboards/kube-mixin/kubelet.json b/charts/kubezero-metrics/configs/dashboards/kubelet.json similarity index 100% rename from charts/kubezero-metrics/dashboards/kube-mixin/kubelet.json rename to charts/kubezero-metrics/configs/dashboards/kubelet.json diff --git a/charts/kubezero-metrics/dashboards/kube-mixin/namespace-by-pod.json b/charts/kubezero-metrics/configs/dashboards/namespace-by-pod.json similarity index 100% rename from charts/kubezero-metrics/dashboards/kube-mixin/namespace-by-pod.json rename to charts/kubezero-metrics/configs/dashboards/namespace-by-pod.json diff --git a/charts/kubezero-metrics/dashboards/kube-mixin/namespace-by-workload.json b/charts/kubezero-metrics/configs/dashboards/namespace-by-workload.json similarity index 100% rename from charts/kubezero-metrics/dashboards/kube-mixin/namespace-by-workload.json rename to charts/kubezero-metrics/configs/dashboards/namespace-by-workload.json diff --git a/charts/kubezero-metrics/dashboards/kube-mixin/persistentvolumesusage.json b/charts/kubezero-metrics/configs/dashboards/persistentvolumesusage.json similarity index 100% rename from charts/kubezero-metrics/dashboards/kube-mixin/persistentvolumesusage.json rename to charts/kubezero-metrics/configs/dashboards/persistentvolumesusage.json diff --git a/charts/kubezero-metrics/dashboards/kube-mixin/pod-total.json b/charts/kubezero-metrics/configs/dashboards/pod-total.json similarity index 100% rename from charts/kubezero-metrics/dashboards/kube-mixin/pod-total.json rename to charts/kubezero-metrics/configs/dashboards/pod-total.json diff --git a/charts/kubezero-metrics/dashboards/kube-mixin/proxy.json b/charts/kubezero-metrics/configs/dashboards/proxy.json similarity index 100% rename from charts/kubezero-metrics/dashboards/kube-mixin/proxy.json rename to charts/kubezero-metrics/configs/dashboards/proxy.json diff --git a/charts/kubezero-metrics/dashboards/kube-mixin/scheduler.json b/charts/kubezero-metrics/configs/dashboards/scheduler.json similarity index 100% rename from charts/kubezero-metrics/dashboards/kube-mixin/scheduler.json rename to charts/kubezero-metrics/configs/dashboards/scheduler.json diff --git a/charts/kubezero-metrics/dashboards/kube-mixin/workload-total.json b/charts/kubezero-metrics/configs/dashboards/workload-total.json similarity index 100% rename from charts/kubezero-metrics/dashboards/kube-mixin/workload-total.json rename to charts/kubezero-metrics/configs/dashboards/workload-total.json diff --git a/charts/kubezero-metrics/configs/jsonnetfile.json b/charts/kubezero-metrics/configs/jsonnetfile.json new file mode 100644 index 00000000..97f3b96f --- /dev/null +++ b/charts/kubezero-metrics/configs/jsonnetfile.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/prometheus-operator/kube-prometheus.git", + "subdir": "jsonnet/kube-prometheus" + } + }, + "version": "main" + } + ], + "legacyImports": true +} diff --git a/charts/kubezero-metrics/configs/jsonnetfile.lock.json b/charts/kubezero-metrics/configs/jsonnetfile.lock.json new file mode 100644 index 00000000..bdb5d22d --- /dev/null +++ b/charts/kubezero-metrics/configs/jsonnetfile.lock.json @@ -0,0 +1,160 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/brancz/kubernetes-grafana.git", + "subdir": "grafana" + } + }, + "version": "c3b14b24b83cfe9abf1064649d19e2d679f033fb", + "sum": "YrE4DNQsWgYWs6h0j/FjQETt8xDXdYdsslb1WK7xQEk=" + }, + { + "source": { + "git": { + "remote": "https://github.com/etcd-io/etcd.git", + "subdir": "contrib/mixin" + } + }, + "version": "3df272774672366beb02c5447782805ab5fec957", + "sum": "5XhYOigrKipOWDbIn9hlrz7JcbelzvJnormxSaup9JI=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet" + } + }, + "version": "19b27b272abf4263af1365ec485784c49815a332", + "sum": "gF8foHByYcB25jcUOBqP6jxk0OPifQMjPvKY0HaCk6w=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "grafana-builder" + } + }, + "version": "b9941184e4c9987077898e788343e21a800d8465", + "sum": "GRf2GvwEU4jhXV+JOonXSZ4wdDv8mnHBPCQ6TUVd+g8=" + }, + { + "source": { + "git": { + "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin.git", + "subdir": "" + } + }, + "version": "ff4641bcd83314c955150bea6b147df9ca335c4a", + "sum": "oUVGwcCbmdH8qz9B+lbRawI9s23GY9HeW7MwYZRbZ/0=" + }, + { + "source": { + "git": { + "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin.git", + "subdir": "lib/promgrafonnet" + } + }, + "version": "ff4641bcd83314c955150bea6b147df9ca335c4a", + "sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps=" + }, + { + "source": { + "git": { + "remote": "https://github.com/kubernetes/kube-state-metrics.git", + "subdir": "jsonnet/kube-state-metrics" + } + }, + "version": "8dab6f7472c26987ab7f8899a4a2f753fed8e8a8", + "sum": "S5qI+PJUdNeYOv76jH5nxwYS9N6U7CRxvyuB1wI4cTE=" + }, + { + "source": { + "git": { + "remote": "https://github.com/kubernetes/kube-state-metrics.git", + "subdir": "jsonnet/kube-state-metrics-mixin" + } + }, + "version": "8dab6f7472c26987ab7f8899a4a2f753fed8e8a8", + "sum": "u8gaydJoxEjzizQ8jY8xSjYgWooPmxw+wIWdDxifMAk=" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus-operator/kube-prometheus.git", + "subdir": "jsonnet/kube-prometheus" + } + }, + "version": "a2eee1803a074fb40cad109d690732c22f0130cf", + "sum": "kqVnoNBux2YF1s03m+O3w/5jreAnjXx2/NjvNP1Hoy4=" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus-operator/prometheus-operator.git", + "subdir": "jsonnet/mixin" + } + }, + "version": "c6b5cc2e178002683ab3afe8d023a8bb2508ba19", + "sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U=", + "name": "prometheus-operator-mixin" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus-operator/prometheus-operator.git", + "subdir": "jsonnet/prometheus-operator" + } + }, + "version": "c6b5cc2e178002683ab3afe8d023a8bb2508ba19", + "sum": "sECNXs/aIEreFUma1BWVyknBygqh3AVJEB3msmrAYYY=" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus/alertmanager.git", + "subdir": "doc/alertmanager-mixin" + } + }, + "version": "e35efbddb66a73fd8723be5334477e76f21fbd19", + "sum": "pep+dHzfIjh2SU5pEkwilMCAT/NoL6YYflV4x8cr7vU=", + "name": "alertmanager" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus/node_exporter.git", + "subdir": "docs/node-mixin" + } + }, + "version": "0e6b23c338e98809c9872c70a2f5dfa8d6d370d4", + "sum": "MnfAA4+l2BkgJncnYfV8uHC7CxHZut8+ap8KkEqyB5Y=" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus/prometheus.git", + "subdir": "documentation/prometheus-mixin" + } + }, + "version": "a05b510fc32c3ecc2fc369002576179ae1cbcc23", + "sum": "m4VHwft4fUcxzL4+52lLZG/V5aH5ZEdjaweb88vISL0=", + "name": "prometheus" + }, + { + "source": { + "git": { + "remote": "https://github.com/thanos-io/thanos.git", + "subdir": "mixin" + } + }, + "version": "360b39e1c6ab3ac8dcefa225a6205142f9362c68", + "sum": "Og+wEHfgzXBvBLAeeQvGNoiCw3FY4LQHlJdpsG/owj8=", + "name": "thanos-mixin" + } + ], + "legacyImports": false +} diff --git a/charts/kubezero-metrics/configs/k8s-dashboards.yaml b/charts/kubezero-metrics/configs/k8s-dashboards.yaml new file mode 100644 index 00000000..99758679 --- /dev/null +++ b/charts/kubezero-metrics/configs/k8s-dashboards.yaml @@ -0,0 +1,49 @@ +configmap: grafana-dashboards-k8s +gzip: true +folder: Kubernetes +dashboards: +- name: coreDNS + url: https://grafana.com/api/dashboards/12539/revisions/5/download + tags: ['kubernetes', 'DNS'] +- name: etcd + url: https://grafana.com/api/dashboards/3070/revisions/3/download + tags: ['kubernetes', 'etcd'] +- name: node + url: https://grafana.com/api/dashboards/1860/revisions/23/download + tags: ['kubernetes'] +# cd dashboards; for f in *.json; do echo "- name: ${f%%.json}" >> ../dashboards.yaml; echo " url: file://dashboards/$f" >> ../dashboards.yaml; done; cd - +- name: apiserver + url: file://dashboards/apiserver.json +- name: cluster-total + url: file://dashboards/cluster-total.json +- name: controller-manager + url: file://dashboards/controller-manager.json +- name: k8s-resources-cluster + url: file://dashboards/k8s-resources-cluster.json +- name: k8s-resources-namespace + url: file://dashboards/k8s-resources-namespace.json +- name: k8s-resources-node + url: file://dashboards/k8s-resources-node.json +- name: k8s-resources-pod + url: file://dashboards/k8s-resources-pod.json +- name: k8s-resources-workload + url: file://dashboards/k8s-resources-workload.json +- name: k8s-resources-workloads-namespace + url: file://dashboards/k8s-resources-workloads-namespace.json +- name: kubelet + url: file://dashboards/kubelet.json +- name: namespace-by-pod + url: file://dashboards/namespace-by-pod.json +- name: namespace-by-workload + url: file://dashboards/namespace-by-workload.json +- name: persistentvolumesusage + url: file://dashboards/persistentvolumesusage.json + tags: ['kubernetes', 'csi'] +- name: pod-total + url: file://dashboards/pod-total.json +- name: proxy + url: file://dashboards/proxy.json +- name: scheduler + url: file://dashboards/scheduler.json +- name: workload-total + url: file://dashboards/workload-total.json diff --git a/charts/kubezero-metrics/configs/k8s-rules.yaml b/charts/kubezero-metrics/configs/k8s-rules.yaml new file mode 100644 index 00000000..cef2ba59 --- /dev/null +++ b/charts/kubezero-metrics/configs/k8s-rules.yaml @@ -0,0 +1,18 @@ + # cd rules; for f in *-prometheusRule; do echo "- name: ${f%%-prometheusRule}" >> ../k8s-rules.yaml; echo " url: file://rules/$f" >> ../k8s-rules.yaml; done; cd - +rules: +- name: alertmanager + url: file://rules/alertmanager-prometheusRule +- name: etcd-mixin + url: file://rules/etcd-mixin-prometheusRule +- name: kube-prometheus + url: file://rules/kube-prometheus-prometheusRule +- name: kube-state-metrics + url: file://rules/kube-state-metrics-prometheusRule +- name: kubernetes + url: file://rules/kubernetes-prometheusRule +- name: node-exporter + url: file://rules/node-exporter-prometheusRule +- name: prometheus-operator + url: file://rules/prometheus-operator-prometheusRule +- name: prometheus + url: file://rules/prometheus-prometheusRule diff --git a/charts/kubezero-metrics/dashboards/metrics-dashboards.yaml b/charts/kubezero-metrics/configs/metrics-dashboards.yaml similarity index 100% rename from charts/kubezero-metrics/dashboards/metrics-dashboards.yaml rename to charts/kubezero-metrics/configs/metrics-dashboards.yaml diff --git a/charts/kubezero-metrics/dashboards/mixin.libsonnet b/charts/kubezero-metrics/configs/mixin.libsonnet similarity index 100% rename from charts/kubezero-metrics/dashboards/mixin.libsonnet rename to charts/kubezero-metrics/configs/mixin.libsonnet diff --git a/charts/kubezero-metrics/configs/rules.libsonnet b/charts/kubezero-metrics/configs/rules.libsonnet new file mode 100644 index 00000000..f90a2afc --- /dev/null +++ b/charts/kubezero-metrics/configs/rules.libsonnet @@ -0,0 +1,49 @@ +local addMixin = (import 'kube-prometheus/lib/mixin.libsonnet'); + +local etcdMixin = addMixin({ + name: 'etcd', + mixin: (import 'github.com/etcd-io/etcd/contrib/mixin/mixin.libsonnet') + + { + _config+: { + etcd_instance_labels: 'instance, pod', + } + } + + // Remove both etcdHighNumberOfFailedGRPCRequests from etcd-mixin for now + { + prometheusAlerts+: { + groups: std.map( + function(group) + if group.name == 'etcd' then + group { + rules: std.filter( + function(rule) + rule.alert != 'etcdHighNumberOfFailedGRPCRequests', + group.rules + ), + } + else + group, + super.groups + ), + }, + }, + }); + +local kp = (import 'kube-prometheus/main.libsonnet') + + { + values+:: { + common+: { + namespace: 'monitoring', + }, + } + }; + +// We just want the Prometheus Rules +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + +{ 'alertmanager-prometheusRule': kp.alertmanager.prometheusRule } + +{ 'kube-state-metrics-prometheusRule': kp.kubeStateMetrics.prometheusRule } + +{ 'kubernetes-prometheusRule': kp.kubernetesControlPlane.prometheusRule } + +{ 'node-exporter-prometheusRule': kp.nodeExporter.prometheusRule } + +{ 'prometheus-prometheusRule': kp.prometheus.prometheusRule } + +{ 'etcd-mixin-prometheusRule': etcdMixin.prometheusRules } diff --git a/charts/kubezero-metrics/configs/rules/alertmanager-prometheusRule b/charts/kubezero-metrics/configs/rules/alertmanager-prometheusRule new file mode 100644 index 00000000..99d1647c --- /dev/null +++ b/charts/kubezero-metrics/configs/rules/alertmanager-prometheusRule @@ -0,0 +1,129 @@ +{ + "apiVersion": "monitoring.coreos.com/v1", + "kind": "PrometheusRule", + "metadata": { + "labels": { + "app.kubernetes.io/component": "alert-router", + "app.kubernetes.io/name": "alertmanager", + "app.kubernetes.io/part-of": "kube-prometheus", + "app.kubernetes.io/version": "0.23.0", + "prometheus": "k8s", + "role": "alert-rules" + }, + "name": "alertmanager-main-rules", + "namespace": "monitoring" + }, + "spec": { + "groups": [ + { + "name": "alertmanager.rules", + "rules": [ + { + "alert": "AlertmanagerFailedReload", + "annotations": { + "description": "Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload", + "summary": "Reloading an Alertmanager configuration has failed." + }, + "expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\nmax_over_time(alertmanager_config_last_reload_successful{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m]) == 0\n", + "for": "10m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "AlertmanagerMembersInconsistent", + "annotations": { + "description": "Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent", + "summary": "A member of an Alertmanager cluster has not found all other cluster members." + }, + "expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n max_over_time(alertmanager_cluster_members{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m])\n< on (namespace,service) group_left\n count by (namespace,service) (max_over_time(alertmanager_cluster_members{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m]))\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "AlertmanagerFailedToSendAlerts", + "annotations": { + "description": "Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts", + "summary": "An Alertmanager instance failed to send notifications." + }, + "expr": "(\n rate(alertmanager_notifications_failed_total{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m])\n/\n rate(alertmanager_notifications_total{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m])\n)\n> 0.01\n", + "for": "5m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "AlertmanagerClusterFailedToSendAlerts", + "annotations": { + "description": "The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts", + "summary": "All Alertmanager instances in a cluster failed to send notifications to a critical integration." + }, + "expr": "min by (namespace,service, integration) (\n rate(alertmanager_notifications_failed_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration=~`.*`}[5m])\n/\n rate(alertmanager_notifications_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration=~`.*`}[5m])\n)\n> 0.01\n", + "for": "5m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "AlertmanagerClusterFailedToSendAlerts", + "annotations": { + "description": "The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts", + "summary": "All Alertmanager instances in a cluster failed to send notifications to a non-critical integration." + }, + "expr": "min by (namespace,service, integration) (\n rate(alertmanager_notifications_failed_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration!~`.*`}[5m])\n/\n rate(alertmanager_notifications_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration!~`.*`}[5m])\n)\n> 0.01\n", + "for": "5m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "AlertmanagerConfigInconsistent", + "annotations": { + "description": "Alertmanager instances within the {{$labels.job}} cluster have different configurations.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent", + "summary": "Alertmanager instances within the same cluster have different configurations." + }, + "expr": "count by (namespace,service) (\n count_values by (namespace,service) (\"config_hash\", alertmanager_config_hash{job=\"alertmanager-main\",namespace=\"monitoring\"})\n)\n!= 1\n", + "for": "20m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "AlertmanagerClusterDown", + "annotations": { + "description": "{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown", + "summary": "Half or more of the Alertmanager instances within the same cluster are down." + }, + "expr": "(\n count by (namespace,service) (\n avg_over_time(up{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m]) < 0.5\n )\n/\n count by (namespace,service) (\n up{job=\"alertmanager-main\",namespace=\"monitoring\"}\n )\n)\n>= 0.5\n", + "for": "5m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "AlertmanagerClusterCrashlooping", + "annotations": { + "description": "{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping", + "summary": "Half or more of the Alertmanager instances within the same cluster are crashlooping." + }, + "expr": "(\n count by (namespace,service) (\n changes(process_start_time_seconds{job=\"alertmanager-main\",namespace=\"monitoring\"}[10m]) > 4\n )\n/\n count by (namespace,service) (\n up{job=\"alertmanager-main\",namespace=\"monitoring\"}\n )\n)\n>= 0.5\n", + "for": "5m", + "labels": { + "severity": "critical" + } + } + ] + } + ] + } +} diff --git a/charts/kubezero-metrics/configs/rules/etcd-mixin-prometheusRule b/charts/kubezero-metrics/configs/rules/etcd-mixin-prometheusRule new file mode 100644 index 00000000..c54efb47 --- /dev/null +++ b/charts/kubezero-metrics/configs/rules/etcd-mixin-prometheusRule @@ -0,0 +1,161 @@ +{ + "apiVersion": "monitoring.coreos.com/v1", + "kind": "PrometheusRule", + "metadata": { + "labels": { + "prometheus": "k8s" + }, + "name": "etcd", + "namespace": "monitoring" + }, + "spec": { + "groups": [ + { + "name": "etcd", + "rules": [ + { + "alert": "etcdMembersDown", + "annotations": { + "description": "etcd cluster \"{{ $labels.job }}\": members are down ({{ $value }}).", + "summary": "etcd cluster members are down." + }, + "expr": "max without (endpoint) (\n sum without (instance, pod) (up{job=~\".*etcd.*\"} == bool 0)\nor\n count without (To) (\n sum without (instance, pod) (rate(etcd_network_peer_sent_failures_total{job=~\".*etcd.*\"}[120s])) > 0.01\n )\n)\n> 0\n", + "for": "10m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "etcdInsufficientMembers", + "annotations": { + "description": "etcd cluster \"{{ $labels.job }}\": insufficient members ({{ $value }}).", + "summary": "etcd cluster has insufficient number of members." + }, + "expr": "sum(up{job=~\".*etcd.*\"} == bool 1) without (instance, pod) < ((count(up{job=~\".*etcd.*\"}) without (instance, pod) + 1) / 2)\n", + "for": "3m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "etcdNoLeader", + "annotations": { + "description": "etcd cluster \"{{ $labels.job }}\": member {{ $labels.instance }} has no leader.", + "summary": "etcd cluster has no leader." + }, + "expr": "etcd_server_has_leader{job=~\".*etcd.*\"} == 0\n", + "for": "1m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "etcdHighNumberOfLeaderChanges", + "annotations": { + "description": "etcd cluster \"{{ $labels.job }}\": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.", + "summary": "etcd cluster has high number of leader changes." + }, + "expr": "increase((max without (instance, pod) (etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}))[15m:1m]) >= 4\n", + "for": "5m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "etcdGRPCRequestsSlow", + "annotations": { + "description": "etcd cluster \"{{ $labels.job }}\": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }}.", + "summary": "etcd grpc requests are slow" + }, + "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\", grpc_type=\"unary\"}[5m])) without(grpc_type))\n> 0.15\n", + "for": "10m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "etcdMemberCommunicationSlow", + "annotations": { + "description": "etcd cluster \"{{ $labels.job }}\": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.", + "summary": "etcd cluster member communication is slow." + }, + "expr": "histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.15\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "etcdHighNumberOfFailedProposals", + "annotations": { + "description": "etcd cluster \"{{ $labels.job }}\": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.", + "summary": "etcd cluster has high number of proposal failures." + }, + "expr": "rate(etcd_server_proposals_failed_total{job=~\".*etcd.*\"}[15m]) > 5\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "etcdHighFsyncDurations", + "annotations": { + "description": "etcd cluster \"{{ $labels.job }}\": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.", + "summary": "etcd cluster 99th percentile fsync durations are too high." + }, + "expr": "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.5\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "etcdHighFsyncDurations", + "annotations": { + "description": "etcd cluster \"{{ $labels.job }}\": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}." + }, + "expr": "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 1\n", + "for": "10m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "etcdHighCommitDurations", + "annotations": { + "description": "etcd cluster \"{{ $labels.job }}\": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.", + "summary": "etcd cluster 99th percentile commit durations are too high." + }, + "expr": "histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.25\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "etcdBackendQuotaLowSpace", + "annotations": { + "description": "etcd cluster \"{{ $labels.job }}\": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full." + }, + "expr": "(etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100 > 95\n", + "for": "10m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "etcdExcessiveDatabaseGrowth", + "annotations": { + "description": "etcd cluster \"{{ $labels.job }}\": Observed surge in etcd writes leading to 50% increase in database size over the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive." + }, + "expr": "increase(((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100)[240m:1m]) > 50\n", + "for": "10m", + "labels": { + "severity": "warning" + } + } + ] + } + ] + } +} diff --git a/charts/kubezero-metrics/configs/rules/kube-prometheus-prometheusRule b/charts/kubezero-metrics/configs/rules/kube-prometheus-prometheusRule new file mode 100644 index 00000000..cfd34465 --- /dev/null +++ b/charts/kubezero-metrics/configs/rules/kube-prometheus-prometheusRule @@ -0,0 +1,109 @@ +{ + "apiVersion": "monitoring.coreos.com/v1", + "kind": "PrometheusRule", + "metadata": { + "labels": { + "app.kubernetes.io/component": "exporter", + "app.kubernetes.io/name": "kube-prometheus", + "app.kubernetes.io/part-of": "kube-prometheus", + "prometheus": "k8s", + "role": "alert-rules" + }, + "name": "kube-prometheus-rules", + "namespace": "monitoring" + }, + "spec": { + "groups": [ + { + "name": "general.rules", + "rules": [ + { + "alert": "TargetDown", + "annotations": { + "description": "{{ printf \"%.4g\" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/general/targetdown", + "summary": "One or more targets are unreachable." + }, + "expr": "100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "Watchdog", + "annotations": { + "description": "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty.\n", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/general/watchdog", + "summary": "An alert that should always be firing to certify that Alertmanager is working properly." + }, + "expr": "vector(1)", + "labels": { + "severity": "none" + } + } + ] + }, + { + "name": "node-network", + "rules": [ + { + "alert": "NodeNetworkInterfaceFlapping", + "annotations": { + "description": "Network interface \"{{ $labels.device }}\" changing its up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping", + "summary": "Network interface is often changing its status" + }, + "expr": "changes(node_network_up{job=\"node-exporter\",device!~\"veth.+\"}[2m]) > 2\n", + "for": "2m", + "labels": { + "severity": "warning" + } + } + ] + }, + { + "name": "kube-prometheus-node-recording.rules", + "rules": [ + { + "expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[3m])) BY (instance)", + "record": "instance:node_cpu:rate:sum" + }, + { + "expr": "sum(rate(node_network_receive_bytes_total[3m])) BY (instance)", + "record": "instance:node_network_receive_bytes:rate:sum" + }, + { + "expr": "sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)", + "record": "instance:node_network_transmit_bytes:rate:sum" + }, + { + "expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)", + "record": "instance:node_cpu:ratio" + }, + { + "expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[5m]))", + "record": "cluster:node_cpu:sum_rate5m" + }, + { + "expr": "cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))", + "record": "cluster:node_cpu:ratio" + } + ] + }, + { + "name": "kube-prometheus-general.rules", + "rules": [ + { + "expr": "count without(instance, pod, node) (up == 1)", + "record": "count:up1" + }, + { + "expr": "count without(instance, pod, node) (up == 0)", + "record": "count:up0" + } + ] + } + ] + } +} diff --git a/charts/kubezero-metrics/configs/rules/kube-state-metrics-prometheusRule b/charts/kubezero-metrics/configs/rules/kube-state-metrics-prometheusRule new file mode 100644 index 00000000..832ed4ee --- /dev/null +++ b/charts/kubezero-metrics/configs/rules/kube-state-metrics-prometheusRule @@ -0,0 +1,77 @@ +{ + "apiVersion": "monitoring.coreos.com/v1", + "kind": "PrometheusRule", + "metadata": { + "labels": { + "app.kubernetes.io/component": "exporter", + "app.kubernetes.io/name": "kube-state-metrics", + "app.kubernetes.io/part-of": "kube-prometheus", + "app.kubernetes.io/version": "2.2.1", + "prometheus": "k8s", + "role": "alert-rules" + }, + "name": "kube-state-metrics-rules", + "namespace": "monitoring" + }, + "spec": { + "groups": [ + { + "name": "kube-state-metrics", + "rules": [ + { + "alert": "KubeStateMetricsListErrors", + "annotations": { + "description": "kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors", + "summary": "kube-state-metrics is experiencing errors in list operations." + }, + "expr": "(sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\",result=\"error\"}[5m]))\n /\nsum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\"}[5m])))\n> 0.01\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubeStateMetricsWatchErrors", + "annotations": { + "description": "kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors", + "summary": "kube-state-metrics is experiencing errors in watch operations." + }, + "expr": "(sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\",result=\"error\"}[5m]))\n /\nsum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\"}[5m])))\n> 0.01\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubeStateMetricsShardingMismatch", + "annotations": { + "description": "kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch", + "summary": "kube-state-metrics sharding is misconfigured." + }, + "expr": "stdvar (kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) != 0\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubeStateMetricsShardsMissing", + "annotations": { + "description": "kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing", + "summary": "kube-state-metrics shards are missing." + }, + "expr": "2^max(kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) - 1\n -\nsum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job=\"kube-state-metrics\"}) )\n!= 0\n", + "for": "15m", + "labels": { + "severity": "critical" + } + } + ] + } + ] + } +} diff --git a/charts/kubezero-metrics/configs/rules/kubernetes-prometheusRule b/charts/kubezero-metrics/configs/rules/kubernetes-prometheusRule new file mode 100644 index 00000000..ec6402e4 --- /dev/null +++ b/charts/kubezero-metrics/configs/rules/kubernetes-prometheusRule @@ -0,0 +1,1178 @@ +{ + "apiVersion": "monitoring.coreos.com/v1", + "kind": "PrometheusRule", + "metadata": { + "labels": { + "app.kubernetes.io/name": "kube-prometheus", + "app.kubernetes.io/part-of": "kube-prometheus", + "prometheus": "k8s", + "role": "alert-rules" + }, + "name": "kubernetes-monitoring-rules", + "namespace": "monitoring" + }, + "spec": { + "groups": [ + { + "name": "kubernetes-apps", + "rules": [ + { + "alert": "KubePodCrashLooping", + "annotations": { + "description": "Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf \"%.2f\" $value }} times / 10 minutes.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping", + "summary": "Pod is crash looping." + }, + "expr": "increase(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\"}[10m]) > 0\nand\nkube_pod_container_status_waiting{job=\"kube-state-metrics\"} == 1\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubePodNotReady", + "annotations": { + "description": "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready", + "summary": "Pod has been in a non-ready state for more than 15 minutes." + }, + "expr": "sum by (namespace, pod) (\n max by(namespace, pod) (\n kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Pending|Unknown\"}\n ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (\n 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!=\"Job\"})\n )\n) > 0\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeDeploymentGenerationMismatch", + "annotations": { + "description": "Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch", + "summary": "Deployment generation mismatch due to possible roll-back" + }, + "expr": "kube_deployment_status_observed_generation{job=\"kube-state-metrics\"}\n !=\nkube_deployment_metadata_generation{job=\"kube-state-metrics\"}\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeDeploymentReplicasMismatch", + "annotations": { + "description": "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch", + "summary": "Deployment has not matched the expected number of replicas." + }, + "expr": "(\n kube_deployment_spec_replicas{job=\"kube-state-metrics\"}\n >\n kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}\n) and (\n changes(kube_deployment_status_replicas_updated{job=\"kube-state-metrics\"}[10m])\n ==\n 0\n)\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeStatefulSetReplicasMismatch", + "annotations": { + "description": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch", + "summary": "Deployment has not matched the expected number of replicas." + }, + "expr": "(\n kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas{job=\"kube-state-metrics\"}\n) and (\n changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[10m])\n ==\n 0\n)\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeStatefulSetGenerationMismatch", + "annotations": { + "description": "StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch", + "summary": "StatefulSet generation mismatch due to possible roll-back" + }, + "expr": "kube_statefulset_status_observed_generation{job=\"kube-state-metrics\"}\n !=\nkube_statefulset_metadata_generation{job=\"kube-state-metrics\"}\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeStatefulSetUpdateNotRolledOut", + "annotations": { + "description": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout", + "summary": "StatefulSet update has not been rolled out." + }, + "expr": "(\n max without (revision) (\n kube_statefulset_status_current_revision{job=\"kube-state-metrics\"}\n unless\n kube_statefulset_status_update_revision{job=\"kube-state-metrics\"}\n )\n *\n (\n kube_statefulset_replicas{job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeDaemonSetRolloutStuck", + "annotations": { + "description": "DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck", + "summary": "DaemonSet rollout is stuck." + }, + "expr": "(\n (\n kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"}\n !=\n 0\n ) or (\n kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_available{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeContainerWaiting", + "annotations": { + "description": "Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting", + "summary": "Pod container waiting longer than 1 hour" + }, + "expr": "sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job=\"kube-state-metrics\"}) > 0\n", + "for": "1h", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeDaemonSetNotScheduled", + "annotations": { + "description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled", + "summary": "DaemonSet pods are not scheduled." + }, + "expr": "kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n -\nkube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"} > 0\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeDaemonSetMisScheduled", + "annotations": { + "description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled", + "summary": "DaemonSet pods are misscheduled." + }, + "expr": "kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"} > 0\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeJobCompletion", + "annotations": { + "description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobcompletion", + "summary": "Job did not complete in time" + }, + "expr": "kube_job_spec_completions{job=\"kube-state-metrics\"} - kube_job_status_succeeded{job=\"kube-state-metrics\"} > 0\n", + "for": "12h", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeJobFailed", + "annotations": { + "description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed", + "summary": "Job failed to complete." + }, + "expr": "kube_job_failed{job=\"kube-state-metrics\"} > 0\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeHpaReplicasMismatch", + "annotations": { + "description": "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch", + "summary": "HPA has not matched descired number of replicas." + }, + "expr": "(kube_horizontalpodautoscaler_status_desired_replicas{job=\"kube-state-metrics\"}\n !=\nkube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"})\n and\n(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n >\nkube_horizontalpodautoscaler_spec_min_replicas{job=\"kube-state-metrics\"})\n and\n(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n <\nkube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\"})\n and\nchanges(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}[15m]) == 0\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeHpaMaxedOut", + "annotations": { + "description": "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout", + "summary": "HPA is running at max replicas" + }, + "expr": "kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n ==\nkube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\"}\n", + "for": "15m", + "labels": { + "severity": "warning" + } + } + ] + }, + { + "name": "kubernetes-resources", + "rules": [ + { + "alert": "KubeCPUOvercommit", + "annotations": { + "description": "Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit", + "summary": "Cluster has overcommitted CPU resource requests." + }, + "expr": "sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource=\"cpu\"}) - max(kube_node_status_allocatable{resource=\"cpu\"})) > 0\nand\n(sum(kube_node_status_allocatable{resource=\"cpu\"}) - max(kube_node_status_allocatable{resource=\"cpu\"})) > 0\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeMemoryOvercommit", + "annotations": { + "description": "Cluster has overcommitted memory resource requests for Pods by {{ $value }} bytes and cannot tolerate node failure.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit", + "summary": "Cluster has overcommitted memory resource requests." + }, + "expr": "sum(namespace_memory:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource=\"memory\"}) - max(kube_node_status_allocatable{resource=\"memory\"})) > 0\nand\n(sum(kube_node_status_allocatable{resource=\"memory\"}) - max(kube_node_status_allocatable{resource=\"memory\"})) > 0\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeCPUQuotaOvercommit", + "annotations": { + "description": "Cluster has overcommitted CPU resource requests for Namespaces.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit", + "summary": "Cluster has overcommitted CPU resource requests." + }, + "expr": "sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"cpu\"})\n /\nsum(kube_node_status_allocatable{resource=\"cpu\"})\n > 1.5\n", + "for": "5m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeMemoryQuotaOvercommit", + "annotations": { + "description": "Cluster has overcommitted memory resource requests for Namespaces.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit", + "summary": "Cluster has overcommitted memory resource requests." + }, + "expr": "sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"memory\"})\n /\nsum(kube_node_status_allocatable{resource=\"memory\",job=\"kube-state-metrics\"})\n > 1.5\n", + "for": "5m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeQuotaAlmostFull", + "annotations": { + "description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull", + "summary": "Namespace quota is going to be full." + }, + "expr": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n > 0.9 < 1\n", + "for": "15m", + "labels": { + "severity": "info" + } + }, + { + "alert": "KubeQuotaFullyUsed", + "annotations": { + "description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused", + "summary": "Namespace quota is fully used." + }, + "expr": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n == 1\n", + "for": "15m", + "labels": { + "severity": "info" + } + }, + { + "alert": "KubeQuotaExceeded", + "annotations": { + "description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded", + "summary": "Namespace quota has exceeded the limits." + }, + "expr": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n > 1\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "CPUThrottlingHigh", + "annotations": { + "description": "{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh", + "summary": "Processes experience elevated CPU throttling." + }, + "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\", }[5m])) by (container, pod, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)\n > ( 25 / 100 )\n", + "for": "15m", + "labels": { + "severity": "info" + } + } + ] + }, + { + "name": "kubernetes-storage", + "rules": [ + { + "alert": "KubePersistentVolumeFillingUp", + "annotations": { + "description": "The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup", + "summary": "PersistentVolume is filling up." + }, + "expr": "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) < 0.03\nand\nkubelet_volume_stats_used_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1\n", + "for": "1m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubePersistentVolumeFillingUp", + "annotations": { + "description": "Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup", + "summary": "PersistentVolume is filling up." + }, + "expr": "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) < 0.15\nand\nkubelet_volume_stats_used_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nand\npredict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}[6h], 4 * 24 * 3600) < 0\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1\n", + "for": "1h", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubePersistentVolumeErrors", + "annotations": { + "description": "The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors", + "summary": "PersistentVolume is having issues with provisioning." + }, + "expr": "kube_persistentvolume_status_phase{phase=~\"Failed|Pending\",job=\"kube-state-metrics\"} > 0\n", + "for": "5m", + "labels": { + "severity": "critical" + } + } + ] + }, + { + "name": "kubernetes-system", + "rules": [ + { + "alert": "KubeVersionMismatch", + "annotations": { + "description": "There are {{ $value }} different semantic versions of Kubernetes components running.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch", + "summary": "Different semantic versions of Kubernetes components running." + }, + "expr": "count(count by (git_version) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"},\"git_version\",\"$1\",\"git_version\",\"(v[0-9]*.[0-9]*).*\"))) > 1\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeClientErrors", + "annotations": { + "description": "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors", + "summary": "Kubernetes API server client is experiencing errors." + }, + "expr": "(sum(rate(rest_client_requests_total{code=~\"5..\"}[5m])) by (instance, job, namespace)\n /\nsum(rate(rest_client_requests_total[5m])) by (instance, job, namespace))\n> 0.01\n", + "for": "15m", + "labels": { + "severity": "warning" + } + } + ] + }, + { + "name": "kube-apiserver-slos", + "rules": [ + { + "alert": "KubeAPIErrorBudgetBurn", + "annotations": { + "description": "The API server is burning too much error budget.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn", + "summary": "The API server is burning too much error budget." + }, + "expr": "sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)\nand\nsum(apiserver_request:burnrate5m) > (14.40 * 0.01000)\n", + "for": "2m", + "labels": { + "long": "1h", + "severity": "critical", + "short": "5m" + } + }, + { + "alert": "KubeAPIErrorBudgetBurn", + "annotations": { + "description": "The API server is burning too much error budget.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn", + "summary": "The API server is burning too much error budget." + }, + "expr": "sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)\nand\nsum(apiserver_request:burnrate30m) > (6.00 * 0.01000)\n", + "for": "15m", + "labels": { + "long": "6h", + "severity": "critical", + "short": "30m" + } + }, + { + "alert": "KubeAPIErrorBudgetBurn", + "annotations": { + "description": "The API server is burning too much error budget.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn", + "summary": "The API server is burning too much error budget." + }, + "expr": "sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)\nand\nsum(apiserver_request:burnrate2h) > (3.00 * 0.01000)\n", + "for": "1h", + "labels": { + "long": "1d", + "severity": "warning", + "short": "2h" + } + }, + { + "alert": "KubeAPIErrorBudgetBurn", + "annotations": { + "description": "The API server is burning too much error budget.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn", + "summary": "The API server is burning too much error budget." + }, + "expr": "sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)\nand\nsum(apiserver_request:burnrate6h) > (1.00 * 0.01000)\n", + "for": "3h", + "labels": { + "long": "3d", + "severity": "warning", + "short": "6h" + } + } + ] + }, + { + "name": "kubernetes-system-apiserver", + "rules": [ + { + "alert": "KubeClientCertificateExpiration", + "annotations": { + "description": "A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration", + "summary": "Client certificate is about to expire." + }, + "expr": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 604800\n", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeClientCertificateExpiration", + "annotations": { + "description": "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration", + "summary": "Client certificate is about to expire." + }, + "expr": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 86400\n", + "labels": { + "severity": "critical" + } + }, + { + "alert": "AggregatedAPIErrors", + "annotations": { + "description": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/aggregatedapierrors", + "summary": "An aggregated API has reported errors." + }, + "expr": "sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4\n", + "labels": { + "severity": "warning" + } + }, + { + "alert": "AggregatedAPIDown", + "annotations": { + "description": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/aggregatedapidown", + "summary": "An aggregated API is down." + }, + "expr": "(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85\n", + "for": "5m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeAPIDown", + "annotations": { + "description": "KubeAPI has disappeared from Prometheus target discovery.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapidown", + "summary": "Target disappeared from Prometheus target discovery." + }, + "expr": "absent(up{job=\"apiserver\"} == 1)\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubeAPITerminatedRequests", + "annotations": { + "description": "The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests", + "summary": "The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests." + }, + "expr": "sum(rate(apiserver_request_terminations_total{job=\"apiserver\"}[10m])) / ( sum(rate(apiserver_request_total{job=\"apiserver\"}[10m])) + sum(rate(apiserver_request_terminations_total{job=\"apiserver\"}[10m])) ) > 0.20\n", + "for": "5m", + "labels": { + "severity": "warning" + } + } + ] + }, + { + "name": "kubernetes-system-kubelet", + "rules": [ + { + "alert": "KubeNodeNotReady", + "annotations": { + "description": "{{ $labels.node }} has been unready for more than 15 minutes.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready", + "summary": "Node is not ready." + }, + "expr": "kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} == 0\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeNodeUnreachable", + "annotations": { + "description": "{{ $labels.node }} is unreachable and some workloads may be rescheduled.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable", + "summary": "Node is unreachable." + }, + "expr": "(kube_node_spec_taint{job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\",effect=\"NoSchedule\"} unless ignoring(key,value) kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"}) == 1\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeletTooManyPods", + "annotations": { + "description": "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods", + "summary": "Kubelet is running at capacity." + }, + "expr": "count by(node) (\n (kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})\n)\n/\nmax by(node) (\n kube_node_status_capacity{job=\"kube-state-metrics\",resource=\"pods\"} != 1\n) > 0.95\n", + "for": "15m", + "labels": { + "severity": "info" + } + }, + { + "alert": "KubeNodeReadinessFlapping", + "annotations": { + "description": "The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping", + "summary": "Node readiness status is flapping." + }, + "expr": "sum(changes(kube_node_status_condition{status=\"true\",condition=\"Ready\"}[15m])) by (node) > 2\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeletPlegDurationHigh", + "annotations": { + "description": "The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh", + "summary": "Kubelet Pod Lifecycle Event Generator is taking too long to relist." + }, + "expr": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile=\"0.99\"} >= 10\n", + "for": "5m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeletPodStartUpLatencyHigh", + "annotations": { + "description": "Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh", + "summary": "Kubelet Pod startup latency is too high." + }, + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\", metrics_path=\"/metrics\"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"} > 60\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeletClientCertificateExpiration", + "annotations": { + "description": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration", + "summary": "Kubelet client certificate is about to expire." + }, + "expr": "kubelet_certificate_manager_client_ttl_seconds < 604800\n", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeletClientCertificateExpiration", + "annotations": { + "description": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration", + "summary": "Kubelet client certificate is about to expire." + }, + "expr": "kubelet_certificate_manager_client_ttl_seconds < 86400\n", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubeletServerCertificateExpiration", + "annotations": { + "description": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration", + "summary": "Kubelet server certificate is about to expire." + }, + "expr": "kubelet_certificate_manager_server_ttl_seconds < 604800\n", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeletServerCertificateExpiration", + "annotations": { + "description": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration", + "summary": "Kubelet server certificate is about to expire." + }, + "expr": "kubelet_certificate_manager_server_ttl_seconds < 86400\n", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubeletClientCertificateRenewalErrors", + "annotations": { + "description": "Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors", + "summary": "Kubelet has failed to renew its client certificate." + }, + "expr": "increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeletServerCertificateRenewalErrors", + "annotations": { + "description": "Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors", + "summary": "Kubelet has failed to renew its server certificate." + }, + "expr": "increase(kubelet_server_expiration_renew_errors[5m]) > 0\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeletDown", + "annotations": { + "description": "Kubelet has disappeared from Prometheus target discovery.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletdown", + "summary": "Target disappeared from Prometheus target discovery." + }, + "expr": "absent(up{job=\"kubelet\", metrics_path=\"/metrics\"} == 1)\n", + "for": "15m", + "labels": { + "severity": "critical" + } + } + ] + }, + { + "name": "kubernetes-system-scheduler", + "rules": [ + { + "alert": "KubeSchedulerDown", + "annotations": { + "description": "KubeScheduler has disappeared from Prometheus target discovery.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeschedulerdown", + "summary": "Target disappeared from Prometheus target discovery." + }, + "expr": "absent(up{job=\"kube-scheduler\"} == 1)\n", + "for": "15m", + "labels": { + "severity": "critical" + } + } + ] + }, + { + "name": "kubernetes-system-controller-manager", + "rules": [ + { + "alert": "KubeControllerManagerDown", + "annotations": { + "description": "KubeControllerManager has disappeared from Prometheus target discovery.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontrollermanagerdown", + "summary": "Target disappeared from Prometheus target discovery." + }, + "expr": "absent(up{job=\"kube-controller-manager\"} == 1)\n", + "for": "15m", + "labels": { + "severity": "critical" + } + } + ] + }, + { + "name": "kube-apiserver-burnrate.rules", + "rules": [ + { + "expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[1d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[1d]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[1d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n", + "labels": { + "verb": "read" + }, + "record": "apiserver_request:burnrate1d" + }, + { + "expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[1h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[1h]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[1h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n", + "labels": { + "verb": "read" + }, + "record": "apiserver_request:burnrate1h" + }, + { + "expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[2h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[2h]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[2h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n", + "labels": { + "verb": "read" + }, + "record": "apiserver_request:burnrate2h" + }, + { + "expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[30m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[30m]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[30m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n", + "labels": { + "verb": "read" + }, + "record": "apiserver_request:burnrate30m" + }, + { + "expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[3d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[3d]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[3d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n", + "labels": { + "verb": "read" + }, + "record": "apiserver_request:burnrate3d" + }, + { + "expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[5m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[5m]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[5m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n", + "labels": { + "verb": "read" + }, + "record": "apiserver_request:burnrate5m" + }, + { + "expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[6h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[6h]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[6h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n", + "labels": { + "verb": "read" + }, + "record": "apiserver_request:burnrate6h" + }, + { + "expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[1d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n", + "labels": { + "verb": "write" + }, + "record": "apiserver_request:burnrate1d" + }, + { + "expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[1h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n", + "labels": { + "verb": "write" + }, + "record": "apiserver_request:burnrate1h" + }, + { + "expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[2h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n", + "labels": { + "verb": "write" + }, + "record": "apiserver_request:burnrate2h" + }, + { + "expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[30m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n", + "labels": { + "verb": "write" + }, + "record": "apiserver_request:burnrate30m" + }, + { + "expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[3d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n", + "labels": { + "verb": "write" + }, + "record": "apiserver_request:burnrate3d" + }, + { + "expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[5m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n", + "labels": { + "verb": "write" + }, + "record": "apiserver_request:burnrate5m" + }, + { + "expr": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[6h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n", + "labels": { + "verb": "write" + }, + "record": "apiserver_request:burnrate6h" + } + ] + }, + { + "name": "kube-apiserver-histogram.rules", + "rules": [ + { + "expr": "histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))) > 0\n", + "labels": { + "quantile": "0.99", + "verb": "read" + }, + "record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))) > 0\n", + "labels": { + "quantile": "0.99", + "verb": "write" + }, + "record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.99" + }, + "record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.9" + }, + "record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.5" + }, + "record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile" + } + ] + }, + { + "interval": "3m", + "name": "kube-apiserver-availability.rules", + "rules": [ + { + "expr": "avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~\"LIST|GET\"})\n", + "labels": { + "verb": "read" + }, + "record": "code:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n", + "labels": { + "verb": "write" + }, + "record": "code:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (cluster, verb, scope) (increase(apiserver_request_duration_seconds_count[1h]))\n", + "record": "cluster_verb_scope:apiserver_request_duration_seconds_count:increase1h" + }, + { + "expr": "sum by (cluster, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_duration_seconds_count:increase1h[30d]) * 24 * 30)\n", + "record": "cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d" + }, + { + "expr": "sum by (cluster, verb, scope, le) (increase(apiserver_request_duration_seconds_bucket[1h]))\n", + "record": "cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase1h" + }, + { + "expr": "sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase1h[30d]) * 24 * 30)\n", + "record": "cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d" + }, + { + "expr": "1 - (\n (\n # write too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"})\n ) +\n (\n # read too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"})\n )\n ) +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d)\n", + "labels": { + "verb": "all" + }, + "record": "apiserver_request:availability30d" + }, + { + "expr": "1 - (\n sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n # too slow\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\"})\n", + "labels": { + "verb": "read" + }, + "record": "apiserver_request:availability30d" + }, + { + "expr": "1 - (\n (\n # too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\"})\n", + "labels": { + "verb": "write" + }, + "record": "apiserver_request:availability30d" + }, + { + "expr": "sum by (cluster,code,resource) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n", + "labels": { + "verb": "read" + }, + "record": "code_resource:apiserver_request_total:rate5m" + }, + { + "expr": "sum by (cluster,code,resource) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n", + "labels": { + "verb": "write" + }, + "record": "code_resource:apiserver_request_total:rate5m" + }, + { + "expr": "sum by (cluster, code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET|POST|PUT|PATCH|DELETE\",code=~\"2..\"}[1h]))\n", + "record": "code_verb:apiserver_request_total:increase1h" + }, + { + "expr": "sum by (cluster, code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET|POST|PUT|PATCH|DELETE\",code=~\"3..\"}[1h]))\n", + "record": "code_verb:apiserver_request_total:increase1h" + }, + { + "expr": "sum by (cluster, code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET|POST|PUT|PATCH|DELETE\",code=~\"4..\"}[1h]))\n", + "record": "code_verb:apiserver_request_total:increase1h" + }, + { + "expr": "sum by (cluster, code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET|POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1h]))\n", + "record": "code_verb:apiserver_request_total:increase1h" + } + ] + }, + { + "name": "k8s.rules", + "rules": [ + { + "expr": "sum by (cluster, namespace, pod, container) (\n irate(container_cpu_usage_seconds_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}[5m])\n) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (\n 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=\"\"})\n)\n", + "record": "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate" + }, + { + "expr": "container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info{node!=\"\"})\n)\n", + "record": "node_namespace_pod_container:container_memory_working_set_bytes" + }, + { + "expr": "container_memory_rss{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info{node!=\"\"})\n)\n", + "record": "node_namespace_pod_container:container_memory_rss" + }, + { + "expr": "container_memory_cache{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info{node!=\"\"})\n)\n", + "record": "node_namespace_pod_container:container_memory_cache" + }, + { + "expr": "container_memory_swap{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info{node!=\"\"})\n)\n", + "record": "node_namespace_pod_container:container_memory_swap" + }, + { + "expr": "kube_pod_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n", + "record": "cluster:namespace:pod_memory:active:kube_pod_container_resource_requests" + }, + { + "expr": "sum by (namespace, cluster) (\n sum by (namespace, pod, cluster) (\n max by (namespace, pod, container, cluster) (\n kube_pod_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\"}\n ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (\n kube_pod_status_phase{phase=~\"Pending|Running\"} == 1\n )\n )\n)\n", + "record": "namespace_memory:kube_pod_container_resource_requests:sum" + }, + { + "expr": "kube_pod_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n", + "record": "cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests" + }, + { + "expr": "sum by (namespace, cluster) (\n sum by (namespace, pod, cluster) (\n max by (namespace, pod, container, cluster) (\n kube_pod_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\"}\n ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (\n kube_pod_status_phase{phase=~\"Pending|Running\"} == 1\n )\n )\n)\n", + "record": "namespace_cpu:kube_pod_container_resource_requests:sum" + }, + { + "expr": "kube_pod_container_resource_limits{resource=\"memory\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n", + "record": "cluster:namespace:pod_memory:active:kube_pod_container_resource_limits" + }, + { + "expr": "sum by (namespace, cluster) (\n sum by (namespace, pod, cluster) (\n max by (namespace, pod, container, cluster) (\n kube_pod_container_resource_limits{resource=\"memory\",job=\"kube-state-metrics\"}\n ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (\n kube_pod_status_phase{phase=~\"Pending|Running\"} == 1\n )\n )\n)\n", + "record": "namespace_memory:kube_pod_container_resource_limits:sum" + }, + { + "expr": "kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n )\n", + "record": "cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits" + }, + { + "expr": "sum by (namespace, cluster) (\n sum by (namespace, pod, cluster) (\n max by (namespace, pod, container, cluster) (\n kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"}\n ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (\n kube_pod_status_phase{phase=~\"Pending|Running\"} == 1\n )\n )\n)\n", + "record": "namespace_cpu:kube_pod_container_resource_limits:sum" + }, + { + "expr": "max by (cluster, namespace, workload, pod) (\n label_replace(\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"ReplicaSet\"},\n \"replicaset\", \"$1\", \"owner_name\", \"(.*)\"\n ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (\n 1, max by (replicaset, namespace, owner_name) (\n kube_replicaset_owner{job=\"kube-state-metrics\"}\n )\n ),\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n)\n", + "labels": { + "workload_type": "deployment" + }, + "record": "namespace_workload_pod:kube_pod_owner:relabel" + }, + { + "expr": "max by (cluster, namespace, workload, pod) (\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"DaemonSet\"},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n)\n", + "labels": { + "workload_type": "daemonset" + }, + "record": "namespace_workload_pod:kube_pod_owner:relabel" + }, + { + "expr": "max by (cluster, namespace, workload, pod) (\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"StatefulSet\"},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n)\n", + "labels": { + "workload_type": "statefulset" + }, + "record": "namespace_workload_pod:kube_pod_owner:relabel" + } + ] + }, + { + "name": "kube-scheduler.rules", + "rules": [ + { + "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.99" + }, + "record": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.99" + }, + "record": "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.99" + }, + "record": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.9" + }, + "record": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.9" + }, + "record": "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.9" + }, + "record": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.5" + }, + "record": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.5" + }, + "record": "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.5" + }, + "record": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile" + } + ] + }, + { + "name": "node.rules", + "rules": [ + { + "expr": "topk by(namespace, pod) (1,\n max by (node, namespace, pod) (\n label_replace(kube_pod_info{job=\"kube-state-metrics\",node!=\"\"}, \"pod\", \"$1\", \"pod\", \"(.*)\")\n))\n", + "record": "node_namespace_pod:kube_pod_info:" + }, + { + "expr": "count by (cluster, node) (sum by (node, cpu) (\n node_cpu_seconds_total{job=\"node-exporter\"}\n* on (namespace, pod) group_left(node)\n topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:)\n))\n", + "record": "node:node_num_cpu:sum" + }, + { + "expr": "sum(\n node_memory_MemAvailable_bytes{job=\"node-exporter\"} or\n (\n node_memory_Buffers_bytes{job=\"node-exporter\"} +\n node_memory_Cached_bytes{job=\"node-exporter\"} +\n node_memory_MemFree_bytes{job=\"node-exporter\"} +\n node_memory_Slab_bytes{job=\"node-exporter\"}\n )\n) by (cluster)\n", + "record": ":node_memory_MemAvailable_bytes:sum" + } + ] + }, + { + "name": "kubelet.rules", + "rules": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"})\n", + "labels": { + "quantile": "0.99" + }, + "record": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"})\n", + "labels": { + "quantile": "0.9" + }, + "record": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"})\n", + "labels": { + "quantile": "0.5" + }, + "record": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile" + } + ] + } + ] + } +} diff --git a/charts/kubezero-metrics/configs/rules/node-exporter-prometheusRule b/charts/kubezero-metrics/configs/rules/node-exporter-prometheusRule new file mode 100644 index 00000000..85ddccb7 --- /dev/null +++ b/charts/kubezero-metrics/configs/rules/node-exporter-prometheusRule @@ -0,0 +1,305 @@ +{ + "apiVersion": "monitoring.coreos.com/v1", + "kind": "PrometheusRule", + "metadata": { + "labels": { + "app.kubernetes.io/component": "exporter", + "app.kubernetes.io/name": "node-exporter", + "app.kubernetes.io/part-of": "kube-prometheus", + "app.kubernetes.io/version": "1.2.2", + "prometheus": "k8s", + "role": "alert-rules" + }, + "name": "node-exporter-rules", + "namespace": "monitoring" + }, + "spec": { + "groups": [ + { + "name": "node-exporter", + "rules": [ + { + "alert": "NodeFilesystemSpaceFillingUp", + "annotations": { + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup", + "summary": "Filesystem is predicted to run out of space within the next 24 hours." + }, + "expr": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n", + "for": "1h", + "labels": { + "severity": "warning" + } + }, + { + "alert": "NodeFilesystemSpaceFillingUp", + "annotations": { + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up fast.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup", + "summary": "Filesystem is predicted to run out of space within the next 4 hours." + }, + "expr": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 15\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n", + "for": "1h", + "labels": { + "severity": "critical" + } + }, + { + "alert": "NodeFilesystemAlmostOutOfSpace", + "annotations": { + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace", + "summary": "Filesystem has less than 5% space left." + }, + "expr": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n", + "for": "30m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "NodeFilesystemAlmostOutOfSpace", + "annotations": { + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace", + "summary": "Filesystem has less than 3% space left." + }, + "expr": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n", + "for": "30m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "NodeFilesystemFilesFillingUp", + "annotations": { + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left and is filling up.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup", + "summary": "Filesystem is predicted to run out of inodes within the next 24 hours." + }, + "expr": "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n", + "for": "1h", + "labels": { + "severity": "warning" + } + }, + { + "alert": "NodeFilesystemFilesFillingUp", + "annotations": { + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left and is filling up fast.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup", + "summary": "Filesystem is predicted to run out of inodes within the next 4 hours." + }, + "expr": "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n", + "for": "1h", + "labels": { + "severity": "critical" + } + }, + { + "alert": "NodeFilesystemAlmostOutOfFiles", + "annotations": { + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles", + "summary": "Filesystem has less than 5% inodes left." + }, + "expr": "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n", + "for": "1h", + "labels": { + "severity": "warning" + } + }, + { + "alert": "NodeFilesystemAlmostOutOfFiles", + "annotations": { + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles", + "summary": "Filesystem has less than 3% inodes left." + }, + "expr": "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n", + "for": "1h", + "labels": { + "severity": "critical" + } + }, + { + "alert": "NodeNetworkReceiveErrs", + "annotations": { + "description": "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs", + "summary": "Network interface is reporting many receive errors." + }, + "expr": "rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01\n", + "for": "1h", + "labels": { + "severity": "warning" + } + }, + { + "alert": "NodeNetworkTransmitErrs", + "annotations": { + "description": "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs", + "summary": "Network interface is reporting many transmit errors." + }, + "expr": "rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01\n", + "for": "1h", + "labels": { + "severity": "warning" + } + }, + { + "alert": "NodeHighNumberConntrackEntriesUsed", + "annotations": { + "description": "{{ $value | humanizePercentage }} of conntrack entries are used.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused", + "summary": "Number of conntrack are getting close to the limit." + }, + "expr": "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n", + "labels": { + "severity": "warning" + } + }, + { + "alert": "NodeTextFileCollectorScrapeError", + "annotations": { + "description": "Node Exporter text file collector failed to scrape.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror", + "summary": "Node Exporter text file collector failed to scrape." + }, + "expr": "node_textfile_scrape_error{job=\"node-exporter\"} == 1\n", + "labels": { + "severity": "warning" + } + }, + { + "alert": "NodeClockSkewDetected", + "annotations": { + "description": "Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected", + "summary": "Clock skew detected." + }, + "expr": "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "NodeClockNotSynchronising", + "annotations": { + "description": "Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising", + "summary": "Clock not synchronising." + }, + "expr": "min_over_time(node_timex_sync_status[5m]) == 0\nand\nnode_timex_maxerror_seconds >= 16\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "NodeRAIDDegraded", + "annotations": { + "description": "RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded", + "summary": "RAID Array is degraded" + }, + "expr": "node_md_disks_required - ignoring (state) (node_md_disks{state=\"active\"}) > 0\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "NodeRAIDDiskFailure", + "annotations": { + "description": "At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure", + "summary": "Failed device in RAID array" + }, + "expr": "node_md_disks{state=\"failed\"} > 0\n", + "labels": { + "severity": "warning" + } + }, + { + "alert": "NodeFileDescriptorLimit", + "annotations": { + "description": "File descriptors limit at {{ $labels.instance }} is currently at {{ printf \"%.2f\" $value }}%.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit", + "summary": "Kernel is predicted to exhaust file descriptors limit soon." + }, + "expr": "(\n node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"node-exporter\"} > 70\n)\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "NodeFileDescriptorLimit", + "annotations": { + "description": "File descriptors limit at {{ $labels.instance }} is currently at {{ printf \"%.2f\" $value }}%.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit", + "summary": "Kernel is predicted to exhaust file descriptors limit soon." + }, + "expr": "(\n node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"node-exporter\"} > 90\n)\n", + "for": "15m", + "labels": { + "severity": "critical" + } + } + ] + }, + { + "name": "node-exporter.rules", + "rules": [ + { + "expr": "count without (cpu, mode) (\n node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"}\n)\n", + "record": "instance:node_num_cpu:sum" + }, + { + "expr": "1 - avg without (cpu, mode) (\n rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\"}[5m])\n)\n", + "record": "instance:node_cpu_utilisation:rate5m" + }, + { + "expr": "(\n node_load1{job=\"node-exporter\"}\n/\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n", + "record": "instance:node_load1_per_cpu:ratio" + }, + { + "expr": "1 - (\n node_memory_MemAvailable_bytes{job=\"node-exporter\"}\n or\n (\n node_memory_Buffers_bytes{job=\"node-exporter\"}\n +\n node_memory_Cached_bytes{job=\"node-exporter\"}\n +\n node_memory_MemFree_bytes{job=\"node-exporter\"}\n +\n node_memory_Slab_bytes{job=\"node-exporter\"}\n )\n/\n node_memory_MemTotal_bytes{job=\"node-exporter\"}\n)\n", + "record": "instance:node_memory_utilisation:ratio" + }, + { + "expr": "rate(node_vmstat_pgmajfault{job=\"node-exporter\"}[5m])\n", + "record": "instance:node_vmstat_pgmajfault:rate5m" + }, + { + "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[5m])\n", + "record": "instance_device:node_disk_io_time_seconds:rate5m" + }, + { + "expr": "rate(node_disk_io_time_weighted_seconds_total{job=\"node-exporter\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[5m])\n", + "record": "instance_device:node_disk_io_time_weighted_seconds:rate5m" + }, + { + "expr": "sum without (device) (\n rate(node_network_receive_bytes_total{job=\"node-exporter\", device!=\"lo\"}[5m])\n)\n", + "record": "instance:node_network_receive_bytes_excluding_lo:rate5m" + }, + { + "expr": "sum without (device) (\n rate(node_network_transmit_bytes_total{job=\"node-exporter\", device!=\"lo\"}[5m])\n)\n", + "record": "instance:node_network_transmit_bytes_excluding_lo:rate5m" + }, + { + "expr": "sum without (device) (\n rate(node_network_receive_drop_total{job=\"node-exporter\", device!=\"lo\"}[5m])\n)\n", + "record": "instance:node_network_receive_drop_excluding_lo:rate5m" + }, + { + "expr": "sum without (device) (\n rate(node_network_transmit_drop_total{job=\"node-exporter\", device!=\"lo\"}[5m])\n)\n", + "record": "instance:node_network_transmit_drop_excluding_lo:rate5m" + } + ] + } + ] + } +} diff --git a/charts/kubezero-metrics/configs/rules/prometheus-operator-prometheusRule b/charts/kubezero-metrics/configs/rules/prometheus-operator-prometheusRule new file mode 100644 index 00000000..be7b9988 --- /dev/null +++ b/charts/kubezero-metrics/configs/rules/prometheus-operator-prometheusRule @@ -0,0 +1,116 @@ +{ + "apiVersion": "monitoring.coreos.com/v1", + "kind": "PrometheusRule", + "metadata": { + "labels": { + "app.kubernetes.io/component": "controller", + "app.kubernetes.io/name": "prometheus-operator", + "app.kubernetes.io/part-of": "kube-prometheus", + "app.kubernetes.io/version": "0.51.1", + "prometheus": "k8s", + "role": "alert-rules" + }, + "name": "prometheus-operator-rules", + "namespace": "monitoring" + }, + "spec": { + "groups": [ + { + "name": "prometheus-operator", + "rules": [ + { + "alert": "PrometheusOperatorListErrors", + "annotations": { + "description": "Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors", + "summary": "Errors while performing list operations in controller." + }, + "expr": "(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[10m]))) > 0.4\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusOperatorWatchErrors", + "annotations": { + "description": "Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors", + "summary": "Errors while performing watch operations in controller." + }, + "expr": "(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[10m]))) > 0.4\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusOperatorSyncFailed", + "annotations": { + "description": "Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorsyncfailed", + "summary": "Last controller reconciliation failed" + }, + "expr": "min_over_time(prometheus_operator_syncs{status=\"failed\",job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]) > 0\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusOperatorReconcileErrors", + "annotations": { + "description": "{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors", + "summary": "Errors while reconciling controller." + }, + "expr": "(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]))) > 0.1\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusOperatorNodeLookupErrors", + "annotations": { + "description": "Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornodelookuperrors", + "summary": "Errors while reconciling Prometheus." + }, + "expr": "rate(prometheus_operator_node_address_lookup_errors_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]) > 0.1\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusOperatorNotReady", + "annotations": { + "description": "Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready", + "summary": "Prometheus operator not ready" + }, + "expr": "min by(namespace, controller) (max_over_time(prometheus_operator_ready{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]) == 0)\n", + "for": "5m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusOperatorRejectedResources", + "annotations": { + "description": "Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf \"%0.0f\" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorrejectedresources", + "summary": "Resources rejected by Prometheus operator" + }, + "expr": "min_over_time(prometheus_operator_managed_resources{state=\"rejected\",job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]) > 0\n", + "for": "5m", + "labels": { + "severity": "warning" + } + } + ] + } + ] + } +} diff --git a/charts/kubezero-metrics/configs/rules/prometheus-prometheusRule b/charts/kubezero-metrics/configs/rules/prometheus-prometheusRule new file mode 100644 index 00000000..638ce0d9 --- /dev/null +++ b/charts/kubezero-metrics/configs/rules/prometheus-prometheusRule @@ -0,0 +1,259 @@ +{ + "apiVersion": "monitoring.coreos.com/v1", + "kind": "PrometheusRule", + "metadata": { + "labels": { + "app.kubernetes.io/component": "prometheus", + "app.kubernetes.io/name": "prometheus", + "app.kubernetes.io/part-of": "kube-prometheus", + "app.kubernetes.io/version": "2.30.0", + "prometheus": "k8s", + "role": "alert-rules" + }, + "name": "prometheus-k8s-prometheus-rules", + "namespace": "monitoring" + }, + "spec": { + "groups": [ + { + "name": "prometheus", + "rules": [ + { + "alert": "PrometheusBadConfig", + "annotations": { + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig", + "summary": "Failed Prometheus configuration reload." + }, + "expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\nmax_over_time(prometheus_config_last_reload_successful{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) == 0\n", + "for": "10m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "PrometheusNotificationQueueRunningFull", + "annotations": { + "description": "Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull", + "summary": "Prometheus alert notification queue predicted to run full in less than 30m." + }, + "expr": "# Without min_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n predict_linear(prometheus_notifications_queue_length{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m], 60 * 30)\n>\n min_over_time(prometheus_notifications_queue_capacity{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusErrorSendingAlertsToSomeAlertmanagers", + "annotations": { + "description": "{{ printf \"%.1f\" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers", + "summary": "Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager." + }, + "expr": "(\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n* 100\n> 1\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusNotConnectedToAlertmanagers", + "annotations": { + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers", + "summary": "Prometheus is not connected to any Alertmanagers." + }, + "expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\nmax_over_time(prometheus_notifications_alertmanagers_discovered{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) < 1\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusTSDBReloadsFailing", + "annotations": { + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing", + "summary": "Prometheus has issues reloading blocks from disk." + }, + "expr": "increase(prometheus_tsdb_reloads_failures_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[3h]) > 0\n", + "for": "4h", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusTSDBCompactionsFailing", + "annotations": { + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing", + "summary": "Prometheus has issues compacting blocks." + }, + "expr": "increase(prometheus_tsdb_compactions_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[3h]) > 0\n", + "for": "4h", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusNotIngestingSamples", + "annotations": { + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples", + "summary": "Prometheus is not ingesting samples." + }, + "expr": "(\n rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) <= 0\nand\n (\n sum without(scrape_job) (prometheus_target_metadata_cache_entries{job=\"prometheus-k8s\",namespace=\"monitoring\"}) > 0\n or\n sum without(rule_group) (prometheus_rule_group_rules{job=\"prometheus-k8s\",namespace=\"monitoring\"}) > 0\n )\n)\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusDuplicateTimestamps", + "annotations": { + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf \"%.4g\" $value }} samples/s with different values but duplicated timestamp.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps", + "summary": "Prometheus is dropping samples with duplicate timestamps." + }, + "expr": "rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) > 0\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusOutOfOrderTimestamps", + "annotations": { + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf \"%.4g\" $value }} samples/s with timestamps arriving out of order.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps", + "summary": "Prometheus drops samples with out-of-order timestamps." + }, + "expr": "rate(prometheus_target_scrapes_sample_out_of_order_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) > 0\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusRemoteStorageFailures", + "annotations": { + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf \"%.1f\" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures", + "summary": "Prometheus fails to send samples to remote storage." + }, + "expr": "(\n (rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n/\n (\n (rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n +\n (rate(prometheus_remote_storage_succeeded_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n )\n)\n* 100\n> 1\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "PrometheusRemoteWriteBehind", + "annotations": { + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf \"%.1f\" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind", + "summary": "Prometheus remote write is behind." + }, + "expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n- ignoring(remote_name, url) group_right\n max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n> 120\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "PrometheusRemoteWriteDesiredShards", + "annotations": { + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance=\"%s\",job=\"prometheus-k8s\",namespace=\"monitoring\"}` $labels.instance | query | first | value }}.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards", + "summary": "Prometheus remote write desired shards calculation wants to run more than configured max shards." + }, + "expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_shards_desired{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n>\n max_over_time(prometheus_remote_storage_shards_max{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusRuleFailures", + "annotations": { + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf \"%.0f\" $value }} rules in the last 5m.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures", + "summary": "Prometheus is failing rule evaluations." + }, + "expr": "increase(prometheus_rule_evaluation_failures_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) > 0\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "PrometheusMissingRuleEvaluations", + "annotations": { + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf \"%.0f\" $value }} rule group evaluations in the last 5m.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations", + "summary": "Prometheus is missing rule evaluations due to slow rule group evaluation." + }, + "expr": "increase(prometheus_rule_group_iterations_missed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) > 0\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusTargetLimitHit", + "annotations": { + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf \"%.0f\" $value }} targets because the number of targets exceeded the configured target_limit.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit", + "summary": "Prometheus has dropped targets because some scrape configs have exceeded the targets limit." + }, + "expr": "increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) > 0\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusLabelLimitHit", + "annotations": { + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf \"%.0f\" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit", + "summary": "Prometheus has dropped targets because some scrape configs have exceeded the labels limit." + }, + "expr": "increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) > 0\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusTargetSyncFailure", + "annotations": { + "description": "{{ printf \"%.0f\" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}} have failed to sync because invalid configuration was supplied.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure", + "summary": "Prometheus has failed to sync targets." + }, + "expr": "increase(prometheus_target_sync_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[30m]) > 0\n", + "for": "5m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "PrometheusErrorSendingAlertsToAnyAlertmanager", + "annotations": { + "description": "{{ printf \"%.1f\" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.", + "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager", + "summary": "Prometheus encounters more than 3% errors sending alerts to any Alertmanager." + }, + "expr": "min without (alertmanager) (\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\",alertmanager!~``}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\",alertmanager!~``}[5m])\n)\n* 100\n> 3\n", + "for": "15m", + "labels": { + "severity": "critical" + } + } + ] + } + ] + } +} diff --git a/charts/kubezero-metrics/dashboards/zdt-dashboards.yaml b/charts/kubezero-metrics/configs/zdt-dashboards.yaml similarity index 100% rename from charts/kubezero-metrics/dashboards/zdt-dashboards.yaml rename to charts/kubezero-metrics/configs/zdt-dashboards.yaml diff --git a/charts/kubezero-metrics/dashboards/zdt/home.json b/charts/kubezero-metrics/configs/zdt/home.json similarity index 100% rename from charts/kubezero-metrics/dashboards/zdt/home.json rename to charts/kubezero-metrics/configs/zdt/home.json diff --git a/charts/kubezero-metrics/dashboards/build.sh b/charts/kubezero-metrics/dashboards/build.sh deleted file mode 100755 index 49fb3368..00000000 --- a/charts/kubezero-metrics/dashboards/build.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -[ -x ./jb-linux-amd64 ] || wget https://github.com/jsonnet-bundler/jsonnet-bundler/releases/download/v0.4.0/jb-linux-amd64 && chmod +x ./jb-linux-amd64 - -./jb-linux-amd64 update - -mkdir -p kube-mixin -#jsonnet -J vendor -S -e 'std.manifestYamlDoc((import "mixin.libsonnet").prometheusAlerts)' > kube-mixin/alerts.yml -#jsonnet -J vendor -S -e 'std.manifestYamlDoc((import "mixin.libsonnet").prometheusRules)' > kube-mixin/rules.yml -jsonnet -J vendor -m kube-mixin -e '(import "mixin.libsonnet").grafanaDashboards' diff --git a/charts/kubezero-metrics/dashboards/jsonnetfile.json b/charts/kubezero-metrics/dashboards/jsonnetfile.json deleted file mode 100644 index 4c799a46..00000000 --- a/charts/kubezero-metrics/dashboards/jsonnetfile.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "version": 1, - "dependencies": [ - { - "source": { - "git": { - "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin.git", - "subdir": "" - } - }, - "version": "master" - } - ], - "legacyImports": true -} diff --git a/charts/kubezero-metrics/dashboards/jsonnetfile.lock.json b/charts/kubezero-metrics/dashboards/jsonnetfile.lock.json deleted file mode 100644 index 621d592c..00000000 --- a/charts/kubezero-metrics/dashboards/jsonnetfile.lock.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "version": 1, - "dependencies": [ - { - "source": { - "git": { - "remote": "https://github.com/grafana/grafonnet-lib.git", - "subdir": "grafonnet" - } - }, - "version": "19b27b272abf4263af1365ec485784c49815a332", - "sum": "gF8foHByYcB25jcUOBqP6jxk0OPifQMjPvKY0HaCk6w=" - }, - { - "source": { - "git": { - "remote": "https://github.com/grafana/jsonnet-libs.git", - "subdir": "grafana-builder" - } - }, - "version": "14bc3cec71f32ba3cc490a47a22004a744b6bb4e", - "sum": "GRf2GvwEU4jhXV+JOonXSZ4wdDv8mnHBPCQ6TUVd+g8=" - }, - { - "source": { - "git": { - "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin.git", - "subdir": "" - } - }, - "version": "ff4641bcd83314c955150bea6b147df9ca335c4a", - "sum": "oUVGwcCbmdH8qz9B+lbRawI9s23GY9HeW7MwYZRbZ/0=" - } - ], - "legacyImports": false -} diff --git a/charts/kubezero-metrics/dashboards/k8s-dashboards.yaml b/charts/kubezero-metrics/dashboards/k8s-dashboards.yaml deleted file mode 100644 index 5bc9a180..00000000 --- a/charts/kubezero-metrics/dashboards/k8s-dashboards.yaml +++ /dev/null @@ -1,49 +0,0 @@ -configmap: grafana-dashboards-kube-mixin -gzip: true -folder: Kubernetes -dashboards: -- name: coreDNS - url: https://grafana.com/api/dashboards/12539/revisions/5/download - tags: ['kubernetes', 'DNS'] -- name: etcd - url: https://grafana.com/api/dashboards/3070/revisions/3/download - tags: ['kubernetes', 'etcd'] -- name: node - url: https://grafana.com/api/dashboards/1860/revisions/23/download - tags: ['kubernetes'] -# cd kube-mixin; for f in *.json; do echo "- name: ${f%%.json}" >> ../dashboards.yaml; echo " url: file://kube-mixin/$f" >> ../dashboards.yaml; done; cd - -- name: apiserver - url: file://kube-mixin/apiserver.json -- name: cluster-total - url: file://kube-mixin/cluster-total.json -- name: controller-manager - url: file://kube-mixin/controller-manager.json -- name: k8s-resources-cluster - url: file://kube-mixin/k8s-resources-cluster.json -- name: k8s-resources-namespace - url: file://kube-mixin/k8s-resources-namespace.json -- name: k8s-resources-node - url: file://kube-mixin/k8s-resources-node.json -- name: k8s-resources-pod - url: file://kube-mixin/k8s-resources-pod.json -- name: k8s-resources-workload - url: file://kube-mixin/k8s-resources-workload.json -- name: k8s-resources-workloads-namespace - url: file://kube-mixin/k8s-resources-workloads-namespace.json -- name: kubelet - url: file://kube-mixin/kubelet.json -- name: namespace-by-pod - url: file://kube-mixin/namespace-by-pod.json -- name: namespace-by-workload - url: file://kube-mixin/namespace-by-workload.json -- name: persistentvolumesusage - url: file://kube-mixin/persistentvolumesusage.json - tags: ['kubernetes', 'csi'] -- name: pod-total - url: file://kube-mixin/pod-total.json -- name: proxy - url: file://kube-mixin/proxy.json -- name: scheduler - url: file://kube-mixin/scheduler.json -- name: workload-total - url: file://kube-mixin/workload-total.json diff --git a/charts/kubezero-metrics/sync_grafana_dashboards.py b/charts/kubezero-metrics/sync_grafana_dashboards.py index 7cb42749..cc0444d5 100755 --- a/charts/kubezero-metrics/sync_grafana_dashboards.py +++ b/charts/kubezero-metrics/sync_grafana_dashboards.py @@ -86,14 +86,14 @@ for b in config['dashboards']: # replace datasources traverse_json(obj) - + # Set default tim in all charts to 1h - obj['time'] = { "from": "now-1h", "to": "now" } + obj['time'] = {"from": "now-1h", "to": "now"} obj['refresh'] = "30s" # set tags if 'tags' in b: - obj['tags'] = b['tags'] + obj['tags'] = b['tags'] text = json.dumps(obj, indent=2) diff --git a/charts/kubezero-metrics/sync_prometheus_rules.py b/charts/kubezero-metrics/sync_prometheus_rules.py new file mode 100755 index 00000000..fc601b01 --- /dev/null +++ b/charts/kubezero-metrics/sync_prometheus_rules.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +import sys +import os +import json +import yaml +import requests +import textwrap + +config_file = sys.argv[1] +configmap_folder = sys.argv[2] + + +# read config file +with open(config_file, 'r') as yaml_contents: + config = yaml.safe_load(yaml_contents.read()) + + +def base_rule(config): + rule = '' + if 'condition' in config: + # use index function to make go template happy if '-' in names + if '-' in config['condition'] and "index" not in config['condition']: + tokens = config['condition'].split('.') + rule = '''{{- if index .Values %(condition)s }} + ''' % {'condition': ' '.join(f'"{w}"' for w in tokens[2:])} + + else: + rule = '''{{- if %(condition)s }} + ''' % config + + # Base rule for KubeZero + rule += '''apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%%s-%%s" (include "kubezero-lib.fullname" $) "%(name)s" | trunc 63 | trimSuffix "-" }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "kubezero-lib.labels" . | nindent 4 }} +''' % config + + return rule + + +for r in config['rules']: + if not r['url'].startswith('file://'): + response = requests.get(r['url']) + if response.status_code != 200: + print('Skipping the file, response code %s not equals 200' % response.status_code) + continue + raw_text = response.text + else: + with open(r['url'].replace('file://', ''), 'r') as file_contents: + raw_text = file_contents.read() + + obj = json.loads(raw_text) + + rule = base_rule(r) + + text = yaml.dump(obj['spec'], default_flow_style=False, width=1000, indent=2) + + # Encode {{ }} for helm + text = text.replace("{{", "{{`{{").replace("}}", "}}`}}").replace("{{`{{", "{{`{{`}}").replace("}}`}}", "{{`}}`}}") + text = textwrap.indent(text, ' '*2) + + # add support for additionalRuleLabels from Helm values + helm_labels = '''{{- if .Values.additionalRuleLabels }} +{{- toYaml .Values.additionalRuleLabels | nindent 8 }} +{{- end }} +''' + helm_labels = textwrap.indent(helm_labels, ' '*8) + + text = text.replace("labels:\n", "labels:\n{}".format(helm_labels)) + + rule += '''spec:\n''' + rule += text+'\n' + + if 'condition' in r: + rule += '{{- end }}'+'\n' + + # Write Configmap + configmap_file = os.path.join(configmap_folder, r['name'] + '.yaml') + with open(configmap_file, 'w') as f: + f.write(rule) diff --git a/charts/kubezero-metrics/templates/grafana-dashboards-k8s.yaml b/charts/kubezero-metrics/templates/grafana-dashboards-k8s.yaml index 0a3bd02b..fa35e552 100644 --- a/charts/kubezero-metrics/templates/grafana-dashboards-k8s.yaml +++ b/charts/kubezero-metrics/templates/grafana-dashboards-k8s.yaml @@ -1,7 +1,7 @@ apiVersion: v1 kind: ConfigMap metadata: - name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "grafana-dashboards-kube-mixin" | trunc 63 | trimSuffix "-" }} + name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "grafana-dashboards-k8s" | trunc 63 | trimSuffix "-" }} namespace: {{ .Release.Namespace }} labels: grafana_dashboard: "1" diff --git a/charts/kubezero-metrics/templates/rules/alertmanager.yaml b/charts/kubezero-metrics/templates/rules/alertmanager.yaml new file mode 100644 index 00000000..8b52dfe9 --- /dev/null +++ b/charts/kubezero-metrics/templates/rules/alertmanager.yaml @@ -0,0 +1,114 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "alertmanager" | trunc 63 | trimSuffix "-" }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "kubezero-lib.labels" . | nindent 4 }} +spec: + groups: + - name: alertmanager.rules + rules: + - alert: AlertmanagerFailedReload + annotations: + description: Configuration has failed to load for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload + summary: Reloading an Alertmanager configuration has failed. + expr: '# Without max_over_time, failed scrapes could create false negatives, see + + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + + max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m]) == 0 + + ' + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: AlertmanagerMembersInconsistent + annotations: + description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} has only found {{`{{`}} $value {{`}}`}} members of the {{`{{`}}$labels.job{{`}}`}} cluster. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent + summary: A member of an Alertmanager cluster has not found all other cluster members. + expr: "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n max_over_time(alertmanager_cluster_members{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m])\n< on (namespace,service) group_left\n count by (namespace,service) (max_over_time(alertmanager_cluster_members{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m]))\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: AlertmanagerFailedToSendAlerts + annotations: + description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} failed to send {{`{{`}} $value | humanizePercentage {{`}}`}} of notifications to {{`{{`}} $labels.integration {{`}}`}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts + summary: An Alertmanager instance failed to send notifications. + expr: "(\n rate(alertmanager_notifications_failed_total{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m])\n/\n rate(alertmanager_notifications_total{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m])\n)\n> 0.01\n" + for: 5m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. + expr: "min by (namespace,service, integration) (\n rate(alertmanager_notifications_failed_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration=~`.*`}[5m])\n/\n rate(alertmanager_notifications_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration=~`.*`}[5m])\n)\n> 0.01\n" + for: 5m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. + expr: "min by (namespace,service, integration) (\n rate(alertmanager_notifications_failed_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration!~`.*`}[5m])\n/\n rate(alertmanager_notifications_total{job=\"alertmanager-main\",namespace=\"monitoring\", integration!~`.*`}[5m])\n)\n> 0.01\n" + for: 5m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: AlertmanagerConfigInconsistent + annotations: + description: Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have different configurations. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent + summary: Alertmanager instances within the same cluster have different configurations. + expr: "count by (namespace,service) (\n count_values by (namespace,service) (\"config_hash\", alertmanager_config_hash{job=\"alertmanager-main\",namespace=\"monitoring\"})\n)\n!= 1\n" + for: 20m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: AlertmanagerClusterDown + annotations: + description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have been up for less than half of the last 5m.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown + summary: Half or more of the Alertmanager instances within the same cluster are down. + expr: "(\n count by (namespace,service) (\n avg_over_time(up{job=\"alertmanager-main\",namespace=\"monitoring\"}[5m]) < 0.5\n )\n/\n count by (namespace,service) (\n up{job=\"alertmanager-main\",namespace=\"monitoring\"}\n )\n)\n>= 0.5\n" + for: 5m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: AlertmanagerClusterCrashlooping + annotations: + description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have restarted at least 5 times in the last 10m.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping + summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. + expr: "(\n count by (namespace,service) (\n changes(process_start_time_seconds{job=\"alertmanager-main\",namespace=\"monitoring\"}[10m]) > 4\n )\n/\n count by (namespace,service) (\n up{job=\"alertmanager-main\",namespace=\"monitoring\"}\n )\n)\n>= 0.5\n" + for: 5m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + diff --git a/charts/kubezero-metrics/templates/rules/etcd-mixin.yaml b/charts/kubezero-metrics/templates/rules/etcd-mixin.yaml new file mode 100644 index 00000000..ef8f7a26 --- /dev/null +++ b/charts/kubezero-metrics/templates/rules/etcd-mixin.yaml @@ -0,0 +1,195 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "etcd-mixin" | trunc 63 | trimSuffix "-" }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "kubezero-lib.labels" . | nindent 4 }} +spec: + groups: + - name: etcd + rules: + - alert: etcdMembersDown + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": members are down ({{`{{`}} $value {{`}}`}}).' + summary: etcd cluster members are down. + expr: "max without (endpoint) (\n sum without (instance, pod) (up{job=~\".*etcd.*\"} == bool 0)\nor\n count without (To) (\n sum without (instance, pod) (rate(etcd_network_peer_sent_failures_total{job=~\".*etcd.*\"}[120s])) > 0.01\n )\n)\n> 0\n" + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: etcdInsufficientMembers + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).' + summary: etcd cluster has insufficient number of members. + expr: 'sum(up{job=~".*etcd.*"} == bool 1) without (instance, pod) < ((count(up{job=~".*etcd.*"}) without (instance, pod) + 1) / 2) + + ' + for: 3m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: etcdNoLeader + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.' + summary: etcd cluster has no leader. + expr: 'etcd_server_has_leader{job=~".*etcd.*"} == 0 + + ' + for: 1m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: etcdHighNumberOfLeaderChanges + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' + summary: etcd cluster has high number of leader changes. + expr: 'increase((max without (instance, pod) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4 + + ' + for: 5m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: etcdHighNumberOfFailedGRPCRequests + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + summary: etcd cluster has high number of failed grpc requests. + expr: "100 * sum(rate(grpc_server_handled_total{job=~\".*etcd.*\", grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[5m])) without (grpc_type, grpc_code)\n /\nsum(rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) without (grpc_type, grpc_code)\n > 1\n" + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: etcdHighNumberOfFailedGRPCRequests + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + summary: etcd cluster has high number of failed grpc requests. + expr: "100 * sum(rate(grpc_server_handled_total{job=~\".*etcd.*\", grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[5m])) without (grpc_type, grpc_code)\n /\nsum(rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) without (grpc_type, grpc_code)\n > 5\n" + for: 5m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: etcdGRPCRequestsSlow + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile of gRPC requests is {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + summary: etcd grpc requests are slow + expr: 'histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) without(grpc_type)) + + > 0.15 + + ' + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: etcdMemberCommunicationSlow + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + summary: etcd cluster member communication is slow. + expr: 'histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) + + > 0.15 + + ' + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: etcdHighNumberOfFailedProposals + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + summary: etcd cluster has high number of proposal failures. + expr: 'rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: etcdHighFsyncDurations + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fsync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + summary: etcd cluster 99th percentile fsync durations are too high. + expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) + + > 0.5 + + ' + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: etcdHighFsyncDurations + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fsync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) + + > 1 + + ' + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: etcdHighCommitDurations + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + summary: etcd cluster 99th percentile commit durations are too high. + expr: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) + + > 0.25 + + ' + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: etcdBackendQuotaLowSpace + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": database size exceeds the defined quota on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.' + expr: '(etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100 > 95 + + ' + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: etcdExcessiveDatabaseGrowth + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": Observed surge in etcd writes leading to 50% increase in database size over the past four hours on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please check as it might be disruptive.' + expr: 'increase(((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100)[240m:1m]) > 50 + + ' + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + diff --git a/charts/kubezero-metrics/templates/rules/kube-prometheus.yaml b/charts/kubezero-metrics/templates/rules/kube-prometheus.yaml new file mode 100644 index 00000000..b86ed68e --- /dev/null +++ b/charts/kubezero-metrics/templates/rules/kube-prometheus.yaml @@ -0,0 +1,81 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "kube-prometheus" | trunc 63 | trimSuffix "-" }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "kubezero-lib.labels" . | nindent 4 }} +spec: + groups: + - name: general.rules + rules: + - alert: TargetDown + annotations: + description: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown + summary: One or more targets are unreachable. + expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: Watchdog + annotations: + description: 'This is an alert meant to ensure that the entire alerting pipeline is functional. + + This alert is always firing, therefore it should always be firing in Alertmanager + + and always fire against a receiver. There are integrations with various notification + + mechanisms that send a notification when this alert is not firing. For example the + + "DeadMansSnitch" integration in PagerDuty. + + ' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog + summary: An alert that should always be firing to certify that Alertmanager is working properly. + expr: vector(1) + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: none + - name: node-network + rules: + - alert: NodeNetworkInterfaceFlapping + annotations: + description: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing its up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping + summary: Network interface is often changing its status + expr: 'changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 + + ' + for: 2m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - name: kube-prometheus-node-recording.rules + rules: + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance) + record: instance:node_cpu:rate:sum + - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) + record: instance:node_network_receive_bytes:rate:sum + - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) + record: instance:node_network_transmit_bytes:rate:sum + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) + record: instance:node_cpu:ratio + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) + record: cluster:node_cpu:sum_rate5m + - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) + record: cluster:node_cpu:ratio + - name: kube-prometheus-general.rules + rules: + - expr: count without(instance, pod, node) (up == 1) + record: count:up1 + - expr: count without(instance, pod, node) (up == 0) + record: count:up0 + diff --git a/charts/kubezero-metrics/templates/rules/kube-state-metrics.yaml b/charts/kubezero-metrics/templates/rules/kube-state-metrics.yaml new file mode 100644 index 00000000..92f0727c --- /dev/null +++ b/charts/kubezero-metrics/templates/rules/kube-state-metrics.yaml @@ -0,0 +1,62 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "kube-state-metrics" | trunc 63 | trimSuffix "-" }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "kubezero-lib.labels" . | nindent 4 }} +spec: + groups: + - name: kube-state-metrics + rules: + - alert: KubeStateMetricsListErrors + annotations: + description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors + summary: kube-state-metrics is experiencing errors in list operations. + expr: "(sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\",result=\"error\"}[5m]))\n /\nsum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\"}[5m])))\n> 0.01\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: KubeStateMetricsWatchErrors + annotations: + description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors + summary: kube-state-metrics is experiencing errors in watch operations. + expr: "(sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\",result=\"error\"}[5m]))\n /\nsum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\"}[5m])))\n> 0.01\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: KubeStateMetricsShardingMismatch + annotations: + description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch + summary: kube-state-metrics sharding is misconfigured. + expr: 'stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0 + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: KubeStateMetricsShardsMissing + annotations: + description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing + summary: kube-state-metrics shards are missing. + expr: "2^max(kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) - 1\n -\nsum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job=\"kube-state-metrics\"}) )\n!= 0\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + diff --git a/charts/kubezero-metrics/templates/rules/kubernetes.yaml b/charts/kubezero-metrics/templates/rules/kubernetes.yaml new file mode 100644 index 00000000..dfc35931 --- /dev/null +++ b/charts/kubezero-metrics/templates/rules/kubernetes.yaml @@ -0,0 +1,1174 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "kubernetes" | trunc 63 | trimSuffix "-" }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "kubezero-lib.labels" . | nindent 4 }} +spec: + groups: + - name: kubernetes-apps + rules: + - alert: KubePodCrashLooping + annotations: + description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is restarting {{`{{`}} printf "%.2f" $value {{`}}`}} times / 10 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping + summary: Pod is crash looping. + expr: 'increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[10m]) > 0 + + and + + kube_pod_container_status_waiting{job="kube-state-metrics"} == 1 + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubePodNotReady + annotations: + description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready + summary: Pod has been in a non-ready state for more than 15 minutes. + expr: "sum by (namespace, pod) (\n max by(namespace, pod) (\n kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Pending|Unknown\"}\n ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (\n 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!=\"Job\"})\n )\n) > 0\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeDeploymentGenerationMismatch + annotations: + description: Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch + summary: Deployment generation mismatch due to possible roll-back + expr: "kube_deployment_status_observed_generation{job=\"kube-state-metrics\"}\n !=\nkube_deployment_metadata_generation{job=\"kube-state-metrics\"}\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeDeploymentReplicasMismatch + annotations: + description: Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: "(\n kube_deployment_spec_replicas{job=\"kube-state-metrics\"}\n >\n kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}\n) and (\n changes(kube_deployment_status_replicas_updated{job=\"kube-state-metrics\"}[10m])\n ==\n 0\n)\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeStatefulSetReplicasMismatch + annotations: + description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: "(\n kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas{job=\"kube-state-metrics\"}\n) and (\n changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[10m])\n ==\n 0\n)\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeStatefulSetGenerationMismatch + annotations: + description: StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch + summary: StatefulSet generation mismatch due to possible roll-back + expr: "kube_statefulset_status_observed_generation{job=\"kube-state-metrics\"}\n !=\nkube_statefulset_metadata_generation{job=\"kube-state-metrics\"}\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeStatefulSetUpdateNotRolledOut + annotations: + description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout + summary: StatefulSet update has not been rolled out. + expr: "(\n max without (revision) (\n kube_statefulset_status_current_revision{job=\"kube-state-metrics\"}\n unless\n kube_statefulset_status_update_revision{job=\"kube-state-metrics\"}\n )\n *\n (\n kube_statefulset_replicas{job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeDaemonSetRolloutStuck + annotations: + description: DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck + summary: DaemonSet rollout is stuck. + expr: "(\n (\n kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"}\n !=\n 0\n ) or (\n kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_available{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeContainerWaiting + annotations: + description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting + summary: Pod container waiting longer than 1 hour + expr: 'sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 + + ' + for: 1h + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeDaemonSetNotScheduled + annotations: + description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled + summary: DaemonSet pods are not scheduled. + expr: "kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n -\nkube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"} > 0\n" + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeDaemonSetMisScheduled + annotations: + description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled + summary: DaemonSet pods are misscheduled. + expr: 'kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeJobCompletion + annotations: + description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than 12 hours to complete. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobcompletion + summary: Job did not complete in time + expr: 'kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 + + ' + for: 12h + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeJobFailed + annotations: + description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed + summary: Job failed to complete. + expr: 'kube_job_failed{job="kube-state-metrics"} > 0 + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeHpaReplicasMismatch + annotations: + description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch + summary: HPA has not matched descired number of replicas. + expr: "(kube_horizontalpodautoscaler_status_desired_replicas{job=\"kube-state-metrics\"}\n !=\nkube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"})\n and\n(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n >\nkube_horizontalpodautoscaler_spec_min_replicas{job=\"kube-state-metrics\"})\n and\n(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n <\nkube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\"})\n and\nchanges(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}[15m]) == 0\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeHpaMaxedOut + annotations: + description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has been running at max replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout + summary: HPA is running at max replicas + expr: "kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n ==\nkube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\"}\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - name: kubernetes-resources + rules: + - alert: KubeCPUOvercommit + annotations: + description: Cluster has overcommitted CPU resource requests for Pods by {{`{{`}} $value {{`}}`}} CPU shares and cannot tolerate node failure. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: 'sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0 + + and + + (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0 + + ' + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeMemoryOvercommit + annotations: + description: Cluster has overcommitted memory resource requests for Pods by {{`{{`}} $value {{`}}`}} bytes and cannot tolerate node failure. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit + summary: Cluster has overcommitted memory resource requests. + expr: 'sum(namespace_memory:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0 + + and + + (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0 + + ' + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeCPUQuotaOvercommit + annotations: + description: Cluster has overcommitted CPU resource requests for Namespaces. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: "sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"cpu\"})\n /\nsum(kube_node_status_allocatable{resource=\"cpu\"})\n > 1.5\n" + for: 5m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeMemoryQuotaOvercommit + annotations: + description: Cluster has overcommitted memory resource requests for Namespaces. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit + summary: Cluster has overcommitted memory resource requests. + expr: "sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"memory\"})\n /\nsum(kube_node_status_allocatable{resource=\"memory\",job=\"kube-state-metrics\"})\n > 1.5\n" + for: 5m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeQuotaAlmostFull + annotations: + description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull + summary: Namespace quota is going to be full. + expr: "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n > 0.9 < 1\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: info + - alert: KubeQuotaFullyUsed + annotations: + description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused + summary: Namespace quota is fully used. + expr: "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n == 1\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: info + - alert: KubeQuotaExceeded + annotations: + description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded + summary: Namespace quota has exceeded the limits. + expr: "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n > 1\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: CPUThrottlingHigh + annotations: + description: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}}.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh + summary: Processes experience elevated CPU throttling. + expr: "sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\", }[5m])) by (container, pod, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)\n > ( 25 / 100 )\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: info + - name: kubernetes-storage + rules: + - alert: KubePersistentVolumeFillingUp + annotations: + description: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is only {{`{{`}} $value | humanizePercentage {{`}}`}} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) < 0.03\nand\nkubelet_volume_stats_used_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1\n" + for: 1m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: KubePersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) < 0.15\nand\nkubelet_volume_stats_used_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nand\npredict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}[6h], 4 * 24 * 3600) < 0\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1\n" + for: 1h + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubePersistentVolumeErrors + annotations: + description: The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors + summary: PersistentVolume is having issues with provisioning. + expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 + + ' + for: 5m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - name: kubernetes-system + rules: + - alert: KubeVersionMismatch + annotations: + description: There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch + summary: Different semantic versions of Kubernetes components running. + expr: 'count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeClientErrors + annotations: + description: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} $value | humanizePercentage {{`}}`}} errors.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors + summary: Kubernetes API server client is experiencing errors. + expr: "(sum(rate(rest_client_requests_total{code=~\"5..\"}[5m])) by (instance, job, namespace)\n /\nsum(rate(rest_client_requests_total[5m])) by (instance, job, namespace))\n> 0.01\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - name: kube-apiserver-slos + rules: + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: 'sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) + + and + + sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) + + ' + for: 2m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + long: 1h + severity: critical + short: 5m + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: 'sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) + + and + + sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + long: 6h + severity: critical + short: 30m + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: 'sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) + + and + + sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) + + ' + for: 1h + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + long: 1d + severity: warning + short: 2h + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: 'sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) + + and + + sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) + + ' + for: 3h + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + long: 3d + severity: warning + short: 6h + - name: kubernetes-system-apiserver + rules: + - alert: KubeClientCertificateExpiration + annotations: + description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration + summary: Client certificate is about to expire. + expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration + summary: Client certificate is about to expire. + expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: AggregatedAPIErrors + annotations: + description: An aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has reported errors. It has appeared unavailable {{`{{`}} $value | humanize {{`}}`}} times averaged over the past 10m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/aggregatedapierrors + summary: An aggregated API has reported errors. + expr: 'sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4 + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: AggregatedAPIDown + annotations: + description: An aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has been only {{`{{`}} $value | humanize {{`}}`}}% available over the last 10m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/aggregatedapidown + summary: An aggregated API is down. + expr: '(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 + + ' + for: 5m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeAPIDown + annotations: + description: KubeAPI has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapidown + summary: Target disappeared from Prometheus target discovery. + expr: 'absent(up{job="apiserver"} == 1) + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: KubeAPITerminatedRequests + annotations: + description: The apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests + summary: The apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests. + expr: 'sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 + + ' + for: 5m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - name: kubernetes-system-kubelet + rules: + - alert: KubeNodeNotReady + annotations: + description: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready + summary: Node is not ready. + expr: 'kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeNodeUnreachable + annotations: + description: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable + summary: Node is unreachable. + expr: '(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeletTooManyPods + annotations: + description: Kubelet '{{`{{`}} $labels.node {{`}}`}}' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods + summary: Kubelet is running at capacity. + expr: "count by(node) (\n (kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})\n)\n/\nmax by(node) (\n kube_node_status_capacity{job=\"kube-state-metrics\",resource=\"pods\"} != 1\n) > 0.95\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: info + - alert: KubeNodeReadinessFlapping + annotations: + description: The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping + summary: Node readiness status is flapping. + expr: 'sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeletPlegDurationHigh + annotations: + description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh + summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. + expr: 'node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 + + ' + for: 5m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeletPodStartUpLatencyHigh + annotations: + description: Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh + summary: Kubelet Pod startup latency is too high. + expr: 'histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeletClientCertificateExpiration + annotations: + description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration + summary: Kubelet client certificate is about to expire. + expr: 'kubelet_certificate_manager_client_ttl_seconds < 604800 + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeletClientCertificateExpiration + annotations: + description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration + summary: Kubelet client certificate is about to expire. + expr: 'kubelet_certificate_manager_client_ttl_seconds < 86400 + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: KubeletServerCertificateExpiration + annotations: + description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration + summary: Kubelet server certificate is about to expire. + expr: 'kubelet_certificate_manager_server_ttl_seconds < 604800 + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeletServerCertificateExpiration + annotations: + description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration + summary: Kubelet server certificate is about to expire. + expr: 'kubelet_certificate_manager_server_ttl_seconds < 86400 + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: KubeletClientCertificateRenewalErrors + annotations: + description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its client certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes). + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors + summary: Kubelet has failed to renew its client certificate. + expr: 'increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeletServerCertificateRenewalErrors + annotations: + description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its server certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes). + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors + summary: Kubelet has failed to renew its server certificate. + expr: 'increase(kubelet_server_expiration_renew_errors[5m]) > 0 + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: KubeletDown + annotations: + description: Kubelet has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletdown + summary: Target disappeared from Prometheus target discovery. + expr: 'absent(up{job="kubelet", metrics_path="/metrics"} == 1) + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - name: kubernetes-system-scheduler + rules: + - alert: KubeSchedulerDown + annotations: + description: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeschedulerdown + summary: Target disappeared from Prometheus target discovery. + expr: 'absent(up{job="kube-scheduler"} == 1) + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - name: kubernetes-system-controller-manager + rules: + - alert: KubeControllerManagerDown + annotations: + description: KubeControllerManager has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontrollermanagerdown + summary: Target disappeared from Prometheus target discovery. + expr: 'absent(up{job="kube-controller-manager"} == 1) + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - name: kube-apiserver-burnrate.rules + rules: + - expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[1d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[1d]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[1d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n" + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: read + record: apiserver_request:burnrate1d + - expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[1h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[1h]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[1h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n" + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: read + record: apiserver_request:burnrate1h + - expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[2h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[2h]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[2h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n" + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: read + record: apiserver_request:burnrate2h + - expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[30m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[30m]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[30m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n" + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: read + record: apiserver_request:burnrate30m + - expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[3d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[3d]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[3d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n" + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: read + record: apiserver_request:burnrate3d + - expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[5m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[5m]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[5m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n" + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: read + record: apiserver_request:burnrate5m + - expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"}[6h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"}[6h]))\n +\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"}[6h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n" + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: read + record: apiserver_request:burnrate6h + - expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[1d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n" + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: write + record: apiserver_request:burnrate1d + - expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[1h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n" + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: write + record: apiserver_request:burnrate1h + - expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[2h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n" + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: write + record: apiserver_request:burnrate2h + - expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[30m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n" + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: write + record: apiserver_request:burnrate30m + - expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[3d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n" + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: write + record: apiserver_request:burnrate3d + - expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[5m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n" + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: write + record: apiserver_request:burnrate5m + - expr: "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n -\n sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[6h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n" + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: write + record: apiserver_request:burnrate6h + - name: kube-apiserver-histogram.rules + rules: + - expr: 'histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + quantile: '0.99' + verb: read + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: 'histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + quantile: '0.99' + verb: write + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + quantile: '0.99' + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: 'histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + quantile: '0.9' + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: 'histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + quantile: '0.5' + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - interval: 3m + name: kube-apiserver-availability.rules + rules: + - expr: 'avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 + + ' + record: code_verb:apiserver_request_total:increase30d + - expr: 'sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: read + record: code:apiserver_request_total:increase30d + - expr: 'sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: write + record: code:apiserver_request_total:increase30d + - expr: 'sum by (cluster, verb, scope) (increase(apiserver_request_duration_seconds_count[1h])) + + ' + record: cluster_verb_scope:apiserver_request_duration_seconds_count:increase1h + - expr: 'sum by (cluster, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_duration_seconds_count:increase1h[30d]) * 24 * 30) + + ' + record: cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d + - expr: 'sum by (cluster, verb, scope, le) (increase(apiserver_request_duration_seconds_bucket[1h])) + + ' + record: cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase1h + - expr: 'sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase1h[30d]) * 24 * 30) + + ' + record: cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d + - expr: "1 - (\n (\n # write too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"})\n ) +\n (\n # read too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"\ + })\n )\n ) +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d)\n" + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: all + record: apiserver_request:availability30d + - expr: "1 - (\n sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n # too slow\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"cluster\",le=\"40\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\"})\n" + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: read + record: apiserver_request:availability30d + - expr: "1 - (\n (\n # too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\"})\n" + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: write + record: apiserver_request:availability30d + - expr: 'sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: read + record: code_resource:apiserver_request_total:rate5m + - expr: 'sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + verb: write + record: code_resource:apiserver_request_total:rate5m + - expr: 'sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h])) + + ' + record: code_verb:apiserver_request_total:increase1h + - expr: 'sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h])) + + ' + record: code_verb:apiserver_request_total:increase1h + - expr: 'sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h])) + + ' + record: code_verb:apiserver_request_total:increase1h + - expr: 'sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + + ' + record: code_verb:apiserver_request_total:increase1h + - name: k8s.rules + rules: + - expr: "sum by (cluster, namespace, pod, container) (\n irate(container_cpu_usage_seconds_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}[5m])\n) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (\n 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=\"\"})\n)\n" + record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate + - expr: "container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info{node!=\"\"})\n)\n" + record: node_namespace_pod_container:container_memory_working_set_bytes + - expr: "container_memory_rss{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info{node!=\"\"})\n)\n" + record: node_namespace_pod_container:container_memory_rss + - expr: "container_memory_cache{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info{node!=\"\"})\n)\n" + record: node_namespace_pod_container:container_memory_cache + - expr: "container_memory_swap{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info{node!=\"\"})\n)\n" + record: node_namespace_pod_container:container_memory_swap + - expr: "kube_pod_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n" + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests + - expr: "sum by (namespace, cluster) (\n sum by (namespace, pod, cluster) (\n max by (namespace, pod, container, cluster) (\n kube_pod_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\"}\n ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (\n kube_pod_status_phase{phase=~\"Pending|Running\"} == 1\n )\n )\n)\n" + record: namespace_memory:kube_pod_container_resource_requests:sum + - expr: "kube_pod_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n" + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests + - expr: "sum by (namespace, cluster) (\n sum by (namespace, pod, cluster) (\n max by (namespace, pod, container, cluster) (\n kube_pod_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\"}\n ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (\n kube_pod_status_phase{phase=~\"Pending|Running\"} == 1\n )\n )\n)\n" + record: namespace_cpu:kube_pod_container_resource_requests:sum + - expr: "kube_pod_container_resource_limits{resource=\"memory\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n" + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits + - expr: "sum by (namespace, cluster) (\n sum by (namespace, pod, cluster) (\n max by (namespace, pod, container, cluster) (\n kube_pod_container_resource_limits{resource=\"memory\",job=\"kube-state-metrics\"}\n ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (\n kube_pod_status_phase{phase=~\"Pending|Running\"} == 1\n )\n )\n)\n" + record: namespace_memory:kube_pod_container_resource_limits:sum + - expr: "kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n )\n" + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits + - expr: "sum by (namespace, cluster) (\n sum by (namespace, pod, cluster) (\n max by (namespace, pod, container, cluster) (\n kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"}\n ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (\n kube_pod_status_phase{phase=~\"Pending|Running\"} == 1\n )\n )\n)\n" + record: namespace_cpu:kube_pod_container_resource_limits:sum + - expr: "max by (cluster, namespace, workload, pod) (\n label_replace(\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"ReplicaSet\"},\n \"replicaset\", \"$1\", \"owner_name\", \"(.*)\"\n ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (\n 1, max by (replicaset, namespace, owner_name) (\n kube_replicaset_owner{job=\"kube-state-metrics\"}\n )\n ),\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n)\n" + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + workload_type: deployment + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: "max by (cluster, namespace, workload, pod) (\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"DaemonSet\"},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n)\n" + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + workload_type: daemonset + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: "max by (cluster, namespace, workload, pod) (\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"StatefulSet\"},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n)\n" + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + workload_type: statefulset + record: namespace_workload_pod:kube_pod_owner:relabel + - name: kube-scheduler.rules + rules: + - expr: 'histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + quantile: '0.99' + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: 'histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + quantile: '0.99' + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: 'histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + quantile: '0.99' + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - expr: 'histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + quantile: '0.9' + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: 'histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + quantile: '0.9' + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: 'histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + quantile: '0.9' + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - expr: 'histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + quantile: '0.5' + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: 'histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + quantile: '0.5' + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: 'histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + quantile: '0.5' + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - name: node.rules + rules: + - expr: "topk by(namespace, pod) (1,\n max by (node, namespace, pod) (\n label_replace(kube_pod_info{job=\"kube-state-metrics\",node!=\"\"}, \"pod\", \"$1\", \"pod\", \"(.*)\")\n))\n" + record: 'node_namespace_pod:kube_pod_info:' + - expr: "count by (cluster, node) (sum by (node, cpu) (\n node_cpu_seconds_total{job=\"node-exporter\"}\n* on (namespace, pod) group_left(node)\n topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:)\n))\n" + record: node:node_num_cpu:sum + - expr: "sum(\n node_memory_MemAvailable_bytes{job=\"node-exporter\"} or\n (\n node_memory_Buffers_bytes{job=\"node-exporter\"} +\n node_memory_Cached_bytes{job=\"node-exporter\"} +\n node_memory_MemFree_bytes{job=\"node-exporter\"} +\n node_memory_Slab_bytes{job=\"node-exporter\"}\n )\n) by (cluster)\n" + record: :node_memory_MemAvailable_bytes:sum + - name: kubelet.rules + rules: + - expr: 'histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + quantile: '0.99' + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: 'histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + quantile: '0.9' + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: 'histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + quantile: '0.5' + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + diff --git a/charts/kubezero-metrics/templates/rules/node-exporter.yaml b/charts/kubezero-metrics/templates/rules/node-exporter.yaml new file mode 100644 index 00000000..9b56db16 --- /dev/null +++ b/charts/kubezero-metrics/templates/rules/node-exporter.yaml @@ -0,0 +1,273 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "node-exporter" | trunc 63 | trimSuffix "-" }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "kubezero-lib.labels" . | nindent 4 }} +spec: + groups: + - name: node-exporter + rules: + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 24 hours. + expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up fast. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 4 hours. + expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 15\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace + summary: Filesystem has less than 5% space left. + expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n" + for: 30m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace + summary: Filesystem has less than 3% space left. + expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n" + for: 30m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 24 hours. + expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up fast. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 4 hours. + expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles + summary: Filesystem has less than 5% inodes left. + expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles + summary: Filesystem has less than 3% inodes left. + expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: NodeNetworkReceiveErrs + annotations: + description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} receive errors in the last two minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs + summary: Network interface is reporting many receive errors. + expr: 'rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 + + ' + for: 1h + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: NodeNetworkTransmitErrs + annotations: + description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} transmit errors in the last two minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs + summary: Network interface is reporting many transmit errors. + expr: 'rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 + + ' + for: 1h + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: NodeHighNumberConntrackEntriesUsed + annotations: + description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused + summary: Number of conntrack are getting close to the limit. + expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: NodeTextFileCollectorScrapeError + annotations: + description: Node Exporter text file collector failed to scrape. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror + summary: Node Exporter text file collector failed to scrape. + expr: 'node_textfile_scrape_error{job="node-exporter"} == 1 + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: NodeClockSkewDetected + annotations: + description: Clock on {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected + summary: Clock skew detected. + expr: "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n" + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: NodeClockNotSynchronising + annotations: + description: Clock on {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising + summary: Clock not synchronising. + expr: 'min_over_time(node_timex_sync_status[5m]) == 0 + + and + + node_timex_maxerror_seconds >= 16 + + ' + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: NodeRAIDDegraded + annotations: + description: RAID array '{{`{{`}} $labels.device {{`}}`}}' on {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded + summary: RAID Array is degraded + expr: 'node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: NodeRAIDDiskFailure + annotations: + description: At least one device in RAID array on {{`{{`}} $labels.instance {{`}}`}} failed. Array '{{`{{`}} $labels.device {{`}}`}}' needs attention and possibly a disk swap. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure + summary: Failed device in RAID array + expr: 'node_md_disks{state="failed"} > 0 + + ' + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: "(\n node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"node-exporter\"} > 70\n)\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: "(\n node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"node-exporter\"} > 90\n)\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - name: node-exporter.rules + rules: + - expr: "count without (cpu, mode) (\n node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"}\n)\n" + record: instance:node_num_cpu:sum + - expr: "1 - avg without (cpu, mode) (\n rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\"}[5m])\n)\n" + record: instance:node_cpu_utilisation:rate5m + - expr: "(\n node_load1{job=\"node-exporter\"}\n/\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n" + record: instance:node_load1_per_cpu:ratio + - expr: "1 - (\n node_memory_MemAvailable_bytes{job=\"node-exporter\"}\n or\n (\n node_memory_Buffers_bytes{job=\"node-exporter\"}\n +\n node_memory_Cached_bytes{job=\"node-exporter\"}\n +\n node_memory_MemFree_bytes{job=\"node-exporter\"}\n +\n node_memory_Slab_bytes{job=\"node-exporter\"}\n )\n/\n node_memory_MemTotal_bytes{job=\"node-exporter\"}\n)\n" + record: instance:node_memory_utilisation:ratio + - expr: 'rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) + + ' + record: instance:node_vmstat_pgmajfault:rate5m + - expr: 'rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) + + ' + record: instance_device:node_disk_io_time_seconds:rate5m + - expr: 'rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) + + ' + record: instance_device:node_disk_io_time_weighted_seconds:rate5m + - expr: "sum without (device) (\n rate(node_network_receive_bytes_total{job=\"node-exporter\", device!=\"lo\"}[5m])\n)\n" + record: instance:node_network_receive_bytes_excluding_lo:rate5m + - expr: "sum without (device) (\n rate(node_network_transmit_bytes_total{job=\"node-exporter\", device!=\"lo\"}[5m])\n)\n" + record: instance:node_network_transmit_bytes_excluding_lo:rate5m + - expr: "sum without (device) (\n rate(node_network_receive_drop_total{job=\"node-exporter\", device!=\"lo\"}[5m])\n)\n" + record: instance:node_network_receive_drop_excluding_lo:rate5m + - expr: "sum without (device) (\n rate(node_network_transmit_drop_total{job=\"node-exporter\", device!=\"lo\"}[5m])\n)\n" + record: instance:node_network_transmit_drop_excluding_lo:rate5m + diff --git a/charts/kubezero-metrics/templates/rules/prometheus-operator.yaml b/charts/kubezero-metrics/templates/rules/prometheus-operator.yaml new file mode 100644 index 00000000..56193e6d --- /dev/null +++ b/charts/kubezero-metrics/templates/rules/prometheus-operator.yaml @@ -0,0 +1,110 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "prometheus-operator" | trunc 63 | trimSuffix "-" }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "kubezero-lib.labels" . | nindent 4 }} +spec: + groups: + - name: prometheus-operator + rules: + - alert: PrometheusOperatorListErrors + annotations: + description: Errors while performing List operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors + summary: Errors while performing list operations in controller. + expr: '(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: PrometheusOperatorWatchErrors + annotations: + description: Errors while performing watch operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors + summary: Errors while performing watch operations in controller. + expr: '(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: PrometheusOperatorSyncFailed + annotations: + description: Controller {{`{{`}} $labels.controller {{`}}`}} in {{`{{`}} $labels.namespace {{`}}`}} namespace fails to reconcile {{`{{`}} $value {{`}}`}} objects. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorsyncfailed + summary: Last controller reconciliation failed + expr: 'min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 + + ' + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: PrometheusOperatorReconcileErrors + annotations: + description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of reconciling operations failed for {{`{{`}} $labels.controller {{`}}`}} controller in {{`{{`}} $labels.namespace {{`}}`}} namespace.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors + summary: Errors while reconciling controller. + expr: '(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1 + + ' + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: PrometheusOperatorNodeLookupErrors + annotations: + description: Errors while reconciling Prometheus in {{`{{`}} $labels.namespace {{`}}`}} Namespace. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornodelookuperrors + summary: Errors while reconciling Prometheus. + expr: 'rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 + + ' + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: PrometheusOperatorNotReady + annotations: + description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace isn't ready to reconcile {{`{{`}} $labels.controller {{`}}`}} resources. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready + summary: Prometheus operator not ready + expr: 'min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0) + + ' + for: 5m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: PrometheusOperatorRejectedResources + annotations: + description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace rejected {{`{{`}} printf "%0.0f" $value {{`}}`}} {{`{{`}} $labels.controller {{`}}`}}/{{`{{`}} $labels.resource {{`}}`}} resources. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorrejectedresources + summary: Resources rejected by Prometheus operator + expr: 'min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 + + ' + for: 5m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + diff --git a/charts/kubezero-metrics/templates/rules/prometheus.yaml b/charts/kubezero-metrics/templates/rules/prometheus.yaml new file mode 100644 index 00000000..7215ea26 --- /dev/null +++ b/charts/kubezero-metrics/templates/rules/prometheus.yaml @@ -0,0 +1,258 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "prometheus" | trunc 63 | trimSuffix "-" }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "kubezero-lib.labels" . | nindent 4 }} +spec: + groups: + - name: prometheus + rules: + - alert: PrometheusBadConfig + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to reload its configuration. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig + summary: Failed Prometheus configuration reload. + expr: '# Without max_over_time, failed scrapes could create false negatives, see + + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + + max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0 + + ' + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: PrometheusNotificationQueueRunningFull + annotations: + description: Alert notification queue of Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is running full. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull + summary: Prometheus alert notification queue predicted to run full in less than 30m. + expr: "# Without min_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n predict_linear(prometheus_notifications_queue_length{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m], 60 * 30)\n>\n min_over_time(prometheus_notifications_queue_capacity{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers + annotations: + description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.alertmanager{{`}}`}}.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers + summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. + expr: "(\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n* 100\n> 1\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: PrometheusNotConnectedToAlertmanagers + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not connected to any Alertmanagers. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers + summary: Prometheus is not connected to any Alertmanagers. + expr: '# Without max_over_time, failed scrapes could create false negatives, see + + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + + max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"}[5m]) < 1 + + ' + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: PrometheusTSDBReloadsFailing + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} reload failures over the last 3h. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing + summary: Prometheus has issues reloading blocks from disk. + expr: 'increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 + + ' + for: 4h + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: PrometheusTSDBCompactionsFailing + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} compaction failures over the last 3h. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing + summary: Prometheus has issues compacting blocks. + expr: 'increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 + + ' + for: 4h + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: PrometheusNotIngestingSamples + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not ingesting samples. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples + summary: Prometheus is not ingesting samples. + expr: "(\n rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) <= 0\nand\n (\n sum without(scrape_job) (prometheus_target_metadata_cache_entries{job=\"prometheus-k8s\",namespace=\"monitoring\"}) > 0\n or\n sum without(rule_group) (prometheus_rule_group_rules{job=\"prometheus-k8s\",namespace=\"monitoring\"}) > 0\n )\n)\n" + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: PrometheusDuplicateTimestamps + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with different values but duplicated timestamp. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps + summary: Prometheus is dropping samples with duplicate timestamps. + expr: 'rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + + ' + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: PrometheusOutOfOrderTimestamps + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with timestamps arriving out of order. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps + summary: Prometheus drops samples with out-of-order timestamps. + expr: 'rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + + ' + for: 10m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: PrometheusRemoteStorageFailures + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} failed to send {{`{{`}} printf "%.1f" $value {{`}}`}}% of the samples to {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}} + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures + summary: Prometheus fails to send samples to remote storage. + expr: "(\n (rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n/\n (\n (rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n +\n (rate(prometheus_remote_storage_succeeded_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n )\n)\n* 100\n> 1\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: PrometheusRemoteWriteBehind + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write is {{`{{`}} printf "%.1f" $value {{`}}`}}s behind for {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind + summary: Prometheus remote write is behind. + expr: "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n- ignoring(remote_name, url) group_right\n max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n> 120\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: PrometheusRemoteWriteDesiredShards + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write desired shards calculation wants to run {{`{{`}} $value {{`}}`}} shards for queue {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}, which is more than the max of {{`{{`}} printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value {{`}}`}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards + summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. + expr: "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_shards_desired{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n>\n max_over_time(prometheus_remote_storage_shards_max{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: PrometheusRuleFailures + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to evaluate {{`{{`}} printf "%.0f" $value {{`}}`}} rules in the last 5m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures + summary: Prometheus is failing rule evaluations. + expr: 'increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: PrometheusMissingRuleEvaluations + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has missed {{`{{`}} printf "%.0f" $value {{`}}`}} rule group evaluations in the last 5m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations + summary: Prometheus is missing rule evaluations due to slow rule group evaluation. + expr: 'increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: PrometheusTargetLimitHit + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because the number of targets exceeded the configured target_limit. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit + summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. + expr: 'increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: PrometheusLabelLimitHit + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit + summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit. + expr: 'increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + + ' + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: warning + - alert: PrometheusTargetSyncFailure + annotations: + description: '{{`{{`}} printf "%.0f" $value {{`}}`}} targets in Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} have failed to sync because invalid configuration was supplied.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure + summary: Prometheus has failed to sync targets. + expr: 'increase(prometheus_target_sync_failed_total{job="prometheus-k8s",namespace="monitoring"}[30m]) > 0 + + ' + for: 5m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + - alert: PrometheusErrorSendingAlertsToAnyAlertmanager + annotations: + description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% minimum errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to any Alertmanager.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager + summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. + expr: "min without (alertmanager) (\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\",alertmanager!~``}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\",alertmanager!~``}[5m])\n)\n* 100\n> 3\n" + for: 15m + labels: + {{- if .Values.additionalRuleLabels }} + {{- toYaml .Values.additionalRuleLabels | nindent 8 }} + {{- end }} + severity: critical + diff --git a/charts/kubezero-metrics/update.sh b/charts/kubezero-metrics/update.sh index a136b64b..f3bc613e 100755 --- a/charts/kubezero-metrics/update.sh +++ b/charts/kubezero-metrics/update.sh @@ -9,20 +9,20 @@ helm pull prometheus-community/kube-prometheus-stack --untar --untardir charts - rm -rf charts/prometheus-pushgateway helm pull prometheus-community/prometheus-pushgateway --untar --untardir charts --version $PG_VER -# The grpc alerts could be re-enabled with etcd 3.5 -# https://github.com/etcd-io/etcd/pull/12196 patch -p0 -i zdt.patch --no-backup-if-mismatch patch -p0 -i zdt-pushgateway.patch --no-backup-if-mismatch -# Create ZDT dashboard configmap -cd dashboards +# Create ZDT dashboard, alerts etc configmaps +cd configs ./build.sh ../sync_grafana_dashboards.py metrics-dashboards.yaml ../templates/grafana-dashboards-metrics.yaml ../sync_grafana_dashboards.py k8s-dashboards.yaml ../templates/grafana-dashboards-k8s.yaml ../sync_grafana_dashboards.py zdt-dashboards.yaml ../templates/grafana-dashboards-zdt.yaml + +../sync_prometheus_rules.py k8s-rules.yaml ../templates/rules cd - -# Delete not used upstream dashboards -rm -rf charts/kube-prometheus-stack/templates/grafana/dashboards-1.14 +# Delete not used upstream dashboards or rules +rm -rf charts/kube-prometheus-stack/templates/grafana/dashboards-1.14 charts/kube-prometheus-stack/templates/prometheus/rules-1.14 diff --git a/charts/kubezero-metrics/values.yaml b/charts/kubezero-metrics/values.yaml index 04272a20..9e07d608 100644 --- a/charts/kubezero-metrics/values.yaml +++ b/charts/kubezero-metrics/values.yaml @@ -1,16 +1,14 @@ +#additionalRuleLabels: +# clusterName: myTestCluster +# awsRegion: eu-central-1 + kube-prometheus-stack: global: rbac: pspEnabled: false defaultRules: - create: true - # Disable kubeapi error budget rules for now as they are buggy - rules: - kubeApiserverSlos: false - #additionalRuleLabels: - # clusterName: myTestCluster - # awsRegion: eu-central-1 + create: false coreDns: enabled: true diff --git a/charts/kubezero-metrics/zdt.patch b/charts/kubezero-metrics/zdt.patch index ef018ca0..68c06715 100644 --- a/charts/kubezero-metrics/zdt.patch +++ b/charts/kubezero-metrics/zdt.patch @@ -18,62 +18,3 @@ diff -tubr charts/kube-prometheus-stack/charts/grafana/templates/_pod.tpl charts {{- end}} - name: {{ .Chart.Name }} {{- if .Values.image.sha }} -diff -tubr charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml charts/kube-prometheus-stack.zdt/templates/prometheus/rules-1.14/etcd.yaml ---- charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml 2021-06-26 19:54:57.000000000 +0200 -+++ charts/kube-prometheus-stack.zdt/templates/prometheus/rules-1.14/etcd.yaml 2021-07-01 13:31:20.750609396 +0200 -@@ -54,34 +54,6 @@ - {{- if .Values.defaultRules.additionalRuleLabels }} - {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} - {{- end }} -- - alert: etcdHighNumberOfFailedGRPCRequests -- annotations: -- message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' -- expr: |- -- 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method) -- / -- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method) -- > 1 -- for: 10m -- labels: -- severity: warning --{{- if .Values.defaultRules.additionalRuleLabels }} --{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} --{{- end }} -- - alert: etcdHighNumberOfFailedGRPCRequests -- annotations: -- message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' -- expr: |- -- 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method) -- / -- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method) -- > 5 -- for: 5m -- labels: -- severity: critical --{{- if .Values.defaultRules.additionalRuleLabels }} --{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} --{{- end }} - - alert: etcdGRPCRequestsSlow - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": gRPC requests to {{`{{`}} $labels.grpc_method {{`}}`}} are taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' -diff -tubr charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml charts/kube-prometheus-stack.zdt/templates/prometheus/rules-1.14/node-exporter.yaml ---- charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml 2021-06-26 19:54:57.000000000 +0200 -+++ charts/kube-prometheus-stack.zdt/templates/prometheus/rules-1.14/node-exporter.yaml 2021-07-01 13:31:20.753942729 +0200 -@@ -31,7 +31,7 @@ - summary: Filesystem is predicted to run out of space within the next 24 hours. - expr: |- - ( -- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40 -+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 25 - and - predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 - and -@@ -50,7 +50,7 @@ - summary: Filesystem is predicted to run out of space within the next 4 hours. - expr: |- - ( -- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15 -+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 10 - and - predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 - and