feat: add grafana and prometheus rules for openEBS lvm driver

This commit is contained in:
Stefan Reimer 2022-03-08 11:41:47 +01:00
parent 0e5d6b6c31
commit c584600630
28 changed files with 3704 additions and 19 deletions

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: kubezero-ci name: kubezero-ci
description: KubeZero umbrella chart for all things CI description: KubeZero umbrella chart for all things CI
type: application type: application
version: 0.4.25 version: 0.4.26
home: https://kubezero.com home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords: keywords:

View File

@ -17,7 +17,7 @@ gitea:
enabled: false enabled: false
image: image:
tag: 1.15.10 tag: 1.16.1
rootless: true rootless: true
securityContext: securityContext:
@ -117,10 +117,10 @@ jenkins:
- kubernetes:1.31.3 - kubernetes:1.31.3
- workflow-aggregator:2.6 - workflow-aggregator:2.6
- git:4.10.3 - git:4.10.3
- configuration-as-code:1.55.1 - configuration-as-code:1346.ve8cfa_3473c94
- antisamy-markup-formatter:2.7 - antisamy-markup-formatter:2.7
- prometheus:2.0.10 - prometheus:2.0.10
- htmlpublisher:1.28 - htmlpublisher:1.29
- build-discarder:60.v1747b0eb632a - build-discarder:60.v1747b0eb632a
serviceAccountAgent: serviceAccountAgent:

2
charts/kubezero-storage/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
jsonnet/vendor/
jsonnet/tmp/

View File

@ -0,0 +1 @@
jsonnet

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: kubezero-storage name: kubezero-storage
description: KubeZero umbrella chart for all things storage incl. AWS EBS/EFS, openEBS-lvm, gemini description: KubeZero umbrella chart for all things storage incl. AWS EBS/EFS, openEBS-lvm, gemini
type: application type: application
version: 0.5.4 version: 0.5.7
home: https://kubezero.com home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords: keywords:
@ -28,7 +28,7 @@ dependencies:
condition: gemini.enabled condition: gemini.enabled
# repository: https://charts.fairwinds.com/stable # repository: https://charts.fairwinds.com/stable
- name: aws-ebs-csi-driver - name: aws-ebs-csi-driver
version: 2.6.2 version: 2.6.3
condition: aws-ebs-csi-driver.enabled condition: aws-ebs-csi-driver.enabled
# repository: https://kubernetes-sigs.github.io/aws-ebs-csi-driver # repository: https://kubernetes-sigs.github.io/aws-ebs-csi-driver
- name: aws-efs-csi-driver - name: aws-efs-csi-driver

View File

@ -1,6 +1,6 @@
# kubezero-storage # kubezero-storage
![Version: 0.5.3](https://img.shields.io/badge/Version-0.5.3-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![Version: 0.5.7](https://img.shields.io/badge/Version-0.5.7-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)
KubeZero umbrella chart for all things storage incl. AWS EBS/EFS, openEBS-lvm, gemini KubeZero umbrella chart for all things storage incl. AWS EBS/EFS, openEBS-lvm, gemini
@ -18,7 +18,7 @@ Kubernetes: `>= 1.20.0`
| Repository | Name | Version | | Repository | Name | Version |
|------------|------|---------| |------------|------|---------|
| | aws-ebs-csi-driver | 2.6.2 | | | aws-ebs-csi-driver | 2.6.3 |
| | aws-efs-csi-driver | 2.2.3 | | | aws-efs-csi-driver | 2.2.3 |
| | gemini | 0.0.8 | | | gemini | 0.0.8 |
| | lvm-localpv | 0.8.6 | | | lvm-localpv | 0.8.6 |
@ -73,7 +73,6 @@ Kubernetes: `>= 1.20.0`
| aws-efs-csi-driver.node.tolerations[0].key | string | `"kubezero-workergroup"` | | | aws-efs-csi-driver.node.tolerations[0].key | string | `"kubezero-workergroup"` | |
| aws-efs-csi-driver.node.tolerations[0].operator | string | `"Exists"` | | | aws-efs-csi-driver.node.tolerations[0].operator | string | `"Exists"` | |
| aws-efs-csi-driver.replicaCount | int | `1` | | | aws-efs-csi-driver.replicaCount | int | `1` | |
| aws-efs-csi-driver.storageClasses[0].name | string | `"efs-sc"` | |
| gemini.enabled | bool | `false` | | | gemini.enabled | bool | `false` | |
| gemini.resources.limits.cpu | string | `"400m"` | | | gemini.resources.limits.cpu | string | `"400m"` | |
| gemini.resources.limits.memory | string | `"128Mi"` | | | gemini.resources.limits.memory | string | `"128Mi"` | |
@ -90,6 +89,7 @@ Kubernetes: `>= 1.20.0`
| lvm-localpv.lvmNode.tolerations[0].effect | string | `"NoSchedule"` | | | lvm-localpv.lvmNode.tolerations[0].effect | string | `"NoSchedule"` | |
| lvm-localpv.lvmNode.tolerations[0].key | string | `"kubezero-workergroup"` | | | lvm-localpv.lvmNode.tolerations[0].key | string | `"kubezero-workergroup"` | |
| lvm-localpv.lvmNode.tolerations[0].operator | string | `"Exists"` | | | lvm-localpv.lvmNode.tolerations[0].operator | string | `"Exists"` | |
| lvm-localpv.prometheus.enabled | bool | `false` | |
| lvm-localpv.storageClass.default | bool | `false` | | | lvm-localpv.storageClass.default | bool | `false` | |
| lvm-localpv.storageClass.vgpattern | string | `""` | | | lvm-localpv.storageClass.vgpattern | string | `""` | |
| snapshotController.enabled | bool | `true` | | | snapshotController.enabled | bool | `true` | |
@ -107,3 +107,4 @@ Kubernetes: `>= 1.20.0`
- https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment - https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment
## Resources ## Resources
- https://github.com/openebs/monitoring/blob/develop/docs/openebs-mixin-user-guide.md#install-openebs-mixin-in-existing-prometheus-stack

View File

@ -19,3 +19,4 @@
- https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment - https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment
## Resources ## Resources
- https://github.com/openebs/monitoring/blob/develop/docs/openebs-mixin-user-guide.md#install-openebs-mixin-in-existing-prometheus-stack

View File

@ -1,5 +1,9 @@
# Helm chart # Helm chart
## v2.6.3
* Bump app/driver to version `v1.5.1`
## v2.6.2 ## v2.6.2
* Update csi-resizer version to v1.1.0 * Update csi-resizer version to v1.1.0

View File

@ -3,7 +3,7 @@ annotations:
- kind: added - kind: added
description: Custom controller.updateStrategy to set controller deployment strategy. description: Custom controller.updateStrategy to set controller deployment strategy.
apiVersion: v2 apiVersion: v2
appVersion: 1.5.0 appVersion: 1.5.1
description: A Helm chart for AWS EBS CSI Driver description: A Helm chart for AWS EBS CSI Driver
home: https://github.com/kubernetes-sigs/aws-ebs-csi-driver home: https://github.com/kubernetes-sigs/aws-ebs-csi-driver
keywords: keywords:
@ -19,4 +19,4 @@ maintainers:
name: aws-ebs-csi-driver name: aws-ebs-csi-driver
sources: sources:
- https://github.com/kubernetes-sigs/aws-ebs-csi-driver - https://github.com/kubernetes-sigs/aws-ebs-csi-driver
version: 2.6.2 version: 2.6.3

View File

@ -1,9 +1,9 @@
apiVersion: v1 apiVersion: v1
appVersion: 0.1.0 appVersion: "1.0"
description: Automated backup and restore of PersistentVolumes using the VolumeSnapshot description: Automated backup and restore of PersistentVolumes using the VolumeSnapshot
API API
maintainers: maintainers:
- email: robertb@fairwinds.com - email: robertb@fairwinds.com
name: rbren name: rbren
name: gemini name: gemini
version: 0.0.8 version: 1.0.0

View File

@ -28,7 +28,7 @@ Your cluster must support the [VolumeSnapshot API](https://kubernetes.io/docs/co
|-----|------|---------|-------------| |-----|------|---------|-------------|
| image.pullPolicy | string | `"Always"` | imagePullPolicy - Highly recommended to leave this as `Always` | | image.pullPolicy | string | `"Always"` | imagePullPolicy - Highly recommended to leave this as `Always` |
| image.repository | string | `"quay.io/fairwinds/gemini"` | Repository for the gemini image | | image.repository | string | `"quay.io/fairwinds/gemini"` | Repository for the gemini image |
| image.tag | string | `"0.1"` | The gemini image tag to use | | image.tag | string | `nil` | The gemini image tag to use. Defaults to .Chart.appVersion |
| rbac.create | bool | `true` | If true, create a new ServiceAccount and attach permissions | | rbac.create | bool | `true` | If true, create a new ServiceAccount and attach permissions |
| rbac.serviceAccountName | string | `nil` | | | rbac.serviceAccountName | string | `nil` | |
| verbosity | int | `5` | How verbose the controller logs should be | | verbosity | int | `5` | How verbose the controller logs should be |

View File

@ -31,7 +31,7 @@ spec:
- -v - -v
- {{ . | quote }} - {{ . | quote }}
{{- end }} {{- end }}
image: '{{.Values.image.repository}}:{{.Values.image.tag}}' image: "{{.Values.image.repository}}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: '{{.Values.image.pullPolicy}}' imagePullPolicy: '{{.Values.image.pullPolicy}}'
name: gemini-controller name: gemini-controller
resources: resources:

View File

@ -3,8 +3,8 @@ image:
pullPolicy: Always pullPolicy: Always
# image.repository -- Repository for the gemini image # image.repository -- Repository for the gemini image
repository: quay.io/fairwinds/gemini repository: quay.io/fairwinds/gemini
# image.tag -- The gemini image tag to use # image.tag -- The gemini image tag to use. Defaults to .Chart.appVersion
tag: "0.1" tag:
rbac: rbac:
# rbac.create -- If true, create a new ServiceAccount and attach permissions # rbac.create -- If true, create a new ServiceAccount and attach permissions

View File

@ -0,0 +1,35 @@
JSONNET_ARGS := -n 2 --max-blank-lines 2 --string-style s --comment-style s
ifneq (,$(shell which jsonnetfmt))
JSONNET_FMT_CMD := jsonnetfmt
else
JSONNET_FMT_CMD := jsonnet
JSONNET_FMT_ARGS := fmt $(JSONNET_ARGS)
endif
JSONNET_FMT := $(JSONNET_FMT_CMD) $(JSONNET_FMT_ARGS)
dashboardsDirPath=dashboards
rulessDirPath=rules
all: fmt generate lint
fmt:
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
xargs -n 1 -- $(JSONNET_FMT) -i
generate: clean
jsonnet -J vendor -m rules -e '(import "prometheusRules.libsonnet")'
jsonnet -J vendor -m dashboards -e '(import "dashboards.libsonnet")'
render: generate
../../kubezero-metrics/sync_grafana_dashboards.py dashboards.yaml ../templates/lvm/grafana-dashboards.yaml
../../kubezero-metrics/sync_prometheus_rules.py rules.yaml ../templates/lvm/
lint:
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
while read f; do \
$(JSONNET_FMT) "$$f" | diff -u "$$f" -; \
done
clean:
rm -rf $(dashboardsDirPath) $(rulessDirPath)
mkdir -p $(dashboardsDirPath) $(rulessDirPath)

View File

@ -0,0 +1,24 @@
{
_config+:: {
dashboards+: {
cStor: false,
jiva: false,
localPV: false,
lvmLocalPV: true,
deviceLocalPV: false,
zfsLocalPV: false,
ndm: false,
npd: false,
},
alertRules+: {
cStor: false,
jiva: false,
localPV: false,
lvmLocalPV: true,
deviceLocalPV: false,
zfsLocalPV: false,
ndm: false,
npd: false,
},
},
}

View File

@ -0,0 +1,10 @@
local addMixin = (import 'kube-prometheus/lib/mixin.libsonnet');
local openebsMixin = addMixin({
name: 'openebs',
mixin: (import 'github.com/openebs/monitoring/jsonnet/openebs-mixin/mixin.libsonnet') +
(import 'config.libsonnet'),
});
{ ['openebs-' + name]: openebsMixin.grafanaDashboards[name] for name in std.objectFields(openebsMixin.grafanaDashboards) }

View File

@ -0,0 +1,10 @@
configmap: openebs-lvmlocalpv-grafana-dashboard
gzip: true
folder: KubeZero
condition: 'index .Values "lvm-localpv" "prometheus" "enabled"'
dashboards:
- name: openebs-localpv-pool
url: file://dashboards/openebs-lvmlocalpv-pool.json
tags:
- storage
- openEBS

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,24 @@
{
"version": 1,
"dependencies": [
{
"source": {
"git": {
"remote": "https://github.com/prometheus-operator/kube-prometheus.git",
"subdir": "jsonnet/kube-prometheus"
}
},
"version": "release-0.9"
},
{
"source": {
"git": {
"remote": "https://github.com/openebs/monitoring.git",
"subdir": ""
}
},
"version": "develop"
}
],
"legacyImports": true
}

View File

@ -0,0 +1,170 @@
{
"version": 1,
"dependencies": [
{
"source": {
"git": {
"remote": "https://github.com/brancz/kubernetes-grafana.git",
"subdir": "grafana"
}
},
"version": "90f38916f1f8a310a715d18e36f787f84df4ddf5",
"sum": "0kZ1pnuIirDtbg6F9at5+NQOwKNONIGEPq0eECzvRkI="
},
{
"source": {
"git": {
"remote": "https://github.com/etcd-io/etcd.git",
"subdir": "contrib/mixin"
}
},
"version": "fa191c64bd3c3eaa6cc167e837d7a73e3b7d7c72",
"sum": "W/Azptf1PoqjyMwJON96UY69MFugDA4IAYiKURscryc="
},
{
"source": {
"git": {
"remote": "https://github.com/grafana/grafonnet-lib.git",
"subdir": "grafonnet"
}
},
"version": "6db00c292d3a1c71661fc875f90e0ec7caa538c2",
"sum": "gF8foHByYcB25jcUOBqP6jxk0OPifQMjPvKY0HaCk6w="
},
{
"source": {
"git": {
"remote": "https://github.com/grafana/jsonnet-libs.git",
"subdir": "grafana-builder"
}
},
"version": "03d32a72a2a0bf0ee00ffc853be5f07ad3bafcbe",
"sum": "0KkygBQd/AFzUvVzezE4qF/uDYgrwUXVpZfINBti0oc="
},
{
"source": {
"git": {
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin.git",
"subdir": ""
}
},
"version": "fb9d8ed4bc4a3d6efac525f72e8a0d2c583a0fe2",
"sum": "xjKkdp+5fkekCNBUIgZCHTRmVdUEmQNFKslrL2Ho8gs="
},
{
"source": {
"git": {
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin.git",
"subdir": "lib/promgrafonnet"
}
},
"version": "177bc8ec789fa049a9585713d232035b159f8c92",
"sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps="
},
{
"source": {
"git": {
"remote": "https://github.com/kubernetes/kube-state-metrics.git",
"subdir": "jsonnet/kube-state-metrics"
}
},
"version": "f170cc73f11c1580d7f38af746be0f2fa79c6a1e",
"sum": "S5qI+PJUdNeYOv76jH5nxwYS9N6U7CRxvyuB1wI4cTE="
},
{
"source": {
"git": {
"remote": "https://github.com/kubernetes/kube-state-metrics.git",
"subdir": "jsonnet/kube-state-metrics-mixin"
}
},
"version": "f170cc73f11c1580d7f38af746be0f2fa79c6a1e",
"sum": "u8gaydJoxEjzizQ8jY8xSjYgWooPmxw+wIWdDxifMAk="
},
{
"source": {
"git": {
"remote": "https://github.com/openebs/monitoring.git",
"subdir": ""
}
},
"version": "87b902c85429eedce78033049de9280f39e81372",
"sum": "TyL/PPFwpRRzAOwYVlvgBsSHS6/+Z6TgRGFe8beJcpE="
},
{
"source": {
"git": {
"remote": "https://github.com/prometheus-operator/kube-prometheus.git",
"subdir": "jsonnet/kube-prometheus"
}
},
"version": "c8db3c85db6e13b94726b7c5d630a6ebe7038cf9",
"sum": "7DfVEg2kDzIQPgxTThnjQ4S2ZksrF0hkNLysXP6Wv24="
},
{
"source": {
"git": {
"remote": "https://github.com/prometheus-operator/prometheus-operator.git",
"subdir": "jsonnet/mixin"
}
},
"version": "83fe36566f4e0894eb5ffcd2638a0f039a17bdeb",
"sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U=",
"name": "prometheus-operator-mixin"
},
{
"source": {
"git": {
"remote": "https://github.com/prometheus-operator/prometheus-operator.git",
"subdir": "jsonnet/prometheus-operator"
}
},
"version": "83fe36566f4e0894eb5ffcd2638a0f039a17bdeb",
"sum": "J1G++A8hrtr3+OZQMmcNeb1w/C30bXqqwpwHL/Xhsd4="
},
{
"source": {
"git": {
"remote": "https://github.com/prometheus/alertmanager.git",
"subdir": "doc/alertmanager-mixin"
}
},
"version": "b408b522bc653d014e53035e59fa394cc1edd762",
"sum": "pep+dHzfIjh2SU5pEkwilMCAT/NoL6YYflV4x8cr7vU=",
"name": "alertmanager"
},
{
"source": {
"git": {
"remote": "https://github.com/prometheus/node_exporter.git",
"subdir": "docs/node-mixin"
}
},
"version": "832909dd257eb368cf83363ffcae3ab84cb4bcb1",
"sum": "MmxGhE2PJ1a52mk2x7vDpMT2at4Jglbud/rK74CB5i0="
},
{
"source": {
"git": {
"remote": "https://github.com/prometheus/prometheus.git",
"subdir": "documentation/prometheus-mixin"
}
},
"version": "751ca03faddc9c64089c41d0da370a3a0b477742",
"sum": "AS8WYFi/z10BZSF6DFkKBscjB32XDMM7iIso7CO/FyI=",
"name": "prometheus"
},
{
"source": {
"git": {
"remote": "https://github.com/thanos-io/thanos.git",
"subdir": "mixin"
}
},
"version": "ff363498fc95cfe17de894d7237bcf38bdd0bc36",
"sum": "cajthvLKDjYgYHCKQU2g/pTMRkxcbuJEvTnCyJOihl8=",
"name": "thanos-mixin"
}
],
"legacyImports": false
}

View File

@ -0,0 +1,9 @@
local addMixin = (import 'kube-prometheus/lib/mixin.libsonnet');
local openebsMixin = addMixin({
name: 'openebs',
mixin: (import 'github.com/openebs/monitoring/jsonnet/openebs-mixin/mixin.libsonnet') +
(import 'config.libsonnet'),
});
{ 'openebs-mixin-prometheusRules': openebsMixin.prometheusRules }

View File

@ -0,0 +1,4 @@
rules:
- name: prometheus-rules
url: file://rules/openebs-mixin-prometheusRules
condition: 'index .Values "lvm-localpv" "prometheus" "enabled"'

View File

@ -0,0 +1,156 @@
{
"apiVersion": "monitoring.coreos.com/v1",
"kind": "PrometheusRule",
"metadata": {
"labels": {
"prometheus": "k8s"
},
"name": "openebs",
"namespace": "monitoring"
},
"spec": {
"groups": [
{
"name": "lvm-pool",
"rules": [
{
"alert": "LVMVolumeGroupMissingPhysicalVolume",
"annotations": {
"componentType": "volume group",
"description": "LVM volume group '{{ $labels.name }}' on node '{{ $labels.instance }}' is missing {{ $value }} underlying physical volume(s).",
"summary": "LVM volume group '{{ $labels.name }}' is missing the underlying physical volume.",
"vgName": "{{ $labels.name }}"
},
"expr": "lvm_vg_missing_pv_count > 0",
"for": "5m",
"labels": {
"severity": "critical"
}
},
{
"alert": "LVMVolumeGroupCapacityLow",
"annotations": {
"componentType": "volume group",
"description": "LVM volume group '{{ $labels.name }}' on node '{{ $labels.instance }}' has {{ with printf \"lvm_vg_free_size_bytes{instance='%s',name='%s'}\" $labels.instance $labels.name | query }} {{ . | first | value }} {{ end }}bytes of space remaining",
"summary": "LVM volume group '{{ $labels.name }}' is running low on capacity. Already {{ $value }}% of total capacity is consumed.",
"vgName": "{{ $labels.name }}"
},
"expr": "((lvm_vg_total_size_bytes - lvm_vg_free_size_bytes)/lvm_vg_total_size_bytes)*100 > 90",
"for": "5m",
"labels": {
"severity": "critical"
}
},
{
"alert": "LVMThinPoolCapacityLow",
"annotations": {
"componentType": "logical volume",
"description": "LVM thin pool '{{ $labels.name }}' on node '{{ $labels.instance }}' has {{ with printf \"lvm_lv_total_size_bytes{instance='%s',name='%s',segtype='%s'}-((lvm_lv_used_percent{instance='%s',name='%s',segtype='%s'}*lvm_lv_total_size_bytes{instance='%s',name='%s',segtype='%s'})/100)\" $labels.instance $labels.name $labels.segtype $labels.instance $labels.name $labels.segtype $labels.instance $labels.name $labels.segtype | query }} {{ . | first | value }} {{ end }}bytes of space remaining",
"lvName": "{{ $labels.name }}",
"summary": "LVM thin pool '{{ $labels.name }}' is running low on capacity. Already {{ $value }}% of total capacity is consumed."
},
"expr": "lvm_lv_used_percent{segtype=\"thin-pool\"} > 90",
"for": "5m",
"labels": {
"severity": "critical"
}
}
]
},
{
"name": "persistent-volume-claim",
"rules": [
{
"alert": "StalePersistentVolumeClaim",
"annotations": {
"description": "Persistent Volume Claim '{{ $labels.persistentvolumeclaim }}' has no consumer",
"summary": "Persistent Volume Claim '{{ $labels.persistentvolumeclaim }}' in namespace '{{ $labels.namespace }}' is not consumed by any pod in any namespace"
},
"expr": "kube_persistentvolumeclaim_info unless (kube_persistentvolumeclaim_info * on(persistentvolumeclaim) group_left kube_pod_spec_volumes_persistentvolumeclaims_info) == 1",
"for": "5m",
"labels": {
"severity": "info"
}
},
{
"alert": "PendingPersistentVolumeClaim",
"annotations": {
"description": "Persistent Volume Claim '{{ $labels.persistentvolumeclaim }}' has been in pending state for more than 5 minutes",
"summary": "Persistent Volume Claim '{{ $labels.persistentvolumeclaim }}' pending in namespace '{{ $labels.namespace }}'"
},
"expr": "kube_persistentvolumeclaim_status_phase{phase=\"Pending\"} == 1",
"for": "5m",
"labels": {
"severity": "warning"
}
},
{
"alert": "LostPersistentVolumeClaim",
"annotations": {
"description": "Persistent Volume Claim '{{ $labels.persistentvolumeclaim }}' has been in lost state for more than 5 minutes",
"summary": "Persistent Volume Claim '{{ $labels.persistentvolumeclaim }}' in namespace '{{ $labels.namespace }}' lost it's corresponding persistent volume"
},
"expr": "kube_persistentvolumeclaim_status_phase{phase=\"Lost\"} == 1",
"for": "5m",
"labels": {
"severity": "warning"
}
}
]
},
{
"name": "volume-node",
"rules": [
{
"alert": "VolumeNodeFileSystemIsReadOnly",
"annotations": {
"description": "Persistent Volume's filesystem on node '{{ $labels.node }}' for persistent volume claim '{{ $labels.persistentvolumeclaim }}' has become read-only",
"summary": "Volume mount failed for persistent volume claim '{{ $labels.persistentvolumeclaim }}' on node '{{ $labels.node }}' due to read-only file-system"
},
"expr": "kubelet_volume_stats_inodes * on(node) group_left(reason) problem_counter{reason=\"FilesystemIsReadOnly\"} > 0",
"for": "5m",
"labels": {
"severity": "critical"
}
},
{
"alert": "VolumeNodeExt4Error",
"annotations": {
"description": "Persistent Volume's on node '{{ $labels.node }}' persistent volume claim '{{ $labels.persistentvolumeclaim }}' encountering ext4 filesystem error",
"summary": "Node '{{ $labels.node }}' has encountered errors on ext4 file-system on volume having claim '{{ $labels.persistentvolumeclaim }}'"
},
"expr": "kubelet_volume_stats_inodes * on(node) group_left(reason) problem_counter{reason=\"Ext4Error\"} > 0",
"for": "5m",
"labels": {
"severity": "critical"
}
},
{
"alert": "VolumeNodeIOError",
"annotations": {
"description": "Persistent Volume on node '{{ $labels.node }}' for persistent volume claim '{{ $labels.persistentvolumeclaim }}' encountering errors w.r.t buffer I/O ",
"summary": "IO errors encountered on volume having persistent volume claim '{{ $labels.persistentvolumeclaim }}' on node '{{ $labels.node }}'"
},
"expr": "kubelet_volume_stats_inodes * on(node) group_left(reason) problem_counter{reason=\"IOError\"} > 0",
"for": "5m",
"labels": {
"severity": "critical"
}
},
{
"alert": "VolumeNodeExt4Warning",
"annotations": {
"description": "Persistent Volume on node '{{ $labels.node }}' receiving ext4 filesystem warning for persistent volume claim '{{ $labels.persistentvolumeclaim }}'",
"summary": "Node '{{ $labels.node }}' has encountered warning on ext4 file-system on volume having claim '{{ $labels.persistentvolumeclaim }}'"
},
"expr": "kubelet_volume_stats_inodes * on(node) group_left(reason) problem_counter{reason=\"Ext4Warning\"} > 0",
"for": "5m",
"labels": {
"severity": "critical"
}
}
]
}
]
}
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,104 @@
{{- if index .Values "lvm-localpv" "prometheus" "enabled" }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "prometheus-rules" | trunc 63 | trimSuffix "-" }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "kubezero-lib.labels" . | nindent 4 }}
spec:
groups:
- name: lvm-pool
rules:
- alert: LVMVolumeGroupMissingPhysicalVolume
annotations:
componentType: volume group
description: LVM volume group '{{`{{`}} $labels.name {{`}}`}}' on node '{{`{{`}} $labels.instance {{`}}`}}' is missing {{`{{`}} $value {{`}}`}} underlying physical volume(s).
summary: LVM volume group '{{`{{`}} $labels.name {{`}}`}}' is missing the underlying physical volume.
vgName: '{{`{{`}} $labels.name {{`}}`}}'
expr: lvm_vg_missing_pv_count > 0
for: 5m
labels:
severity: critical
- alert: LVMVolumeGroupCapacityLow
annotations:
componentType: volume group
description: LVM volume group '{{`{{`}} $labels.name {{`}}`}}' on node '{{`{{`}} $labels.instance {{`}}`}}' has {{`{{`}} with printf "lvm_vg_free_size_bytes{instance='%s',name='%s'}" $labels.instance $labels.name | query {{`}}`}} {{`{{`}} . | first | value {{`}}`}} {{`{{`}} end {{`}}`}}bytes of space remaining
summary: LVM volume group '{{`{{`}} $labels.name {{`}}`}}' is running low on capacity. Already {{`{{`}} $value {{`}}`}}% of total capacity is consumed.
vgName: '{{`{{`}} $labels.name {{`}}`}}'
expr: ((lvm_vg_total_size_bytes - lvm_vg_free_size_bytes)/lvm_vg_total_size_bytes)*100 > 90
for: 5m
labels:
severity: critical
- alert: LVMThinPoolCapacityLow
annotations:
componentType: logical volume
description: LVM thin pool '{{`{{`}} $labels.name {{`}}`}}' on node '{{`{{`}} $labels.instance {{`}}`}}' has {{`{{`}} with printf "lvm_lv_total_size_bytes{instance='%s',name='%s',segtype='%s'}-((lvm_lv_used_percent{instance='%s',name='%s',segtype='%s'}*lvm_lv_total_size_bytes{instance='%s',name='%s',segtype='%s'})/100)" $labels.instance $labels.name $labels.segtype $labels.instance $labels.name $labels.segtype $labels.instance $labels.name $labels.segtype | query {{`}}`}} {{`{{`}} . | first | value {{`}}`}} {{`{{`}} end {{`}}`}}bytes of space remaining
lvName: '{{`{{`}} $labels.name {{`}}`}}'
summary: LVM thin pool '{{`{{`}} $labels.name {{`}}`}}' is running low on capacity. Already {{`{{`}} $value {{`}}`}}% of total capacity is consumed.
expr: lvm_lv_used_percent{segtype="thin-pool"} > 90
for: 5m
labels:
severity: critical
- name: persistent-volume-claim
rules:
- alert: StalePersistentVolumeClaim
annotations:
description: Persistent Volume Claim '{{`{{`}} $labels.persistentvolumeclaim {{`}}`}}' has no consumer
summary: Persistent Volume Claim '{{`{{`}} $labels.persistentvolumeclaim {{`}}`}}' in namespace '{{`{{`}} $labels.namespace {{`}}`}}' is not consumed by any pod in any namespace
expr: kube_persistentvolumeclaim_info unless (kube_persistentvolumeclaim_info * on(persistentvolumeclaim) group_left kube_pod_spec_volumes_persistentvolumeclaims_info) == 1
for: 5m
labels:
severity: info
- alert: PendingPersistentVolumeClaim
annotations:
description: Persistent Volume Claim '{{`{{`}} $labels.persistentvolumeclaim {{`}}`}}' has been in pending state for more than 5 minutes
summary: Persistent Volume Claim '{{`{{`}} $labels.persistentvolumeclaim {{`}}`}}' pending in namespace '{{`{{`}} $labels.namespace {{`}}`}}'
expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
for: 5m
labels:
severity: warning
- alert: LostPersistentVolumeClaim
annotations:
description: Persistent Volume Claim '{{`{{`}} $labels.persistentvolumeclaim {{`}}`}}' has been in lost state for more than 5 minutes
summary: Persistent Volume Claim '{{`{{`}} $labels.persistentvolumeclaim {{`}}`}}' in namespace '{{`{{`}} $labels.namespace {{`}}`}}' lost it's corresponding persistent volume
expr: kube_persistentvolumeclaim_status_phase{phase="Lost"} == 1
for: 5m
labels:
severity: warning
- name: volume-node
rules:
- alert: VolumeNodeFileSystemIsReadOnly
annotations:
description: Persistent Volume's filesystem on node '{{`{{`}} $labels.node {{`}}`}}' for persistent volume claim '{{`{{`}} $labels.persistentvolumeclaim {{`}}`}}' has become read-only
summary: Volume mount failed for persistent volume claim '{{`{{`}} $labels.persistentvolumeclaim {{`}}`}}' on node '{{`{{`}} $labels.node {{`}}`}}' due to read-only file-system
expr: kubelet_volume_stats_inodes * on(node) group_left(reason) problem_counter{reason="FilesystemIsReadOnly"} > 0
for: 5m
labels:
severity: critical
- alert: VolumeNodeExt4Error
annotations:
description: Persistent Volume's on node '{{`{{`}} $labels.node {{`}}`}}' persistent volume claim '{{`{{`}} $labels.persistentvolumeclaim {{`}}`}}' encountering ext4 filesystem error
summary: Node '{{`{{`}} $labels.node {{`}}`}}' has encountered errors on ext4 file-system on volume having claim '{{`{{`}} $labels.persistentvolumeclaim {{`}}`}}'
expr: kubelet_volume_stats_inodes * on(node) group_left(reason) problem_counter{reason="Ext4Error"} > 0
for: 5m
labels:
severity: critical
- alert: VolumeNodeIOError
annotations:
description: 'Persistent Volume on node ''{{`{{`}} $labels.node {{`}}`}}'' for persistent volume claim ''{{`{{`}} $labels.persistentvolumeclaim {{`}}`}}'' encountering errors w.r.t buffer I/O '
summary: IO errors encountered on volume having persistent volume claim '{{`{{`}} $labels.persistentvolumeclaim {{`}}`}}' on node '{{`{{`}} $labels.node {{`}}`}}'
expr: kubelet_volume_stats_inodes * on(node) group_left(reason) problem_counter{reason="IOError"} > 0
for: 5m
labels:
severity: critical
- alert: VolumeNodeExt4Warning
annotations:
description: Persistent Volume on node '{{`{{`}} $labels.node {{`}}`}}' receiving ext4 filesystem warning for persistent volume claim '{{`{{`}} $labels.persistentvolumeclaim {{`}}`}}'
summary: Node '{{`{{`}} $labels.node {{`}}`}}' has encountered warning on ext4 file-system on volume having claim '{{`{{`}} $labels.persistentvolumeclaim {{`}}`}}'
expr: kubelet_volume_stats_inodes * on(node) group_left(reason) problem_counter{reason="Ext4Warning"} > 0
for: 5m
labels:
severity: critical
{{- end }}

View File

@ -0,0 +1,18 @@
{{- if index .Values "lvm-localpv" "prometheus" "enabled" }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "openebs-monitoring-lvmlocalpv" | trunc 63 | trimSuffix "-" }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "kubezero-lib.labels" . | nindent 4 }}
spec:
selector:
matchLabels:
name: openebs-lvm-node
namespaceSelector:
any: true
endpoints:
- port: metrics
path: /metrics
{{- end }}

View File

@ -26,3 +26,7 @@ VERSION=$(yq eval '.dependencies[] | select(.name=="aws-efs-csi-driver") | .vers
rm -rf charts/aws-efs-csi-driver rm -rf charts/aws-efs-csi-driver
curl -L -s -o - https://github.com/kubernetes-sigs/aws-efs-csi-driver/releases/download/helm-chart-aws-efs-csi-driver-${VERSION}/aws-efs-csi-driver-${VERSION}.tgz | tar xfz - -C charts curl -L -s -o - https://github.com/kubernetes-sigs/aws-efs-csi-driver/releases/download/helm-chart-aws-efs-csi-driver-${VERSION}/aws-efs-csi-driver-${VERSION}.tgz | tar xfz - -C charts
patch -i efs.patch -p0 --no-backup-if-mismatch patch -i efs.patch -p0 --no-backup-if-mismatch
# Metrics
cd jsonnet
make render

View File

@ -44,6 +44,8 @@ lvm-localpv:
analytics: analytics:
enabled: false enabled: false
prometheus:
enabled: false
gemini: gemini:
enabled: false enabled: false
@ -170,8 +172,12 @@ aws-efs-csi-driver:
effect: NoSchedule effect: NoSchedule
operator: Exists operator: Exists
storageClasses: #storageClasses:
- name: efs-sc #- name: efs-sc
# parameters:
# provisioningMode: efs-ap
# directoryPerms: "700"
# fileSystemId: <efs-id-from-worker-cfn>
#PersistentVolumes: #PersistentVolumes:
# - name: example-pv # - name: example-pv