55 lines
3.1 KiB
YAML
55 lines
3.1 KiB
YAML
{{- if index .Values "cert-manager" "prometheus" "servicemonitor" "enabled" }}
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: {{ printf "%s-%s" (include "kubezero-lib.fullname" $) "prometheus-rules" | trunc 63 | trimSuffix "-" }}
|
|
namespace: {{ .Release.Namespace }}
|
|
labels:
|
|
{{- include "kubezero-lib.labels" . | nindent 4 }}
|
|
spec:
|
|
groups:
|
|
- name: cert-manager
|
|
rules:
|
|
- alert: CertManagerAbsent
|
|
annotations:
|
|
description: New certificates will not be able to be minted, and existing ones can't be renewed until cert-manager is back.
|
|
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerabsent
|
|
summary: Cert Manager has dissapeared from Prometheus service discovery.
|
|
expr: absent(up{job="cert-manager"})
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
- name: certificates
|
|
rules:
|
|
- alert: CertManagerCertExpirySoon
|
|
annotations:
|
|
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
|
|
description: The domain that this cert covers will be unavailable after {{`{{`}} $value | humanizeDuration {{`}}`}}. Clients using endpoints that this cert protects will start to fail in {{`{{`}} $value | humanizeDuration {{`}}`}}.
|
|
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertexpirysoon
|
|
summary: The cert `{{`{{`}} $labels.name {{`}}`}}` is {{`{{`}} $value | humanizeDuration {{`}}`}} from expiry, it should have renewed over a week ago.
|
|
expr: "avg by (exported_namespace, namespace, name) (\n certmanager_certificate_expiration_timestamp_seconds - time()\n) < (21 * 24 * 3600) # 21 days in seconds\n"
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
- alert: CertManagerCertNotReady
|
|
annotations:
|
|
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
|
|
description: This certificate has not been ready to serve traffic for at least 10m. If the cert is being renewed or there is another valid cert, the ingress controller _may_ be able to serve that instead.
|
|
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertnotready
|
|
summary: The cert `{{`{{`}} $labels.name {{`}}`}}` is not ready to serve traffic.
|
|
expr: "max by (name, exported_namespace, namespace, condition) (\n certmanager_certificate_ready_status{condition!=\"True\"} == 1\n)\n"
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
- alert: CertManagerHittingRateLimits
|
|
annotations:
|
|
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
|
|
description: Depending on the rate limit, cert-manager may be unable to generate certificates for up to a week.
|
|
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerhittingratelimits
|
|
summary: Cert manager hitting LetsEncrypt rate limits.
|
|
expr: "sum by (host) (\n rate(certmanager_http_acme_client_request_count{status=\"429\"}[5m])\n) > 0\n"
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
{{- end }}
|