81 lines
4.0 KiB
Plaintext
81 lines
4.0 KiB
Plaintext
{
|
|
"apiVersion": "monitoring.coreos.com/v1",
|
|
"kind": "PrometheusRule",
|
|
"metadata": {
|
|
"labels": {
|
|
"prometheus": "k8s"
|
|
},
|
|
"name": "cert-manager",
|
|
"namespace": "monitoring"
|
|
},
|
|
"spec": {
|
|
"groups": [
|
|
{
|
|
"name": "cert-manager",
|
|
"rules": [
|
|
{
|
|
"alert": "CertManagerAbsent",
|
|
"annotations": {
|
|
"description": "New certificates will not be able to be minted, and existing ones can't be renewed until cert-manager is back.",
|
|
"runbook_url": "https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagerabsent",
|
|
"summary": "Cert Manager has disappeared from Prometheus service discovery."
|
|
},
|
|
"expr": "absent(up{job=\"cert-manager\"})",
|
|
"for": "10m",
|
|
"labels": {
|
|
"severity": "critical"
|
|
}
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "certificates",
|
|
"rules": [
|
|
{
|
|
"alert": "CertManagerCertExpirySoon",
|
|
"annotations": {
|
|
"dashboard_url": "https://grafana.example.com/d/TvuRo2iMk/cert-manager",
|
|
"description": "The domain that this cert covers will be unavailable after {{ $value | humanizeDuration }}. Clients using endpoints that this cert protects will start to fail in {{ $value | humanizeDuration }}.",
|
|
"runbook_url": "https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagercertexpirysoon",
|
|
"summary": "The cert `{{ $labels.name }}` is {{ $value | humanizeDuration }} from expiry, it should have renewed over a week ago."
|
|
},
|
|
"expr": "avg by (exported_namespace, namespace, name) (\n certmanager_certificate_expiration_timestamp_seconds - time()\n) < (21 * 24 * 3600) # 21 days in seconds\n",
|
|
"for": "1h",
|
|
"labels": {
|
|
"severity": "warning"
|
|
}
|
|
},
|
|
{
|
|
"alert": "CertManagerCertNotReady",
|
|
"annotations": {
|
|
"dashboard_url": "https://grafana.example.com/d/TvuRo2iMk/cert-manager",
|
|
"description": "This certificate has not been ready to serve traffic for at least 10m. If the cert is being renewed or there is another valid cert, the ingress controller _may_ be able to serve that instead.",
|
|
"runbook_url": "https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagercertnotready",
|
|
"summary": "The cert `{{ $labels.name }}` is not ready to serve traffic."
|
|
},
|
|
"expr": "max by (name, exported_namespace, namespace, condition) (\n certmanager_certificate_ready_status{condition!=\"True\"} == 1\n)\n",
|
|
"for": "10m",
|
|
"labels": {
|
|
"severity": "critical"
|
|
}
|
|
},
|
|
{
|
|
"alert": "CertManagerHittingRateLimits",
|
|
"annotations": {
|
|
"dashboard_url": "https://grafana.example.com/d/TvuRo2iMk/cert-manager",
|
|
"description": "Depending on the rate limit, cert-manager may be unable to generate certificates for up to a week.",
|
|
"runbook_url": "https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagerhittingratelimits",
|
|
"summary": "Cert manager hitting LetsEncrypt rate limits."
|
|
},
|
|
"expr": "sum by (host) (\n rate(certmanager_http_acme_client_request_count{status=\"429\"}[5m])\n) > 0\n",
|
|
"for": "5m",
|
|
"labels": {
|
|
"severity": "critical"
|
|
}
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
}
|