{ "apiVersion": "monitoring.coreos.com/v1", "kind": "PrometheusRule", "metadata": { "labels": { "app.kubernetes.io/component": "exporter", "app.kubernetes.io/name": "kube-prometheus", "app.kubernetes.io/part-of": "kube-prometheus", "prometheus": "k8s", "role": "alert-rules" }, "name": "kube-prometheus-rules", "namespace": "monitoring" }, "spec": { "groups": [ { "name": "general.rules", "rules": [ { "alert": "TargetDown", "annotations": { "description": "{{ printf \"%.4g\" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.", "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/general/targetdown", "summary": "One or more targets are unreachable." }, "expr": "100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10", "for": "10m", "labels": { "severity": "warning" } }, { "alert": "Watchdog", "annotations": { "description": "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty.\n", "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/general/watchdog", "summary": "An alert that should always be firing to certify that Alertmanager is working properly." }, "expr": "vector(1)", "labels": { "severity": "none" } }, { "alert": "InfoInhibitor", "annotations": { "description": "This is an alert that is used to inhibit info alerts.\nBy themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with\nother alerts.\nThis alert fires whenever there's a severity=\"info\" alert, and stops firing when another alert with a\nseverity of 'warning' or 'critical' starts firing on the same namespace.\nThis alert should be routed to a null receiver and configured to inhibit alerts with severity=\"info\".\n", "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor", "summary": "Info-level alert inhibition." }, "expr": "ALERTS{severity = \"info\"} == 1 unless on(namespace) ALERTS{alertname != \"InfoInhibitor\", severity =~ \"warning|critical\", alertstate=\"firing\"} == 1", "labels": { "severity": "none" } } ] }, { "name": "node-network", "rules": [ { "alert": "NodeNetworkInterfaceFlapping", "annotations": { "description": "Network interface \"{{ $labels.device }}\" changing its up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}", "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping", "summary": "Network interface is often changing its status" }, "expr": "changes(node_network_up{job=\"node-exporter\",device!~\"veth.+\"}[2m]) > 2\n", "for": "2m", "labels": { "severity": "warning" } } ] }, { "name": "kube-prometheus-node-recording.rules", "rules": [ { "expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[3m])) BY (instance)", "record": "instance:node_cpu:rate:sum" }, { "expr": "sum(rate(node_network_receive_bytes_total[3m])) BY (instance)", "record": "instance:node_network_receive_bytes:rate:sum" }, { "expr": "sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)", "record": "instance:node_network_transmit_bytes:rate:sum" }, { "expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)", "record": "instance:node_cpu:ratio" }, { "expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[5m]))", "record": "cluster:node_cpu:sum_rate5m" }, { "expr": "cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))", "record": "cluster:node_cpu:ratio" } ] }, { "name": "kube-prometheus-general.rules", "rules": [ { "expr": "count without(instance, pod, node) (up == 1)", "record": "count:up1" }, { "expr": "count without(instance, pod, node) (up == 0)", "record": "count:up0" } ] } ] } }