"groups": - "name": "kube-apiserver-burnrate.rules" "rules": - "expr": | ( ( # too slow sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[1d])) - ( ( sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[1d])) or vector(0) ) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[1d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[1d])) ) ) + # errors sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) ) / sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1d])) "labels": "verb": "read" "record": "apiserver_request:burnrate1d" - "expr": | ( ( # too slow sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[1h])) - ( ( sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[1h])) or vector(0) ) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[1h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[1h])) ) ) + # errors sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) ) / sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1h])) "labels": "verb": "read" "record": "apiserver_request:burnrate1h" - "expr": | ( ( # too slow sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[2h])) - ( ( sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[2h])) or vector(0) ) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[2h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[2h])) ) ) + # errors sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) ) / sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[2h])) "labels": "verb": "read" "record": "apiserver_request:burnrate2h" - "expr": | ( ( # too slow sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[30m])) - ( ( sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[30m])) or vector(0) ) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[30m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[30m])) ) ) + # errors sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) ) / sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[30m])) "labels": "verb": "read" "record": "apiserver_request:burnrate30m" - "expr": | ( ( # too slow sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[3d])) - ( ( sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[3d])) or vector(0) ) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[3d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[3d])) ) ) + # errors sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) ) / sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[3d])) "labels": "verb": "read" "record": "apiserver_request:burnrate3d" - "expr": | ( ( # too slow sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[5m])) - ( ( sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[5m])) or vector(0) ) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[5m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[5m])) ) ) + # errors sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) ) / sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m])) "labels": "verb": "read" "record": "apiserver_request:burnrate5m" - "expr": | ( ( # too slow sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[6h])) - ( ( sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[6h])) or vector(0) ) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[6h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[6h])) ) ) + # errors sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) ) / sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[6h])) "labels": "verb": "read" "record": "apiserver_request:burnrate6h" - "expr": | ( ( # too slow sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) ) + sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) ) / sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) "labels": "verb": "write" "record": "apiserver_request:burnrate1d" - "expr": | ( ( # too slow sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) ) + sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) ) / sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) "labels": "verb": "write" "record": "apiserver_request:burnrate1h" - "expr": | ( ( # too slow sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) ) + sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) ) / sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) "labels": "verb": "write" "record": "apiserver_request:burnrate2h" - "expr": | ( ( # too slow sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) ) + sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) ) / sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) "labels": "verb": "write" "record": "apiserver_request:burnrate30m" - "expr": | ( ( # too slow sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) ) + sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) ) / sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) "labels": "verb": "write" "record": "apiserver_request:burnrate3d" - "expr": | ( ( # too slow sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) ) + sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) ) / sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) "labels": "verb": "write" "record": "apiserver_request:burnrate5m" - "expr": | ( ( # too slow sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) - sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) ) + sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) ) / sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) "labels": "verb": "write" "record": "apiserver_request:burnrate6h" - "name": "kube-apiserver-histogram.rules" "rules": - "expr": | histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET"}[5m]))) > 0 "labels": "quantile": "0.99" "verb": "read" "record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile" - "expr": | histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 "labels": "quantile": "0.99" "verb": "write" "record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile" - "expr": | histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) "labels": "quantile": "0.99" "record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile" - "expr": | histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) "labels": "quantile": "0.9" "record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile" - "expr": | histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) "labels": "quantile": "0.5" "record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile" - "interval": "3m" "name": "kube-apiserver-availability.rules" "rules": - "expr": | avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 "record": "code_verb:apiserver_request_total:increase30d" - "expr": | sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) "labels": "verb": "read" "record": "code:apiserver_request_total:increase30d" - "expr": | sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) "labels": "verb": "write" "record": "code:apiserver_request_total:increase30d" - "expr": | sum by (cluster, verb, scope) (increase(apiserver_request_duration_seconds_count[1h])) "record": "cluster_verb_scope:apiserver_request_duration_seconds_count:increase1h" - "expr": | sum by (cluster, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_duration_seconds_count:increase1h[30d]) * 24 * 30) "record": "cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d" - "expr": | sum by (cluster, verb, scope, le) (increase(apiserver_request_duration_seconds_bucket[1h])) "record": "cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase1h" - "expr": | sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase1h[30d]) * 24 * 30) "record": "cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d" - "expr": | 1 - ( ( # write too slow sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) - sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) ) + ( # read too slow sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"LIST|GET"}) - ( ( sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) or vector(0) ) + sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="40"}) ) ) + # errors sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) ) / sum by (cluster) (code:apiserver_request_total:increase30d) "labels": "verb": "all" "record": "apiserver_request:availability30d" - "expr": | 1 - ( sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"LIST|GET"}) - ( # too slow ( sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) or vector(0) ) + sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="40"}) ) + # errors sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) ) / sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"}) "labels": "verb": "read" "record": "apiserver_request:availability30d" - "expr": | 1 - ( ( # too slow sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) - sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) ) + # errors sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) ) / sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"}) "labels": "verb": "write" "record": "apiserver_request:availability30d" - "expr": | sum by (cluster,code,resource) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m])) "labels": "verb": "read" "record": "code_resource:apiserver_request_total:rate5m" - "expr": | sum by (cluster,code,resource) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) "labels": "verb": "write" "record": "code_resource:apiserver_request_total:rate5m" - "expr": | sum by (cluster, code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h])) "record": "code_verb:apiserver_request_total:increase1h" - "expr": | sum by (cluster, code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h])) "record": "code_verb:apiserver_request_total:increase1h" - "expr": | sum by (cluster, code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h])) "record": "code_verb:apiserver_request_total:increase1h" - "expr": | sum by (cluster, code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) "record": "code_verb:apiserver_request_total:increase1h" - "name": "k8s.rules" "rules": - "expr": | sum by (cluster, namespace, pod, container) ( irate(container_cpu_usage_seconds_total{job="cadvisor", image!=""}[5m]) ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) ) "record": "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate" - "expr": | container_memory_working_set_bytes{job="cadvisor", image!=""} * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=""}) ) "record": "node_namespace_pod_container:container_memory_working_set_bytes" - "expr": | container_memory_rss{job="cadvisor", image!=""} * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=""}) ) "record": "node_namespace_pod_container:container_memory_rss" - "expr": | container_memory_cache{job="cadvisor", image!=""} * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=""}) ) "record": "node_namespace_pod_container:container_memory_cache" - "expr": | container_memory_swap{job="cadvisor", image!=""} * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=""}) ) "record": "node_namespace_pod_container:container_memory_swap" - "expr": | kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) group_left() max by (namespace, pod) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) ) "record": "cluster:namespace:pod_memory:active:kube_pod_container_resource_requests" - "expr": | sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( max by (namespace, pod, container, cluster) ( kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ) ) "record": "namespace_memory:kube_pod_container_resource_requests:sum" - "expr": | kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) group_left() max by (namespace, pod) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) ) "record": "cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests" - "expr": | sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( max by (namespace, pod, container, cluster) ( kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ) ) "record": "namespace_cpu:kube_pod_container_resource_requests:sum" - "expr": | kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) group_left() max by (namespace, pod) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) ) "record": "cluster:namespace:pod_memory:active:kube_pod_container_resource_limits" - "expr": | sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( max by (namespace, pod, container, cluster) ( kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ) ) "record": "namespace_memory:kube_pod_container_resource_limits:sum" - "expr": | kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) group_left() max by (namespace, pod) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) ) "record": "cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits" - "expr": | sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( max by (namespace, pod, container, cluster) ( kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ) ) "record": "namespace_cpu:kube_pod_container_resource_limits:sum" - "expr": | max by (cluster, namespace, workload, pod) ( label_replace( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, "replicaset", "$1", "owner_name", "(.*)" ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( 1, max by (replicaset, namespace, owner_name) ( kube_replicaset_owner{job="kube-state-metrics"} ) ), "workload", "$1", "owner_name", "(.*)" ) ) "labels": "workload_type": "deployment" "record": "namespace_workload_pod:kube_pod_owner:relabel" - "expr": | max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, "workload", "$1", "owner_name", "(.*)" ) ) "labels": "workload_type": "daemonset" "record": "namespace_workload_pod:kube_pod_owner:relabel" - "expr": | max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, "workload", "$1", "owner_name", "(.*)" ) ) "labels": "workload_type": "statefulset" "record": "namespace_workload_pod:kube_pod_owner:relabel" - "name": "kube-scheduler.rules" "rules": - "expr": | histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) "labels": "quantile": "0.99" "record": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile" - "expr": | histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) "labels": "quantile": "0.99" "record": "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile" - "expr": | histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) "labels": "quantile": "0.99" "record": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile" - "expr": | histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) "labels": "quantile": "0.9" "record": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile" - "expr": | histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) "labels": "quantile": "0.9" "record": "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile" - "expr": | histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) "labels": "quantile": "0.9" "record": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile" - "expr": | histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) "labels": "quantile": "0.5" "record": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile" - "expr": | histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) "labels": "quantile": "0.5" "record": "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile" - "expr": | histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) "labels": "quantile": "0.5" "record": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile" - "name": "node.rules" "rules": - "expr": | topk by(namespace, pod) (1, max by (node, namespace, pod) ( label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") )) "record": "node_namespace_pod:kube_pod_info:" - "expr": | count by (cluster, node) (sum by (node, cpu) ( node_cpu_seconds_total{job="node-exporter"} * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:) )) "record": "node:node_num_cpu:sum" - "expr": | sum( node_memory_MemAvailable_bytes{job="node-exporter"} or ( node_memory_Buffers_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Slab_bytes{job="node-exporter"} ) ) by (cluster) "record": ":node_memory_MemAvailable_bytes:sum" - "name": "kubelet.rules" "rules": - "expr": | histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) "labels": "quantile": "0.99" "record": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile" - "expr": | histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) "labels": "quantile": "0.9" "record": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile" - "expr": | histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) "labels": "quantile": "0.5" "record": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile"