{ "editable": false, "panels": [ { "datasource": { "type": "datasource", "uid": "-- Mixed --" }, "description": "The SLO (service level objective) and other metrics displayed on this dashboard are for informational purposes only.", "gridPos": { "h": 2, "w": 24, "x": 0, "y": 0 }, "id": 1, "options": { "content": "The SLO (service level objective) and other metrics displayed on this dashboard are for informational purposes only." }, "pluginVersion": "v10.4.0", "title": "Notice", "type": "text" }, { "datasource": { "type": "datasource", "uid": "-- Mixed --" }, "description": "How many percent of requests (both read and write) in 30 days have been answered successfully and fast enough?", "fieldConfig": { "defaults": { "decimals": 3, "unit": "percentunit" } }, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 2 }, "id": 2, "interval": "1m", "pluginVersion": "v10.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "apiserver_request:availability30d{verb=\"all\", cluster=\"$cluster\"}" } ], "title": "Availability (30d) > 99.000%", "type": "stat" }, { "datasource": { "type": "datasource", "uid": "-- Mixed --" }, "description": "How much error budget is left looking at our 0.990% availability guarantees?", "fieldConfig": { "defaults": { "custom": { "fillOpacity": 100 }, "decimals": 3, "unit": "percentunit" } }, "gridPos": { "h": 7, "w": 16, "x": 8, "y": 2 }, "id": 3, "interval": "1m", "options": { "legend": { "asTable": true, "placement": "right", "showLegend": true }, "tooltip": { "mode": "single" } }, "pluginVersion": "v10.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "100 * (apiserver_request:availability30d{verb=\"all\", cluster=\"$cluster\"} - 0.990000)", "legendFormat": "errorbudget" } ], "title": "ErrorBudget (30d) > 99.000%", "type": "timeseries" }, { "datasource": { "type": "datasource", "uid": "-- Mixed --" }, "description": "How many percent of read requests (LIST,GET) in 30 days have been answered successfully and fast enough?", "fieldConfig": { "defaults": { "decimals": 3, "unit": "percentunit" } }, "gridPos": { "h": 7, "w": 6, "x": 0, "y": 9 }, "id": 4, "interval": "1m", "pluginVersion": "v10.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "apiserver_request:availability30d{verb=\"read\", cluster=\"$cluster\"}" } ], "title": "Read Availability (30d)", "type": "stat" }, { "datasource": { "type": "datasource", "uid": "-- Mixed --" }, "description": "How many read requests (LIST,GET) per second do the apiservers get by code?", "fieldConfig": { "defaults": { "custom": { "fillOpacity": 100, "stacking": { "mode": "normal" } }, "unit": "reqps" }, "overrides": [ { "matcher": { "id": "byRegexp", "options": "/2../i" }, "properties": [ { "id": "color", "value": "#56A64B" } ] }, { "matcher": { "id": "byRegexp", "options": "/3../i" }, "properties": [ { "id": "color", "value": "#F2CC0C" } ] }, { "matcher": { "id": "byRegexp", "options": "/4../i" }, "properties": [ { "id": "color", "value": "#3274D9" } ] }, { "matcher": { "id": "byRegexp", "options": "/5../i" }, "properties": [ { "id": "color", "value": "#E02F44" } ] } ] }, "gridPos": { "h": 7, "w": 6, "x": 6, "y": 9 }, "id": 5, "interval": "1m", "options": { "legend": { "asTable": true, "placement": "right", "showLegend": true }, "tooltip": { "mode": "single" } }, "pluginVersion": "v10.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "sum by (code) (code_resource:apiserver_request_total:rate5m{verb=\"read\", cluster=\"$cluster\"})", "legendFormat": "{{ code }}" } ], "title": "Read SLI - Requests", "type": "timeseries" }, { "datasource": { "type": "datasource", "uid": "-- Mixed --" }, "description": "How many percent of read requests (LIST,GET) per second are returned with errors (5xx)?", "fieldConfig": { "defaults": { "min": 0, "unit": "percentunit" } }, "gridPos": { "h": 7, "w": 6, "x": 12, "y": 9 }, "id": 6, "interval": "1m", "options": { "legend": { "asTable": true, "placement": "right", "showLegend": true }, "tooltip": { "mode": "single" } }, "pluginVersion": "v10.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"read\",code=~\"5..\", cluster=\"$cluster\"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"read\", cluster=\"$cluster\"})", "legendFormat": "{{ resource }}" } ], "title": "Read SLI - Errors", "type": "timeseries" }, { "datasource": { "type": "datasource", "uid": "-- Mixed --" }, "description": "How many seconds is the 99th percentile for reading (LIST|GET) a given resource?", "fieldConfig": { "defaults": { "unit": "s" } }, "gridPos": { "h": 7, "w": 6, "x": 18, "y": 9 }, "id": 7, "interval": "1m", "options": { "legend": { "asTable": true, "placement": "right", "showLegend": true }, "tooltip": { "mode": "single" } }, "pluginVersion": "v10.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile{verb=\"read\", cluster=\"$cluster\"}", "legendFormat": "{{ resource }}" } ], "title": "Read SLI - Duration", "type": "timeseries" }, { "datasource": { "type": "datasource", "uid": "-- Mixed --" }, "description": "How many percent of write requests (POST|PUT|PATCH|DELETE) in 30 days have been answered successfully and fast enough?", "fieldConfig": { "defaults": { "decimals": 3, "unit": "percentunit" } }, "gridPos": { "h": 7, "w": 6, "x": 0, "y": 16 }, "id": 8, "interval": "1m", "pluginVersion": "v10.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "apiserver_request:availability30d{verb=\"write\", cluster=\"$cluster\"}" } ], "title": "Write Availability (30d)", "type": "stat" }, { "datasource": { "type": "datasource", "uid": "-- Mixed --" }, "description": "How many write requests (POST|PUT|PATCH|DELETE) per second do the apiservers get by code?", "fieldConfig": { "defaults": { "custom": { "fillOpacity": 100, "stacking": { "mode": "normal" } }, "unit": "reqps" }, "overrides": [ { "matcher": { "id": "byRegexp", "options": "/2../i" }, "properties": [ { "id": "color", "value": "#56A64B" } ] }, { "matcher": { "id": "byRegexp", "options": "/3../i" }, "properties": [ { "id": "color", "value": "#F2CC0C" } ] }, { "matcher": { "id": "byRegexp", "options": "/4../i" }, "properties": [ { "id": "color", "value": "#3274D9" } ] }, { "matcher": { "id": "byRegexp", "options": "/5../i" }, "properties": [ { "id": "color", "value": "#E02F44" } ] } ] }, "gridPos": { "h": 7, "w": 6, "x": 6, "y": 16 }, "id": 9, "interval": "1m", "options": { "legend": { "asTable": true, "placement": "right", "showLegend": true }, "tooltip": { "mode": "single" } }, "pluginVersion": "v10.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "sum by (code) (code_resource:apiserver_request_total:rate5m{verb=\"write\", cluster=\"$cluster\"})", "legendFormat": "{{ code }}" } ], "title": "Write SLI - Requests", "type": "timeseries" }, { "datasource": { "type": "datasource", "uid": "-- Mixed --" }, "description": "How many percent of write requests (POST|PUT|PATCH|DELETE) per second are returned with errors (5xx)?", "fieldConfig": { "defaults": { "min": 0, "unit": "percentunit" } }, "gridPos": { "h": 7, "w": 6, "x": 12, "y": 16 }, "id": 10, "interval": "1m", "options": { "legend": { "asTable": true, "placement": "right", "showLegend": true }, "tooltip": { "mode": "single" } }, "pluginVersion": "v10.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"write\",code=~\"5..\", cluster=\"$cluster\"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"write\", cluster=\"$cluster\"})", "legendFormat": "{{ resource }}" } ], "title": "Write SLI - Errors", "type": "timeseries" }, { "datasource": { "type": "datasource", "uid": "-- Mixed --" }, "description": "How many seconds is the 99th percentile for writing (POST|PUT|PATCH|DELETE) a given resource?", "fieldConfig": { "defaults": { "unit": "s" } }, "gridPos": { "h": 7, "w": 6, "x": 18, "y": 16 }, "id": 11, "interval": "1m", "options": { "legend": { "asTable": true, "placement": "right", "showLegend": true }, "tooltip": { "mode": "single" } }, "pluginVersion": "v10.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile{verb=\"write\", cluster=\"$cluster\"}", "legendFormat": "{{ resource }}" } ], "title": "Write SLI - Duration", "type": "timeseries" }, { "datasource": { "type": "datasource", "uid": "-- Mixed --" }, "fieldConfig": { "defaults": { "min": 0, "unit": "ops" } }, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 23 }, "id": 12, "interval": "1m", "options": { "legend": { "asTable": true, "placement": "right", "showLegend": false }, "tooltip": { "mode": "single" } }, "pluginVersion": "v10.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "sum(rate(workqueue_adds_total{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[$__rate_interval])) by (instance, name)", "legendFormat": "{{instance}} {{name}}" } ], "title": "Work Queue Add Rate", "type": "timeseries" }, { "datasource": { "type": "datasource", "uid": "-- Mixed --" }, "fieldConfig": { "defaults": { "min": 0, "unit": "short" } }, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 23 }, "id": 13, "interval": "1m", "options": { "legend": { "asTable": true, "placement": "right", "showLegend": false }, "tooltip": { "mode": "single" } }, "pluginVersion": "v10.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "sum(rate(workqueue_depth{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[$__rate_interval])) by (instance, name)", "legendFormat": "{{instance}} {{name}}" } ], "title": "Work Queue Depth", "type": "timeseries" }, { "datasource": { "type": "datasource", "uid": "-- Mixed --" }, "fieldConfig": { "defaults": { "min": 0, "unit": "s" } }, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 30 }, "id": 14, "interval": "1m", "options": { "legend": { "asTable": true, "calcs": [ "lastNotNull" ], "placement": "right", "showLegend": true }, "tooltip": { "mode": "single" } }, "pluginVersion": "v10.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[$__rate_interval])) by (instance, name, le))", "legendFormat": "{{instance}} {{name}}" } ], "title": "Work Queue Latency", "type": "timeseries" }, { "datasource": { "type": "datasource", "uid": "-- Mixed --" }, "fieldConfig": { "defaults": { "unit": "bytes" } }, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 37 }, "id": 15, "interval": "1m", "options": { "legend": { "asTable": true, "placement": "right", "showLegend": true }, "tooltip": { "mode": "single" } }, "pluginVersion": "v10.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "process_resident_memory_bytes{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}", "legendFormat": "{{instance}}" } ], "title": "Memory", "type": "timeseries" }, { "datasource": { "type": "datasource", "uid": "-- Mixed --" }, "fieldConfig": { "defaults": { "min": 0, "unit": "short" } }, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 37 }, "id": 16, "interval": "1m", "options": { "legend": { "asTable": true, "placement": "right", "showLegend": true }, "tooltip": { "mode": "single" } }, "pluginVersion": "v10.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "rate(process_cpu_seconds_total{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}[$__rate_interval])", "legendFormat": "{{instance}}" } ], "title": "CPU usage", "type": "timeseries" }, { "datasource": { "type": "datasource", "uid": "-- Mixed --" }, "fieldConfig": { "defaults": { "unit": "short" } }, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 37 }, "id": 17, "interval": "1m", "options": { "legend": { "asTable": true, "placement": "right", "showLegend": true }, "tooltip": { "mode": "single" } }, "pluginVersion": "v10.4.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "go_goroutines{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}", "legendFormat": "{{instance}}" } ], "title": "Goroutines", "type": "timeseries" } ], "refresh": "10s", "schemaVersion": 36, "tags": [ "kubernetes" ], "templating": { "list": [ { "current": { "selected": true, "text": "default", "value": "default" }, "hide": 0, "label": "Data source", "name": "datasource", "query": "prometheus", "regex": "", "type": "datasource" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "hide": 2, "label": "cluster", "name": "cluster", "query": "label_values(up{job=\"apiserver\"}, cluster)", "refresh": 2, "sort": 1, "type": "query" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "hide": 0, "includeAll": true, "name": "instance", "query": "label_values(up{job=\"apiserver\", cluster=\"$cluster\"}, instance)", "refresh": 2, "sort": 1, "type": "query" } ] }, "time": { "from": "now-1h", "to": "now" }, "timezone": "UTC", "title": "API server", "uid": "09ec8aa1e996d6ffcd6817bbaff4db1b" }