From a748448c3e02e771d36d01c1d4ac07a107bf0a1e Mon Sep 17 00:00:00 2001 From: Stefan Reimer Date: Wed, 26 Apr 2023 10:26:30 +0000 Subject: [PATCH] Fix storage tests, add taint handling to CA, first docs draft --- Makefile | 3 +- charts/kubezero-addons/values.yaml | 2 + charts/kubezero-network/values.yaml | 2 +- charts/kubezero-storage/Chart.yaml | 2 +- charts/kubezero/values.yaml | 2 +- docs/v1.25.md | 59 +++++++++++++++++++++++++++++ 6 files changed, 66 insertions(+), 4 deletions(-) create mode 100644 docs/v1.25.md diff --git a/Makefile b/Makefile index dabe8ed..eb8ab7e 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,8 @@ IMAGE := kubezero-admin REGION := us-east-1 # Also tag as Kubernetes major version -EXTRA_TAGS = $(shell echo $(TAG) | awk -F '.' '{ print $$1 "." $$2 }') +MY_TAG = $(shell git describe --tags --match v*.*.* 2>/dev/null || git rev-parse --short HEAD 2>/dev/null) +EXTRA_TAGS = $(shell echo $(MY_TAG) | awk -F '.' '{ print $$1 "." $$2 }') include .ci/podman.mk diff --git a/charts/kubezero-addons/values.yaml b/charts/kubezero-addons/values.yaml index 1e9646e..44fae66 100644 --- a/charts/kubezero-addons/values.yaml +++ b/charts/kubezero-addons/values.yaml @@ -209,6 +209,8 @@ cluster-autoscaler: extraArgs: scan-interval: 30s skip-nodes-with-local-storage: false + balance-similar-node-groups: true + ignore-taint: "node.cilium.io/agent-not-ready" #securityContext: # runAsNonRoot: true diff --git a/charts/kubezero-network/values.yaml b/charts/kubezero-network/values.yaml index f142c54..3ede868 100644 --- a/charts/kubezero-network/values.yaml +++ b/charts/kubezero-network/values.yaml @@ -30,7 +30,7 @@ cilium: containerRuntime: integration: crio - # Until we figured out AppArmore on Alpine and Gentoo + # remove with 1.26 securityContext: privileged: true diff --git a/charts/kubezero-storage/Chart.yaml b/charts/kubezero-storage/Chart.yaml index 63a99fd..96f19c2 100644 --- a/charts/kubezero-storage/Chart.yaml +++ b/charts/kubezero-storage/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: kubezero-storage description: KubeZero umbrella chart for all things storage incl. AWS EBS/EFS, openEBS-lvm, gemini type: application -version: 0.8.1 +version: 0.8.2 home: https://kubezero.com icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png keywords: diff --git a/charts/kubezero/values.yaml b/charts/kubezero/values.yaml index 471a55d..187dd6b 100644 --- a/charts/kubezero/values.yaml +++ b/charts/kubezero/values.yaml @@ -40,7 +40,7 @@ cert-manager: storage: enabled: false - targetRevision: 0.8.1 + targetRevision: 0.8.2 aws-ebs-csi-driver: enabled: false aws-efs-csi-driver: diff --git a/docs/v1.25.md b/docs/v1.25.md new file mode 100644 index 0000000..9c71d52 --- /dev/null +++ b/docs/v1.25.md @@ -0,0 +1,59 @@ +# KubeZero 1.25 + +## TODO + +## What's new - Major themes +- AppArmor is now available and enabled on all controller and worker nodes +- all KubeZero AMIs and containers updated to latest Alpine 3.17 +- improved worker initialization, ensure cilium is running before any other pod + +## Version upgrades +- cilium 1.13 +- nvidia-device-plugin +- aws-neuron-driver +- aws-node-termination-handler +- aws-ebs-csi-driver +- aws-efs-csi-driver +- istio 1.17 +- argocd 2.6.7 +- fluent-bit 2.0.10 +- fluentd 2.6 +- all things prometheus + +### FeatureGates +- CustomCPUCFSQuotaPeriod +- NodeOutOfServiceVolumeDetach + +# Upgrade +`(No, really, you MUST read this before you upgrade)` + +Ensure your Kube context points to the correct cluster ! + +1. Review CFN config for controller and workers, no mandatory changes during this release though + +2. Upgrade CFN stacks for the control plane *ONLY* ! + Updating the workers CFN stacks would trigger rolling updates right away ! + +3. Trigger cluster upgrade: + `./admin/upgrade_cluster.sh ` + +4. Review the kubezero-config and if all looks good commit the ArgoApp resouce for Kubezero via regular git + git add / commit / push `` + *DO NOT yet re-enable ArgoCD before all pre v1.24 workers have been replaced !!!* + +5. Reboot controller(s) one by one +Wait each time for controller to join and all pods running. +Might take a while ... + +6. Upgrade CFN stacks for the workers. + This in turn will trigger automated worker updates by evicting pods and launching new workers in a rolling fashion. + Grab a coffee and keep an eye on the cluster to be safe ... + Depending on your cluster size it might take a while to roll over all workers! + +7. Re-enable ArgoCD by hitting on the still waiting upgrade script + +8. Quickly head over to ArgoCD and sync the KubeZero main module as soon as possible to reduce potential back and forth in case ArgoCD has legacy state + + +## Known issues +So far so good.