From a748448c3e02e771d36d01c1d4ac07a107bf0a1e Mon Sep 17 00:00:00 2001
From: Stefan Reimer <stefan@zero-downtime.net>
Date: Wed, 26 Apr 2023 10:26:30 +0000
Subject: [PATCH] Fix storage tests, add taint handling to CA, first docs draft

---
 Makefile                            |  3 +-
 charts/kubezero-addons/values.yaml  |  2 +
 charts/kubezero-network/values.yaml |  2 +-
 charts/kubezero-storage/Chart.yaml  |  2 +-
 charts/kubezero/values.yaml         |  2 +-
 docs/v1.25.md                       | 59 +++++++++++++++++++++++++++++
 6 files changed, 66 insertions(+), 4 deletions(-)
 create mode 100644 docs/v1.25.md
diff --git a/Makefile b/Makefile
index dabe8ede..eb8ab7ef 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,8 @@ IMAGE := kubezero-admin
 REGION := us-east-1
 
 # Also tag as Kubernetes major version
-EXTRA_TAGS = $(shell echo $(TAG) | awk -F '.' '{ print $$1 "." $$2 }')
+MY_TAG = $(shell git describe --tags --match v*.*.* 2>/dev/null || git rev-parse --short HEAD 2>/dev/null)
+EXTRA_TAGS = $(shell echo $(MY_TAG) | awk -F '.' '{ print $$1 "." $$2 }')
 
 include .ci/podman.mk
 
diff --git a/charts/kubezero-addons/values.yaml b/charts/kubezero-addons/values.yaml
index 1e9646ec..44fae66e 100644
--- a/charts/kubezero-addons/values.yaml
+++ b/charts/kubezero-addons/values.yaml
@@ -209,6 +209,8 @@ cluster-autoscaler:
   extraArgs:
     scan-interval: 30s
     skip-nodes-with-local-storage: false
+    balance-similar-node-groups: true
+    ignore-taint: "node.cilium.io/agent-not-ready"
 
   #securityContext:
   #  runAsNonRoot: true
diff --git a/charts/kubezero-network/values.yaml b/charts/kubezero-network/values.yaml
index f142c549..3ede8685 100644
--- a/charts/kubezero-network/values.yaml
+++ b/charts/kubezero-network/values.yaml
@@ -30,7 +30,7 @@ cilium:
   containerRuntime:
     integration: crio
 
-  # Until we figured out AppArmore on Alpine and Gentoo
+  # remove with 1.26
   securityContext:
     privileged: true
 
diff --git a/charts/kubezero-storage/Chart.yaml b/charts/kubezero-storage/Chart.yaml
index 63a99fd4..96f19c2d 100644
--- a/charts/kubezero-storage/Chart.yaml
+++ b/charts/kubezero-storage/Chart.yaml
@@ -2,7 +2,7 @@ apiVersion: v2
 name: kubezero-storage
 description: KubeZero umbrella chart for all things storage incl. AWS EBS/EFS, openEBS-lvm, gemini
 type: application
-version: 0.8.1
+version: 0.8.2
 home: https://kubezero.com
 icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
 keywords:
diff --git a/charts/kubezero/values.yaml b/charts/kubezero/values.yaml
index 471a55d0..187dd6bc 100644
--- a/charts/kubezero/values.yaml
+++ b/charts/kubezero/values.yaml
@@ -40,7 +40,7 @@ cert-manager:
 
 storage:
   enabled: false
-  targetRevision: 0.8.1
+  targetRevision: 0.8.2
   aws-ebs-csi-driver:
     enabled: false
   aws-efs-csi-driver:
diff --git a/docs/v1.25.md b/docs/v1.25.md
new file mode 100644
index 00000000..9c71d520
--- /dev/null
+++ b/docs/v1.25.md
@@ -0,0 +1,59 @@
+# KubeZero 1.25
+
+## TODO
+
+## What's new - Major themes
+- AppArmor is now available and enabled on all controller and worker nodes
+- all KubeZero AMIs and containers updated to latest Alpine 3.17
+- improved worker initialization, ensure cilium is running before any other pod
+
+## Version upgrades
+- cilium 1.13
+- nvidia-device-plugin
+- aws-neuron-driver
+- aws-node-termination-handler
+- aws-ebs-csi-driver
+- aws-efs-csi-driver
+- istio 1.17
+- argocd 2.6.7
+- fluent-bit 2.0.10
+- fluentd 2.6
+- all things prometheus
+
+### FeatureGates
+- CustomCPUCFSQuotaPeriod
+- NodeOutOfServiceVolumeDetach
+
+# Upgrade
+`(No, really, you MUST read this before you upgrade)`
+
+Ensure your Kube context points to the correct cluster !
+
+1. Review CFN config for controller and workers, no mandatory changes during this release though
+
+2. Upgrade CFN stacks for the control plane *ONLY* !
+  Updating the workers CFN stacks would trigger rolling updates right away !
+
+3. Trigger cluster upgrade:  
+  `./admin/upgrade_cluster.sh <path to the argocd app kubezero yaml for THIS cluster>`
+
+4. Review the kubezero-config and if all looks good commit the ArgoApp resouce for Kubezero via regular git  
+  git add / commit / push `<cluster/env/kubezero/application.yaml>`  
+  *DO NOT yet re-enable ArgoCD before all pre v1.24 workers have been replaced !!!*
+
+5. Reboot controller(s) one by one  
+Wait each time for controller to join and all pods running.
+Might take a while ...
+
+6. Upgrade CFN stacks for the workers.  
+  This in turn will trigger automated worker updates by evicting pods and launching new workers in a rolling fashion.
+  Grab a coffee and keep an eye on the cluster to be safe ...
+  Depending on your cluster size it might take a while to roll over all workers!
+
+7. Re-enable ArgoCD by hitting <return> on the still waiting upgrade script 
+
+8. Quickly head over to ArgoCD and sync the KubeZero main module as soon as possible to reduce potential back and forth in case ArgoCD has legacy state
+
+
+## Known issues
+So far so good.