From a49b139e54f9f4d55eb0c5dae5e5c211ca912686 Mon Sep 17 00:00:00 2001
From: Stefan Reimer <stefan@zero-downtime.net>
Date: Wed, 14 Sep 2022 19:08:14 +0200
Subject: [PATCH] feat: latest 1.23 cluster upgrade config

---
 Dockerfile                                    |   7 +-
 admin/README.md                               |  12 ++
 admin/kubezero.sh                             |  70 +++--------
 admin/libhelm.sh                              |  50 ++++++++
 admin/migrate_argo.py                         |  82 ++++++++-----
 admin/upgrade_cluster.sh                      |  23 ++--
 admin/v1.23/pre-upgrade.sh                    |  72 +++++++++++
 .../templates/ClusterConfiguration.yaml       |   2 +-
 charts/kubezero-addons/Chart.yaml             |   9 +-
 charts/kubezero-addons/README.md.gotmpl       |   5 +
 .../device-plugins/aws-neuron-ds.yaml         |   2 -
 charts/kubezero-addons/values.yaml            |  27 ++++
 charts/kubezero-ci/Chart.yaml                 |   4 +-
 charts/kubezero-ci/values.yaml                |   8 +-
 charts/kubezero-istio/Chart.yaml              |   2 +-
 charts/kubezero-istio/values.yaml             |   2 +-
 charts/kubezero-network/Chart.yaml            |   4 +-
 charts/kubezero-network/values.yaml           |   2 -
 charts/kubezero/Chart.yaml                    |   2 +-
 charts/kubezero/templates/addons.yaml         |   5 +
 charts/kubezero/templates/network.yaml        |   6 +
 charts/kubezero/values.yaml                   |   6 +-
 docs/v1.23.md                                 | 116 ++++++++++++++++++
 scripts/exec_each_node.sh                     |   2 +-
 24 files changed, 408 insertions(+), 112 deletions(-)
 create mode 100644 admin/README.md
 mode change 100755 => 100644 admin/libhelm.sh
 create mode 100644 admin/v1.23/pre-upgrade.sh
 create mode 100644 docs/v1.23.md

diff --git a/Dockerfile b/Dockerfile
index fcb87f95..a859a657 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -25,9 +25,12 @@ RUN cd /etc/apk/keys && \
       restic@testing \
       helm@testing
 
-RUN helm repo add kubezero https://cdn.zero-downtime.net/charts
+RUN helm repo add kubezero https://cdn.zero-downtime.net/charts && \
+    mkdir -p /var/lib/kubezero
+
+ADD admin/kubezero.sh admin/libhelm.sh admin/migrate_argo.py /usr/bin
+ADD admin/libhelm.sh admin/v${KUBE_VERSION}/* /var/lib/kubezero
 
-ADD admin/kubezero.sh admin/libhelm.sh /usr/bin
 ADD charts/kubeadm /charts/kubeadm
 ADD charts/kubezero /charts/kubezero
 
diff --git a/admin/README.md b/admin/README.md
new file mode 100644
index 00000000..389f57ca
--- /dev/null
+++ b/admin/README.md
@@ -0,0 +1,12 @@
+# Cluster upgrade flow
+
+## During 1.23 upgrade
+- create new kubezero-values CM if not exists yet, by merging parts of the legacy /etc/kubernetes/kubeadm-values.yaml values with potentially existing values from kubezero ArgoCD app values
+
+
+# General flow
+
+- No ArgoCD -> user kubezero-values CM
+- ArgoCD -> update kubezero-values CM with current values from ArgoCD app values
+
+- Apply any upgrades / migrations
diff --git a/admin/kubezero.sh b/admin/kubezero.sh
index 3cc12587..c5e0b83a 100755
--- a/admin/kubezero.sh
+++ b/admin/kubezero.sh
@@ -6,7 +6,7 @@ if [ -n "$DEBUG" ]; then
 fi
 
 # include helm lib
-. libhelm.sh
+. /var/lib/kubezero/libhelm.sh
 
 # Export vars to ease use in debug_shell etc
 export WORKDIR=/tmp/kubezero
@@ -70,11 +70,11 @@ parse_kubezero() {
     [ -f ${HOSTFS}/etc/kubernetes/kubezero.yaml ] && cp ${HOSTFS}/etc/kubernetes/kubezero.yaml ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml
   fi
 
-  export CLUSTERNAME=$(yq eval '.global.clusterName' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml)
-  export HIGHAVAILABLE=$(yq eval '.global.highAvailable // "false"' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml)
+  export CLUSTERNAME=$(yq eval '.global.clusterName // .clusterName' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml)
+  export HIGHAVAILABLE=$(yq eval '.global.highAvailable // .highAvailable // "false"' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml)
   export ETCD_NODENAME=$(yq eval '.etcd.nodeName' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml)
   export NODENAME=$(yq eval '.nodeName' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml)
-  export PROVIDER_ID=$(yq eval '.providerID' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml)
+  export PROVIDER_ID=$(yq eval '.providerID // ""' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml)
   export AWS_IAM_AUTH=$(yq eval '.api.awsIamAuth.enabled // "false"' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml)
 
   # From here on bail out, allows debug_shell even in error cases
@@ -120,33 +120,9 @@ post_kubeadm() {
 }
 
 
-cluster_upgrade() {
-  ### PRE 1.23 specific
-  #####################
-
-  # Migrate addons and network values from local kubeadm-values.yaml on controllers into CM
-  # - remove secrets from addons
-  # - enable cilium
-
-  if [[ $PROVIDER_ID =~ ^aws ]]; then
-    REGION=$(echo $PROVIDER_ID | sed -e 's,aws:///,,' -e 's,/.*,,' -e 's/\w$//')
-  fi
-
-  kubectl get cm -n kube-system kubezero-values || \
-  kubectl create configmap -n kube-system kubezero-values \
-    --from-literal values.yaml="$(yq e 'del .addons.clusterBackup.repository | del .addons.clusterBackup.password | \
-                                   .addons.clusterBackup.image.tag =strenv(KUBE_VERSION) | \
-                                   .network.cilium.enabled = true | .network.multus.defaultNetworks = ["cilium"] | \
-                                   .network.cilium.cluster.name = strenv(CLUSTERNAME) | \
-                                   .global.clusterName = strenv(CLUSTERNAME) | \
-				   .global.highAvailable = strenv(HIGHAVAILABLE) | \
-				   .global.aws.region = strenv(REGION)' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml)"
-
-  # Create kubeadm-values CM if not available
-  kubectl get cm -n kube-system kubeadm-values || \
-  kubectl create configmap -n kube-system kubeadm-values
-
-  #####################
+upgrade_cluster() {
+  # pre upgrade hook
+  [ -f /var/lib/kubezero/pre-upgrade.sh ] && . /var/lib/kubezero/pre-upgrade.sh
 
   render_kubeadm
 
@@ -162,10 +138,8 @@ cluster_upgrade() {
     cp ${HOSTFS}/etc/kubernetes/admin.conf ${HOSTFS}/root/.kube/config
   fi
 
-  ### POST 1.23 specific
-  #####################
-
-  ######################
+  # post upgrade hook
+  [ -f /var/lib/kubezero/post-upgrade.sh ] && . /var/lib/kubezero/post-upgrade.sh
 
   # Cleanup after kubeadm on the host
   rm -rf ${HOSTFS}/etc/kubernetes/tmp
@@ -181,7 +155,7 @@ cluster_upgrade() {
 }
 
 
-node_upgrade() {
+upgrade_node() {
   echo "Starting node upgrade ..."
 
   echo "All done."
@@ -194,7 +168,7 @@ control_plane_node() {
   render_kubeadm
 
   # Ensure clean slate if bootstrap, restore PKI otherwise
-  if [[ "$CMD" =~ "^(bootstrap)$" ]]; then
+  if [[ "$CMD" =~ ^(bootstrap)$ ]]; then
     rm -rf ${HOSTFS}/var/lib/etcd/member
 
   else
@@ -211,7 +185,7 @@ control_plane_node() {
     cp ${WORKDIR}/admin.conf ${HOSTFS}/root/.kube/config
 
     # Only restore etcd data during "restore" and none exists already
-    if [[ "$CMD" =~ "^(restore)$" ]]; then
+    if [[ "$CMD" =~ ^(restore)$ ]]; then
       if [ ! -d ${HOSTFS}/var/lib/etcd/member ]; then
         etcdctl snapshot restore ${HOSTFS}/etc/kubernetes/etcd_snapshot \
           --name $ETCD_NODENAME \
@@ -238,7 +212,7 @@ control_plane_node() {
   _kubeadm init phase preflight
   _kubeadm init phase kubeconfig all
 
-  if [[ "$CMD" =~ "^(join)$" ]]; then
+  if [[ "$CMD" =~ ^(join)$ ]]; then
     # Delete any former self in case forseti did not delete yet
     kubectl delete node ${NODENAME} --wait=true || true
     # Wait for all pods to be deleted otherwise we end up with stale pods eg. kube-proxy and all goes to ....
@@ -297,7 +271,7 @@ control_plane_node() {
   retry 0 5 30 kubectl cluster-info --request-timeout 3 >/dev/null
 
   # Update providerID as underlying VM changed during restore
-  if [[ "$CMD" =~ "^(restore)$" ]]; then
+  if [[ "$CMD" =~ ^(restore)$ ]]; then
     if [ -n "$PROVIDER_ID" ]; then
       etcdhelper \
         -cacert ${HOSTFS}/etc/kubernetes/pki/etcd/ca.crt \
@@ -308,7 +282,7 @@ control_plane_node() {
     fi
   fi
 
-  if [[ "$CMD" =~ "^(bootstrap|restore)$" ]]; then
+  if [[ "$CMD" =~ ^(bootstrap|restore)$ ]]; then
     _kubeadm init phase upload-config all
     _kubeadm init phase upload-certs --skip-certificate-key-print
 
@@ -319,7 +293,7 @@ control_plane_node() {
   _kubeadm init phase mark-control-plane
   _kubeadm init phase kubelet-finalize all
 
-  if [[ "$CMD" =~ "^(bootstrap|restore)$" ]]; then
+  if [[ "$CMD" =~ ^(bootstrap|restore)$ ]]; then
     _kubeadm init phase addon all
   fi
 
@@ -343,14 +317,10 @@ control_plane_node() {
 apply_module() {
   MODULES=$1
 
-  kubectl get configmap -n kube-system kubezero-values -o yaml | yq '.data."values.yaml"' > $WORKDIR/_values.yaml
+  get_kubezero_values
 
   # Always use embedded kubezero chart
-  helm template $CHARTS/kubezero -f $WORKDIR/_values.yaml --version ~$KUBE_VERSION --devel --output-dir $WORKDIR
-
-  # Resolve all the all enabled modules
-
-  [ -z "$MODULES" ] && MODULES="$(ls ${WORKDIR}/kubezero/templates | sed -e 's/.yaml//g')"
+  helm template $CHARTS/kubezero -f $WORKDIR/kubezero-values.yaml --version ~$KUBE_VERSION --devel --output-dir $WORKDIR
 
   # CRDs first
   for t in $MODULES; do
@@ -415,8 +385,8 @@ parse_kubezero
 # Execute tasks
 for t in $@; do
   case "$t" in
-    cluster_upgrade) cluster_upgrade;;
-    node_upgrade) node_upgrade;;
+    upgrade_cluster) upgrade_cluster;;
+    upgrade_node) upgrade_node;;
     bootstrap) control_plane_node bootstrap;;
     join) control_plane_node join;;
     restore) control_plane_node restore;;
diff --git a/admin/libhelm.sh b/admin/libhelm.sh
old mode 100755
new mode 100644
index b572e853..ae169c0f
--- a/admin/libhelm.sh
+++ b/admin/libhelm.sh
@@ -20,6 +20,53 @@ function chart_location() {
 }
 
 
+function argo_used() {
+  kubectl get application kubezero -n argocd && rc=$? || rc=$?
+  return $rc
+}
+
+
+# get kubezero-values from ArgoCD if available or use in-cluster CM without Argo
+function get_kubezero_values() {
+  argo_used && \
+    { kubectl get application kubezero -n argocd -o yaml | yq .spec.source.helm.values > ${WORKDIR}/kubezero-values.yaml; } || \
+    { kubectl get configmap -n kube-system kubezero-values -o yaml | yq '.data."values.yaml"' > ${WORKDIR}/kubezero-values.yaml ;}
+}
+
+
+function disable_argo() {
+  cat > _argoapp_patch.yaml <<EOF
+spec:
+  syncWindows:
+    - kind: deny
+      schedule: '0 * * * *'
+      duration: 24h
+      namespaces:
+      - '*'
+EOF
+  kubectl patch appproject kubezero -n argocd --patch-file _argoapp_patch.yaml --type=merge && rm _argoapp_patch.yaml
+}
+
+
+function enable_argo() {
+  kubectl patch appproject kubezero -n argocd --type json -p='[{"op": "remove", "path": "/spec/syncWindows"}]' || true
+}
+
+
+function argo_app_synced() {
+  APP=$1
+
+  # Ensure we are synced otherwise bail out
+  status=$(kubectl get application $APP -n argocd -o yaml | yq .status.sync.status)
+  if [ "$status" != "Synced" ]; then
+    echo "ArgoCD Application $APP not 'Synced'!"
+    return 1
+  fi
+
+  return 0
+}
+
+
 # make sure namespace exists prior to calling helm as the create-namespace options doesn't work
 function create_ns() {
   local namespace=$1
@@ -83,6 +130,9 @@ function _helm() {
 
   yq eval '.spec.source.helm.values' $WORKDIR/kubezero/templates/${module}.yaml > $WORKDIR/values.yaml
 
+  echo "using values for $module: "
+  cat $WORKDIR/values.yaml
+
   if [ $action == "crds" ]; then
     # Allow custom CRD handling
     declare -F ${module}-crds && ${module}-crds || _crds
diff --git a/admin/migrate_argo.py b/admin/migrate_argo.py
index ee98add0..083a8d52 100755
--- a/admin/migrate_argo.py
+++ b/admin/migrate_argo.py
@@ -2,13 +2,44 @@
 import sys
 import argparse
 import io
-import ruamel.yaml
+import yaml
+
+DEFAULT_VERSION = "1.23.10-2"
 
 
-yaml = ruamel.yaml.YAML()
-yaml.preserve_quotes = True
-yaml.explicit_start = True
-yaml.indent(mapping=2, sequence=4, offset=2)
+def migrate(values):
+    """Actual changes here"""
+
+    # migrate ClusterName to clusterName
+    if "ClusterName" in values:
+        values["clusterName"] = values["ClusterName"]
+        values.pop("ClusterName")
+
+    # Create new clusterwide cloudprovider data if possible
+    try:
+        if values["cert-manager"]["clusterIssuer"]["solvers"][0]["dns01"]["route53"]["regions"]:
+            if "global" not in values:
+                values["global"] = {}
+            if "aws" not in values["global"]:
+                values["global"]["aws"] = {}
+
+            values["global"]["aws"]["region"] = values["cert-manager"]["clusterIssuer"]["solvers"][0]["dns01"]["route53"]["region"]
+    except KeyError:
+        pass
+
+    return values
+
+
+def str_presenter(dumper, data):
+    if len(data.splitlines()) > 1:  # check for multiline string
+        return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
+    return dumper.represent_scalar("tag:yaml.org,2002:str", data)
+
+
+yaml.add_representer(str, str_presenter)
+
+# to use with safe_dump:
+yaml.representer.SafeRepresenter.add_representer(str, str_presenter)
 
 
 def rec_sort(d):
@@ -31,14 +62,17 @@ def rec_sort(d):
     if isinstance(d, list):
         for idx, elem in enumerate(d):
             d[idx] = rec_sort(elem)
+
     return d
 
 
-parser = argparse.ArgumentParser(description="Migrate ArgoCD Kubezero values to new cluster config")
+parser = argparse.ArgumentParser(
+    description="Migrate ArgoCD Kubezero values to new cluster config"
+)
 parser.add_argument(
     "--version",
     dest="version",
-		default="1.23.10",
+    default=DEFAULT_VERSION,
     action="store",
     required=False,
     help="Update KubeZero version",
@@ -46,37 +80,25 @@ parser.add_argument(
 
 args = parser.parse_args()
 
-application = yaml.load(sys.stdin)
+application = yaml.safe_load(sys.stdin)
 
 # Set version from cmd line
 if args.version:
     application["spec"]["source"]["targetRevision"] = args.version
 
 # Extract Helm values
-values = yaml.load(application["spec"]["source"]["helm"]["values"])
-
-### Do your thing
-
-# migrate ClusterName to clusterName
-if "ClusterName" in values:
-    values["clusterName"] = values["ClusterName"]
-    values.pop("ClusterName")
-
-# Create new clusterwide cloudprovider data if possible
-try:
-    if values["cert-manager"]["clusterIssuer"]["solvers"][0]["dns01"]["route53"]["regions"]:
-        if "aws" not in values:
-            values["aws"] = {}
-        values["aws"]["region"] = values["cert-manager"]["clusterIssuer"]["solvers"][0]["dns01"]["route53"]["region"]
-except KeyError:
-    pass
-
-### End
+values = yaml.safe_load(application["spec"]["source"]["helm"]["values"])
 
 # Merge new values
 buffer = io.StringIO()
-yaml.dump(rec_sort(values), buffer)
+yaml.safe_dump(
+    rec_sort(migrate(values)),
+    buffer,
+    default_flow_style=False,
+    indent=2,
+    sort_keys=False,
+)
 application["spec"]["source"]["helm"]["values"] = buffer.getvalue()
 
-# Dump final yaml
-yaml.dump(application, sys.stdout)
+# Output new Application resource
+yaml.dump(application, sys.stdout, default_flow_style=False)
diff --git a/admin/upgrade_cluster.sh b/admin/upgrade_cluster.sh
index 67faa8e5..5ed60f70 100755
--- a/admin/upgrade_cluster.sh
+++ b/admin/upgrade_cluster.sh
@@ -1,6 +1,9 @@
 #!/bin/bash -e
 
-VERSION="v1.23.10-1"
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+. $SCRIPT_DIR/libhelm.sh
+
+VERSION="v1.23.10-2"
 
 [ -n "$DEBUG" ] && set -x
 
@@ -120,19 +123,21 @@ EOF
   kubectl delete pod kubezero-upgrade-${VERSION//.} -n kube-system
 }
 
-all_nodes_upgrade "mount --make-shared /host/sys/fs/cgroup; mount --make-shared /host/sys;"
+argo_used && disable_argo
 
-control_plane_upgrade cluster_upgrade
+#all_nodes_upgrade "mount --make-shared /host/sys/fs/cgroup; mount --make-shared /host/sys;"
 
-echo "Adjust kubezero-values CM !!"
-read
+control_plane_upgrade upgrade_cluster
+
+#echo "Adjust kubezero-values CM !!"
+#read
 
 #kubectl delete ds kube-multus-ds -n kube-system
 
 control_plane_upgrade "apply_network, apply_addons"
-exit 0
 
-kubectl rollout restart daemonset/calico-node -n kube-system
-kubectl rollout restart daemonset/cilium -n kube-system
+#kubectl rollout restart daemonset/calico-node -n kube-system
+#kubectl rollout restart daemonset/cilium -n kube-system
+#kubectl rollout restart daemonset/kube-multus-ds -n kube-system 
 
-kubectl rollout restart daemonset/kube-multus-ds -n kube-system 
+argo_used && enable_argo
diff --git a/admin/v1.23/pre-upgrade.sh b/admin/v1.23/pre-upgrade.sh
new file mode 100644
index 00000000..97c7ba94
--- /dev/null
+++ b/admin/v1.23/pre-upgrade.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# Migrate addons and network values from local kubeadm-values.yaml on controllers into CM
+# - remove secrets from addons
+# - enable cilium
+        
+# Create kubeadm-values CM if not available
+kubectl get cm -n kube-system kubeadm-values || \
+kubectl create configmap -n kube-system kubeadm-values
+
+kubectl get cm -n kube-system kubezero-values || \
+kubectl create configmap -n kube-system kubezero-values
+
+# tweak local kubeadm for upgrade later on
+yq eval -i '.global.clusterName = strenv(CLUSTERNAME) |
+            .global.highAvailable = env(HIGHAVAILABLE)' \
+  ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml
+
+# extract addons
+yq e '.addons |
+      del .clusterBackup.repository |
+      del .clusterBackup.password |
+      .clusterBackup.image.tag = strenv(KUBE_VERSION) |
+      {"addons": .}' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml > $WORKDIR/addons-values.yaml
+
+# extract network
+yq e '.network |
+      .cilium.enabled = true |
+      .multus.defaultNetworks = ["cilium"] |
+      .cilium.cluster.name = strenv(CLUSTERNAME) |
+      {"network": .}' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml > $WORKDIR/network-values.yaml
+
+# get current argo cd values
+kubectl get application kubezero -n argocd -o yaml | yq .spec.source.helm.values > ${WORKDIR}/argo-values.yaml
+
+# merge all into new CM
+yq ea '. as $item ireduce ({}; . * $item ) |
+       .global.clusterName = strenv(CLUSTERNAME) |
+       .global.highAvailable = env(HIGHAVAILABLE)' $WORKDIR/addons-values.yaml ${WORKDIR}/network-values.yaml $WORKDIR/argo-values.yaml > $WORKDIR/kubezero-values.yaml
+
+
+# Update kubezero-values CM
+kubectl get cm -n kube-system kubezero-values -o=yaml | \
+  yq e '.data."values.yaml" |= load_str("/tmp/kubezero/kubezero-values.yaml")' | \
+  kubectl replace -f -
+
+
+kubezero_chart_version=$(yq .version /charts/kubezero/Chart.yaml)
+
+# update argo app, create new from scratch as Argo is really picky being patched
+# autosync DISABLED !!!
+
+cat > $WORKDIR/kube-argo.yaml <<EOF
+apiVersion: argoproj.io/v1alpha1
+kind: Application
+metadata:
+  name: kubezero
+  namespace: argocd
+spec:
+  project: kubezero
+  source:
+    repoURL: https://cdn.zero-downtime.net/charts
+    chart: kubezero
+    targetRevision: $kubezero_chart_version
+  destination:
+    server: https://kubernetes.default.svc
+    namespace: argocd
+  syncPolicy: {}
+EOF
+
+yq eval -i '.spec.source.helm.values |= load_str("/tmp/kubezero/kubezero-values.yaml")' $WORKDIR/kube-argo.yaml
+kubectl apply -f $WORKDIR/kube-argo.yaml
diff --git a/charts/kubeadm/templates/ClusterConfiguration.yaml b/charts/kubeadm/templates/ClusterConfiguration.yaml
index 44f374b7..d42aafa7 100644
--- a/charts/kubeadm/templates/ClusterConfiguration.yaml
+++ b/charts/kubeadm/templates/ClusterConfiguration.yaml
@@ -72,7 +72,7 @@ apiServer:
     authentication-token-webhook-config-file: /etc/kubernetes/apiserver/aws-iam-authenticator.yaml
     {{- end }}
     feature-gates: {{ include "kubeadm.featuregates" ( dict "return" "csv" ) | trimSuffix "," | quote }}
-    enable-admission-plugins: DenyServiceExternalIPs,NodeRestriction,EventRateLimit
+    enable-admission-plugins: DenyServiceExternalIPs,NodeRestriction,EventRateLimit,ExtendedResourceToleration
     # {{- if .Values.global.highAvailable }}
     # goaway-chance: ".001"
     # {{- end }}
diff --git a/charts/kubezero-addons/Chart.yaml b/charts/kubezero-addons/Chart.yaml
index e01a57b9..a35246c3 100644
--- a/charts/kubezero-addons/Chart.yaml
+++ b/charts/kubezero-addons/Chart.yaml
@@ -2,13 +2,15 @@ apiVersion: v2
 name: kubezero-addons
 description: KubeZero umbrella chart for various optional cluster addons
 type: application
-version: 0.6.1
+version: 0.6.2
 appVersion: v1.23.10
 home: https://kubezero.com
 icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
 keywords:
   - kubezero
   - fuse-device-plugin
+  - neuron-device-plugin
+  - nvidia-device-plugin
   - aws-node-termination-handler
   - external-dns
 maintainers:
@@ -27,4 +29,9 @@ dependencies:
     version: 9.21.0
     repository: https://kubernetes.github.io/autoscaler
     condition: cluster-autoscaler.enabled
+  - name: nvidia-device-plugin
+    version: 0.12.2
+    # https://github.com/NVIDIA/k8s-device-plugin
+    repository: https://nvidia.github.io/k8s-device-plugin
+    condition: nvidia-device-plugin.enabled
 kubeVersion: ">= 1.20.0"
diff --git a/charts/kubezero-addons/README.md.gotmpl b/charts/kubezero-addons/README.md.gotmpl
index f7ff4638..2eed5f44 100644
--- a/charts/kubezero-addons/README.md.gotmpl
+++ b/charts/kubezero-addons/README.md.gotmpl
@@ -20,4 +20,9 @@
 ## AWS Neuron 
 Device plugin for [AWS Neuron](https://aws.amazon.com/machine-learning/neuron/) - [Inf1 instances](https://aws.amazon.com/ec2/instance-types/inf1/)
     
+## Nvidia
+
+## Cluster AutoScaler
+- https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws/README.md
+
 {{ template "chart.valuesSection" . }}
diff --git a/charts/kubezero-addons/templates/device-plugins/aws-neuron-ds.yaml b/charts/kubezero-addons/templates/device-plugins/aws-neuron-ds.yaml
index ddb90a48..ae092e96 100644
--- a/charts/kubezero-addons/templates/device-plugins/aws-neuron-ds.yaml
+++ b/charts/kubezero-addons/templates/device-plugins/aws-neuron-ds.yaml
@@ -17,8 +17,6 @@ spec:
     spec:
       serviceAccount: neuron-device-plugin
       tolerations:
-      - key: CriticalAddonsOnly
-        operator: Exists
       - key: aws.amazon.com/neuron
         operator: Exists
         effect: NoSchedule
diff --git a/charts/kubezero-addons/values.yaml b/charts/kubezero-addons/values.yaml
index 92dab88a..a8a6d1cd 100644
--- a/charts/kubezero-addons/values.yaml
+++ b/charts/kubezero-addons/values.yaml
@@ -83,6 +83,33 @@ awsNeuron:
     name: public.ecr.aws/neuron/neuron-device-plugin
     tag: 1.9.3.0
 
+nvidia-device-plugin:
+  enabled: false
+  tolerations:
+  - key: nvidia.com/gpu
+    operator: Exists
+    effect: NoSchedule
+  - key: kubezero-workergroup
+    effect: NoSchedule
+    operator: Exists
+
+  affinity:
+    nodeAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+        nodeSelectorTerms:
+          - matchExpressions:
+              - key: "node.kubernetes.io/instance-type"
+                operator: In
+                values:
+                  - g5.xlarge
+                  - g5.2xlarge
+                  - g5.4xlarge
+                  - g5.8xlarge
+                  - g5.12xlarge
+                  - g5.16xlarge
+                  - g5.24xlarge
+                  - g5.48xlarge
+
 cluster-autoscaler:
   enabled: false
 
diff --git a/charts/kubezero-ci/Chart.yaml b/charts/kubezero-ci/Chart.yaml
index 7adebfb3..e309a0c8 100644
--- a/charts/kubezero-ci/Chart.yaml
+++ b/charts/kubezero-ci/Chart.yaml
@@ -2,7 +2,7 @@ apiVersion: v2
 name: kubezero-ci
 description: KubeZero umbrella chart for all things CI
 type: application
-version: 0.5.12
+version: 0.5.13
 home: https://kubezero.com
 icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
 keywords:
@@ -26,7 +26,7 @@ dependencies:
     repository: https://dl.gitea.io/charts/
     condition: gitea.enabled
   - name: jenkins
-    version: 4.1.16
+    version: 4.2.1
     repository: https://charts.jenkins.io
     condition: jenkins.enabled
   - name: trivy
diff --git a/charts/kubezero-ci/values.yaml b/charts/kubezero-ci/values.yaml
index 8a56261d..91dff8d1 100644
--- a/charts/kubezero-ci/values.yaml
+++ b/charts/kubezero-ci/values.yaml
@@ -114,15 +114,15 @@ jenkins:
                       numToKeepStr: "10"
 
     installPlugins:
-      - kubernetes:3670.v6ca_059233222
+      - kubernetes:3704.va_08f0206b_95e
       - workflow-aggregator:581.v0c46fa_697ffd
-      - git:4.11.4
+      - git:4.11.5
       - configuration-as-code:1512.vb_79d418d5fc8
       - antisamy-markup-formatter:2.7
       - prometheus:2.0.11
       - htmlpublisher:1.30
-      - build-discarder:60.v1747b0eb632a
-      - dark-theme:185.v276b_5a_8966a_e
+      - build-discarder:139.v05696a_7fe240
+      - dark-theme:245.vb_a_2b_b_010ea_96
       - kubernetes-credentials-provider:1.196.va_55f5e31e3c2
 
   serviceAccountAgent:
diff --git a/charts/kubezero-istio/Chart.yaml b/charts/kubezero-istio/Chart.yaml
index f78c86f3..2901a6f0 100644
--- a/charts/kubezero-istio/Chart.yaml
+++ b/charts/kubezero-istio/Chart.yaml
@@ -2,7 +2,7 @@ apiVersion: v2
 name: kubezero-istio
 description: KubeZero Umbrella Chart for Istio
 type: application
-version: 0.8.2
+version: 0.8.3
 home: https://kubezero.com
 icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
 keywords:
diff --git a/charts/kubezero-istio/values.yaml b/charts/kubezero-istio/values.yaml
index 20e93f6e..c61ec4ae 100644
--- a/charts/kubezero-istio/values.yaml
+++ b/charts/kubezero-istio/values.yaml
@@ -1,6 +1,6 @@
 global:
   # hub: docker.io/istio
-  tag: 1.14.3-distroless
+  tag: 1.14.4-distroless
 
   logAsJson: true
 
diff --git a/charts/kubezero-network/Chart.yaml b/charts/kubezero-network/Chart.yaml
index 06945426..68262fe9 100644
--- a/charts/kubezero-network/Chart.yaml
+++ b/charts/kubezero-network/Chart.yaml
@@ -2,7 +2,7 @@ apiVersion: v2
 name: kubezero-network
 description: KubeZero umbrella chart for all things network
 type: application
-version: 0.3.2
+version: 0.3.3
 home: https://kubezero.com
 icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
 keywords:
@@ -23,7 +23,7 @@ dependencies:
     repository: https://helm.cilium.io/
     condition: cilium.enabled
   - name: metallb
-    version: 0.13.4
+    version: 0.13.5
     repository: https://metallb.github.io/metallb
     condition: metallb.enabled
   - name: calico
diff --git a/charts/kubezero-network/values.yaml b/charts/kubezero-network/values.yaml
index e4fc5526..8ecf439b 100644
--- a/charts/kubezero-network/values.yaml
+++ b/charts/kubezero-network/values.yaml
@@ -1,7 +1,5 @@
 metallb:
   enabled: false
-  psp:
-    create: false
 
   controller:
     tolerations:
diff --git a/charts/kubezero/Chart.yaml b/charts/kubezero/Chart.yaml
index b55c061c..7f7bb2e2 100644
--- a/charts/kubezero/Chart.yaml
+++ b/charts/kubezero/Chart.yaml
@@ -2,7 +2,7 @@ apiVersion: v2
 name: kubezero
 description: KubeZero - Root App of Apps chart
 type: application
-version: 1.23.10-1
+version: 1.23.10-2
 home: https://kubezero.com
 icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
 keywords:
diff --git a/charts/kubezero/templates/addons.yaml b/charts/kubezero/templates/addons.yaml
index bf7bd926..00a665b4 100644
--- a/charts/kubezero/templates/addons.yaml
+++ b/charts/kubezero/templates/addons.yaml
@@ -28,6 +28,11 @@ awsNeuron:
   {{- toYaml . | nindent 2 }}
 {{- end }}
 
+{{- with index .Values "addons" "nvidia-device-plugin" }}
+nvidia-device-plugin:
+  {{- toYaml . | nindent 2 }}
+{{- end }}
+
 {{- with index .Values "addons" "external-dns" }}
 external-dns:
   {{- toYaml . | nindent 2 }}
diff --git a/charts/kubezero/templates/network.yaml b/charts/kubezero/templates/network.yaml
index 6a444ed1..21316c19 100644
--- a/charts/kubezero/templates/network.yaml
+++ b/charts/kubezero/templates/network.yaml
@@ -17,6 +17,12 @@ cilium:
 
 
 {{- define "network-argo" }}
+  # Metallb
+  ignoreDifferences:
+  - group: apiextensions.k8s.io/v1
+    kind: CustomResourceDefinition
+    jsonPointers:
+    - /spec/conversion/webhook/clientConfig/caBundle
 {{- end }}
 
 {{ include "kubezero-app.app" . }}
diff --git a/charts/kubezero/values.yaml b/charts/kubezero/values.yaml
index 4e3c97cd..4a13dce2 100644
--- a/charts/kubezero/values.yaml
+++ b/charts/kubezero/values.yaml
@@ -8,12 +8,12 @@ HighAvailableControlplane: false
 
 addons:
   enabled: true
-  targetRevision: 0.6.1
+  targetRevision: 0.6.2
 
 network:
   enabled: true
   retain: true
-  targetRevision: 0.3.2
+  targetRevision: 0.3.3
 
 cert-manager:
   enabled: false
@@ -31,7 +31,7 @@ storage:
 istio:
   enabled: false
   namespace: istio-system
-  targetRevision: 0.8.2
+  targetRevision: 0.8.3
 
 istio-ingress:
   enabled: false
diff --git a/docs/v1.23.md b/docs/v1.23.md
new file mode 100644
index 00000000..2fb2bef6
--- /dev/null
+++ b/docs/v1.23.md
@@ -0,0 +1,116 @@
+# KubeZero 1.23
+
+## What's new - Major themes
+
+- update inf1 neuron drivers incl. node auto-taints 
+- support for Nvidia g5 instances incl. the whole toolchain up to device drivers etc, auto node taints
+- ExtendedResourceToleration AdmissionController enabled to auto tolerate INF1 and Nvidia pods
+- Cluster-Autoscaler 
+
+
+
+### Alpine - Custom AMIs
+Starting with 1.22, all KubeZero nodes boot using custom AMIs. These AMIs will be provided and shared by the Zero Down Time for all customers. As always, all sources incl. the build pipeline are freely available [here](https://git.zero-downtime.net/ZeroDownTime/alpine-zdt-images).
+
+This eliminates *ALL* dependencies at boot time other than container registries. Gone are the days when Ubuntu, SuSE or Github decided to ruin your morning coffee.  
+
+KubeZero migrates from Ubuntu 20.04 LTS to [Alpine v3.15](https://www.alpinelinux.org/releases/) as its base OS.  
+#### Highlights:
+- minimal attack surface by removing all unnecessary bloat,  
+like all things SystemD, Ubuntu's snap, etc
+- reduced root file system size from 8GB to 2GB
+- minimal memory consumption of about 12MB fully booted
+
+  *Minimal* fully booted instance incl. SSH and Monit:
+
+  | | Ubuntu | Alpine|
+  |-|--------|-----|
+  | Memory used | 60MB | 12 MB |
+  | RootFS used | 1.1GB | 330 MB |
+  | RootFS encrypted | no | yes |
+  | Kernel | 5.11 | 5.15 |
+  | Init | Systemd | OpenRC |
+  | AMI / EBS size | 8GB | 1GB |
+  | Boot time | ~120s | ~45s |
+
+- Encrypted AMIs:  
+This closes the last gaps you might have in achieving *full encryption at rest* for every volume within a default KubeZero deployment. 
+
+### Etcd
+On AWS a new dedicated GP3 EBS volume gets provisioned per controller and is used as persistent etcd storage. These volumes will persist for the life time of the cluster and reused by future controller nodes in each AZ.  
+This ensure no data loss during upgrade or restore situations of single controller clusters. The hourly backup on S3 will still be used as fallback / disaster recovery option in case the file system gets corrupted etc.  
+
+
+### DNS
+The [external-dns](https://github.com/kubernetes-sigs/external-dns) controller got integrated and is used to provide DNS based loadbalacing for the apiserver itself. This allows high available control planes on AWS as well as bare-metal in combination with various DNS providers.  
+
+Further usage of this controller to automate any DNS related configurations, like Ingress etc. is planned for following releases.
+
+### Container runtime
+Cri-o now uses crun rather than runc, which reduces the memory overhead *per pod* from 16M to 4M, details at [crun intro](https://www.redhat.com/sysadmin/introduction-crun)  
+
+With 1.22 and the switch to crun, support for [CgroupV2](https://www.kernel.org/doc/Documentation/cgroup-v2.txt) has been enabled.
+
+### AWS Neuron INF support
+Initial support for [Inf1 instances](https://aws.amazon.com/ec2/instance-types/inf1/) part of [AWS Neuron](https://aws.amazon.com/machine-learning/neuron/).  
+
+Workers automatically load the custom kernel module on these instance types and expose the `/dev/neuron*` devices.
+
+## Version upgrades
+- Istio to 1.13.3 using the new Helm [gateway charts](https://istio.io/latest/docs/setup/additional-setup/gateway/)
+- Logging: ECK operator upgraded from 1.6 to 2.1, fluent-bit 1.9.3
+- Metrics: Prometheus and all Grafana charts to latest to match V1.22
+- ArgoCD to V2.2.5
+- AWS EBS/EFS CSI drivers to latest versions
+- cert-manager to V1.8
+- aws-termination-handler to 1.16
+- aws-iam-authenticator to 0.5.7, required for >1.22 which allows using the latest version on the client side again
+
+## Misc
+- new metrics and dashboards for openEBS LVM CSI drivers
+- new node label `node.kubernetes.io/instance-type` for all nodes containing the EC2 instance type
+- kubelet root moved to `/var/lib/containers` to ensure ephemeral storage is allocated from the configurable volume rather than the root fs of the worker
+
+
+# Upgrade
+`(No, really, you MUST read this before you upgrade)`
+
+- Ensure your Kube context points to the correct cluster !
+- Ensure any usage of Kiam has been migrated to OIDC providers as any remaining Kiam components will be deleted as part of the upgrade
+
+1. Migrate ArgoCD KubeZero config:  
+  `cat <cluster/env/kubezero/application.yaml> | ./releases/v1.22/migrate_agro.py` and adjust if needed and replace the original. Do NOT commit yet !
+
+2. Upgrade `logging` and `metrics` module
+- `kubectl get crd elasticsearches.elasticsearch.k8s.elastic.co && kubectl replace -f https://download.elastic.co/downloads/eck/2.1.0/crds.yaml` CRDs for logging  
+- `./bootstrap.sh apply logging <env>` logging module to support new OS coming with 1.22  
+- `./bootstrap.sh crds metrics <env>` CRDs for metrics
+- `./bootstrap.sh apply metrics <env>` to get new exporters in place to support 1.22
+
+3. Trigger the cluster upgrade:  
+`./release/v1.22/upgrade_cluster.sh`
+
+4. Upgrade CFN stacks for the control plane and all worker groups
+Change Kubernetes version in controller config from `1.21.9` to `1.22.8`
+
+5. Reboot controller(s) one by one
+Wait each time for controller to join and all pods running.
+Might take a while ...
+
+6. Launch new set of workers, at least enough to host new Istio Ingress gateways due to Kernel requirements
+Eg. by doubling `desired` for each worker ASG,  
+
+7. Upgrade via boostrap.sh  
+As the changes around Istio are substantial in this release we need to upgrade some parts step by step to prevent service outages, especially for private-ingress.
+
+- `./bootstrap.sh crds all <env>` to deploy all new CRDs first  
+- `./bootstrap.sh apply cert-manager <env>` to update cert-manager, required for Istio  
+- `./bootstrap.sh apply istio <env>` to update the Istio control plane  
+- `./bootstrap.sh apply istio-private-ingress <env>` to deploy the new private-ingress gateways first
+- `./bootstrap.sh apply istio-ingress <env>` to update the public ingress and also remove the 1.21 private-ingress gateways
+
+8. Finalize via ArgoCD  
+  git add / commit / pusSh `<cluster/env/kubezero/application.yaml>` and watch ArgoCD do its work.
+
+9. Drain old workers
+   Drain one by one and reset each ASG to initial "desired" value.
diff --git a/scripts/exec_each_node.sh b/scripts/exec_each_node.sh
index 61d9e7b3..509351e9 100755
--- a/scripts/exec_each_node.sh
+++ b/scripts/exec_each_node.sh
@@ -4,5 +4,5 @@ NODES=$(kubectl get nodes -o json | jq -rc .items[].status.addresses[0].address)
 
 for n in $NODES; do
   >&2 echo "Node: $n"
-  ssh -q $n "$@"
+  ssh -q alpine@$n "$@"
 done