From e9eeb8be8d9b730bef1714c14171341e7e31bce8 Mon Sep 17 00:00:00 2001
From: Stefan Reimer <stefan@zero-downtime.net>
Date: Mon, 14 Nov 2022 14:14:11 +0100
Subject: [PATCH] feat: add delete_module to admin, various cleanups

---
 admin/kubezero.sh                   | 18 ++++++++++++-
 admin/libhelm.sh                    | 11 ++++----
 admin/upgrade_cluster.sh            |  4 +++
 charts/kubezero-network/README.md   |  9 ++++++-
 charts/kubezero-network/values.yaml | 19 ++++++++++---
 charts/kubezero/README.md           |  2 +-
 charts/kubezero/values.yaml         |  2 +-
 docs/v1.24.md                       | 42 +++++++++++++----------------
 8 files changed, 70 insertions(+), 37 deletions(-)

diff --git a/admin/kubezero.sh b/admin/kubezero.sh
index d7845ea..0892388 100755
--- a/admin/kubezero.sh
+++ b/admin/kubezero.sh
@@ -323,6 +323,21 @@ apply_module() {
 }
 
 
+delete_module() {
+  MODULES=$1
+
+  get_kubezero_values
+
+  # Always use embedded kubezero chart
+  helm template $CHARTS/kubezero -f $WORKDIR/kubezero-values.yaml --version ~$KUBE_VERSION --devel --output-dir $WORKDIR
+
+  for t in $MODULES; do
+    _helm delete $t
+  done
+
+  echo "Deleted KubeZero modules: $MODULES. Potential CRDs must be removed manually."
+}
+
 # backup etcd + /etc/kubernetes/pki
 backup() {
   # Display all ENVs, careful this exposes the password !
@@ -377,7 +392,8 @@ for t in $@; do
     bootstrap) control_plane_node bootstrap;;
     join) control_plane_node join;;
     restore) control_plane_node restore;;
-    apply_*) apply_module ${t##apply_};;
+    apply_*) apply_module "${t##apply_}";;
+    delete_*) delete_module "${t##delete_}";;
     backup) backup;;
     debug_shell) debug_shell;;
     *) echo "Unknown command: '$t'";;
diff --git a/admin/libhelm.sh b/admin/libhelm.sh
index cc2291d..94a29f8 100644
--- a/admin/libhelm.sh
+++ b/admin/libhelm.sh
@@ -108,6 +108,7 @@ function _crds() {
 
   # Only apply if there are actually any crds
   if [ -s $WORKDIR/crds.yaml ]; then
+    [ -n "$DEBUG" ] && cat $WORKDIR/crds.yaml
     kubectl apply -f $WORKDIR/crds.yaml --server-side --force-conflicts
   fi
 }
@@ -115,7 +116,7 @@ function _crds() {
 
 # helm template | kubectl apply -f -
 # confine to one namespace if possible
-function apply() {
+function render() {
   helm template $(chart_location $chart) -n $namespace --name-template $module $targetRevision --skip-crds -f $WORKDIR/values.yaml $API_VERSIONS --kube-version $KUBE_VERSION $@ \
     | python3 -c '
 #!/usr/bin/python3
@@ -128,8 +129,6 @@ for manifest in yaml.safe_load_all(sys.stdin):
             manifest["metadata"]["namespace"] = sys.argv[1]
         print("---")
         print(yaml.dump(manifest))' $namespace > $WORKDIR/helm.yaml
-
-  kubectl $action -f $WORKDIR/helm.yaml --server-side --force-conflicts && rc=$? || rc=$?
 }
 
 
@@ -164,13 +163,15 @@ function _helm() {
     # Optional pre hook
     declare -F ${module}-pre && ${module}-pre
 
-    apply
+    render
+    kubectl $action -f $WORKDIR/helm.yaml --server-side --force-conflicts && rc=$? || rc=$?
 
     # Optional post hook
     declare -F ${module}-post && ${module}-post
 
   elif [ $action == "delete" ]; then
-    apply
+    render
+    kubectl $action -f $WORKDIR/helm.yaml && rc=$? || rc=$?
 
     # Delete dedicated namespace if not kube-system
     [ -n "$DELETE_NS" ] && delete_ns $namespace
diff --git a/admin/upgrade_cluster.sh b/admin/upgrade_cluster.sh
index f957b0c..3219753 100755
--- a/admin/upgrade_cluster.sh
+++ b/admin/upgrade_cluster.sh
@@ -124,6 +124,10 @@ spec:
     node-role.kubernetes.io/control-plane: ""
   tolerations:
   - key: node-role.kubernetes.io/master
+    operator: Exists
+    effect: NoSchedule
+  - key: node-role.kubernetes.io/control-plane
+    operator: Exists
     effect: NoSchedule
   restartPolicy: Never
 EOF
diff --git a/charts/kubezero-network/README.md b/charts/kubezero-network/README.md
index b31a42b..6893893 100644
--- a/charts/kubezero-network/README.md
+++ b/charts/kubezero-network/README.md
@@ -26,16 +26,22 @@ Kubernetes: `>= 1.24.0`
 
 | Key | Type | Default | Description |
 |-----|------|---------|-------------|
-| cilium.bpf.hostLegacyRouting | bool | `true` |  |
 | cilium.cgroup.autoMount.enabled | bool | `false` |  |
 | cilium.cgroup.hostRoot | string | `"/sys/fs/cgroup"` |  |
 | cilium.cluster.id | int | `240` |  |
 | cilium.cluster.name | string | `"default"` |  |
 | cilium.cni.binPath | string | `"/usr/libexec/cni"` |  |
+| cilium.cni.exclusive | bool | `false` |  |
 | cilium.cni.logFile | string | `"/var/log/cilium-cni.log"` |  |
 | cilium.containerRuntime.integration | string | `"crio"` |  |
 | cilium.enabled | bool | `false` |  |
 | cilium.hubble.enabled | bool | `false` |  |
+| cilium.hubble.relay.enabled | bool | `false` |  |
+| cilium.hubble.tls.auto.certManagerIssuerRef.group | string | `"cert-manager.io"` |  |
+| cilium.hubble.tls.auto.certManagerIssuerRef.kind | string | `"ClusterIssuer"` |  |
+| cilium.hubble.tls.auto.certManagerIssuerRef.name | string | `"kubezero-local-ca-issuer"` |  |
+| cilium.hubble.tls.auto.method | string | `"cert-manager"` |  |
+| cilium.hubble.ui.enabled | bool | `false` |  |
 | cilium.ipam.operator.clusterPoolIPv4PodCIDRList[0] | string | `"10.240.0.0/16"` |  |
 | cilium.l7Proxy | bool | `false` |  |
 | cilium.operator.nodeSelector."node-role.kubernetes.io/control-plane" | string | `""` |  |
@@ -58,4 +64,5 @@ Kubernetes: `>= 1.24.0`
 | multus.clusterNetwork | string | `"cilium"` |  |
 | multus.defaultNetworks | list | `[]` |  |
 | multus.enabled | bool | `false` |  |
+| multus.readinessindicatorfile | string | `"/etc/cni/net.d/05-cilium.conf"` |  |
 | multus.tag | string | `"v3.9.2"` |  |
diff --git a/charts/kubezero-network/values.yaml b/charts/kubezero-network/values.yaml
index 4a9d325..019dd04 100644
--- a/charts/kubezero-network/values.yaml
+++ b/charts/kubezero-network/values.yaml
@@ -40,8 +40,8 @@ cilium:
     #-- Ensure this is false if multus is enabled
     exclusive: false
 
-  bpf:
-    hostLegacyRouting: true
+  #bpf:
+  #  hostLegacyRouting: true
   #  tproxy: false
 
   cluster:
@@ -57,10 +57,10 @@ cilium:
         - 10.240.0.0/16
 
   # Keep it simple for now
-  # nodePort:
-  #  enabled: true
   l7Proxy: false
 
+  #rollOutCiliumPods: true
+
   cgroup:
     autoMount:
       enabled: false
@@ -84,3 +84,14 @@ cilium:
 
   hubble:
     enabled: false
+    relay:
+      enabled: false
+    ui:
+      enabled: false
+    tls:
+      auto:
+        method: cert-manager
+        certManagerIssuerRef:
+          group: cert-manager.io
+          kind: ClusterIssuer
+          name: kubezero-local-ca-issuer
diff --git a/charts/kubezero/README.md b/charts/kubezero/README.md
index b4bd099..ab20915 100644
--- a/charts/kubezero/README.md
+++ b/charts/kubezero/README.md
@@ -72,7 +72,7 @@ Kubernetes: `>= 1.24.0`
 | storage.aws-ebs-csi-driver.enabled | bool | `false` |  |
 | storage.aws-efs-csi-driver.enabled | bool | `false` |  |
 | storage.enabled | bool | `false` |  |
-| storage.targetRevision | string | `"0.7.3"` |  |
+| storage.targetRevision | string | `"0.7.4"` |  |
 
 ----------------------------------------------
 Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0)
diff --git a/charts/kubezero/values.yaml b/charts/kubezero/values.yaml
index a2b7b8d..a10f008 100644
--- a/charts/kubezero/values.yaml
+++ b/charts/kubezero/values.yaml
@@ -36,7 +36,7 @@ cert-manager:
 
 storage:
   enabled: false
-  targetRevision: 0.7.3
+  targetRevision: 0.7.4
   aws-ebs-csi-driver:
     enabled: false
   aws-efs-csi-driver:
diff --git a/docs/v1.24.md b/docs/v1.24.md
index ff43b45..75471dd 100644
--- a/docs/v1.24.md
+++ b/docs/v1.24.md
@@ -2,47 +2,41 @@
 
 ## TODO
 
-### FeatureGates
-- PodAndContainerStatsFromCRI 
-- DelegateFSGroupToCSIDriver
-
 ## What's new - Major themes
-
-- Cilium added as second CNI to prepare full migration to Cilium with 1.24 upgrade
-- support for Nvidia g5 instances incl. pre-installed kernel drivers, cudo toolchain and CRI intergration
-- updated inf1 neuron drivers
-- ExtendedResourceToleration AdmissionController and auto-taints allowing Neuron and Nvidia pods ONLY to be scheduled on dedicated workers
-- full Cluster-Autoscaler integration
+- Cilium is now the default CNI, calico got removed
+- cluster-autoscaler is enabled by default on AWS
 
 ## Version upgrades
-- Istio to 1.14.4
-- Logging: ECK operator to 2.4, fluent-bit 1.9.8
-- Metrics: Prometheus and all Grafana charts to latest to match V1.23
-- ArgoCD to V2.4 ( access to pod via shell disabled by default )
-- AWS EBS/EFS CSI drivers to latest versions
-- cert-manager to V1.9.1
+- cilium
+- metallb
+- nvidia-device-plugin
+- aws-node-termination-handler
+- aws-ebs-csi-driver
+- aws-efs-csi-driver
+
+### FeatureGates
+- PodAndContainerStatsFromCRI
+- DelegateFSGroupToCSIDriver
 
 # Upgrade
 `(No, really, you MUST read this before you upgrade)`
 
 - Ensure your Kube context points to the correct cluster !
 
-1. Enable `containerProxy` for NAT instances and upgrade NAT instance using the new V2 Pulumi stacks
+1. Review CFN config for controller and workers ( enable containerProxy, remove legacy version settings etc )
 
-2. Review CFN config for controller and workers ( enable containerProxy, remove legacy version settings etc )
+2. Upgrade CFN stacks for the control plane and all worker groups
 
-3. Upgrade CFN stacks for the control plane and all worker groups
-
-4. Trigger fully-automated cluster upgrade:  
+3. Trigger fully-automated cluster upgrade:  
   `./admin/upgrade_cluster.sh <path to the argocd app kubezero yaml for THIS cluster>`
 
-5. Reboot controller(s) one by one  
+4. Reboot controller(s) one by one  
 Wait each time for controller to join and all pods running.
 Might take a while ...
 
-6. Launch new set of workers eg. by doubling `desired` for each worker ASG  
+5. Launch new set of workers eg. by doubling `desired` for each worker ASG  
   once new workers are ready, cordon and drain all old workers  
 The cluster-autoscaler will remove the old workers automatically after about 10min !
 
-7. If all looks good, commit the ArgoApp resouce for Kubezero, before re-enabling ArgoCD itself.  
+6. If all looks good, commit the ArgoApp resouce for Kubezero, before re-enabling ArgoCD itself.  
   git add / commit / push `<cluster/env/kubezero/application.yaml>`