From e9eeb8be8d9b730bef1714c14171341e7e31bce8 Mon Sep 17 00:00:00 2001 From: Stefan Reimer Date: Mon, 14 Nov 2022 14:14:11 +0100 Subject: [PATCH] feat: add delete_module to admin, various cleanups --- admin/kubezero.sh | 18 ++++++++++++- admin/libhelm.sh | 11 ++++---- admin/upgrade_cluster.sh | 4 +++ charts/kubezero-network/README.md | 9 ++++++- charts/kubezero-network/values.yaml | 19 ++++++++++--- charts/kubezero/README.md | 2 +- charts/kubezero/values.yaml | 2 +- docs/v1.24.md | 42 +++++++++++++---------------- 8 files changed, 70 insertions(+), 37 deletions(-) diff --git a/admin/kubezero.sh b/admin/kubezero.sh index d7845ea..0892388 100755 --- a/admin/kubezero.sh +++ b/admin/kubezero.sh @@ -323,6 +323,21 @@ apply_module() { } +delete_module() { + MODULES=$1 + + get_kubezero_values + + # Always use embedded kubezero chart + helm template $CHARTS/kubezero -f $WORKDIR/kubezero-values.yaml --version ~$KUBE_VERSION --devel --output-dir $WORKDIR + + for t in $MODULES; do + _helm delete $t + done + + echo "Deleted KubeZero modules: $MODULES. Potential CRDs must be removed manually." +} + # backup etcd + /etc/kubernetes/pki backup() { # Display all ENVs, careful this exposes the password ! @@ -377,7 +392,8 @@ for t in $@; do bootstrap) control_plane_node bootstrap;; join) control_plane_node join;; restore) control_plane_node restore;; - apply_*) apply_module ${t##apply_};; + apply_*) apply_module "${t##apply_}";; + delete_*) delete_module "${t##delete_}";; backup) backup;; debug_shell) debug_shell;; *) echo "Unknown command: '$t'";; diff --git a/admin/libhelm.sh b/admin/libhelm.sh index cc2291d..94a29f8 100644 --- a/admin/libhelm.sh +++ b/admin/libhelm.sh @@ -108,6 +108,7 @@ function _crds() { # Only apply if there are actually any crds if [ -s $WORKDIR/crds.yaml ]; then + [ -n "$DEBUG" ] && cat $WORKDIR/crds.yaml kubectl apply -f $WORKDIR/crds.yaml --server-side --force-conflicts fi } @@ -115,7 +116,7 @@ function _crds() { # helm template | kubectl apply -f - # confine to one namespace if possible -function apply() { +function render() { helm template $(chart_location $chart) -n $namespace --name-template $module $targetRevision --skip-crds -f $WORKDIR/values.yaml $API_VERSIONS --kube-version $KUBE_VERSION $@ \ | python3 -c ' #!/usr/bin/python3 @@ -128,8 +129,6 @@ for manifest in yaml.safe_load_all(sys.stdin): manifest["metadata"]["namespace"] = sys.argv[1] print("---") print(yaml.dump(manifest))' $namespace > $WORKDIR/helm.yaml - - kubectl $action -f $WORKDIR/helm.yaml --server-side --force-conflicts && rc=$? || rc=$? } @@ -164,13 +163,15 @@ function _helm() { # Optional pre hook declare -F ${module}-pre && ${module}-pre - apply + render + kubectl $action -f $WORKDIR/helm.yaml --server-side --force-conflicts && rc=$? || rc=$? # Optional post hook declare -F ${module}-post && ${module}-post elif [ $action == "delete" ]; then - apply + render + kubectl $action -f $WORKDIR/helm.yaml && rc=$? || rc=$? # Delete dedicated namespace if not kube-system [ -n "$DELETE_NS" ] && delete_ns $namespace diff --git a/admin/upgrade_cluster.sh b/admin/upgrade_cluster.sh index f957b0c..3219753 100755 --- a/admin/upgrade_cluster.sh +++ b/admin/upgrade_cluster.sh @@ -124,6 +124,10 @@ spec: node-role.kubernetes.io/control-plane: "" tolerations: - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule + - key: node-role.kubernetes.io/control-plane + operator: Exists effect: NoSchedule restartPolicy: Never EOF diff --git a/charts/kubezero-network/README.md b/charts/kubezero-network/README.md index b31a42b..6893893 100644 --- a/charts/kubezero-network/README.md +++ b/charts/kubezero-network/README.md @@ -26,16 +26,22 @@ Kubernetes: `>= 1.24.0` | Key | Type | Default | Description | |-----|------|---------|-------------| -| cilium.bpf.hostLegacyRouting | bool | `true` | | | cilium.cgroup.autoMount.enabled | bool | `false` | | | cilium.cgroup.hostRoot | string | `"/sys/fs/cgroup"` | | | cilium.cluster.id | int | `240` | | | cilium.cluster.name | string | `"default"` | | | cilium.cni.binPath | string | `"/usr/libexec/cni"` | | +| cilium.cni.exclusive | bool | `false` | | | cilium.cni.logFile | string | `"/var/log/cilium-cni.log"` | | | cilium.containerRuntime.integration | string | `"crio"` | | | cilium.enabled | bool | `false` | | | cilium.hubble.enabled | bool | `false` | | +| cilium.hubble.relay.enabled | bool | `false` | | +| cilium.hubble.tls.auto.certManagerIssuerRef.group | string | `"cert-manager.io"` | | +| cilium.hubble.tls.auto.certManagerIssuerRef.kind | string | `"ClusterIssuer"` | | +| cilium.hubble.tls.auto.certManagerIssuerRef.name | string | `"kubezero-local-ca-issuer"` | | +| cilium.hubble.tls.auto.method | string | `"cert-manager"` | | +| cilium.hubble.ui.enabled | bool | `false` | | | cilium.ipam.operator.clusterPoolIPv4PodCIDRList[0] | string | `"10.240.0.0/16"` | | | cilium.l7Proxy | bool | `false` | | | cilium.operator.nodeSelector."node-role.kubernetes.io/control-plane" | string | `""` | | @@ -58,4 +64,5 @@ Kubernetes: `>= 1.24.0` | multus.clusterNetwork | string | `"cilium"` | | | multus.defaultNetworks | list | `[]` | | | multus.enabled | bool | `false` | | +| multus.readinessindicatorfile | string | `"/etc/cni/net.d/05-cilium.conf"` | | | multus.tag | string | `"v3.9.2"` | | diff --git a/charts/kubezero-network/values.yaml b/charts/kubezero-network/values.yaml index 4a9d325..019dd04 100644 --- a/charts/kubezero-network/values.yaml +++ b/charts/kubezero-network/values.yaml @@ -40,8 +40,8 @@ cilium: #-- Ensure this is false if multus is enabled exclusive: false - bpf: - hostLegacyRouting: true + #bpf: + # hostLegacyRouting: true # tproxy: false cluster: @@ -57,10 +57,10 @@ cilium: - 10.240.0.0/16 # Keep it simple for now - # nodePort: - # enabled: true l7Proxy: false + #rollOutCiliumPods: true + cgroup: autoMount: enabled: false @@ -84,3 +84,14 @@ cilium: hubble: enabled: false + relay: + enabled: false + ui: + enabled: false + tls: + auto: + method: cert-manager + certManagerIssuerRef: + group: cert-manager.io + kind: ClusterIssuer + name: kubezero-local-ca-issuer diff --git a/charts/kubezero/README.md b/charts/kubezero/README.md index b4bd099..ab20915 100644 --- a/charts/kubezero/README.md +++ b/charts/kubezero/README.md @@ -72,7 +72,7 @@ Kubernetes: `>= 1.24.0` | storage.aws-ebs-csi-driver.enabled | bool | `false` | | | storage.aws-efs-csi-driver.enabled | bool | `false` | | | storage.enabled | bool | `false` | | -| storage.targetRevision | string | `"0.7.3"` | | +| storage.targetRevision | string | `"0.7.4"` | | ---------------------------------------------- Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0) diff --git a/charts/kubezero/values.yaml b/charts/kubezero/values.yaml index a2b7b8d..a10f008 100644 --- a/charts/kubezero/values.yaml +++ b/charts/kubezero/values.yaml @@ -36,7 +36,7 @@ cert-manager: storage: enabled: false - targetRevision: 0.7.3 + targetRevision: 0.7.4 aws-ebs-csi-driver: enabled: false aws-efs-csi-driver: diff --git a/docs/v1.24.md b/docs/v1.24.md index ff43b45..75471dd 100644 --- a/docs/v1.24.md +++ b/docs/v1.24.md @@ -2,47 +2,41 @@ ## TODO -### FeatureGates -- PodAndContainerStatsFromCRI -- DelegateFSGroupToCSIDriver - ## What's new - Major themes - -- Cilium added as second CNI to prepare full migration to Cilium with 1.24 upgrade -- support for Nvidia g5 instances incl. pre-installed kernel drivers, cudo toolchain and CRI intergration -- updated inf1 neuron drivers -- ExtendedResourceToleration AdmissionController and auto-taints allowing Neuron and Nvidia pods ONLY to be scheduled on dedicated workers -- full Cluster-Autoscaler integration +- Cilium is now the default CNI, calico got removed +- cluster-autoscaler is enabled by default on AWS ## Version upgrades -- Istio to 1.14.4 -- Logging: ECK operator to 2.4, fluent-bit 1.9.8 -- Metrics: Prometheus and all Grafana charts to latest to match V1.23 -- ArgoCD to V2.4 ( access to pod via shell disabled by default ) -- AWS EBS/EFS CSI drivers to latest versions -- cert-manager to V1.9.1 +- cilium +- metallb +- nvidia-device-plugin +- aws-node-termination-handler +- aws-ebs-csi-driver +- aws-efs-csi-driver + +### FeatureGates +- PodAndContainerStatsFromCRI +- DelegateFSGroupToCSIDriver # Upgrade `(No, really, you MUST read this before you upgrade)` - Ensure your Kube context points to the correct cluster ! -1. Enable `containerProxy` for NAT instances and upgrade NAT instance using the new V2 Pulumi stacks +1. Review CFN config for controller and workers ( enable containerProxy, remove legacy version settings etc ) -2. Review CFN config for controller and workers ( enable containerProxy, remove legacy version settings etc ) +2. Upgrade CFN stacks for the control plane and all worker groups -3. Upgrade CFN stacks for the control plane and all worker groups - -4. Trigger fully-automated cluster upgrade: +3. Trigger fully-automated cluster upgrade: `./admin/upgrade_cluster.sh ` -5. Reboot controller(s) one by one +4. Reboot controller(s) one by one Wait each time for controller to join and all pods running. Might take a while ... -6. Launch new set of workers eg. by doubling `desired` for each worker ASG +5. Launch new set of workers eg. by doubling `desired` for each worker ASG once new workers are ready, cordon and drain all old workers The cluster-autoscaler will remove the old workers automatically after about 10min ! -7. If all looks good, commit the ArgoApp resouce for Kubezero, before re-enabling ArgoCD itself. +6. If all looks good, commit the ArgoApp resouce for Kubezero, before re-enabling ArgoCD itself. git add / commit / push ``