feat: add delete_module to admin, various cleanups

2022-11-14 14:14:11 +01:00 · 2022-11-14 14:14:11 +01:00 · e9eeb8be8d
parent f8588d8938
commit e9eeb8be8d
8 changed files with 70 additions and 37 deletions
--- a/admin/kubezero.sh
+++ b/admin/kubezero.sh
@ -323,6 +323,21 @@ apply_module() {
 }
 delete_module() {
  MODULES=$1
  get_kubezero_values
  # Always use embedded kubezero chart
  helm template $CHARTS/kubezero -f $WORKDIR/kubezero-values.yaml --version ~$KUBE_VERSION --devel --output-dir $WORKDIR
  for t in $MODULES; do
    _helm delete $t
  done
  echo "Deleted KubeZero modules: $MODULES. Potential CRDs must be removed manually."
 }
 # backup etcd + /etc/kubernetes/pki
 backup() {
  # Display all ENVs, careful this exposes the password !
@ -377,7 +392,8 @@ for t in $@; do
    bootstrap) control_plane_node bootstrap;;
    join) control_plane_node join;;
    restore) control_plane_node restore;;
-    apply_*) apply_module ${t##apply_};;
+    apply_*) apply_module "${t##apply_}";;
    delete_*) delete_module "${t##delete_}";;
    backup) backup;;
    debug_shell) debug_shell;;
    *) echo "Unknown command: '$t'";;
--- a/admin/libhelm.sh
+++ b/admin/libhelm.sh
@ -108,6 +108,7 @@ function _crds() {
  # Only apply if there are actually any crds
  if [ -s $WORKDIR/crds.yaml ]; then
    [ -n "$DEBUG" ] && cat $WORKDIR/crds.yaml
    kubectl apply -f $WORKDIR/crds.yaml --server-side --force-conflicts
  fi
 }
@ -115,7 +116,7 @@ function _crds() {
 # helm template | kubectl apply -f -
 # confine to one namespace if possible
-function apply() {
+function render() {
  helm template $(chart_location $chart) -n $namespace --name-template $module $targetRevision --skip-crds -f $WORKDIR/values.yaml $API_VERSIONS --kube-version $KUBE_VERSION $@ \
    | python3 -c '
 #!/usr/bin/python3
@ -128,8 +129,6 @@ for manifest in yaml.safe_load_all(sys.stdin):
            manifest["metadata"]["namespace"] = sys.argv[1]
        print("---")
        print(yaml.dump(manifest))' $namespace > $WORKDIR/helm.yaml
  kubectl $action -f $WORKDIR/helm.yaml --server-side --force-conflicts && rc=$? || rc=$?
 }
@ -164,13 +163,15 @@ function _helm() {
    # Optional pre hook
    declare -F ${module}-pre && ${module}-pre
-    apply
+    render
    kubectl $action -f $WORKDIR/helm.yaml --server-side --force-conflicts && rc=$? || rc=$?
    # Optional post hook
    declare -F ${module}-post && ${module}-post
  elif [ $action == "delete" ]; then
-    apply
+    render
    kubectl $action -f $WORKDIR/helm.yaml && rc=$? || rc=$?
    # Delete dedicated namespace if not kube-system
    [ -n "$DELETE_NS" ] && delete_ns $namespace
--- a/admin/upgrade_cluster.sh
+++ b/admin/upgrade_cluster.sh
@ -124,6 +124,10 @@ spec:
    node-role.kubernetes.io/control-plane: ""
  tolerations:
  - key: node-role.kubernetes.io/master
    operator: Exists
    effect: NoSchedule
  - key: node-role.kubernetes.io/control-plane
    operator: Exists
    effect: NoSchedule
  restartPolicy: Never
 EOF
--- a/charts/kubezero-network/README.md
+++ b/charts/kubezero-network/README.md
@ -26,16 +26,22 @@ Kubernetes: `>= 1.24.0`
 | Key | Type | Default | Description |
 |-----|------|---------|-------------|
 | cilium.bpf.hostLegacyRouting | bool | `true` |  |
 | cilium.cgroup.autoMount.enabled | bool | `false` |  |
 | cilium.cgroup.hostRoot | string | `"/sys/fs/cgroup"` |  |
 | cilium.cluster.id | int | `240` |  |
 | cilium.cluster.name | string | `"default"` |  |
 | cilium.cni.binPath | string | `"/usr/libexec/cni"` |  |
 | cilium.cni.exclusive | bool | `false` |  |
 | cilium.cni.logFile | string | `"/var/log/cilium-cni.log"` |  |
 | cilium.containerRuntime.integration | string | `"crio"` |  |
 | cilium.enabled | bool | `false` |  |
 | cilium.hubble.enabled | bool | `false` |  |
 | cilium.hubble.relay.enabled | bool | `false` |  |
 | cilium.hubble.tls.auto.certManagerIssuerRef.group | string | `"cert-manager.io"` |  |
 | cilium.hubble.tls.auto.certManagerIssuerRef.kind | string | `"ClusterIssuer"` |  |
 | cilium.hubble.tls.auto.certManagerIssuerRef.name | string | `"kubezero-local-ca-issuer"` |  |
 | cilium.hubble.tls.auto.method | string | `"cert-manager"` |  |
 | cilium.hubble.ui.enabled | bool | `false` |  |
 | cilium.ipam.operator.clusterPoolIPv4PodCIDRList[0] | string | `"10.240.0.0/16"` |  |
 | cilium.l7Proxy | bool | `false` |  |
 | cilium.operator.nodeSelector."node-role.kubernetes.io/control-plane" | string | `""` |  |
@ -58,4 +64,5 @@ Kubernetes: `>= 1.24.0`
 | multus.clusterNetwork | string | `"cilium"` |  |
 | multus.defaultNetworks | list | `[]` |  |
 | multus.enabled | bool | `false` |  |
 | multus.readinessindicatorfile | string | `"/etc/cni/net.d/05-cilium.conf"` |  |
 | multus.tag | string | `"v3.9.2"` |  |
--- a/charts/kubezero-network/values.yaml
+++ b/charts/kubezero-network/values.yaml
@ -40,8 +40,8 @@ cilium:
    #-- Ensure this is false if multus is enabled
    exclusive: false
-  bpf:
+  #bpf:
-    hostLegacyRouting: true
+  #  hostLegacyRouting: true
  #  tproxy: false
  cluster:
@ -57,10 +57,10 @@ cilium:
        - 10.240.0.0/16
  # Keep it simple for now
  # nodePort:
  #  enabled: true
  l7Proxy: false
  #rollOutCiliumPods: true
  cgroup:
    autoMount:
      enabled: false
@ -84,3 +84,14 @@ cilium:
  hubble:
    enabled: false
    relay:
      enabled: false
    ui:
      enabled: false
    tls:
      auto:
        method: cert-manager
        certManagerIssuerRef:
          group: cert-manager.io
          kind: ClusterIssuer
          name: kubezero-local-ca-issuer
--- a/charts/kubezero/README.md
+++ b/charts/kubezero/README.md
@ -72,7 +72,7 @@ Kubernetes: `>= 1.24.0`
 | storage.aws-ebs-csi-driver.enabled | bool | `false` |  |
 | storage.aws-efs-csi-driver.enabled | bool | `false` |  |
 | storage.enabled | bool | `false` |  |
-| storage.targetRevision | string | `"0.7.3"` |  |
+| storage.targetRevision | string | `"0.7.4"` |  |
 ----------------------------------------------
 Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0)
--- a/charts/kubezero/values.yaml
+++ b/charts/kubezero/values.yaml
@ -36,7 +36,7 @@ cert-manager:
 storage:
  enabled: false
-  targetRevision: 0.7.3
+  targetRevision: 0.7.4
  aws-ebs-csi-driver:
    enabled: false
  aws-efs-csi-driver:
--- a/docs/v1.24.md
+++ b/docs/v1.24.md
@ -2,47 +2,41 @@
 ## TODO
 ### FeatureGates
 - PodAndContainerStatsFromCRI 
 - DelegateFSGroupToCSIDriver
 ## What's new - Major themes
-
+- Cilium is now the default CNI, calico got removed
- Cilium added as second CNI to prepare full migration to Cilium with 1.24 upgrade
+- cluster-autoscaler is enabled by default on AWS
 - support for Nvidia g5 instances incl. pre-installed kernel drivers, cudo toolchain and CRI intergration
 - updated inf1 neuron drivers
 - ExtendedResourceToleration AdmissionController and auto-taints allowing Neuron and Nvidia pods ONLY to be scheduled on dedicated workers
 - full Cluster-Autoscaler integration
 ## Version upgrades
- Istio to 1.14.4
+- cilium
- Logging: ECK operator to 2.4, fluent-bit 1.9.8
+- metallb
- Metrics: Prometheus and all Grafana charts to latest to match V1.23
+- nvidia-device-plugin
- ArgoCD to V2.4 ( access to pod via shell disabled by default )
+- aws-node-termination-handler
- AWS EBS/EFS CSI drivers to latest versions
+- aws-ebs-csi-driver
- cert-manager to V1.9.1
+- aws-efs-csi-driver
 ### FeatureGates
 - PodAndContainerStatsFromCRI
 - DelegateFSGroupToCSIDriver
 # Upgrade
 `(No, really, you MUST read this before you upgrade)`
 - Ensure your Kube context points to the correct cluster !
-1. Enable `containerProxy` for NAT instances and upgrade NAT instance using the new V2 Pulumi stacks
+1. Review CFN config for controller and workers ( enable containerProxy, remove legacy version settings etc )
-2. Review CFN config for controller and workers ( enable containerProxy, remove legacy version settings etc )
+2. Upgrade CFN stacks for the control plane and all worker groups
-3. Upgrade CFN stacks for the control plane and all worker groups
+3. Trigger fully-automated cluster upgrade:  
 4. Trigger fully-automated cluster upgrade:  
  `./admin/upgrade_cluster.sh <path to the argocd app kubezero yaml for THIS cluster>`
-5. Reboot controller(s) one by one  
+4. Reboot controller(s) one by one  
 Wait each time for controller to join and all pods running.
 Might take a while ...
-6. Launch new set of workers eg. by doubling `desired` for each worker ASG  
+5. Launch new set of workers eg. by doubling `desired` for each worker ASG  
  once new workers are ready, cordon and drain all old workers  
 The cluster-autoscaler will remove the old workers automatically after about 10min !
-7. If all looks good, commit the ArgoApp resouce for Kubezero, before re-enabling ArgoCD itself.  
+6. If all looks good, commit the ArgoApp resouce for Kubezero, before re-enabling ArgoCD itself.  
  git add / commit / push `<cluster/env/kubezero/application.yaml>`