fix: rc round of fixed for the upgrade flow of 1.23

2022-09-23 12:54:27 +02:00 · 2022-09-23 12:54:27 +02:00 · f540172694
parent ab97b8f1ca
commit f540172694
8 changed files with 74 additions and 41 deletions
--- a/admin/kubezero.sh
+++ b/admin/kubezero.sh
@ -166,7 +166,7 @@ control_plane_node() {

  else
    # restore latest backup
-    retry 10 60 30 restic restore latest --no-lock -t / --tag $KUBE_VERSION_MINOR
+    retry 10 60 30 restic restore latest --no-lock -t / #Review: Use latest no matter what for now: --tag $KUBE_VERSION_MINOR

    # Make last etcd snapshot available
    cp ${WORKDIR}/etcd_snapshot ${HOSTFS}/etc/kubernetes
@ -382,8 +382,7 @@ for t in $@; do
    bootstrap) control_plane_node bootstrap;;
    join) control_plane_node join;;
    restore) control_plane_node restore;;
-    apply_network) apply_module network;;
-    apply_addons) apply_module addons;;
+    apply_*) apply_module ${t##apply_};;
    backup) backup;;
    debug_shell) debug_shell;;
    *) echo "Unknown command: '$t'";;
--- a/admin/libhelm.sh
+++ b/admin/libhelm.sh
@ -55,6 +55,21 @@ function enable_argo() {
 }


+function cntFailedPods() {
+  NS=$1
+
+  NR=$(kubectl get pods -n $NS --field-selector="status.phase!=Succeeded,status.phase!=Running" -o custom-columns="POD:metadata.name" -o json | jq '.items | length')
+  echo $NR
+}
+
+
+function waitSystemPodsRunning() {
+  while true; do
+    [ "$(cntFailedPods kube-system)" -eq 0 ] && break
+    sleep 3
+  done
+}
+
 function argo_app_synced() {
  APP=$1

@ -93,7 +108,7 @@ function _crds() {

  # Only apply if there are actually any crds
  if [ -s $WORKDIR/crds.yaml ]; then
-    kubectl apply -f $WORKDIR/crds.yaml --server-side
+    kubectl apply -f $WORKDIR/crds.yaml --server-side --force-conflicts
  fi
 }

@ -114,7 +129,7 @@ for manifest in yaml.safe_load_all(sys.stdin):
        print("---")
        print(yaml.dump(manifest))' $namespace > $WORKDIR/helm.yaml

-  kubectl $action -f $WORKDIR/helm.yaml && rc=$? || rc=$?
+  kubectl $action -f $WORKDIR/helm.yaml --server-side --force-conflicts && rc=$? || rc=$?
 }


--- a/admin/upgrade_cluster.sh
+++ b/admin/upgrade_cluster.sh
@ -31,6 +31,9 @@ spec:
      labels:
        name: kubezero-all-nodes-upgrade
    spec:
+      hostNetwork: true
+      hostIPC: true
+      hostPID: true
      tolerations:
      - key: node-role.kubernetes.io/master
        operator: Exists
@ -43,7 +46,10 @@ spec:
        volumeMounts:
        - name: host
          mountPath: /host
+        - name: hostproc
+          mountPath: /hostproc
        securityContext:
+          privileged: true
          capabilities:
            add: ["SYS_ADMIN"]
      containers:
@ -55,6 +61,10 @@ spec:
        hostPath:
          path: /
          type: Directory
+      - name: hostproc
+        hostPath:
+          path: /proc
+          type: Directory
 EOF

  kubectl rollout status daemonset -n kube-system kubezero-all-nodes-upgrade --timeout 300s
@ -122,14 +132,18 @@ EOF
  kubectl delete pod kubezero-upgrade -n kube-system
 }

+
+echo "Checking that all pods in kube-system are running ..."
+waitSystemPodsRunning
+
 argo_used && disable_argo

-all_nodes_upgrade "mount --make-shared /host/sys/fs/cgroup; mount --make-shared /host/sys;"
+all_nodes_upgrade "nsenter -m/hostproc/1/ns/mnt mount --make-shared /sys/fs/cgroup; mount --make-shared /sys; sleep 3;"

 control_plane_upgrade kubeadm_upgrade

-echo "Adjust kubezero-values as needed: (eg. set cilium cluster id etc):"
-kubectl edit cm kubezero-values -n kube-system
+echo "Adjust kubezero values as needed: (eg. set cilium cluster id and ensure no IP space overlap !!):"
+argo_used && kubectl edit app kubezero -n argocd || kubectl edit cm kubezero-values -n kube-system

 # Remove multus DS due to label changes, if this fails:
 # kubezero-network $ helm template . --set multus.enabled=true | kubectl apply -f -
@ -141,16 +155,25 @@ kubectl delete daemonset metrics-prometheus-node-exporter -n monitoring || true
 # AWS EBS CSI driver change their fsGroupPolicy
 kubectl delete CSIDriver ebs.csi.aws.com || true

-control_plane_upgrade "apply_network, apply_addons"
+control_plane_upgrade "apply_network, apply_addons, apply_storage"

 kubectl rollout restart daemonset/calico-node -n kube-system
 kubectl rollout restart daemonset/cilium -n kube-system
 kubectl rollout restart daemonset/kube-multus-ds -n kube-system

-argo_used && enable_argo
+echo "Checking that all pods in kube-system are running ..."
+waitSystemPodsRunning
+
+echo "Applying remaining KubeZero modules..."
+control_plane_upgrade "apply_cert-manager, apply_istio, apply_istio-ingress, apply_istio-private-ingress, apply_logging, apply_metrics, apply_argocd"

 # Final step is to commit the new argocd kubezero app
-kubectl get app kubezero -n argocd -o yaml | yq 'del(.status) | del(.metadata) | .metadata.name="kubezero" | .metadata.namespace="argocd"' | yq 'sort_keys(..) | .spec.source.helm.values |= (from_yaml | to_yaml)' > /tmp/new-kubezero-argoapp.yaml
+kubectl get app kubezero -n argocd -o yaml | yq 'del(.status) | del(.metadata) | del(.operation) | .metadata.name="kubezero" | .metadata.namespace="argocd"' | yq 'sort_keys(..) | .spec.source.helm.values |= (from_yaml | to_yaml)' > $ARGO_APP

 echo "Please commit $ARGO_APP as the updated kubezero/application.yaml for your cluster."
 echo "Then head over to ArgoCD for this cluster and sync all KubeZero modules to apply remaining upgrades."
+
+echo "<Return> to continue and re-enable ArgoCD:"
+read
+
+argo_used && enable_argo
--- a/admin/v1.23/pre-upgrade.sh
+++ b/admin/v1.23/pre-upgrade.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 # Migrate addons and network values from local kubeadm-values.yaml on controllers into CM
-# - remove secrets from addons
 # - enable cilium
        
 # Create emtpy CM if not exists yet
@ -16,13 +15,6 @@ yq eval -i '.global.clusterName = strenv(CLUSTERNAME) |
            .global.highAvailable = env(HIGHAVAILABLE)' \
  ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml

-# extract addons
-yq e '.addons |
-      del .clusterBackup.repository |
-      del .clusterBackup.password |
-      .clusterBackup.image.tag = strenv(KUBE_VERSION) |
-      {"addons": .}' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml > $WORKDIR/addons-values.yaml
-
 # extract network
 yq e '.network |
      .cilium.enabled = true |
@ -35,10 +27,11 @@ yq e '.network |
 # get current argo cd values
 kubectl get application kubezero -n argocd -o yaml | yq '.spec.source.helm.values' > ${WORKDIR}/argo-values.yaml

-# merge all into new CM
+# merge all into new CM and set new minimal addons
 yq ea '. as $item ireduce ({}; . * $item ) |
       .global.clusterName = strenv(CLUSTERNAME) |
-       .global.highAvailable = env(HIGHAVAILABLE)' $WORKDIR/addons-values.yaml ${WORKDIR}/network-values.yaml $WORKDIR/argo-values.yaml > $WORKDIR/kubezero-pre-values.yaml
+       .global.highAvailable = env(HIGHAVAILABLE) |
+       .addons.clusterBackup.image.tag = "v1.23" ' ${WORKDIR}/network-values.yaml $WORKDIR/argo-values.yaml > $WORKDIR/kubezero-pre-values.yaml

 # tumble new config through migrate.py
 cat $WORKDIR/kubezero-pre-values.yaml | migrate_argo_values.py > $WORKDIR/kubezero-values.yaml
--- a/charts/kubezero-network/Chart.yaml
+++ b/charts/kubezero-network/Chart.yaml
@ -2,7 +2,7 @@ apiVersion: v2
 name: kubezero-network
 description: KubeZero umbrella chart for all things network
 type: application
-version: 0.3.3
+version: 0.3.4
 home: https://kubezero.com
 icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
 keywords:
--- a/charts/kubezero-network/values.yaml
+++ b/charts/kubezero-network/values.yaml
@ -37,27 +37,27 @@ cilium:
    binPath: "/usr/libexec/cni"
    #-- Ensure this is false if multus is enabled
    exclusive: false
-  # chainingMode: generic-veth
+    chainingMode: generic-veth

-  # Until we switch to Cilium
-  #bpf:
-  #  hostLegacyRouting: true
+  bpf:
+    hostLegacyRouting: true
  #  tproxy: false

  # enableIPv4Masquerade: false
  # enableIdentityMark: false
-  policyEnforcementMode: "audit"
+  policyEnforcementMode: "never"

  cluster:
    # This should match the second octet of clusterPoolIPv4PodCIDRList
    # to prevent IP space overlap and easy tracking
-    id: 1
+    # use 240 as default, less likely to clash with 1, do NOT use 244 used by calico until 1.25
+    id: 240
    name: default

  ipam:
    operator:
      clusterPoolIPv4PodCIDRList:
-        - 10.1.0.0/16
+        - 10.240.0.0/16

  # Should be handled by multus
  nodePort:
--- a/charts/kubezero/templates/addons.yaml
+++ b/charts/kubezero/templates/addons.yaml
@ -79,26 +79,29 @@ external-dns:
      value: "0"
  {{- end }}

-
-{{- with index .Values "addons" "cluster-autoscaler" }}
 cluster-autoscaler:
-  {{- toYaml . | nindent 2 }}
-  autoDiscovery:
-    clusterName: {{ $.Values.global.clusterName }}
+  enabled: {{ default "false" (index .Values "addons" "cluster-autoscaler" "enabled") }}

-  {{- with $.Values.metrics }}
+  {{- with omit (index .Values "addons" "cluster-autoscaler") "enabled" }}
+  {{- toYaml . | nindent 2 }}
+  {{- end }}
+
+  autoDiscovery:
+    clusterName: {{ .Values.global.clusterName }}
+
+  {{- with .Values.metrics }}
  serviceMonitor:
    enabled: {{ .enabled }}
  prometheusRule:
    enabled: {{ .enabled }}
  {{- end }}

+  {{- if .Values.global.aws }}
  # AWS
-  {{- if $.Values.global.aws }}
-  awsRegion: {{ $.Values.global.aws.region }}
+  awsRegion: {{ .Values.global.aws.region }}

  extraEnv:
-    AWS_ROLE_ARN: "arn:aws:iam::{{ $.Values.global.aws.accountId }}:role/{{ $.Values.global.aws.region }}.{{ $.Values.global.clusterName }}.clusterAutoScaler"
+    AWS_ROLE_ARN: "arn:aws:iam::{{ .Values.global.aws.accountId }}:role/{{ .Values.global.aws.region }}.{{ .Values.global.clusterName }}.clusterAutoScaler"
    AWS_WEB_IDENTITY_TOKEN_FILE: "/var/run/secrets/sts.amazonaws.com/serviceaccount/token"
    AWS_STS_REGIONAL_ENDPOINTS: "regional"
  extraVolumes:
@ -115,8 +118,6 @@ cluster-autoscaler:
    readOnly: true
  {{- end }}

-{{- end }}
-
 {{- with .Values.addons.fuseDevicePlugin }}
 fuseDevicePlugin:
  {{- toYaml . | nindent 2 }}
--- a/charts/kubezero/values.yaml
+++ b/charts/kubezero/values.yaml
@ -19,11 +19,13 @@ addons:
    enabled: false
  aws-node-termination-handler:
    enabled: false
+  cluster-autoscaler:
+    enabled: false

 network:
  enabled: true
  retain: true
-  targetRevision: 0.3.3
+  targetRevision: 0.3.4
  cilium:
    cluster: {}