From f540172694932dd26a587eab72684612f2b562b9 Mon Sep 17 00:00:00 2001 From: Stefan Reimer Date: Fri, 23 Sep 2022 12:54:27 +0200 Subject: [PATCH] fix: rc round of fixed for the upgrade flow of 1.23 --- admin/kubezero.sh | 5 ++-- admin/libhelm.sh | 19 +++++++++++++-- admin/upgrade_cluster.sh | 35 ++++++++++++++++++++++----- admin/v1.23/pre-upgrade.sh | 13 +++------- charts/kubezero-network/Chart.yaml | 2 +- charts/kubezero-network/values.yaml | 14 +++++------ charts/kubezero/templates/addons.yaml | 23 +++++++++--------- charts/kubezero/values.yaml | 4 ++- 8 files changed, 74 insertions(+), 41 deletions(-) diff --git a/admin/kubezero.sh b/admin/kubezero.sh index 789fba9..9e40051 100755 --- a/admin/kubezero.sh +++ b/admin/kubezero.sh @@ -166,7 +166,7 @@ control_plane_node() { else # restore latest backup - retry 10 60 30 restic restore latest --no-lock -t / --tag $KUBE_VERSION_MINOR + retry 10 60 30 restic restore latest --no-lock -t / #Review: Use latest no matter what for now: --tag $KUBE_VERSION_MINOR # Make last etcd snapshot available cp ${WORKDIR}/etcd_snapshot ${HOSTFS}/etc/kubernetes @@ -382,8 +382,7 @@ for t in $@; do bootstrap) control_plane_node bootstrap;; join) control_plane_node join;; restore) control_plane_node restore;; - apply_network) apply_module network;; - apply_addons) apply_module addons;; + apply_*) apply_module ${t##apply_};; backup) backup;; debug_shell) debug_shell;; *) echo "Unknown command: '$t'";; diff --git a/admin/libhelm.sh b/admin/libhelm.sh index c1f743c..56d5702 100644 --- a/admin/libhelm.sh +++ b/admin/libhelm.sh @@ -55,6 +55,21 @@ function enable_argo() { } +function cntFailedPods() { + NS=$1 + + NR=$(kubectl get pods -n $NS --field-selector="status.phase!=Succeeded,status.phase!=Running" -o custom-columns="POD:metadata.name" -o json | jq '.items | length') + echo $NR +} + + +function waitSystemPodsRunning() { + while true; do + [ "$(cntFailedPods kube-system)" -eq 0 ] && break + sleep 3 + done +} + function argo_app_synced() { APP=$1 @@ -93,7 +108,7 @@ function _crds() { # Only apply if there are actually any crds if [ -s $WORKDIR/crds.yaml ]; then - kubectl apply -f $WORKDIR/crds.yaml --server-side + kubectl apply -f $WORKDIR/crds.yaml --server-side --force-conflicts fi } @@ -114,7 +129,7 @@ for manifest in yaml.safe_load_all(sys.stdin): print("---") print(yaml.dump(manifest))' $namespace > $WORKDIR/helm.yaml - kubectl $action -f $WORKDIR/helm.yaml && rc=$? || rc=$? + kubectl $action -f $WORKDIR/helm.yaml --server-side --force-conflicts && rc=$? || rc=$? } diff --git a/admin/upgrade_cluster.sh b/admin/upgrade_cluster.sh index 918894d..c3796c8 100755 --- a/admin/upgrade_cluster.sh +++ b/admin/upgrade_cluster.sh @@ -31,6 +31,9 @@ spec: labels: name: kubezero-all-nodes-upgrade spec: + hostNetwork: true + hostIPC: true + hostPID: true tolerations: - key: node-role.kubernetes.io/master operator: Exists @@ -43,7 +46,10 @@ spec: volumeMounts: - name: host mountPath: /host + - name: hostproc + mountPath: /hostproc securityContext: + privileged: true capabilities: add: ["SYS_ADMIN"] containers: @@ -55,6 +61,10 @@ spec: hostPath: path: / type: Directory + - name: hostproc + hostPath: + path: /proc + type: Directory EOF kubectl rollout status daemonset -n kube-system kubezero-all-nodes-upgrade --timeout 300s @@ -122,14 +132,18 @@ EOF kubectl delete pod kubezero-upgrade -n kube-system } + +echo "Checking that all pods in kube-system are running ..." +waitSystemPodsRunning + argo_used && disable_argo -all_nodes_upgrade "mount --make-shared /host/sys/fs/cgroup; mount --make-shared /host/sys;" +all_nodes_upgrade "nsenter -m/hostproc/1/ns/mnt mount --make-shared /sys/fs/cgroup; mount --make-shared /sys; sleep 3;" control_plane_upgrade kubeadm_upgrade -echo "Adjust kubezero-values as needed: (eg. set cilium cluster id etc):" -kubectl edit cm kubezero-values -n kube-system +echo "Adjust kubezero values as needed: (eg. set cilium cluster id and ensure no IP space overlap !!):" +argo_used && kubectl edit app kubezero -n argocd || kubectl edit cm kubezero-values -n kube-system # Remove multus DS due to label changes, if this fails: # kubezero-network $ helm template . --set multus.enabled=true | kubectl apply -f - @@ -141,16 +155,25 @@ kubectl delete daemonset metrics-prometheus-node-exporter -n monitoring || true # AWS EBS CSI driver change their fsGroupPolicy kubectl delete CSIDriver ebs.csi.aws.com || true -control_plane_upgrade "apply_network, apply_addons" +control_plane_upgrade "apply_network, apply_addons, apply_storage" kubectl rollout restart daemonset/calico-node -n kube-system kubectl rollout restart daemonset/cilium -n kube-system kubectl rollout restart daemonset/kube-multus-ds -n kube-system -argo_used && enable_argo +echo "Checking that all pods in kube-system are running ..." +waitSystemPodsRunning + +echo "Applying remaining KubeZero modules..." +control_plane_upgrade "apply_cert-manager, apply_istio, apply_istio-ingress, apply_istio-private-ingress, apply_logging, apply_metrics, apply_argocd" # Final step is to commit the new argocd kubezero app -kubectl get app kubezero -n argocd -o yaml | yq 'del(.status) | del(.metadata) | .metadata.name="kubezero" | .metadata.namespace="argocd"' | yq 'sort_keys(..) | .spec.source.helm.values |= (from_yaml | to_yaml)' > /tmp/new-kubezero-argoapp.yaml +kubectl get app kubezero -n argocd -o yaml | yq 'del(.status) | del(.metadata) | del(.operation) | .metadata.name="kubezero" | .metadata.namespace="argocd"' | yq 'sort_keys(..) | .spec.source.helm.values |= (from_yaml | to_yaml)' > $ARGO_APP echo "Please commit $ARGO_APP as the updated kubezero/application.yaml for your cluster." echo "Then head over to ArgoCD for this cluster and sync all KubeZero modules to apply remaining upgrades." + +echo " to continue and re-enable ArgoCD:" +read + +argo_used && enable_argo diff --git a/admin/v1.23/pre-upgrade.sh b/admin/v1.23/pre-upgrade.sh index 24da186..d21b815 100644 --- a/admin/v1.23/pre-upgrade.sh +++ b/admin/v1.23/pre-upgrade.sh @@ -1,7 +1,6 @@ #!/bin/bash # Migrate addons and network values from local kubeadm-values.yaml on controllers into CM -# - remove secrets from addons # - enable cilium # Create emtpy CM if not exists yet @@ -16,13 +15,6 @@ yq eval -i '.global.clusterName = strenv(CLUSTERNAME) | .global.highAvailable = env(HIGHAVAILABLE)' \ ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml -# extract addons -yq e '.addons | - del .clusterBackup.repository | - del .clusterBackup.password | - .clusterBackup.image.tag = strenv(KUBE_VERSION) | - {"addons": .}' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml > $WORKDIR/addons-values.yaml - # extract network yq e '.network | .cilium.enabled = true | @@ -35,10 +27,11 @@ yq e '.network | # get current argo cd values kubectl get application kubezero -n argocd -o yaml | yq '.spec.source.helm.values' > ${WORKDIR}/argo-values.yaml -# merge all into new CM +# merge all into new CM and set new minimal addons yq ea '. as $item ireduce ({}; . * $item ) | .global.clusterName = strenv(CLUSTERNAME) | - .global.highAvailable = env(HIGHAVAILABLE)' $WORKDIR/addons-values.yaml ${WORKDIR}/network-values.yaml $WORKDIR/argo-values.yaml > $WORKDIR/kubezero-pre-values.yaml + .global.highAvailable = env(HIGHAVAILABLE) | + .addons.clusterBackup.image.tag = "v1.23" ' ${WORKDIR}/network-values.yaml $WORKDIR/argo-values.yaml > $WORKDIR/kubezero-pre-values.yaml # tumble new config through migrate.py cat $WORKDIR/kubezero-pre-values.yaml | migrate_argo_values.py > $WORKDIR/kubezero-values.yaml diff --git a/charts/kubezero-network/Chart.yaml b/charts/kubezero-network/Chart.yaml index 3179bf9..6e16ac1 100644 --- a/charts/kubezero-network/Chart.yaml +++ b/charts/kubezero-network/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: kubezero-network description: KubeZero umbrella chart for all things network type: application -version: 0.3.3 +version: 0.3.4 home: https://kubezero.com icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png keywords: diff --git a/charts/kubezero-network/values.yaml b/charts/kubezero-network/values.yaml index 8ecf439..0d85cc6 100644 --- a/charts/kubezero-network/values.yaml +++ b/charts/kubezero-network/values.yaml @@ -37,27 +37,27 @@ cilium: binPath: "/usr/libexec/cni" #-- Ensure this is false if multus is enabled exclusive: false - # chainingMode: generic-veth + chainingMode: generic-veth - # Until we switch to Cilium - #bpf: - # hostLegacyRouting: true + bpf: + hostLegacyRouting: true # tproxy: false # enableIPv4Masquerade: false # enableIdentityMark: false - policyEnforcementMode: "audit" + policyEnforcementMode: "never" cluster: # This should match the second octet of clusterPoolIPv4PodCIDRList # to prevent IP space overlap and easy tracking - id: 1 + # use 240 as default, less likely to clash with 1, do NOT use 244 used by calico until 1.25 + id: 240 name: default ipam: operator: clusterPoolIPv4PodCIDRList: - - 10.1.0.0/16 + - 10.240.0.0/16 # Should be handled by multus nodePort: diff --git a/charts/kubezero/templates/addons.yaml b/charts/kubezero/templates/addons.yaml index 2f1299e..124e2d1 100644 --- a/charts/kubezero/templates/addons.yaml +++ b/charts/kubezero/templates/addons.yaml @@ -79,26 +79,29 @@ external-dns: value: "0" {{- end }} - -{{- with index .Values "addons" "cluster-autoscaler" }} cluster-autoscaler: - {{- toYaml . | nindent 2 }} - autoDiscovery: - clusterName: {{ $.Values.global.clusterName }} + enabled: {{ default "false" (index .Values "addons" "cluster-autoscaler" "enabled") }} - {{- with $.Values.metrics }} + {{- with omit (index .Values "addons" "cluster-autoscaler") "enabled" }} + {{- toYaml . | nindent 2 }} + {{- end }} + + autoDiscovery: + clusterName: {{ .Values.global.clusterName }} + + {{- with .Values.metrics }} serviceMonitor: enabled: {{ .enabled }} prometheusRule: enabled: {{ .enabled }} {{- end }} + {{- if .Values.global.aws }} # AWS - {{- if $.Values.global.aws }} - awsRegion: {{ $.Values.global.aws.region }} + awsRegion: {{ .Values.global.aws.region }} extraEnv: - AWS_ROLE_ARN: "arn:aws:iam::{{ $.Values.global.aws.accountId }}:role/{{ $.Values.global.aws.region }}.{{ $.Values.global.clusterName }}.clusterAutoScaler" + AWS_ROLE_ARN: "arn:aws:iam::{{ .Values.global.aws.accountId }}:role/{{ .Values.global.aws.region }}.{{ .Values.global.clusterName }}.clusterAutoScaler" AWS_WEB_IDENTITY_TOKEN_FILE: "/var/run/secrets/sts.amazonaws.com/serviceaccount/token" AWS_STS_REGIONAL_ENDPOINTS: "regional" extraVolumes: @@ -115,8 +118,6 @@ cluster-autoscaler: readOnly: true {{- end }} -{{- end }} - {{- with .Values.addons.fuseDevicePlugin }} fuseDevicePlugin: {{- toYaml . | nindent 2 }} diff --git a/charts/kubezero/values.yaml b/charts/kubezero/values.yaml index f00941b..70d8637 100644 --- a/charts/kubezero/values.yaml +++ b/charts/kubezero/values.yaml @@ -19,11 +19,13 @@ addons: enabled: false aws-node-termination-handler: enabled: false + cluster-autoscaler: + enabled: false network: enabled: true retain: true - targetRevision: 0.3.3 + targetRevision: 0.3.4 cilium: cluster: {}