From 6b868907af2fb3cf1b0ddcdf027e25854d51b8f3 Mon Sep 17 00:00:00 2001 From: Stefan Reimer Date: Fri, 3 Dec 2021 22:13:40 +0100 Subject: [PATCH] Feat: more tweaks for 1.21 upgrade --- containers/admin/upgrade-cluster.yaml | 43 ------- .../52-aws-iam-authenticator-mappings.yaml | 7 +- containers/admin/v1.21/kubezero.sh | 69 +++++++--- containers/admin/v1.21/upgrade_121.sh | 121 ++++++++++++++++++ scripts/upgrade_120.sh | 119 ----------------- 5 files changed, 172 insertions(+), 187 deletions(-) delete mode 100644 containers/admin/upgrade-cluster.yaml create mode 100755 containers/admin/v1.21/upgrade_121.sh delete mode 100755 scripts/upgrade_120.sh diff --git a/containers/admin/upgrade-cluster.yaml b/containers/admin/upgrade-cluster.yaml deleted file mode 100644 index 9af82232..00000000 --- a/containers/admin/upgrade-cluster.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: kubezero-upgrade-v1.21.7 - namespace: kube-system -spec: - backoffLimit: 1 - template: - spec: - hostNetwork: true - #hostIPC: true - #hostPID: true - containers: - - name: kubezero-admin - image: public.ecr.aws/zero-downtime/kubezero-admin:v1.21.7 - imagePullPolicy: Always - command: ["kubezero.sh"] - args: - - upgrade - env: - - name: DEBUG - value: "1" - volumeMounts: - - name: host - mountPath: /host - - name: workdir - mountPath: /tmp - securityContext: - capabilities: - add: ["SYS_CHROOT"] - volumes: - - name: host - hostPath: - path: / - type: Directory - - name: workdir - emptyDir: {} - nodeSelector: - node-role.kubernetes.io/master: "" - tolerations: - - key: node-role.kubernetes.io/master - effect: NoSchedule - restartPolicy: Never diff --git a/containers/admin/v1.21/kubeadm/templates/resources/52-aws-iam-authenticator-mappings.yaml b/containers/admin/v1.21/kubeadm/templates/resources/52-aws-iam-authenticator-mappings.yaml index 0ff2b1a9..31ef6e35 100644 --- a/containers/admin/v1.21/kubeadm/templates/resources/52-aws-iam-authenticator-mappings.yaml +++ b/containers/admin/v1.21/kubeadm/templates/resources/52-aws-iam-authenticator-mappings.yaml @@ -1,5 +1,4 @@ {{- if .Values.api.awsIamAuth.enabled }} -# Controller role for consistency, similar to kubeadm admin.conf apiVersion: iamauthenticator.k8s.aws/v1alpha1 kind: IAMIdentityMapping metadata: @@ -8,11 +7,9 @@ spec: arn: {{ .Values.api.awsIamAuth.workerNodeRole }} username: system:node:{{ "{{" }}EC2PrivateDNSName{{ "}}" }} groups: - # For now use masters, define properly with 1.20 - - system:masters - - system:nodes - - system:bootstrappers + - system:bootstrappers:kubeadm:default-node-token --- + # Admin Role for remote access apiVersion: iamauthenticator.k8s.aws/v1alpha1 kind: IAMIdentityMapping diff --git a/containers/admin/v1.21/kubezero.sh b/containers/admin/v1.21/kubezero.sh index c6ab54e7..3e9fd313 100755 --- a/containers/admin/v1.21/kubezero.sh +++ b/containers/admin/v1.21/kubezero.sh @@ -3,6 +3,7 @@ set -e WORKDIR=/tmp/kubezero HOSTFS=/host +VERSION=v1.21 export KUBECONFIG="${HOSTFS}/root/.kube/config" @@ -138,27 +139,26 @@ if [ "$1" == 'upgrade' ]; then ###################### helm repo add kubezero https://cdn.zero-downtime.net/charts/ - # if Calico, install multus to prepare migration - kubectl get ds calico-node -n kube-system && \ - helm template kubezero/kubezero-network --version 0.1.0 --include-crds --namespace kube-system --kube-version $KUBE_VERSION --name-template network \ - --set multus.enabled=true \ - | kubectl apply -f - $LOG + # network + yq eval '.network // ""' ${HOSTFS}/etc/kubernetes/kubezero.yaml > _values.yaml + helm template kubezero/kubezero-network --version 0.1.0 --include-crds --namespace kube-system --name-template network \ + -f _values.yaml --kube-version $KUBE_VERSION | kubectl apply -f - $LOG - # migrate backup - if [ -f ${HOSTFS}/usr/local/sbin/backup_control_plane.sh ]; then - _repo=$(grep "export RESTIC_REPOSITORY" ${HOSTFS}/usr/local/sbin/backup_control_plane.sh) - helm template kubezero/kubezero-addons --version 0.2.1 --include-crds --namespace kube-system --kube-version $KUBE_VERSION --name-template addons \ - --set clusterBackup.enabled=true \ - --set clusterBackup.repository="${_repo##*=}" \ - --set clusterBackup.password="$(cat ${HOSTFS}/etc/kubernetes/clusterBackup.passphrase)" \ - | kubectl apply -f - $LOG - fi + # addons + yq eval '.addons // ""' ${HOSTFS}/etc/kubernetes/kubezero.yaml > _values.yaml + helm template kubezero/kubezero-addons --version 0.2.2 --include-crds --namespace kube-system --name-template addons \ + -f _values.yaml --kube-version $KUBE_VERSION | kubectl apply -f - $LOG ###################### + # Execute cluster backup to allow new controllers to join + kubectl create job backup-cluster-now --from=cronjob/kubezero-backup -n kube-system + + # That might take a while as the backup pod needs the CNIs to come online etc. + retry 10 30 40 kubectl wait --for=condition=complete job/backup-cluster-now -n kube-system && kubectl delete job backup-cluster-now -n kube-system # Cleanup after kubeadm on the host - rm -rf /etc/kubernetes/tmp + rm -rf ${HOSTFS}/etc/kubernetes/tmp echo "Successfully upgraded cluster." @@ -169,6 +169,30 @@ if [ "$1" == 'upgrade' ]; then # Removed: # - update oidc do we need that ? +elif [[ "$1" == 'node-upgrade' ]]; then + + echo "Starting node upgrade ..." + + if [ -f ${HOSTFS}/usr/local/sbin/backup_control_plane.sh ]; then + mv ${HOSTFS}/usr/local/sbin/backup_control_plane.sh ${HOSTFS}/usr/local/sbin/backup_control_plane.disabled + echo "Disabled old cluster backup OS cronjob" + fi + + echo "Migrating kubezero.yaml" + + export restic_repo=$(grep "export RESTIC_REPOSITORY" ${HOSTFS}/usr/local/sbin/backup_control_plane.disabled | sed -e 's/.*=//' | sed -e 's/"//g') + export restic_pw="$(cat ${HOSTFS}/etc/cloudbender/clusterBackup.passphrase)" + export REGION=$(kubectl get node $NODE_NAME -o yaml | yq eval '.metadata.labels."topology.kubernetes.io/region"' -) + + # enable backup and awsIamAuth. multus, match other reorg + yq -Mi e '.api.awsIamAuth.enabled = "true" | .api.awsIamAuth.workerNodeRole = .workerNodeRole | .api.awsIamAuth.kubeAdminRole = .kubeAdminRole + | .api.serviceAccountIssuer = .serviceAccountIssuer | .api.apiAudiences = "istio-ca,sts.amazonaws.com" + | .network.multus.enabled = "true" + | .addons.clusterBackup.enabled = "true" | .addons.clusterBackup.repository = strenv(restic_repo) | .addons.clusterBackup.password = strenv(restic_pw) + | .addons.clusterBackup.extraEnv[0].name = "AWS_DEFAULT_REGION" | .addons.clusterBackup.extraEnv[0].value = strenv(REGION) + ' ${HOSTFS}/etc/kubernetes/kubezero.yaml + + echo "All done." elif [[ "$1" =~ "^(bootstrap|recover|join)$" ]]; then @@ -223,8 +247,8 @@ elif [[ "$1" =~ "^(bootstrap|recover|join)$" ]]; then yq eval -M ".clusters[0].cluster.certificate-authority-data = \"$(cat ${HOSTFS}/etc/kubernetes/pki/ca.crt | base64 -w0)\"" ${WORKDIR}/kubeadm/templates/admin-aws-iam.yaml > ${HOSTFS}/etc/kubernetes/admin-aws-iam.yaml fi - # Install some basics on bootstrap - if [[ "$1" =~ "^(bootstrap)$" ]]; then + # Install some basics on bootstrap and join for 1.21.7 to get new modules in place + if [[ "$1" =~ "^(bootstrap|join|recover)$" ]]; then helm repo add kubezero https://cdn.zero-downtime.net/charts/ # network @@ -234,7 +258,7 @@ elif [[ "$1" =~ "^(bootstrap|recover|join)$" ]]; then # addons yq eval '.addons // ""' ${HOSTFS}/etc/kubernetes/kubezero.yaml > _values.yaml - helm template kubezero/kubezero-addons --version 0.2.1 --include-crds --namespace kube-system --name-template addons \ + helm template kubezero/kubezero-addons --version 0.2.2 --include-crds --namespace kube-system --name-template addons \ -f _values.yaml --kube-version $KUBE_VERSION | kubectl apply -f - $LOG fi @@ -263,15 +287,20 @@ elif [ "$1" == 'backup' ]; then # Backup via restic restic snapshots || restic init - restic backup ${WORKDIR} -H $CLUSTERNAME + restic backup ${WORKDIR} -H $CLUSTERNAME --tag $VERSION echo "Backup complete" + + # Remove all previous + restic forget --keep-tag $VERSION --prune + restic forget --keep-hourly 24 --keep-daily ${RESTIC_RETENTION:-7} --prune + elif [ "$1" == 'restore' ]; then mkdir -p ${WORKDIR} - restic restore latest --no-lock -t / + restic restore latest --no-lock -t / --tag $VERSION # Make last etcd snapshot available cp ${WORKDIR}/etcd_snapshot ${HOSTFS}/etc/kubernetes diff --git a/containers/admin/v1.21/upgrade_121.sh b/containers/admin/v1.21/upgrade_121.sh new file mode 100755 index 00000000..ee23afcc --- /dev/null +++ b/containers/admin/v1.21/upgrade_121.sh @@ -0,0 +1,121 @@ +#!/bin/bash +set -eux + +# unset any AWS_DEFAULT_PROFILE as it will break aws-iam-auth +unset AWS_DEFAULT_PROFILE + +nodes=$(kubectl get nodes -l node-role.kubernetes.io/control-plane -o json | jq .items[].metadata.name -r) + +for node in $nodes; do + echo "Deploying node upgrade job on $node..." + + cat <<'EOF' | sed -e "s/__node__/$node/g" | kubectl apply -f - +apiVersion: v1 +kind: Pod +metadata: + name: kubezero-upgrade-node-__node__ + namespace: kube-system + labels: + app: kubezero-upgrade-node +spec: + hostNetwork: true + containers: + - name: kubezero-admin + image: public.ecr.aws/zero-downtime/kubezero-admin:v1.21.7 + imagePullPolicy: Always + command: ["kubezero.sh"] + args: + - node-upgrade + env: + - name: DEBUG + value: "1" + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: host + mountPath: /host + - name: workdir + mountPath: /tmp + securityContext: + capabilities: + add: ["SYS_CHROOT"] + volumes: + - name: host + hostPath: + path: / + type: Directory + - name: workdir + emptyDir: {} + nodeSelector: + kubernetes.io/hostname: __node__ + tolerations: + - key: node-role.kubernetes.io/master + effect: NoSchedule + restartPolicy: Never +EOF + kubectl wait pod kubezero-upgrade-node-$node -n kube-system --timeout 120s --for=condition=initialized + while true; do + kubectl logs kubezero-upgrade-node-$node -n kube-system -f && break + sleep 3 + done + kubectl delete pod kubezero-upgrade-node-$node -n kube-system +done + +echo "Deploying cluster upgrade job ..." + +cat <<'EOF' | kubectl apply -f - +apiVersion: v1 +kind: Pod +metadata: + name: kubezero-upgrade-v121 + namespace: kube-system + labels: + app: kubezero-upgrade +spec: + hostNetwork: true + #hostIPC: true + #hostPID: true + containers: + - name: kubezero-admin + image: public.ecr.aws/zero-downtime/kubezero-admin:v1.21.7 + imagePullPolicy: Always + command: ["kubezero.sh"] + args: + - upgrade + env: + - name: DEBUG + value: "1" + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: host + mountPath: /host + - name: workdir + mountPath: /tmp + securityContext: + capabilities: + add: ["SYS_CHROOT"] + volumes: + - name: host + hostPath: + path: / + type: Directory + - name: workdir + emptyDir: {} + nodeSelector: + node-role.kubernetes.io/master: "" + tolerations: + - key: node-role.kubernetes.io/master + effect: NoSchedule + restartPolicy: Never +EOF + +kubectl wait pod kubezero-upgrade-v121 -n kube-system --timeout 120s --for=condition=initialized +while true; do + kubectl logs kubezero-upgrade-v121 -n kube-system -f && break + sleep 3 +done diff --git a/scripts/upgrade_120.sh b/scripts/upgrade_120.sh deleted file mode 100755 index f0a5cc83..00000000 --- a/scripts/upgrade_120.sh +++ /dev/null @@ -1,119 +0,0 @@ -#!/bin/bash -set -e - -# Get all kube-control-plane ASGs in the current account and region -asgs=$(aws autoscaling describe-auto-scaling-groups --output json | jq .AutoScalingGroups[].AutoScalingGroupName -r | grep kube-control-plane) - -for asg in $asgs; do - hooks=$(aws autoscaling describe-lifecycle-hooks --auto-scaling-group-name $asg --output json | jq '.LifecycleHooks[] | select (.LifecycleTransition=="autoscaling:EC2_INSTANCE_TERMINATING") | .LifecycleHookName' -r) - - for hook in $hooks; do - echo "Delete Lifecycle hook $hook of ASG $asg ? to abort" - read - aws autoscaling delete-lifecycle-hook --lifecycle-hook-name $hook --auto-scaling-group-name $asg - done -done - -# unset any AWS_DEFAULT_PROFILE as it will break aws-iam-auth -unset AWS_DEFAULT_PROFILE - -nodes=$(kubectl get nodes -l node-role.kubernetes.io/master -o json | jq .items[].metadata.name -r) - -for node in $nodes; do - echo "Deploying upgrade job on $node..." - - cat <<'EOF' > _job.yaml -apiVersion: batch/v1 -kind: Job -metadata: - name: kubezero-upgrade - namespace: kube-system -spec: - template: - spec: - hostNetwork: true - hostIPC: true - hostPID: true - containers: - - name: busybox - image: busybox - command: - - /bin/sh - - -c - - | - cat <<'EOF' > /host/tmp/upgrade.sh - #!/bin/bash -ex - for l in $(cat /etc/environment); do - export $l - done - my_ip=$(ec2metadata --local-ipv4) - my_id=$(ec2metadata --instance-id) - clusterName=$(yq r /etc/kubezero/kubezero.yaml clusterName) - my_asg=$(aws ec2 describe-tags --filters "Name=resource-id,Values=${my_id}" --output json | jq '.Tags[] | select(.Key=="aws:cloudformation:logical-id") | .Value' -r) - - [ $my_asg == "KubeControlAsgAZ1" ] && nodename="etcd0-$clusterName" - [ $my_asg == "KubeControlAsgAZ2" ] && nodename="etcd1-$clusterName" - [ $my_asg == "KubeControlAsgAZ3" ] && nodename="etcd2-$clusterName" - zone_name=$(hostname -d) - - zone_id=$(aws route53 list-hosted-zones --query 'HostedZones[?Name==`'"$zone_name"'.`].Id' --output text | cut --delimiter="/" --fields=3) - cat < route53.json - { "Changes": [ { "Action": "UPSERT", "ResourceRecordSet": { "Name": "${nodename}.${zone_name}", "Type": "A", "TTL": 30, "ResourceRecords": [ { "Value": "$my_ip" } ] } } ] } - EOF2 - - echo "Updating DNS entry for $nodename to $my_ip" - aws route53 change-resource-record-sets --hosted-zone-id $zone_id --change-batch file://route53.json - - echo "Adding additional control shutdown commands" - if [ ! -f /usr/local/sbin/drain_delete_node.sh ]; then - cat < /usr/local/sbin/drain_delete_node.sh - #!/bin/bash -ex - export LC_TYPE=en_US.UTF-8 - export KUBECONFIG=/root/.kube/config - kubeadm reset phase update-cluster-status - kubeadm reset phase remove-etcd-member - EOF3 - chmod +x /usr/local/sbin/drain_delete_node.sh - sed -e 's,/usr/local/sbin/backup_control_plane.sh&,/usr/local/sbin/drain_delete_node.sh,' -i /usr/local/sbin/cloudbender_shutdown.sh - fi - - echo "Patching ClusterConfig to re-create new etcd server certificates" - yq w /etc/kubezero/kubeadm/templates/ClusterConfiguration.yaml etcd.local.serverCertSANs[+] $nodename > /etc/kubernetes/kubeadm-recert.yaml - yq w -i /etc/kubernetes/kubeadm-recert.yaml etcd.local.serverCertSANs[+] $nodename.$zone_name - rm -f /etc/kubernetes/pki/etcd/server.* - kubeadm init phase certs etcd-server --config=/etc/kubernetes/kubeadm-recert.yaml 2>/dev/null - kill -s HUP $(ps -e | grep etcd | awk '{print $1}') - echo "Waiting for etcd to accept connections again...might take 30s or more" - while true; do - etcdctl member list -w simple 1>/dev/null 2>&1 && break || true - sleep 3 - done - EOF - chmod +x /host/tmp/upgrade.sh - chroot /host bash -c /tmp/upgrade.sh - volumeMounts: - - name: host - mountPath: /host - securityContext: - privileged: true - volumes: - - name: host - hostPath: - path: / - type: Directory - nodeSelector: - kubernetes.io/hostname: __node__ - tolerations: - - key: node-role.kubernetes.io/master - effect: NoSchedule - restartPolicy: Never -EOF - - # set controller node - sed -i -e "s/__node__/$node/" _job.yaml - - kubectl apply -f _job.yaml - kubectl wait -n kube-system --timeout 300s --for=condition=complete job/kubezero-upgrade - kubectl delete -f _job.yaml - rm -f _job.yaml -done