Feat: more tweaks for 1.21 upgrade
This commit is contained in:
parent
ecaa0e17b7
commit
6b868907af
@ -1,43 +0,0 @@
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: kubezero-upgrade-v1.21.7
|
||||
namespace: kube-system
|
||||
spec:
|
||||
backoffLimit: 1
|
||||
template:
|
||||
spec:
|
||||
hostNetwork: true
|
||||
#hostIPC: true
|
||||
#hostPID: true
|
||||
containers:
|
||||
- name: kubezero-admin
|
||||
image: public.ecr.aws/zero-downtime/kubezero-admin:v1.21.7
|
||||
imagePullPolicy: Always
|
||||
command: ["kubezero.sh"]
|
||||
args:
|
||||
- upgrade
|
||||
env:
|
||||
- name: DEBUG
|
||||
value: "1"
|
||||
volumeMounts:
|
||||
- name: host
|
||||
mountPath: /host
|
||||
- name: workdir
|
||||
mountPath: /tmp
|
||||
securityContext:
|
||||
capabilities:
|
||||
add: ["SYS_CHROOT"]
|
||||
volumes:
|
||||
- name: host
|
||||
hostPath:
|
||||
path: /
|
||||
type: Directory
|
||||
- name: workdir
|
||||
emptyDir: {}
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/master: ""
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/master
|
||||
effect: NoSchedule
|
||||
restartPolicy: Never
|
@ -1,5 +1,4 @@
|
||||
{{- if .Values.api.awsIamAuth.enabled }}
|
||||
# Controller role for consistency, similar to kubeadm admin.conf
|
||||
apiVersion: iamauthenticator.k8s.aws/v1alpha1
|
||||
kind: IAMIdentityMapping
|
||||
metadata:
|
||||
@ -8,11 +7,9 @@ spec:
|
||||
arn: {{ .Values.api.awsIamAuth.workerNodeRole }}
|
||||
username: system:node:{{ "{{" }}EC2PrivateDNSName{{ "}}" }}
|
||||
groups:
|
||||
# For now use masters, define properly with 1.20
|
||||
- system:masters
|
||||
- system:nodes
|
||||
- system:bootstrappers
|
||||
- system:bootstrappers:kubeadm:default-node-token
|
||||
---
|
||||
|
||||
# Admin Role for remote access
|
||||
apiVersion: iamauthenticator.k8s.aws/v1alpha1
|
||||
kind: IAMIdentityMapping
|
||||
|
@ -3,6 +3,7 @@ set -e
|
||||
|
||||
WORKDIR=/tmp/kubezero
|
||||
HOSTFS=/host
|
||||
VERSION=v1.21
|
||||
|
||||
export KUBECONFIG="${HOSTFS}/root/.kube/config"
|
||||
|
||||
@ -138,27 +139,26 @@ if [ "$1" == 'upgrade' ]; then
|
||||
######################
|
||||
helm repo add kubezero https://cdn.zero-downtime.net/charts/
|
||||
|
||||
# if Calico, install multus to prepare migration
|
||||
kubectl get ds calico-node -n kube-system && \
|
||||
helm template kubezero/kubezero-network --version 0.1.0 --include-crds --namespace kube-system --kube-version $KUBE_VERSION --name-template network \
|
||||
--set multus.enabled=true \
|
||||
| kubectl apply -f - $LOG
|
||||
# network
|
||||
yq eval '.network // ""' ${HOSTFS}/etc/kubernetes/kubezero.yaml > _values.yaml
|
||||
helm template kubezero/kubezero-network --version 0.1.0 --include-crds --namespace kube-system --name-template network \
|
||||
-f _values.yaml --kube-version $KUBE_VERSION | kubectl apply -f - $LOG
|
||||
|
||||
# migrate backup
|
||||
if [ -f ${HOSTFS}/usr/local/sbin/backup_control_plane.sh ]; then
|
||||
_repo=$(grep "export RESTIC_REPOSITORY" ${HOSTFS}/usr/local/sbin/backup_control_plane.sh)
|
||||
helm template kubezero/kubezero-addons --version 0.2.1 --include-crds --namespace kube-system --kube-version $KUBE_VERSION --name-template addons \
|
||||
--set clusterBackup.enabled=true \
|
||||
--set clusterBackup.repository="${_repo##*=}" \
|
||||
--set clusterBackup.password="$(cat ${HOSTFS}/etc/kubernetes/clusterBackup.passphrase)" \
|
||||
| kubectl apply -f - $LOG
|
||||
fi
|
||||
# addons
|
||||
yq eval '.addons // ""' ${HOSTFS}/etc/kubernetes/kubezero.yaml > _values.yaml
|
||||
helm template kubezero/kubezero-addons --version 0.2.2 --include-crds --namespace kube-system --name-template addons \
|
||||
-f _values.yaml --kube-version $KUBE_VERSION | kubectl apply -f - $LOG
|
||||
|
||||
######################
|
||||
|
||||
# Execute cluster backup to allow new controllers to join
|
||||
kubectl create job backup-cluster-now --from=cronjob/kubezero-backup -n kube-system
|
||||
|
||||
# That might take a while as the backup pod needs the CNIs to come online etc.
|
||||
retry 10 30 40 kubectl wait --for=condition=complete job/backup-cluster-now -n kube-system && kubectl delete job backup-cluster-now -n kube-system
|
||||
|
||||
# Cleanup after kubeadm on the host
|
||||
rm -rf /etc/kubernetes/tmp
|
||||
rm -rf ${HOSTFS}/etc/kubernetes/tmp
|
||||
|
||||
echo "Successfully upgraded cluster."
|
||||
|
||||
@ -169,6 +169,30 @@ if [ "$1" == 'upgrade' ]; then
|
||||
# Removed:
|
||||
# - update oidc do we need that ?
|
||||
|
||||
elif [[ "$1" == 'node-upgrade' ]]; then
|
||||
|
||||
echo "Starting node upgrade ..."
|
||||
|
||||
if [ -f ${HOSTFS}/usr/local/sbin/backup_control_plane.sh ]; then
|
||||
mv ${HOSTFS}/usr/local/sbin/backup_control_plane.sh ${HOSTFS}/usr/local/sbin/backup_control_plane.disabled
|
||||
echo "Disabled old cluster backup OS cronjob"
|
||||
fi
|
||||
|
||||
echo "Migrating kubezero.yaml"
|
||||
|
||||
export restic_repo=$(grep "export RESTIC_REPOSITORY" ${HOSTFS}/usr/local/sbin/backup_control_plane.disabled | sed -e 's/.*=//' | sed -e 's/"//g')
|
||||
export restic_pw="$(cat ${HOSTFS}/etc/cloudbender/clusterBackup.passphrase)"
|
||||
export REGION=$(kubectl get node $NODE_NAME -o yaml | yq eval '.metadata.labels."topology.kubernetes.io/region"' -)
|
||||
|
||||
# enable backup and awsIamAuth. multus, match other reorg
|
||||
yq -Mi e '.api.awsIamAuth.enabled = "true" | .api.awsIamAuth.workerNodeRole = .workerNodeRole | .api.awsIamAuth.kubeAdminRole = .kubeAdminRole
|
||||
| .api.serviceAccountIssuer = .serviceAccountIssuer | .api.apiAudiences = "istio-ca,sts.amazonaws.com"
|
||||
| .network.multus.enabled = "true"
|
||||
| .addons.clusterBackup.enabled = "true" | .addons.clusterBackup.repository = strenv(restic_repo) | .addons.clusterBackup.password = strenv(restic_pw)
|
||||
| .addons.clusterBackup.extraEnv[0].name = "AWS_DEFAULT_REGION" | .addons.clusterBackup.extraEnv[0].value = strenv(REGION)
|
||||
' ${HOSTFS}/etc/kubernetes/kubezero.yaml
|
||||
|
||||
echo "All done."
|
||||
|
||||
elif [[ "$1" =~ "^(bootstrap|recover|join)$" ]]; then
|
||||
|
||||
@ -223,8 +247,8 @@ elif [[ "$1" =~ "^(bootstrap|recover|join)$" ]]; then
|
||||
yq eval -M ".clusters[0].cluster.certificate-authority-data = \"$(cat ${HOSTFS}/etc/kubernetes/pki/ca.crt | base64 -w0)\"" ${WORKDIR}/kubeadm/templates/admin-aws-iam.yaml > ${HOSTFS}/etc/kubernetes/admin-aws-iam.yaml
|
||||
fi
|
||||
|
||||
# Install some basics on bootstrap
|
||||
if [[ "$1" =~ "^(bootstrap)$" ]]; then
|
||||
# Install some basics on bootstrap and join for 1.21.7 to get new modules in place
|
||||
if [[ "$1" =~ "^(bootstrap|join|recover)$" ]]; then
|
||||
helm repo add kubezero https://cdn.zero-downtime.net/charts/
|
||||
|
||||
# network
|
||||
@ -234,7 +258,7 @@ elif [[ "$1" =~ "^(bootstrap|recover|join)$" ]]; then
|
||||
|
||||
# addons
|
||||
yq eval '.addons // ""' ${HOSTFS}/etc/kubernetes/kubezero.yaml > _values.yaml
|
||||
helm template kubezero/kubezero-addons --version 0.2.1 --include-crds --namespace kube-system --name-template addons \
|
||||
helm template kubezero/kubezero-addons --version 0.2.2 --include-crds --namespace kube-system --name-template addons \
|
||||
-f _values.yaml --kube-version $KUBE_VERSION | kubectl apply -f - $LOG
|
||||
fi
|
||||
|
||||
@ -263,15 +287,20 @@ elif [ "$1" == 'backup' ]; then
|
||||
|
||||
# Backup via restic
|
||||
restic snapshots || restic init
|
||||
restic backup ${WORKDIR} -H $CLUSTERNAME
|
||||
restic backup ${WORKDIR} -H $CLUSTERNAME --tag $VERSION
|
||||
|
||||
echo "Backup complete"
|
||||
|
||||
# Remove all previous
|
||||
restic forget --keep-tag $VERSION --prune
|
||||
|
||||
restic forget --keep-hourly 24 --keep-daily ${RESTIC_RETENTION:-7} --prune
|
||||
|
||||
|
||||
elif [ "$1" == 'restore' ]; then
|
||||
mkdir -p ${WORKDIR}
|
||||
|
||||
restic restore latest --no-lock -t /
|
||||
restic restore latest --no-lock -t / --tag $VERSION
|
||||
|
||||
# Make last etcd snapshot available
|
||||
cp ${WORKDIR}/etcd_snapshot ${HOSTFS}/etc/kubernetes
|
||||
|
121
containers/admin/v1.21/upgrade_121.sh
Executable file
121
containers/admin/v1.21/upgrade_121.sh
Executable file
@ -0,0 +1,121 @@
|
||||
#!/bin/bash
|
||||
set -eux
|
||||
|
||||
# unset any AWS_DEFAULT_PROFILE as it will break aws-iam-auth
|
||||
unset AWS_DEFAULT_PROFILE
|
||||
|
||||
nodes=$(kubectl get nodes -l node-role.kubernetes.io/control-plane -o json | jq .items[].metadata.name -r)
|
||||
|
||||
for node in $nodes; do
|
||||
echo "Deploying node upgrade job on $node..."
|
||||
|
||||
cat <<'EOF' | sed -e "s/__node__/$node/g" | kubectl apply -f -
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: kubezero-upgrade-node-__node__
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app: kubezero-upgrade-node
|
||||
spec:
|
||||
hostNetwork: true
|
||||
containers:
|
||||
- name: kubezero-admin
|
||||
image: public.ecr.aws/zero-downtime/kubezero-admin:v1.21.7
|
||||
imagePullPolicy: Always
|
||||
command: ["kubezero.sh"]
|
||||
args:
|
||||
- node-upgrade
|
||||
env:
|
||||
- name: DEBUG
|
||||
value: "1"
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
volumeMounts:
|
||||
- name: host
|
||||
mountPath: /host
|
||||
- name: workdir
|
||||
mountPath: /tmp
|
||||
securityContext:
|
||||
capabilities:
|
||||
add: ["SYS_CHROOT"]
|
||||
volumes:
|
||||
- name: host
|
||||
hostPath:
|
||||
path: /
|
||||
type: Directory
|
||||
- name: workdir
|
||||
emptyDir: {}
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: __node__
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/master
|
||||
effect: NoSchedule
|
||||
restartPolicy: Never
|
||||
EOF
|
||||
kubectl wait pod kubezero-upgrade-node-$node -n kube-system --timeout 120s --for=condition=initialized
|
||||
while true; do
|
||||
kubectl logs kubezero-upgrade-node-$node -n kube-system -f && break
|
||||
sleep 3
|
||||
done
|
||||
kubectl delete pod kubezero-upgrade-node-$node -n kube-system
|
||||
done
|
||||
|
||||
echo "Deploying cluster upgrade job ..."
|
||||
|
||||
cat <<'EOF' | kubectl apply -f -
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: kubezero-upgrade-v121
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app: kubezero-upgrade
|
||||
spec:
|
||||
hostNetwork: true
|
||||
#hostIPC: true
|
||||
#hostPID: true
|
||||
containers:
|
||||
- name: kubezero-admin
|
||||
image: public.ecr.aws/zero-downtime/kubezero-admin:v1.21.7
|
||||
imagePullPolicy: Always
|
||||
command: ["kubezero.sh"]
|
||||
args:
|
||||
- upgrade
|
||||
env:
|
||||
- name: DEBUG
|
||||
value: "1"
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
volumeMounts:
|
||||
- name: host
|
||||
mountPath: /host
|
||||
- name: workdir
|
||||
mountPath: /tmp
|
||||
securityContext:
|
||||
capabilities:
|
||||
add: ["SYS_CHROOT"]
|
||||
volumes:
|
||||
- name: host
|
||||
hostPath:
|
||||
path: /
|
||||
type: Directory
|
||||
- name: workdir
|
||||
emptyDir: {}
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/master: ""
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/master
|
||||
effect: NoSchedule
|
||||
restartPolicy: Never
|
||||
EOF
|
||||
|
||||
kubectl wait pod kubezero-upgrade-v121 -n kube-system --timeout 120s --for=condition=initialized
|
||||
while true; do
|
||||
kubectl logs kubezero-upgrade-v121 -n kube-system -f && break
|
||||
sleep 3
|
||||
done
|
@ -1,119 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Get all kube-control-plane ASGs in the current account and region
|
||||
asgs=$(aws autoscaling describe-auto-scaling-groups --output json | jq .AutoScalingGroups[].AutoScalingGroupName -r | grep kube-control-plane)
|
||||
|
||||
for asg in $asgs; do
|
||||
hooks=$(aws autoscaling describe-lifecycle-hooks --auto-scaling-group-name $asg --output json | jq '.LifecycleHooks[] | select (.LifecycleTransition=="autoscaling:EC2_INSTANCE_TERMINATING") | .LifecycleHookName' -r)
|
||||
|
||||
for hook in $hooks; do
|
||||
echo "Delete Lifecycle hook $hook of ASG $asg ? <Ctrl+C> to abort"
|
||||
read
|
||||
aws autoscaling delete-lifecycle-hook --lifecycle-hook-name $hook --auto-scaling-group-name $asg
|
||||
done
|
||||
done
|
||||
|
||||
# unset any AWS_DEFAULT_PROFILE as it will break aws-iam-auth
|
||||
unset AWS_DEFAULT_PROFILE
|
||||
|
||||
nodes=$(kubectl get nodes -l node-role.kubernetes.io/master -o json | jq .items[].metadata.name -r)
|
||||
|
||||
for node in $nodes; do
|
||||
echo "Deploying upgrade job on $node..."
|
||||
|
||||
cat <<'EOF' > _job.yaml
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: kubezero-upgrade
|
||||
namespace: kube-system
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
hostNetwork: true
|
||||
hostIPC: true
|
||||
hostPID: true
|
||||
containers:
|
||||
- name: busybox
|
||||
image: busybox
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
cat <<'EOF' > /host/tmp/upgrade.sh
|
||||
#!/bin/bash -ex
|
||||
for l in $(cat /etc/environment); do
|
||||
export $l
|
||||
done
|
||||
my_ip=$(ec2metadata --local-ipv4)
|
||||
my_id=$(ec2metadata --instance-id)
|
||||
clusterName=$(yq r /etc/kubezero/kubezero.yaml clusterName)
|
||||
my_asg=$(aws ec2 describe-tags --filters "Name=resource-id,Values=${my_id}" --output json | jq '.Tags[] | select(.Key=="aws:cloudformation:logical-id") | .Value' -r)
|
||||
|
||||
[ $my_asg == "KubeControlAsgAZ1" ] && nodename="etcd0-$clusterName"
|
||||
[ $my_asg == "KubeControlAsgAZ2" ] && nodename="etcd1-$clusterName"
|
||||
[ $my_asg == "KubeControlAsgAZ3" ] && nodename="etcd2-$clusterName"
|
||||
zone_name=$(hostname -d)
|
||||
|
||||
zone_id=$(aws route53 list-hosted-zones --query 'HostedZones[?Name==`'"$zone_name"'.`].Id' --output text | cut --delimiter="/" --fields=3)
|
||||
cat <<EOF2 > route53.json
|
||||
{ "Changes": [ { "Action": "UPSERT", "ResourceRecordSet": { "Name": "${nodename}.${zone_name}", "Type": "A", "TTL": 30, "ResourceRecords": [ { "Value": "$my_ip" } ] } } ] }
|
||||
EOF2
|
||||
|
||||
echo "Updating DNS entry for $nodename to $my_ip"
|
||||
aws route53 change-resource-record-sets --hosted-zone-id $zone_id --change-batch file://route53.json
|
||||
|
||||
echo "Adding additional control shutdown commands"
|
||||
if [ ! -f /usr/local/sbin/drain_delete_node.sh ]; then
|
||||
cat <<EOF3 > /usr/local/sbin/drain_delete_node.sh
|
||||
#!/bin/bash -ex
|
||||
export LC_TYPE=en_US.UTF-8
|
||||
export KUBECONFIG=/root/.kube/config
|
||||
kubeadm reset phase update-cluster-status
|
||||
kubeadm reset phase remove-etcd-member
|
||||
EOF3
|
||||
chmod +x /usr/local/sbin/drain_delete_node.sh
|
||||
sed -e 's,/usr/local/sbin/backup_control_plane.sh&,/usr/local/sbin/drain_delete_node.sh,' -i /usr/local/sbin/cloudbender_shutdown.sh
|
||||
fi
|
||||
|
||||
echo "Patching ClusterConfig to re-create new etcd server certificates"
|
||||
yq w /etc/kubezero/kubeadm/templates/ClusterConfiguration.yaml etcd.local.serverCertSANs[+] $nodename > /etc/kubernetes/kubeadm-recert.yaml
|
||||
yq w -i /etc/kubernetes/kubeadm-recert.yaml etcd.local.serverCertSANs[+] $nodename.$zone_name
|
||||
rm -f /etc/kubernetes/pki/etcd/server.*
|
||||
kubeadm init phase certs etcd-server --config=/etc/kubernetes/kubeadm-recert.yaml 2>/dev/null
|
||||
kill -s HUP $(ps -e | grep etcd | awk '{print $1}')
|
||||
echo "Waiting for etcd to accept connections again...might take 30s or more"
|
||||
while true; do
|
||||
etcdctl member list -w simple 1>/dev/null 2>&1 && break || true
|
||||
sleep 3
|
||||
done
|
||||
EOF
|
||||
chmod +x /host/tmp/upgrade.sh
|
||||
chroot /host bash -c /tmp/upgrade.sh
|
||||
volumeMounts:
|
||||
- name: host
|
||||
mountPath: /host
|
||||
securityContext:
|
||||
privileged: true
|
||||
volumes:
|
||||
- name: host
|
||||
hostPath:
|
||||
path: /
|
||||
type: Directory
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: __node__
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/master
|
||||
effect: NoSchedule
|
||||
restartPolicy: Never
|
||||
EOF
|
||||
|
||||
# set controller node
|
||||
sed -i -e "s/__node__/$node/" _job.yaml
|
||||
|
||||
kubectl apply -f _job.yaml
|
||||
kubectl wait -n kube-system --timeout 300s --for=condition=complete job/kubezero-upgrade
|
||||
kubectl delete -f _job.yaml
|
||||
rm -f _job.yaml
|
||||
done
|
Loading…
Reference in New Issue
Block a user