Improve etcd upgrade flow

This commit is contained in:
Stefan Reimer 2023-05-03 17:33:04 +00:00
parent 1633187ac5
commit 6145802c38
2 changed files with 12 additions and 22 deletions

View File

@ -214,30 +214,16 @@ control_plane_node() {
sleep 3
done
# if we are NOT member already, flush etcd to be able to join
# see if we are a former member
MY_ID=$(etcdctl member list --endpoints=$etcd_endpoints | grep $ETCD_NODENAME | awk '{print $1}' | sed -e 's/,$//')
[ -n "$MY_ID" ] && retry 12 5 5 etcdctl member remove $MY_ID --endpoints=$etcd_endpoints
# Failsafe / etcd on ephmeral: we were a member but our dataset is missing
# -> remove former self so we can re-join
if [ -n "$MY_ID" -a ! -d ${HOSTFS}/var/lib/etcd/member ]; then
# Remove former self first
[ -n "$MY_ID" ] && retry 12 5 5 etcdctl member remove $MY_ID --endpoints=$etcd_endpoints
MY_ID=""
fi
# flush etcd data directory as joining with previous store seems flaky, especially during etcd version upgrades
rm -rf ${HOSTFS}/var/lib/etcd/member
if [ -z "$MY_ID" ]; then
# flush etcd data directory from restore
rm -rf ${HOSTFS}/var/lib/etcd/member
# Announce new etcd member and capture ETCD_INITIAL_CLUSTER, retry needed in case another node joining causes temp quorum loss
ETCD_ENVS=$(retry 12 5 5 etcdctl member add $ETCD_NODENAME --peer-urls="https://${ETCD_NODENAME}:2380" --endpoints=$etcd_endpoints)
export $(echo "$ETCD_ENVS" | grep ETCD_INITIAL_CLUSTER= | sed -e 's/"//g')
else
# build initial_cluster string from running cluster
_cluster=$(etcdctl member list --endpoints=$etcd_endpoints -w json | jq -r '.members[] | "\(.name)=\(.peerURLs[]),"')
export ETCD_INITIAL_CLUSTER=$(echo ${_cluster%%,} | sed -e 's/ //g')
fi
# Announce new etcd member and capture ETCD_INITIAL_CLUSTER, retry needed in case another node joining causes temp quorum loss
ETCD_ENVS=$(retry 12 5 5 etcdctl member add $ETCD_NODENAME --peer-urls="https://${ETCD_NODENAME}:2380" --endpoints=$etcd_endpoints)
export $(echo "$ETCD_ENVS" | grep ETCD_INITIAL_CLUSTER= | sed -e 's/"//g')
# Patch kubeadm-values.yaml and re-render to get etcd manifest patched
yq eval -i '.etcd.state = "existing"

View File

@ -170,7 +170,11 @@ waitSystemPodsRunning
echo "Applying remaining KubeZero modules..."
control_plane_upgrade "apply_cert-manager, apply_istio, apply_istio-ingress, apply_istio-private-ingress, apply_logging, apply_metrics, apply_argocd" backup
control_plane_upgrade "apply_cert-manager, apply_istio, apply_istio-ingress, apply_istio-private-ingress, apply_logging, apply_metrics, apply_argocd"
# Trigger backup of upgraded cluster state
kubectl create job --from=cronjob/kubezero-backup kubezero-backup-$VERSION -n kube-system
kubectl wait --for=condition=complete job/kubezero-backup-$VERSION && kubectl delete job kubezero-backup-$VERSION -n kube-system
# Final step is to commit the new argocd kubezero app
kubectl get app kubezero -n argocd -o yaml | yq 'del(.status) | del(.metadata) | del(.operation) | .metadata.name="kubezero" | .metadata.namespace="argocd"' | yq 'sort_keys(..) | .spec.source.helm.values |= (from_yaml | to_yaml)' > $ARGO_APP