From 573dd3ec3ef04bc991833126bca3e6f1237c500b Mon Sep 17 00:00:00 2001 From: Stefan Reimer Date: Wed, 13 Nov 2024 14:35:50 +0000 Subject: [PATCH] fix: minor upgrade flow fixes --- admin/kubezero.sh | 2 +- admin/upgrade_cluster.sh | 14 ++++++++++---- docs/nvidia-test.yaml | 7 ++++++- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/admin/kubezero.sh b/admin/kubezero.sh index 1b0a68e3..cd41be02 100755 --- a/admin/kubezero.sh +++ b/admin/kubezero.sh @@ -64,7 +64,7 @@ render_kubeadm() { cat ${WORKDIR}/kubeadm/templates/${f}Configuration.yaml >> ${HOSTFS}/etc/kubernetes/kubeadm.yaml done - if [[ "$phase" =~ ^(bootstrap|restore)$ ]]; then + if [[ "$phase" =~ ^(bootstrap|join|restore)$ ]]; then cat ${WORKDIR}/kubeadm/templates/InitConfiguration.yaml >> ${HOSTFS}/etc/kubernetes/kubeadm.yaml fi diff --git a/admin/upgrade_cluster.sh b/admin/upgrade_cluster.sh index 0904bd57..b0b0f1f1 100755 --- a/admin/upgrade_cluster.sh +++ b/admin/upgrade_cluster.sh @@ -19,6 +19,11 @@ echo "Checking that all pods in kube-system are running ..." [ "$ARGOCD" == "True" ] && disable_argo +# 1.30 fix for the missing kubeadm socket annotations +for c in $(kubectl get nodes -l "node-role.kubernetes.io/control-plane=" | grep v1.29 | awk {'print $1}'); do + kubectl annotate node $c 'kubeadm.alpha.kubernetes.io/cri-socket=unix:///var/run/crio/crio.sock' +done + control_plane_upgrade kubeadm_upgrade echo "Control plane upgraded, to continue" @@ -33,13 +38,14 @@ kubectl delete runtimeclass crio || true # upgrade modules # -# Preload cilium images to running nodes -all_nodes_upgrade "chroot /host crictl pull quay.io/cilium/cilium:v1.16.3" +# Preload cilium images to running nodes, disabled till 1.31 +# all_nodes_upgrade "chroot /host crictl pull quay.io/cilium/cilium:v1.16.3; chroot /host crictl pull ghcr.io/k8snetworkplumbingwg/multus-cni:v3.9.3" control_plane_upgrade "apply_network, apply_addons, apply_storage, apply_operators" -echo "Checking that all pods in kube-system are running ..." -waitSystemPodsRunning +# Disabled during 1.30 due to nvidia runtime deadlock +#echo "Checking that all pods in kube-system are running ..." +#waitSystemPodsRunning echo "Applying remaining KubeZero modules..." diff --git a/docs/nvidia-test.yaml b/docs/nvidia-test.yaml index 87000152..0c7e3f11 100644 --- a/docs/nvidia-test.yaml +++ b/docs/nvidia-test.yaml @@ -6,11 +6,12 @@ metadata: spec: runtimeClassName: nvidia containers: + #- image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0 - image: nvidia/cuda:12.5.1-base-ubuntu22.04 command: - "bash" - "-c" - - "sleep 3600" + - "nvidia-smi; sleep 3600" imagePullPolicy: IfNotPresent name: nvidia-test resources: @@ -18,3 +19,7 @@ spec: nvidia.com/gpu: 1 requests: memory: 1024Mi + tolerations: + - effect: NoSchedule + key: kubezero-workergroup + operator: Exists