fix: rc round of fixed for the upgrade flow of 1.23

This commit is contained in:
Stefan Reimer 2022-09-23 12:54:27 +02:00
parent 048b113b1e
commit c06da7b814
8 changed files with 74 additions and 41 deletions

View File

@ -166,7 +166,7 @@ control_plane_node() {
else
# restore latest backup
retry 10 60 30 restic restore latest --no-lock -t / --tag $KUBE_VERSION_MINOR
retry 10 60 30 restic restore latest --no-lock -t / #Review: Use latest no matter what for now: --tag $KUBE_VERSION_MINOR
# Make last etcd snapshot available
cp ${WORKDIR}/etcd_snapshot ${HOSTFS}/etc/kubernetes
@ -382,8 +382,7 @@ for t in $@; do
bootstrap) control_plane_node bootstrap;;
join) control_plane_node join;;
restore) control_plane_node restore;;
apply_network) apply_module network;;
apply_addons) apply_module addons;;
apply_*) apply_module ${t##apply_};;
backup) backup;;
debug_shell) debug_shell;;
*) echo "Unknown command: '$t'";;

View File

@ -55,6 +55,21 @@ function enable_argo() {
}
function cntFailedPods() {
NS=$1
NR=$(kubectl get pods -n $NS --field-selector="status.phase!=Succeeded,status.phase!=Running" -o custom-columns="POD:metadata.name" -o json | jq '.items | length')
echo $NR
}
function waitSystemPodsRunning() {
while true; do
[ "$(cntFailedPods kube-system)" -eq 0 ] && break
sleep 3
done
}
function argo_app_synced() {
APP=$1
@ -93,7 +108,7 @@ function _crds() {
# Only apply if there are actually any crds
if [ -s $WORKDIR/crds.yaml ]; then
kubectl apply -f $WORKDIR/crds.yaml --server-side
kubectl apply -f $WORKDIR/crds.yaml --server-side --force-conflicts
fi
}
@ -114,7 +129,7 @@ for manifest in yaml.safe_load_all(sys.stdin):
print("---")
print(yaml.dump(manifest))' $namespace > $WORKDIR/helm.yaml
kubectl $action -f $WORKDIR/helm.yaml && rc=$? || rc=$?
kubectl $action -f $WORKDIR/helm.yaml --server-side --force-conflicts && rc=$? || rc=$?
}

View File

@ -31,6 +31,9 @@ spec:
labels:
name: kubezero-all-nodes-upgrade
spec:
hostNetwork: true
hostIPC: true
hostPID: true
tolerations:
- key: node-role.kubernetes.io/master
operator: Exists
@ -43,7 +46,10 @@ spec:
volumeMounts:
- name: host
mountPath: /host
- name: hostproc
mountPath: /hostproc
securityContext:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
containers:
@ -55,6 +61,10 @@ spec:
hostPath:
path: /
type: Directory
- name: hostproc
hostPath:
path: /proc
type: Directory
EOF
kubectl rollout status daemonset -n kube-system kubezero-all-nodes-upgrade --timeout 300s
@ -122,14 +132,18 @@ EOF
kubectl delete pod kubezero-upgrade -n kube-system
}
echo "Checking that all pods in kube-system are running ..."
waitSystemPodsRunning
argo_used && disable_argo
all_nodes_upgrade "mount --make-shared /host/sys/fs/cgroup; mount --make-shared /host/sys;"
all_nodes_upgrade "nsenter -m/hostproc/1/ns/mnt mount --make-shared /sys/fs/cgroup; mount --make-shared /sys; sleep 3;"
control_plane_upgrade kubeadm_upgrade
echo "Adjust kubezero-values as needed: (eg. set cilium cluster id etc):"
kubectl edit cm kubezero-values -n kube-system
echo "Adjust kubezero values as needed: (eg. set cilium cluster id and ensure no IP space overlap !!):"
argo_used && kubectl edit app kubezero -n argocd || kubectl edit cm kubezero-values -n kube-system
# Remove multus DS due to label changes, if this fails:
# kubezero-network $ helm template . --set multus.enabled=true | kubectl apply -f -
@ -141,16 +155,25 @@ kubectl delete daemonset metrics-prometheus-node-exporter -n monitoring || true
# AWS EBS CSI driver change their fsGroupPolicy
kubectl delete CSIDriver ebs.csi.aws.com || true
control_plane_upgrade "apply_network, apply_addons"
control_plane_upgrade "apply_network, apply_addons, apply_storage"
kubectl rollout restart daemonset/calico-node -n kube-system
kubectl rollout restart daemonset/cilium -n kube-system
kubectl rollout restart daemonset/kube-multus-ds -n kube-system
argo_used && enable_argo
echo "Checking that all pods in kube-system are running ..."
waitSystemPodsRunning
echo "Applying remaining KubeZero modules..."
control_plane_upgrade "apply_cert-manager, apply_istio, apply_istio-ingress, apply_istio-private-ingress, apply_logging, apply_metrics, apply_argocd"
# Final step is to commit the new argocd kubezero app
kubectl get app kubezero -n argocd -o yaml | yq 'del(.status) | del(.metadata) | .metadata.name="kubezero" | .metadata.namespace="argocd"' | yq 'sort_keys(..) | .spec.source.helm.values |= (from_yaml | to_yaml)' > /tmp/new-kubezero-argoapp.yaml
kubectl get app kubezero -n argocd -o yaml | yq 'del(.status) | del(.metadata) | del(.operation) | .metadata.name="kubezero" | .metadata.namespace="argocd"' | yq 'sort_keys(..) | .spec.source.helm.values |= (from_yaml | to_yaml)' > $ARGO_APP
echo "Please commit $ARGO_APP as the updated kubezero/application.yaml for your cluster."
echo "Then head over to ArgoCD for this cluster and sync all KubeZero modules to apply remaining upgrades."
echo "<Return> to continue and re-enable ArgoCD:"
read
argo_used && enable_argo

View File

@ -1,7 +1,6 @@
#!/bin/bash
# Migrate addons and network values from local kubeadm-values.yaml on controllers into CM
# - remove secrets from addons
# - enable cilium
# Create emtpy CM if not exists yet
@ -16,13 +15,6 @@ yq eval -i '.global.clusterName = strenv(CLUSTERNAME) |
.global.highAvailable = env(HIGHAVAILABLE)' \
${HOSTFS}/etc/kubernetes/kubeadm-values.yaml
# extract addons
yq e '.addons |
del .clusterBackup.repository |
del .clusterBackup.password |
.clusterBackup.image.tag = strenv(KUBE_VERSION) |
{"addons": .}' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml > $WORKDIR/addons-values.yaml
# extract network
yq e '.network |
.cilium.enabled = true |
@ -35,10 +27,11 @@ yq e '.network |
# get current argo cd values
kubectl get application kubezero -n argocd -o yaml | yq '.spec.source.helm.values' > ${WORKDIR}/argo-values.yaml
# merge all into new CM
# merge all into new CM and set new minimal addons
yq ea '. as $item ireduce ({}; . * $item ) |
.global.clusterName = strenv(CLUSTERNAME) |
.global.highAvailable = env(HIGHAVAILABLE)' $WORKDIR/addons-values.yaml ${WORKDIR}/network-values.yaml $WORKDIR/argo-values.yaml > $WORKDIR/kubezero-pre-values.yaml
.global.highAvailable = env(HIGHAVAILABLE) |
.addons.clusterBackup.image.tag = "v1.23" ' ${WORKDIR}/network-values.yaml $WORKDIR/argo-values.yaml > $WORKDIR/kubezero-pre-values.yaml
# tumble new config through migrate.py
cat $WORKDIR/kubezero-pre-values.yaml | migrate_argo_values.py > $WORKDIR/kubezero-values.yaml

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: kubezero-network
description: KubeZero umbrella chart for all things network
type: application
version: 0.3.3
version: 0.3.4
home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords:

View File

@ -37,27 +37,27 @@ cilium:
binPath: "/usr/libexec/cni"
#-- Ensure this is false if multus is enabled
exclusive: false
# chainingMode: generic-veth
chainingMode: generic-veth
# Until we switch to Cilium
#bpf:
# hostLegacyRouting: true
bpf:
hostLegacyRouting: true
# tproxy: false
# enableIPv4Masquerade: false
# enableIdentityMark: false
policyEnforcementMode: "audit"
policyEnforcementMode: "never"
cluster:
# This should match the second octet of clusterPoolIPv4PodCIDRList
# to prevent IP space overlap and easy tracking
id: 1
# use 240 as default, less likely to clash with 1, do NOT use 244 used by calico until 1.25
id: 240
name: default
ipam:
operator:
clusterPoolIPv4PodCIDRList:
- 10.1.0.0/16
- 10.240.0.0/16
# Should be handled by multus
nodePort:

View File

@ -79,26 +79,29 @@ external-dns:
value: "0"
{{- end }}
{{- with index .Values "addons" "cluster-autoscaler" }}
cluster-autoscaler:
{{- toYaml . | nindent 2 }}
autoDiscovery:
clusterName: {{ $.Values.global.clusterName }}
enabled: {{ default "false" (index .Values "addons" "cluster-autoscaler" "enabled") }}
{{- with $.Values.metrics }}
{{- with omit (index .Values "addons" "cluster-autoscaler") "enabled" }}
{{- toYaml . | nindent 2 }}
{{- end }}
autoDiscovery:
clusterName: {{ .Values.global.clusterName }}
{{- with .Values.metrics }}
serviceMonitor:
enabled: {{ .enabled }}
prometheusRule:
enabled: {{ .enabled }}
{{- end }}
{{- if .Values.global.aws }}
# AWS
{{- if $.Values.global.aws }}
awsRegion: {{ $.Values.global.aws.region }}
awsRegion: {{ .Values.global.aws.region }}
extraEnv:
AWS_ROLE_ARN: "arn:aws:iam::{{ $.Values.global.aws.accountId }}:role/{{ $.Values.global.aws.region }}.{{ $.Values.global.clusterName }}.clusterAutoScaler"
AWS_ROLE_ARN: "arn:aws:iam::{{ .Values.global.aws.accountId }}:role/{{ .Values.global.aws.region }}.{{ .Values.global.clusterName }}.clusterAutoScaler"
AWS_WEB_IDENTITY_TOKEN_FILE: "/var/run/secrets/sts.amazonaws.com/serviceaccount/token"
AWS_STS_REGIONAL_ENDPOINTS: "regional"
extraVolumes:
@ -115,8 +118,6 @@ cluster-autoscaler:
readOnly: true
{{- end }}
{{- end }}
{{- with .Values.addons.fuseDevicePlugin }}
fuseDevicePlugin:
{{- toYaml . | nindent 2 }}

View File

@ -19,11 +19,13 @@ addons:
enabled: false
aws-node-termination-handler:
enabled: false
cluster-autoscaler:
enabled: false
network:
enabled: true
retain: true
targetRevision: 0.3.3
targetRevision: 0.3.4
cilium:
cluster: {}