fix: rc round of fixed for the upgrade flow of 1.23

This commit is contained in:
Stefan Reimer 2022-09-23 12:54:27 +02:00
parent 048b113b1e
commit c06da7b814
8 changed files with 74 additions and 41 deletions

View File

@ -166,7 +166,7 @@ control_plane_node() {
else else
# restore latest backup # restore latest backup
retry 10 60 30 restic restore latest --no-lock -t / --tag $KUBE_VERSION_MINOR retry 10 60 30 restic restore latest --no-lock -t / #Review: Use latest no matter what for now: --tag $KUBE_VERSION_MINOR
# Make last etcd snapshot available # Make last etcd snapshot available
cp ${WORKDIR}/etcd_snapshot ${HOSTFS}/etc/kubernetes cp ${WORKDIR}/etcd_snapshot ${HOSTFS}/etc/kubernetes
@ -382,8 +382,7 @@ for t in $@; do
bootstrap) control_plane_node bootstrap;; bootstrap) control_plane_node bootstrap;;
join) control_plane_node join;; join) control_plane_node join;;
restore) control_plane_node restore;; restore) control_plane_node restore;;
apply_network) apply_module network;; apply_*) apply_module ${t##apply_};;
apply_addons) apply_module addons;;
backup) backup;; backup) backup;;
debug_shell) debug_shell;; debug_shell) debug_shell;;
*) echo "Unknown command: '$t'";; *) echo "Unknown command: '$t'";;

View File

@ -55,6 +55,21 @@ function enable_argo() {
} }
function cntFailedPods() {
NS=$1
NR=$(kubectl get pods -n $NS --field-selector="status.phase!=Succeeded,status.phase!=Running" -o custom-columns="POD:metadata.name" -o json | jq '.items | length')
echo $NR
}
function waitSystemPodsRunning() {
while true; do
[ "$(cntFailedPods kube-system)" -eq 0 ] && break
sleep 3
done
}
function argo_app_synced() { function argo_app_synced() {
APP=$1 APP=$1
@ -93,7 +108,7 @@ function _crds() {
# Only apply if there are actually any crds # Only apply if there are actually any crds
if [ -s $WORKDIR/crds.yaml ]; then if [ -s $WORKDIR/crds.yaml ]; then
kubectl apply -f $WORKDIR/crds.yaml --server-side kubectl apply -f $WORKDIR/crds.yaml --server-side --force-conflicts
fi fi
} }
@ -114,7 +129,7 @@ for manifest in yaml.safe_load_all(sys.stdin):
print("---") print("---")
print(yaml.dump(manifest))' $namespace > $WORKDIR/helm.yaml print(yaml.dump(manifest))' $namespace > $WORKDIR/helm.yaml
kubectl $action -f $WORKDIR/helm.yaml && rc=$? || rc=$? kubectl $action -f $WORKDIR/helm.yaml --server-side --force-conflicts && rc=$? || rc=$?
} }

View File

@ -31,6 +31,9 @@ spec:
labels: labels:
name: kubezero-all-nodes-upgrade name: kubezero-all-nodes-upgrade
spec: spec:
hostNetwork: true
hostIPC: true
hostPID: true
tolerations: tolerations:
- key: node-role.kubernetes.io/master - key: node-role.kubernetes.io/master
operator: Exists operator: Exists
@ -43,7 +46,10 @@ spec:
volumeMounts: volumeMounts:
- name: host - name: host
mountPath: /host mountPath: /host
- name: hostproc
mountPath: /hostproc
securityContext: securityContext:
privileged: true
capabilities: capabilities:
add: ["SYS_ADMIN"] add: ["SYS_ADMIN"]
containers: containers:
@ -55,6 +61,10 @@ spec:
hostPath: hostPath:
path: / path: /
type: Directory type: Directory
- name: hostproc
hostPath:
path: /proc
type: Directory
EOF EOF
kubectl rollout status daemonset -n kube-system kubezero-all-nodes-upgrade --timeout 300s kubectl rollout status daemonset -n kube-system kubezero-all-nodes-upgrade --timeout 300s
@ -122,14 +132,18 @@ EOF
kubectl delete pod kubezero-upgrade -n kube-system kubectl delete pod kubezero-upgrade -n kube-system
} }
echo "Checking that all pods in kube-system are running ..."
waitSystemPodsRunning
argo_used && disable_argo argo_used && disable_argo
all_nodes_upgrade "mount --make-shared /host/sys/fs/cgroup; mount --make-shared /host/sys;" all_nodes_upgrade "nsenter -m/hostproc/1/ns/mnt mount --make-shared /sys/fs/cgroup; mount --make-shared /sys; sleep 3;"
control_plane_upgrade kubeadm_upgrade control_plane_upgrade kubeadm_upgrade
echo "Adjust kubezero-values as needed: (eg. set cilium cluster id etc):" echo "Adjust kubezero values as needed: (eg. set cilium cluster id and ensure no IP space overlap !!):"
kubectl edit cm kubezero-values -n kube-system argo_used && kubectl edit app kubezero -n argocd || kubectl edit cm kubezero-values -n kube-system
# Remove multus DS due to label changes, if this fails: # Remove multus DS due to label changes, if this fails:
# kubezero-network $ helm template . --set multus.enabled=true | kubectl apply -f - # kubezero-network $ helm template . --set multus.enabled=true | kubectl apply -f -
@ -141,16 +155,25 @@ kubectl delete daemonset metrics-prometheus-node-exporter -n monitoring || true
# AWS EBS CSI driver change their fsGroupPolicy # AWS EBS CSI driver change their fsGroupPolicy
kubectl delete CSIDriver ebs.csi.aws.com || true kubectl delete CSIDriver ebs.csi.aws.com || true
control_plane_upgrade "apply_network, apply_addons" control_plane_upgrade "apply_network, apply_addons, apply_storage"
kubectl rollout restart daemonset/calico-node -n kube-system kubectl rollout restart daemonset/calico-node -n kube-system
kubectl rollout restart daemonset/cilium -n kube-system kubectl rollout restart daemonset/cilium -n kube-system
kubectl rollout restart daemonset/kube-multus-ds -n kube-system kubectl rollout restart daemonset/kube-multus-ds -n kube-system
argo_used && enable_argo echo "Checking that all pods in kube-system are running ..."
waitSystemPodsRunning
echo "Applying remaining KubeZero modules..."
control_plane_upgrade "apply_cert-manager, apply_istio, apply_istio-ingress, apply_istio-private-ingress, apply_logging, apply_metrics, apply_argocd"
# Final step is to commit the new argocd kubezero app # Final step is to commit the new argocd kubezero app
kubectl get app kubezero -n argocd -o yaml | yq 'del(.status) | del(.metadata) | .metadata.name="kubezero" | .metadata.namespace="argocd"' | yq 'sort_keys(..) | .spec.source.helm.values |= (from_yaml | to_yaml)' > /tmp/new-kubezero-argoapp.yaml kubectl get app kubezero -n argocd -o yaml | yq 'del(.status) | del(.metadata) | del(.operation) | .metadata.name="kubezero" | .metadata.namespace="argocd"' | yq 'sort_keys(..) | .spec.source.helm.values |= (from_yaml | to_yaml)' > $ARGO_APP
echo "Please commit $ARGO_APP as the updated kubezero/application.yaml for your cluster." echo "Please commit $ARGO_APP as the updated kubezero/application.yaml for your cluster."
echo "Then head over to ArgoCD for this cluster and sync all KubeZero modules to apply remaining upgrades." echo "Then head over to ArgoCD for this cluster and sync all KubeZero modules to apply remaining upgrades."
echo "<Return> to continue and re-enable ArgoCD:"
read
argo_used && enable_argo

View File

@ -1,7 +1,6 @@
#!/bin/bash #!/bin/bash
# Migrate addons and network values from local kubeadm-values.yaml on controllers into CM # Migrate addons and network values from local kubeadm-values.yaml on controllers into CM
# - remove secrets from addons
# - enable cilium # - enable cilium
# Create emtpy CM if not exists yet # Create emtpy CM if not exists yet
@ -16,13 +15,6 @@ yq eval -i '.global.clusterName = strenv(CLUSTERNAME) |
.global.highAvailable = env(HIGHAVAILABLE)' \ .global.highAvailable = env(HIGHAVAILABLE)' \
${HOSTFS}/etc/kubernetes/kubeadm-values.yaml ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml
# extract addons
yq e '.addons |
del .clusterBackup.repository |
del .clusterBackup.password |
.clusterBackup.image.tag = strenv(KUBE_VERSION) |
{"addons": .}' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml > $WORKDIR/addons-values.yaml
# extract network # extract network
yq e '.network | yq e '.network |
.cilium.enabled = true | .cilium.enabled = true |
@ -35,10 +27,11 @@ yq e '.network |
# get current argo cd values # get current argo cd values
kubectl get application kubezero -n argocd -o yaml | yq '.spec.source.helm.values' > ${WORKDIR}/argo-values.yaml kubectl get application kubezero -n argocd -o yaml | yq '.spec.source.helm.values' > ${WORKDIR}/argo-values.yaml
# merge all into new CM # merge all into new CM and set new minimal addons
yq ea '. as $item ireduce ({}; . * $item ) | yq ea '. as $item ireduce ({}; . * $item ) |
.global.clusterName = strenv(CLUSTERNAME) | .global.clusterName = strenv(CLUSTERNAME) |
.global.highAvailable = env(HIGHAVAILABLE)' $WORKDIR/addons-values.yaml ${WORKDIR}/network-values.yaml $WORKDIR/argo-values.yaml > $WORKDIR/kubezero-pre-values.yaml .global.highAvailable = env(HIGHAVAILABLE) |
.addons.clusterBackup.image.tag = "v1.23" ' ${WORKDIR}/network-values.yaml $WORKDIR/argo-values.yaml > $WORKDIR/kubezero-pre-values.yaml
# tumble new config through migrate.py # tumble new config through migrate.py
cat $WORKDIR/kubezero-pre-values.yaml | migrate_argo_values.py > $WORKDIR/kubezero-values.yaml cat $WORKDIR/kubezero-pre-values.yaml | migrate_argo_values.py > $WORKDIR/kubezero-values.yaml

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: kubezero-network name: kubezero-network
description: KubeZero umbrella chart for all things network description: KubeZero umbrella chart for all things network
type: application type: application
version: 0.3.3 version: 0.3.4
home: https://kubezero.com home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords: keywords:

View File

@ -37,27 +37,27 @@ cilium:
binPath: "/usr/libexec/cni" binPath: "/usr/libexec/cni"
#-- Ensure this is false if multus is enabled #-- Ensure this is false if multus is enabled
exclusive: false exclusive: false
# chainingMode: generic-veth chainingMode: generic-veth
# Until we switch to Cilium bpf:
#bpf: hostLegacyRouting: true
# hostLegacyRouting: true
# tproxy: false # tproxy: false
# enableIPv4Masquerade: false # enableIPv4Masquerade: false
# enableIdentityMark: false # enableIdentityMark: false
policyEnforcementMode: "audit" policyEnforcementMode: "never"
cluster: cluster:
# This should match the second octet of clusterPoolIPv4PodCIDRList # This should match the second octet of clusterPoolIPv4PodCIDRList
# to prevent IP space overlap and easy tracking # to prevent IP space overlap and easy tracking
id: 1 # use 240 as default, less likely to clash with 1, do NOT use 244 used by calico until 1.25
id: 240
name: default name: default
ipam: ipam:
operator: operator:
clusterPoolIPv4PodCIDRList: clusterPoolIPv4PodCIDRList:
- 10.1.0.0/16 - 10.240.0.0/16
# Should be handled by multus # Should be handled by multus
nodePort: nodePort:

View File

@ -79,26 +79,29 @@ external-dns:
value: "0" value: "0"
{{- end }} {{- end }}
{{- with index .Values "addons" "cluster-autoscaler" }}
cluster-autoscaler: cluster-autoscaler:
{{- toYaml . | nindent 2 }} enabled: {{ default "false" (index .Values "addons" "cluster-autoscaler" "enabled") }}
autoDiscovery:
clusterName: {{ $.Values.global.clusterName }}
{{- with $.Values.metrics }} {{- with omit (index .Values "addons" "cluster-autoscaler") "enabled" }}
{{- toYaml . | nindent 2 }}
{{- end }}
autoDiscovery:
clusterName: {{ .Values.global.clusterName }}
{{- with .Values.metrics }}
serviceMonitor: serviceMonitor:
enabled: {{ .enabled }} enabled: {{ .enabled }}
prometheusRule: prometheusRule:
enabled: {{ .enabled }} enabled: {{ .enabled }}
{{- end }} {{- end }}
{{- if .Values.global.aws }}
# AWS # AWS
{{- if $.Values.global.aws }} awsRegion: {{ .Values.global.aws.region }}
awsRegion: {{ $.Values.global.aws.region }}
extraEnv: extraEnv:
AWS_ROLE_ARN: "arn:aws:iam::{{ $.Values.global.aws.accountId }}:role/{{ $.Values.global.aws.region }}.{{ $.Values.global.clusterName }}.clusterAutoScaler" AWS_ROLE_ARN: "arn:aws:iam::{{ .Values.global.aws.accountId }}:role/{{ .Values.global.aws.region }}.{{ .Values.global.clusterName }}.clusterAutoScaler"
AWS_WEB_IDENTITY_TOKEN_FILE: "/var/run/secrets/sts.amazonaws.com/serviceaccount/token" AWS_WEB_IDENTITY_TOKEN_FILE: "/var/run/secrets/sts.amazonaws.com/serviceaccount/token"
AWS_STS_REGIONAL_ENDPOINTS: "regional" AWS_STS_REGIONAL_ENDPOINTS: "regional"
extraVolumes: extraVolumes:
@ -115,8 +118,6 @@ cluster-autoscaler:
readOnly: true readOnly: true
{{- end }} {{- end }}
{{- end }}
{{- with .Values.addons.fuseDevicePlugin }} {{- with .Values.addons.fuseDevicePlugin }}
fuseDevicePlugin: fuseDevicePlugin:
{{- toYaml . | nindent 2 }} {{- toYaml . | nindent 2 }}

View File

@ -19,11 +19,13 @@ addons:
enabled: false enabled: false
aws-node-termination-handler: aws-node-termination-handler:
enabled: false enabled: false
cluster-autoscaler:
enabled: false
network: network:
enabled: true enabled: true
retain: true retain: true
targetRevision: 0.3.3 targetRevision: 0.3.4
cilium: cilium:
cluster: {} cluster: {}