fix: rc round of fixed for the upgrade flow of 1.23
This commit is contained in:
parent
048b113b1e
commit
c06da7b814
@ -166,7 +166,7 @@ control_plane_node() {
|
||||
|
||||
else
|
||||
# restore latest backup
|
||||
retry 10 60 30 restic restore latest --no-lock -t / --tag $KUBE_VERSION_MINOR
|
||||
retry 10 60 30 restic restore latest --no-lock -t / #Review: Use latest no matter what for now: --tag $KUBE_VERSION_MINOR
|
||||
|
||||
# Make last etcd snapshot available
|
||||
cp ${WORKDIR}/etcd_snapshot ${HOSTFS}/etc/kubernetes
|
||||
@ -382,8 +382,7 @@ for t in $@; do
|
||||
bootstrap) control_plane_node bootstrap;;
|
||||
join) control_plane_node join;;
|
||||
restore) control_plane_node restore;;
|
||||
apply_network) apply_module network;;
|
||||
apply_addons) apply_module addons;;
|
||||
apply_*) apply_module ${t##apply_};;
|
||||
backup) backup;;
|
||||
debug_shell) debug_shell;;
|
||||
*) echo "Unknown command: '$t'";;
|
||||
|
@ -55,6 +55,21 @@ function enable_argo() {
|
||||
}
|
||||
|
||||
|
||||
function cntFailedPods() {
|
||||
NS=$1
|
||||
|
||||
NR=$(kubectl get pods -n $NS --field-selector="status.phase!=Succeeded,status.phase!=Running" -o custom-columns="POD:metadata.name" -o json | jq '.items | length')
|
||||
echo $NR
|
||||
}
|
||||
|
||||
|
||||
function waitSystemPodsRunning() {
|
||||
while true; do
|
||||
[ "$(cntFailedPods kube-system)" -eq 0 ] && break
|
||||
sleep 3
|
||||
done
|
||||
}
|
||||
|
||||
function argo_app_synced() {
|
||||
APP=$1
|
||||
|
||||
@ -93,7 +108,7 @@ function _crds() {
|
||||
|
||||
# Only apply if there are actually any crds
|
||||
if [ -s $WORKDIR/crds.yaml ]; then
|
||||
kubectl apply -f $WORKDIR/crds.yaml --server-side
|
||||
kubectl apply -f $WORKDIR/crds.yaml --server-side --force-conflicts
|
||||
fi
|
||||
}
|
||||
|
||||
@ -114,7 +129,7 @@ for manifest in yaml.safe_load_all(sys.stdin):
|
||||
print("---")
|
||||
print(yaml.dump(manifest))' $namespace > $WORKDIR/helm.yaml
|
||||
|
||||
kubectl $action -f $WORKDIR/helm.yaml && rc=$? || rc=$?
|
||||
kubectl $action -f $WORKDIR/helm.yaml --server-side --force-conflicts && rc=$? || rc=$?
|
||||
}
|
||||
|
||||
|
||||
|
@ -31,6 +31,9 @@ spec:
|
||||
labels:
|
||||
name: kubezero-all-nodes-upgrade
|
||||
spec:
|
||||
hostNetwork: true
|
||||
hostIPC: true
|
||||
hostPID: true
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/master
|
||||
operator: Exists
|
||||
@ -43,7 +46,10 @@ spec:
|
||||
volumeMounts:
|
||||
- name: host
|
||||
mountPath: /host
|
||||
- name: hostproc
|
||||
mountPath: /hostproc
|
||||
securityContext:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
containers:
|
||||
@ -55,6 +61,10 @@ spec:
|
||||
hostPath:
|
||||
path: /
|
||||
type: Directory
|
||||
- name: hostproc
|
||||
hostPath:
|
||||
path: /proc
|
||||
type: Directory
|
||||
EOF
|
||||
|
||||
kubectl rollout status daemonset -n kube-system kubezero-all-nodes-upgrade --timeout 300s
|
||||
@ -122,14 +132,18 @@ EOF
|
||||
kubectl delete pod kubezero-upgrade -n kube-system
|
||||
}
|
||||
|
||||
|
||||
echo "Checking that all pods in kube-system are running ..."
|
||||
waitSystemPodsRunning
|
||||
|
||||
argo_used && disable_argo
|
||||
|
||||
all_nodes_upgrade "mount --make-shared /host/sys/fs/cgroup; mount --make-shared /host/sys;"
|
||||
all_nodes_upgrade "nsenter -m/hostproc/1/ns/mnt mount --make-shared /sys/fs/cgroup; mount --make-shared /sys; sleep 3;"
|
||||
|
||||
control_plane_upgrade kubeadm_upgrade
|
||||
|
||||
echo "Adjust kubezero-values as needed: (eg. set cilium cluster id etc):"
|
||||
kubectl edit cm kubezero-values -n kube-system
|
||||
echo "Adjust kubezero values as needed: (eg. set cilium cluster id and ensure no IP space overlap !!):"
|
||||
argo_used && kubectl edit app kubezero -n argocd || kubectl edit cm kubezero-values -n kube-system
|
||||
|
||||
# Remove multus DS due to label changes, if this fails:
|
||||
# kubezero-network $ helm template . --set multus.enabled=true | kubectl apply -f -
|
||||
@ -141,16 +155,25 @@ kubectl delete daemonset metrics-prometheus-node-exporter -n monitoring || true
|
||||
# AWS EBS CSI driver change their fsGroupPolicy
|
||||
kubectl delete CSIDriver ebs.csi.aws.com || true
|
||||
|
||||
control_plane_upgrade "apply_network, apply_addons"
|
||||
control_plane_upgrade "apply_network, apply_addons, apply_storage"
|
||||
|
||||
kubectl rollout restart daemonset/calico-node -n kube-system
|
||||
kubectl rollout restart daemonset/cilium -n kube-system
|
||||
kubectl rollout restart daemonset/kube-multus-ds -n kube-system
|
||||
|
||||
argo_used && enable_argo
|
||||
echo "Checking that all pods in kube-system are running ..."
|
||||
waitSystemPodsRunning
|
||||
|
||||
echo "Applying remaining KubeZero modules..."
|
||||
control_plane_upgrade "apply_cert-manager, apply_istio, apply_istio-ingress, apply_istio-private-ingress, apply_logging, apply_metrics, apply_argocd"
|
||||
|
||||
# Final step is to commit the new argocd kubezero app
|
||||
kubectl get app kubezero -n argocd -o yaml | yq 'del(.status) | del(.metadata) | .metadata.name="kubezero" | .metadata.namespace="argocd"' | yq 'sort_keys(..) | .spec.source.helm.values |= (from_yaml | to_yaml)' > /tmp/new-kubezero-argoapp.yaml
|
||||
kubectl get app kubezero -n argocd -o yaml | yq 'del(.status) | del(.metadata) | del(.operation) | .metadata.name="kubezero" | .metadata.namespace="argocd"' | yq 'sort_keys(..) | .spec.source.helm.values |= (from_yaml | to_yaml)' > $ARGO_APP
|
||||
|
||||
echo "Please commit $ARGO_APP as the updated kubezero/application.yaml for your cluster."
|
||||
echo "Then head over to ArgoCD for this cluster and sync all KubeZero modules to apply remaining upgrades."
|
||||
|
||||
echo "<Return> to continue and re-enable ArgoCD:"
|
||||
read
|
||||
|
||||
argo_used && enable_argo
|
||||
|
@ -1,7 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Migrate addons and network values from local kubeadm-values.yaml on controllers into CM
|
||||
# - remove secrets from addons
|
||||
# - enable cilium
|
||||
|
||||
# Create emtpy CM if not exists yet
|
||||
@ -16,13 +15,6 @@ yq eval -i '.global.clusterName = strenv(CLUSTERNAME) |
|
||||
.global.highAvailable = env(HIGHAVAILABLE)' \
|
||||
${HOSTFS}/etc/kubernetes/kubeadm-values.yaml
|
||||
|
||||
# extract addons
|
||||
yq e '.addons |
|
||||
del .clusterBackup.repository |
|
||||
del .clusterBackup.password |
|
||||
.clusterBackup.image.tag = strenv(KUBE_VERSION) |
|
||||
{"addons": .}' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml > $WORKDIR/addons-values.yaml
|
||||
|
||||
# extract network
|
||||
yq e '.network |
|
||||
.cilium.enabled = true |
|
||||
@ -35,10 +27,11 @@ yq e '.network |
|
||||
# get current argo cd values
|
||||
kubectl get application kubezero -n argocd -o yaml | yq '.spec.source.helm.values' > ${WORKDIR}/argo-values.yaml
|
||||
|
||||
# merge all into new CM
|
||||
# merge all into new CM and set new minimal addons
|
||||
yq ea '. as $item ireduce ({}; . * $item ) |
|
||||
.global.clusterName = strenv(CLUSTERNAME) |
|
||||
.global.highAvailable = env(HIGHAVAILABLE)' $WORKDIR/addons-values.yaml ${WORKDIR}/network-values.yaml $WORKDIR/argo-values.yaml > $WORKDIR/kubezero-pre-values.yaml
|
||||
.global.highAvailable = env(HIGHAVAILABLE) |
|
||||
.addons.clusterBackup.image.tag = "v1.23" ' ${WORKDIR}/network-values.yaml $WORKDIR/argo-values.yaml > $WORKDIR/kubezero-pre-values.yaml
|
||||
|
||||
# tumble new config through migrate.py
|
||||
cat $WORKDIR/kubezero-pre-values.yaml | migrate_argo_values.py > $WORKDIR/kubezero-values.yaml
|
||||
|
@ -2,7 +2,7 @@ apiVersion: v2
|
||||
name: kubezero-network
|
||||
description: KubeZero umbrella chart for all things network
|
||||
type: application
|
||||
version: 0.3.3
|
||||
version: 0.3.4
|
||||
home: https://kubezero.com
|
||||
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
|
||||
keywords:
|
||||
|
@ -37,27 +37,27 @@ cilium:
|
||||
binPath: "/usr/libexec/cni"
|
||||
#-- Ensure this is false if multus is enabled
|
||||
exclusive: false
|
||||
# chainingMode: generic-veth
|
||||
chainingMode: generic-veth
|
||||
|
||||
# Until we switch to Cilium
|
||||
#bpf:
|
||||
# hostLegacyRouting: true
|
||||
bpf:
|
||||
hostLegacyRouting: true
|
||||
# tproxy: false
|
||||
|
||||
# enableIPv4Masquerade: false
|
||||
# enableIdentityMark: false
|
||||
policyEnforcementMode: "audit"
|
||||
policyEnforcementMode: "never"
|
||||
|
||||
cluster:
|
||||
# This should match the second octet of clusterPoolIPv4PodCIDRList
|
||||
# to prevent IP space overlap and easy tracking
|
||||
id: 1
|
||||
# use 240 as default, less likely to clash with 1, do NOT use 244 used by calico until 1.25
|
||||
id: 240
|
||||
name: default
|
||||
|
||||
ipam:
|
||||
operator:
|
||||
clusterPoolIPv4PodCIDRList:
|
||||
- 10.1.0.0/16
|
||||
- 10.240.0.0/16
|
||||
|
||||
# Should be handled by multus
|
||||
nodePort:
|
||||
|
@ -79,26 +79,29 @@ external-dns:
|
||||
value: "0"
|
||||
{{- end }}
|
||||
|
||||
|
||||
{{- with index .Values "addons" "cluster-autoscaler" }}
|
||||
cluster-autoscaler:
|
||||
{{- toYaml . | nindent 2 }}
|
||||
autoDiscovery:
|
||||
clusterName: {{ $.Values.global.clusterName }}
|
||||
enabled: {{ default "false" (index .Values "addons" "cluster-autoscaler" "enabled") }}
|
||||
|
||||
{{- with $.Values.metrics }}
|
||||
{{- with omit (index .Values "addons" "cluster-autoscaler") "enabled" }}
|
||||
{{- toYaml . | nindent 2 }}
|
||||
{{- end }}
|
||||
|
||||
autoDiscovery:
|
||||
clusterName: {{ .Values.global.clusterName }}
|
||||
|
||||
{{- with .Values.metrics }}
|
||||
serviceMonitor:
|
||||
enabled: {{ .enabled }}
|
||||
prometheusRule:
|
||||
enabled: {{ .enabled }}
|
||||
{{- end }}
|
||||
|
||||
{{- if .Values.global.aws }}
|
||||
# AWS
|
||||
{{- if $.Values.global.aws }}
|
||||
awsRegion: {{ $.Values.global.aws.region }}
|
||||
awsRegion: {{ .Values.global.aws.region }}
|
||||
|
||||
extraEnv:
|
||||
AWS_ROLE_ARN: "arn:aws:iam::{{ $.Values.global.aws.accountId }}:role/{{ $.Values.global.aws.region }}.{{ $.Values.global.clusterName }}.clusterAutoScaler"
|
||||
AWS_ROLE_ARN: "arn:aws:iam::{{ .Values.global.aws.accountId }}:role/{{ .Values.global.aws.region }}.{{ .Values.global.clusterName }}.clusterAutoScaler"
|
||||
AWS_WEB_IDENTITY_TOKEN_FILE: "/var/run/secrets/sts.amazonaws.com/serviceaccount/token"
|
||||
AWS_STS_REGIONAL_ENDPOINTS: "regional"
|
||||
extraVolumes:
|
||||
@ -115,8 +118,6 @@ cluster-autoscaler:
|
||||
readOnly: true
|
||||
{{- end }}
|
||||
|
||||
{{- end }}
|
||||
|
||||
{{- with .Values.addons.fuseDevicePlugin }}
|
||||
fuseDevicePlugin:
|
||||
{{- toYaml . | nindent 2 }}
|
||||
|
@ -19,11 +19,13 @@ addons:
|
||||
enabled: false
|
||||
aws-node-termination-handler:
|
||||
enabled: false
|
||||
cluster-autoscaler:
|
||||
enabled: false
|
||||
|
||||
network:
|
||||
enabled: true
|
||||
retain: true
|
||||
targetRevision: 0.3.3
|
||||
targetRevision: 0.3.4
|
||||
cilium:
|
||||
cluster: {}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user