fix: argo-repo server on AWS start cmd, upgrade flow tweaks, docs

This commit is contained in:
Stefan Reimer 2025-07-01 14:10:48 +01:00
parent 5a93d1be68
commit 0179c335a0
3 changed files with 19 additions and 9 deletions

View File

@ -27,9 +27,6 @@ control_plane_phase1() {
# upgrade modules
admin_job "apply_kubezero, apply_network"
echo "Checking that all pods in kube-system are running ..."
waitSystemPodsRunning
}
control_plane_phase2() {
@ -55,20 +52,32 @@ backup() {
done
}
control_plane_healthy() {
echo "Checking control-plane is healthy..."
kubectl rollout status ds cilium -n kube-system
waitSystemPodsRunning
}
#####
echo "Checking that all pods in kube-system are running ..."
waitSystemPodsRunning
control_plane_healthy
[ "$ARGOCD" == "true" ] && disable_argo
control_plane_phase1
control_plane_healthy
echo "Replace controller nodes first to not stall out V1.31 controllers ... <return> to continue"
read -r
cluster_modules
backup
echo "Once ALL nodes, incl. workers, ALL, are running on $KUBE_VERSION, <return> to continue"
echo "Once ALL nodes, incl. workers, ALL, are running on $KUBE_VERSION <return> to continue"
read -r
control_plane_phase2

View File

@ -45,7 +45,7 @@ argo-cd:
- name: cmp-kubezero-git-sync
image: '{{ "{{" }} default .Values.global.image.repository .Values.repoServer.image.repository {{ "}}" }}:{{ "{{" }} default (include "argo-cd.defaultTag" .) .Values.repoServer.image.tag {{ "}}" }}'
imagePullPolicy: '{{ "{{" }} default .Values.global.image.imagePullPolicy .Values.repoServer.image.imagePullPolicy {{ "}}" }}'
command: ["/var/run/argocd/argocd-cmp-server"]
command: ["/usr/local/bin/argocd-cmp-server"]
env:
{{- include "aws-iam-env" (merge (dict "roleName" "argocd-repo-server") .) | nindent 10 }}
volumeMounts:

View File

@ -6,12 +6,13 @@
- ArgoCD upgraded to V3 incl. various performance optimizations, eg. ServerSideApply etc.
## Features and fixes
- enabled PSI metrics to get better insight on root cause of performance issues
- considerable performance improvements under load for the control-plane nodes
- enabled PSI cgroups metrics to get better insight on root cause of performance issues
## Version upgrades
- cilium 1.17.4
- istio 1.26.1
- ArgoCD 3.0.5
- ArgoCD 3.0.9
- Prometheus 3.4.1 / Grafana 12.0.1
- Nvidia container toolkit 1.17.8, drivers 570.148.08, Cuda 12.8
- AWS Neuron 2.21.37