From 127db98358c5033f3c5a5f8ed73771f929e2f125 Mon Sep 17 00:00:00 2001 From: Stefan Reimer Date: Fri, 25 Jun 2021 18:00:38 +0200 Subject: [PATCH] chore: update docs and scripts --- docs/SA_OIDC.yaml | 15 +++ docs/Upgrade-2.18.md | 56 --------- docs/Upgrade-2.20.md | 21 ++-- docs/aws_sidecar.md | 22 ++++ docs/etcd_hackery.md | 4 +- ...ion.sh => deprecated_setClusterVersion.sh} | 0 scripts/upgrade_120.sh | 117 ++++++++++++++++++ 7 files changed, 170 insertions(+), 65 deletions(-) create mode 100644 docs/SA_OIDC.yaml delete mode 100644 docs/Upgrade-2.18.md create mode 100644 docs/aws_sidecar.md rename scripts/{setClusterVersion.sh => deprecated_setClusterVersion.sh} (100%) mode change 100755 => 100644 create mode 100755 scripts/upgrade_120.sh diff --git a/docs/SA_OIDC.yaml b/docs/SA_OIDC.yaml new file mode 100644 index 00000000..ba0c857b --- /dev/null +++ b/docs/SA_OIDC.yaml @@ -0,0 +1,15 @@ +# Service Account Tokens + +## Federation with AWS IAM + +### Discovery +- public S3 location for openid and jwks config files +- synchronized from the api-server to S3 during version upgrades +service-account-issuer: `arn:aws:s3:::${ConfigBucketName}/k8s/${ClusterName}` +api-audiences: `sts.amazonaws.com` + +## Projection + +## Resources +- https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/#service-account-token-volume-projection +- https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/#service-account-issuer-discovery diff --git a/docs/Upgrade-2.18.md b/docs/Upgrade-2.18.md deleted file mode 100644 index bc9fd1d8..00000000 --- a/docs/Upgrade-2.18.md +++ /dev/null @@ -1,56 +0,0 @@ -# Upgrade to KubeZero V2.18.0 (Argoless) - -## (optional) Upgrade control plane nodes / worker nodes -- Set kube version in the controller config to eg. `1.18` -- Update kube-controller and worker stacks with latest CFN code - -- terminate controller node(s) -- once all controller nodes successfully upgraded replace worker nodes in a rolling fashion via. drain / terminate / rinse-repeat - -## ArgoCD -- disable all auto-sync and "prune" features to prevent that eg. namespaces from previous apps get removed - - either remove auto-sync from old values.yaml and run deploy one last time, trigger kubezero sync ! - - or disable manual via Argo UI starting with Kubezero app itself - -- uninstall argo helm chart: -`helm uninstall kubezero -n argocd` - -- remove all "argocd.argoproj.io/instance" labels from namespaces to prevent namespace removal later on: - `./scripts/remove_argo_ns.sh` - -## KubeZero - Part 1 -- migrate values.yaml to new structure, adapt as needed - & update new central kubezero location in git and merge cluster configs - -- upgrade all CRDs: - `./bootstrap.sh crds all clusters/$CLUSTER ../../../kubezero/charts` - -- upgrade first components: - `./bootstrap.sh deploy calico,cert-manager,kiam,aws-ebs-csi-driver,aws-efs-csi-driver clusters/$CLUSTER ../../../kubezero/charts` - -## Istio -Due to changes of the ingress namespace resource the upgrade will cause a brief (~3-5 min) ingress service interruption ! - - - delete istio operators, to remove all pieces, remove operator itself - `./scripts/delete_istio_17.sh` - - deploy istio and istio-ingress via bootstrap.sh - `./bootstrap.sh deploy istio,istio-ingress clusters/$CLUSTER ../../../kubezero/charts` - - patch all VirtualServices via script to new namespace - `./scripts/patch_vs.sh` - -Ingress service interruption ends. - -## KubeZero - Part 2 - -- push kubezero & cluster config to git - -- if logging incl. ElasticSearch is installed remove the old ECK operator first: - `./scripts/remove_old_eck.sh` - -- upgrade all remaining components and install new ArgoCD: - `./bootstrap.sh deploy all clusters/$CLUSTER ../../../kubezero/charts` - -## Verification / Tests -- verify argocd incl. kubezero app -- verify all argo apps status -- verify all the things diff --git a/docs/Upgrade-2.20.md b/docs/Upgrade-2.20.md index b6107876..14fd510f 100644 --- a/docs/Upgrade-2.20.md +++ b/docs/Upgrade-2.20.md @@ -2,16 +2,23 @@ # CloudBender ## Changes -- controller node names are now strictly tight to the AZ they are in: AZ1 -> controller00, AZ2 -> controller01 etc. to prevent controller03 from happening in case AWS launches new instances before the old ones are actually terminated +### Single node control plane +- Control ## Upgrade -- Set Kubernetes version in the controller config to eg. `1.20` -- Update controller and worker stacks with latest CFN code +- Set the specific wanted Kubernetes version in the controller config to eg. `v1.20.2` +- configure your AWS CLI profile as well as your kubectl context to cluster you want to upgrade. +- verify your config ... -- Upgrade requires careful replacement in case existing control planes are shuffled otherwise: ( this might reduce the number of online controllers temporarily to 1 ! ) - - manually set controller ASG to Min/Maz 0 for the ASG currently hosting controller00 - - terminate controller node in AZ1 which will return as controller00 - - replace controller01 and 02 in similar fashion +- run ./scripts/upgrade_120.sh +- update the CFN stack for kube-control-plane + +### Single node control plane +- will automatically be upgraded and the controller node replaced as part of the CFN update + +### Clustered control plane +- replace controller instances one by one in no particular order +- once confirmed that the upgraded 1.20 control plane is working as expected update the clustered control plane CFN stack once more with `LBType: none` to remove the AWS NLB fronting the Kubernetes API which is not required anymore. - replace worker nodes in a rolling fashion via. drain / terminate / rinse-repeat diff --git a/docs/aws_sidecar.md b/docs/aws_sidecar.md new file mode 100644 index 00000000..7f75e1a0 --- /dev/null +++ b/docs/aws_sidecar.md @@ -0,0 +1,22 @@ +``` + - args: + - -c + - sleep 3600 + command: + - /bin/sh + env: + - name: AWS_ROLE_ARN + value: + - name: AWS_WEB_IDENTITY_TOKEN_FILE + value: /var/run/secrets/sts.amazonaws.com/serviceaccount/token + image: amazon/aws-cli + imagePullPolicy: Always + name: busybox + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /var/run/secrets/sts.amazonaws.com/serviceaccount/ + name: aws-token + readOnly: true +``` diff --git a/docs/etcd_hackery.md b/docs/etcd_hackery.md index 0551229a..773bcadf 100644 --- a/docs/etcd_hackery.md +++ b/docs/etcd_hackery.md @@ -4,8 +4,8 @@ https://github.com/jpbetz/auger ## Change a PV object -etcdctl get /registry/persistentvolumes/services-dsny-cache | auger decode > pv.yaml +etcdctl get /registry/persistentvolumes/my_pv | auger decode > pv.yaml vi pv.yaml -cat pv.yaml | auger encode | etcdctl put /registry/persistentvolumes/services-dsny-cache +cat pv.yaml | auger encode | etcdctl put /registry/persistentvolumes/my_pv diff --git a/scripts/setClusterVersion.sh b/scripts/deprecated_setClusterVersion.sh old mode 100755 new mode 100644 similarity index 100% rename from scripts/setClusterVersion.sh rename to scripts/deprecated_setClusterVersion.sh diff --git a/scripts/upgrade_120.sh b/scripts/upgrade_120.sh new file mode 100755 index 00000000..eef12c90 --- /dev/null +++ b/scripts/upgrade_120.sh @@ -0,0 +1,117 @@ +#!/bin/bash +set -e + +# Get all kube-control-plane ASGs in the current account and region +asgs=$(aws autoscaling describe-auto-scaling-groups --output json | jq .AutoScalingGroups[].AutoScalingGroupName -r | grep kube-control-plane) + +for asg in $asgs; do + hooks=$(aws autoscaling describe-lifecycle-hooks --auto-scaling-group-name $asg --output json | jq '.LifecycleHooks[] | select (.LifecycleTransition=="autoscaling:EC2_INSTANCE_TERMINATING") | .LifecycleHookName' -r) + + for hook in $hooks; do + echo "Delete Lifecycle hook $hook of ASG $asg ? to abort" + read + aws autoscaling delete-lifecycle-hook --lifecycle-hook-name $hook --auto-scaling-group-name $asg + done +done + +# unset any AWS_DEFAULT_PROFILE as it will break aws-iam-auth +unset AWS_DEFAULT_PROFILE + +nodes=$(kubectl get nodes -l node-role.kubernetes.io/master -o json | jq .items[].metadata.name -r) + +for node in $nodes; do + echo "Deploying upgrade job on $node..." + + cat <<'EOF' > _job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: kubezero-upgrade + namespace: kube-system +spec: + template: + spec: + hostNetwork: true + hostIPC: true + hostPID: true + containers: + - name: busybox + image: busybox + command: + - /bin/sh + - -c + - | + cat <<'EOF' > /host/tmp/upgrade.sh + #!/bin/bash -ex + for l in $(cat /etc/environment); do + export $l + done + my_ip=$(ec2metadata --local-ipv4) + my_id=$(ec2metadata --instance-id) + clusterName=$(yq r /etc/kubezero/kubezero.yaml clusterName) + my_asg=$(aws ec2 describe-tags --filters "Name=resource-id,Values=${my_id}" --output json | jq '.Tags[] | select(.Key=="aws:cloudformation:logical-id") | .Value' -r) + + [ $my_asg == "KubeControlAsgAZ1" ] && nodename="etcd0-$clusterName" + [ $my_asg == "KubeControlAsgAZ2" ] && nodename="etcd1-$clusterName" + [ $my_asg == "KubeControlAsgAZ3" ] && nodename="etcd2-$clusterName" + zone_name=$(hostname -d) + + zone_id=$(aws route53 list-hosted-zones --query 'HostedZones[?Name==`'"$zone_name"'.`].Id' --output text | cut --delimiter="/" --fields=3) + cat < route53.json + { "Changes": [ { "Action": "UPSERT", "ResourceRecordSet": { "Name": "${nodename}.${zone_name}", "Type": "A", "TTL": 30, "ResourceRecords": [ { "Value": "$my_ip" } ] } } ] } + EOF2 + + echo "Updating DNS entry for $nodename to $my_ip" + aws route53 change-resource-record-sets --hosted-zone-id $zone_id --change-batch file://route53.json + + echo "Adding additional control shutdown commands" + if [ ! -f /usr/local/sbin/drain_delete_node.sh ]; then + cat < /usr/local/sbin/drain_delete_node.sh + #!/bin/bash -ex + export LC_TYPE=en_US.UTF-8 + export KUBECONFIG=/root/.kube/config + kubeadm reset phase update-cluster-status + kubeadm reset phase remove-etcd-member + EOF3 + chmod +x /usr/local/sbin/drain_delete_node.sh + sed -e 's,/usr/local/sbin/backup_control_plane.sh&,/usr/local/sbin/drain_delete_node.sh,' -i /usr/local/sbin/cloudbender_shutdown.sh + fi + + echo "Patching ClusterConfig to re-create new etcd server certificates" + yq w /etc/kubezero/kubeadm/templates/ClusterConfiguration.yaml etcd.local.serverCertSANs[+] $nodename > /etc/kubernetes/kubeadm-recert.yaml + rm -f /etc/kubernetes/pki/etcd/server.* + kubeadm init phase certs etcd-server --config=/etc/kubernetes/kubeadm-recert.yaml 2>/dev/null + kill -s HUP $(ps -e | grep etcd | awk '{print $1}') + echo "Waiting for etcd to accept connections again...might take 30s or more" + while true; do + etcdctl member list -w simple 1>/dev/null 2>&1 && break || true + sleep 3 + done + EOF + chmod +x /host/tmp/upgrade.sh + chroot /host bash -c /tmp/upgrade.sh + volumeMounts: + - name: host + mountPath: /host + securityContext: + privileged: true + volumes: + - name: host + hostPath: + path: / + type: Directory + nodeSelector: + kubernetes.io/hostname: __node__ + tolerations: + - key: node-role.kubernetes.io/master + effect: NoSchedule + restartPolicy: Never +EOF + + # set controller node + sed -i -e "s/__node__/$node/" _job.yaml + + kubectl apply -f _job.yaml + kubectl wait -n kube-system --timeout 300s --for=condition=complete job/kubezero-upgrade + kubectl delete -f _job.yaml +done