From 23912192ade0b4c6a98f9ba082fde2cd91f2d540 Mon Sep 17 00:00:00 2001
From: Stefan Reimer <stefan@zero-downtime.net>
Date: Fri, 25 Jun 2021 18:00:38 +0200
Subject: [PATCH] chore: update docs and scripts

---
 docs/SA_OIDC.yaml                             |  15 +++
 docs/Upgrade-2.18.md                          |  56 ---------
 docs/Upgrade-2.20.md                          |  21 ++--
 docs/aws_sidecar.md                           |  22 ++++
 docs/etcd_hackery.md                          |   4 +-
 ...ion.sh => deprecated_setClusterVersion.sh} |   0
 scripts/upgrade_120.sh                        | 117 ++++++++++++++++++
 7 files changed, 170 insertions(+), 65 deletions(-)
 create mode 100644 docs/SA_OIDC.yaml
 delete mode 100644 docs/Upgrade-2.18.md
 create mode 100644 docs/aws_sidecar.md
 rename scripts/{setClusterVersion.sh => deprecated_setClusterVersion.sh} (100%)
 mode change 100755 => 100644
 create mode 100755 scripts/upgrade_120.sh

diff --git a/docs/SA_OIDC.yaml b/docs/SA_OIDC.yaml
new file mode 100644
index 0000000..ba0c857
--- /dev/null
+++ b/docs/SA_OIDC.yaml
@@ -0,0 +1,15 @@
+# Service Account Tokens
+
+## Federation with AWS IAM
+
+### Discovery
+- public S3 location for openid and jwks config files
+- synchronized from the api-server to S3 during version upgrades  
+service-account-issuer: `arn:aws:s3:::${ConfigBucketName}/k8s/${ClusterName}`  
+api-audiences: `sts.amazonaws.com`
+
+## Projection
+
+## Resources
+- https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/#service-account-token-volume-projection
+- https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/#service-account-issuer-discovery
diff --git a/docs/Upgrade-2.18.md b/docs/Upgrade-2.18.md
deleted file mode 100644
index bc9fd1d..0000000
--- a/docs/Upgrade-2.18.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# Upgrade to KubeZero V2.18.0 (Argoless)
-
-## (optional) Upgrade control plane nodes / worker nodes
-- Set kube version in the controller config to eg. `1.18`  
-- Update kube-controller and worker stacks with latest CFN code
-
-- terminate controller node(s)
-- once all controller nodes successfully upgraded replace worker nodes in a rolling fashion via. drain / terminate / rinse-repeat
-
-## ArgoCD
-- disable all auto-sync and "prune" features to prevent that eg. namespaces from previous apps get removed
-  - either remove auto-sync from old values.yaml and run deploy one last time, trigger kubezero sync !
-  - or disable manual via Argo UI starting with Kubezero app itself
-
-- uninstall argo helm chart:  
-`helm uninstall kubezero -n argocd`
-
-- remove all "argocd.argoproj.io/instance" labels from namespaces to prevent namespace removal later on:  
-  `./scripts/remove_argo_ns.sh`
-
-## KubeZero - Part 1
-- migrate values.yaml to new structure, adapt as needed
-  & update new central kubezero location in git and merge cluster configs
-
-- upgrade all CRDs:  
-  `./bootstrap.sh crds all clusters/$CLUSTER ../../../kubezero/charts`
-
-- upgrade first components:  
-  `./bootstrap.sh deploy calico,cert-manager,kiam,aws-ebs-csi-driver,aws-efs-csi-driver clusters/$CLUSTER ../../../kubezero/charts`
-
-## Istio
-Due to changes of the ingress namespace resource the upgrade will cause a brief (~3-5 min) ingress service interruption !  
-
-  - delete istio operators, to remove all pieces, remove operator itself  
-   `./scripts/delete_istio_17.sh`
-  - deploy istio and istio-ingress via bootstrap.sh  
-  `./bootstrap.sh deploy istio,istio-ingress clusters/$CLUSTER ../../../kubezero/charts`
-  - patch all VirtualServices via script to new namespace  
-  `./scripts/patch_vs.sh`
-
-Ingress service interruption ends.
-
-## KubeZero - Part 2
-
-- push kubezero & cluster config to git
-
-- if logging incl. ElasticSearch is installed remove the old ECK operator first:  
-  `./scripts/remove_old_eck.sh`
-
-- upgrade all remaining components and install new ArgoCD:  
-  `./bootstrap.sh deploy all clusters/$CLUSTER ../../../kubezero/charts`
-
-## Verification / Tests
-- verify argocd incl. kubezero app
-- verify all argo apps status
-- verify all the things
diff --git a/docs/Upgrade-2.20.md b/docs/Upgrade-2.20.md
index b610787..14fd510 100644
--- a/docs/Upgrade-2.20.md
+++ b/docs/Upgrade-2.20.md
@@ -2,16 +2,23 @@
 
 # CloudBender
 ## Changes
-- controller node names are now strictly tight to the AZ they are in: AZ1 -> controller00, AZ2 -> controller01 etc. to prevent controller03 from happening in case AWS launches new instances before the old ones are actually terminated 
+### Single node control plane
+- Control
  
 ## Upgrade
-- Set Kubernetes version in the controller config to eg. `1.20`  
-- Update controller and worker stacks with latest CFN code
+- Set the specific wanted Kubernetes version in the controller config to eg. `v1.20.2`  
+- configure your AWS CLI profile as well as your kubectl context to cluster you want to upgrade.
+- verify your config ...
 
-- Upgrade requires careful replacement in case existing control planes are shuffled otherwise: ( this might reduce the number of online controllers temporarily to 1 ! )
-  - manually set controller ASG to Min/Maz 0 for the ASG currently hosting controller00
-  - terminate controller node in AZ1 which will return as controller00
-  - replace controller01 and 02 in similar fashion
+- run ./scripts/upgrade_120.sh
+- update the CFN stack for kube-control-plane
+
+### Single node control plane
+- will automatically be upgraded and the controller node replaced as part of the CFN update
+
+### Clustered control plane
+- replace controller instances one by one in no particular order
+- once confirmed that the upgraded 1.20 control plane is working as expected update the clustered control plane CFN stack once more with `LBType: none` to remove the AWS NLB fronting the Kubernetes API which is not required anymore.
 
 - replace worker nodes in a rolling fashion via. drain / terminate / rinse-repeat
 
diff --git a/docs/aws_sidecar.md b/docs/aws_sidecar.md
new file mode 100644
index 0000000..7f75e1a
--- /dev/null
+++ b/docs/aws_sidecar.md
@@ -0,0 +1,22 @@
+```
+      - args:
+        - -c
+        - sleep 3600
+        command:
+        - /bin/sh
+        env:
+        - name: AWS_ROLE_ARN
+          value: <ARN>
+        - name: AWS_WEB_IDENTITY_TOKEN_FILE
+          value: /var/run/secrets/sts.amazonaws.com/serviceaccount/token
+        image: amazon/aws-cli
+        imagePullPolicy: Always
+        name: busybox
+        resources: {}
+        terminationMessagePath: /dev/termination-log
+        terminationMessagePolicy: File
+        volumeMounts:
+        - mountPath: /var/run/secrets/sts.amazonaws.com/serviceaccount/
+          name: aws-token
+          readOnly: true
+```
diff --git a/docs/etcd_hackery.md b/docs/etcd_hackery.md
index 0551229..773bcad 100644
--- a/docs/etcd_hackery.md
+++ b/docs/etcd_hackery.md
@@ -4,8 +4,8 @@
 https://github.com/jpbetz/auger
 
 ## Change a PV object
-etcdctl get /registry/persistentvolumes/services-dsny-cache  | auger decode > pv.yaml
+etcdctl get /registry/persistentvolumes/my_pv  | auger decode > pv.yaml
 
 vi pv.yaml
 
-cat pv.yaml | auger encode | etcdctl put /registry/persistentvolumes/services-dsny-cache
+cat pv.yaml | auger encode | etcdctl put /registry/persistentvolumes/my_pv
diff --git a/scripts/setClusterVersion.sh b/scripts/deprecated_setClusterVersion.sh
old mode 100755
new mode 100644
similarity index 100%
rename from scripts/setClusterVersion.sh
rename to scripts/deprecated_setClusterVersion.sh
diff --git a/scripts/upgrade_120.sh b/scripts/upgrade_120.sh
new file mode 100755
index 0000000..eef12c9
--- /dev/null
+++ b/scripts/upgrade_120.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+set -e
+
+# Get all kube-control-plane ASGs in the current account and region
+asgs=$(aws autoscaling describe-auto-scaling-groups --output json | jq .AutoScalingGroups[].AutoScalingGroupName -r | grep kube-control-plane)
+
+for asg in $asgs; do
+  hooks=$(aws autoscaling describe-lifecycle-hooks --auto-scaling-group-name $asg --output json | jq '.LifecycleHooks[] | select (.LifecycleTransition=="autoscaling:EC2_INSTANCE_TERMINATING") | .LifecycleHookName' -r)
+
+  for hook in $hooks; do
+    echo "Delete Lifecycle hook $hook of ASG $asg ? <Ctrl+C> to abort"
+    read
+    aws autoscaling delete-lifecycle-hook --lifecycle-hook-name $hook --auto-scaling-group-name $asg
+  done
+done
+
+# unset any AWS_DEFAULT_PROFILE as it will break aws-iam-auth
+unset AWS_DEFAULT_PROFILE
+
+nodes=$(kubectl get nodes -l node-role.kubernetes.io/master -o json | jq .items[].metadata.name -r)
+
+for node in $nodes; do
+    echo "Deploying upgrade job on $node..."
+
+    cat <<'EOF' > _job.yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: kubezero-upgrade
+  namespace: kube-system
+spec:
+  template:
+    spec:
+      hostNetwork: true
+      hostIPC: true
+      hostPID: true
+      containers:
+      - name: busybox
+        image: busybox
+        command:
+        - /bin/sh
+        - -c
+        - |
+          cat <<'EOF' > /host/tmp/upgrade.sh
+          #!/bin/bash -ex
+          for l in $(cat /etc/environment); do
+            export $l
+          done
+          my_ip=$(ec2metadata --local-ipv4)
+          my_id=$(ec2metadata --instance-id)
+          clusterName=$(yq r /etc/kubezero/kubezero.yaml clusterName)
+          my_asg=$(aws ec2 describe-tags --filters "Name=resource-id,Values=${my_id}" --output json | jq '.Tags[] | select(.Key=="aws:cloudformation:logical-id") | .Value' -r)
+          
+          [ $my_asg == "KubeControlAsgAZ1" ] && nodename="etcd0-$clusterName"
+          [ $my_asg == "KubeControlAsgAZ2" ] && nodename="etcd1-$clusterName"
+          [ $my_asg == "KubeControlAsgAZ3" ] && nodename="etcd2-$clusterName"
+          zone_name=$(hostname -d)
+          
+          zone_id=$(aws route53 list-hosted-zones --query 'HostedZones[?Name==`'"$zone_name"'.`].Id' --output text | cut --delimiter="/" --fields=3)
+          cat <<EOF2 > route53.json
+          { "Changes": [ { "Action": "UPSERT", "ResourceRecordSet": { "Name": "${nodename}.${zone_name}", "Type": "A", "TTL": 30, "ResourceRecords": [ { "Value": "$my_ip" } ] } } ] }
+          EOF2
+          
+          echo "Updating DNS entry for $nodename to $my_ip"
+          aws route53 change-resource-record-sets --hosted-zone-id $zone_id --change-batch file://route53.json
+           
+          echo "Adding additional control shutdown commands"
+          if [ ! -f /usr/local/sbin/drain_delete_node.sh ]; then
+            cat <<EOF3 > /usr/local/sbin/drain_delete_node.sh
+          #!/bin/bash -ex
+          export LC_TYPE=en_US.UTF-8
+          export KUBECONFIG=/root/.kube/config
+          kubeadm reset phase update-cluster-status
+          kubeadm reset phase remove-etcd-member
+          EOF3
+            chmod +x /usr/local/sbin/drain_delete_node.sh
+            sed -e 's,/usr/local/sbin/backup_control_plane.sh&,/usr/local/sbin/drain_delete_node.sh,' -i /usr/local/sbin/cloudbender_shutdown.sh
+          fi
+
+          echo "Patching ClusterConfig to re-create new etcd server certificates"
+          yq w /etc/kubezero/kubeadm/templates/ClusterConfiguration.yaml etcd.local.serverCertSANs[+] $nodename > /etc/kubernetes/kubeadm-recert.yaml
+          rm -f /etc/kubernetes/pki/etcd/server.*
+          kubeadm init phase certs etcd-server --config=/etc/kubernetes/kubeadm-recert.yaml 2>/dev/null
+          kill -s HUP $(ps -e | grep etcd | awk '{print $1}')
+          echo "Waiting for etcd to accept connections again...might take 30s or more"
+          while true; do
+            etcdctl member list -w simple 1>/dev/null 2>&1 && break || true
+            sleep 3
+          done
+          EOF
+          chmod +x /host/tmp/upgrade.sh
+          chroot /host bash -c /tmp/upgrade.sh
+        volumeMounts:
+        - name: host
+          mountPath: /host
+        securityContext:
+          privileged: true
+      volumes:
+      - name: host
+        hostPath:
+          path: /
+          type: Directory
+      nodeSelector:
+        kubernetes.io/hostname: __node__
+      tolerations:
+      - key: node-role.kubernetes.io/master
+        effect: NoSchedule
+      restartPolicy: Never
+EOF
+
+    # set controller node
+    sed -i -e "s/__node__/$node/" _job.yaml
+
+    kubectl apply -f _job.yaml
+    kubectl wait -n kube-system --timeout 300s --for=condition=complete job/kubezero-upgrade
+    kubectl delete -f _job.yaml
+done