chore: update docs and scripts
This commit is contained in:
parent
0dc9e1d2a5
commit
127db98358
15
docs/SA_OIDC.yaml
Normal file
15
docs/SA_OIDC.yaml
Normal file
@ -0,0 +1,15 @@
|
||||
# Service Account Tokens
|
||||
|
||||
## Federation with AWS IAM
|
||||
|
||||
### Discovery
|
||||
- public S3 location for openid and jwks config files
|
||||
- synchronized from the api-server to S3 during version upgrades
|
||||
service-account-issuer: `arn:aws:s3:::${ConfigBucketName}/k8s/${ClusterName}`
|
||||
api-audiences: `sts.amazonaws.com`
|
||||
|
||||
## Projection
|
||||
|
||||
## Resources
|
||||
- https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/#service-account-token-volume-projection
|
||||
- https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/#service-account-issuer-discovery
|
@ -1,56 +0,0 @@
|
||||
# Upgrade to KubeZero V2.18.0 (Argoless)
|
||||
|
||||
## (optional) Upgrade control plane nodes / worker nodes
|
||||
- Set kube version in the controller config to eg. `1.18`
|
||||
- Update kube-controller and worker stacks with latest CFN code
|
||||
|
||||
- terminate controller node(s)
|
||||
- once all controller nodes successfully upgraded replace worker nodes in a rolling fashion via. drain / terminate / rinse-repeat
|
||||
|
||||
## ArgoCD
|
||||
- disable all auto-sync and "prune" features to prevent that eg. namespaces from previous apps get removed
|
||||
- either remove auto-sync from old values.yaml and run deploy one last time, trigger kubezero sync !
|
||||
- or disable manual via Argo UI starting with Kubezero app itself
|
||||
|
||||
- uninstall argo helm chart:
|
||||
`helm uninstall kubezero -n argocd`
|
||||
|
||||
- remove all "argocd.argoproj.io/instance" labels from namespaces to prevent namespace removal later on:
|
||||
`./scripts/remove_argo_ns.sh`
|
||||
|
||||
## KubeZero - Part 1
|
||||
- migrate values.yaml to new structure, adapt as needed
|
||||
& update new central kubezero location in git and merge cluster configs
|
||||
|
||||
- upgrade all CRDs:
|
||||
`./bootstrap.sh crds all clusters/$CLUSTER ../../../kubezero/charts`
|
||||
|
||||
- upgrade first components:
|
||||
`./bootstrap.sh deploy calico,cert-manager,kiam,aws-ebs-csi-driver,aws-efs-csi-driver clusters/$CLUSTER ../../../kubezero/charts`
|
||||
|
||||
## Istio
|
||||
Due to changes of the ingress namespace resource the upgrade will cause a brief (~3-5 min) ingress service interruption !
|
||||
|
||||
- delete istio operators, to remove all pieces, remove operator itself
|
||||
`./scripts/delete_istio_17.sh`
|
||||
- deploy istio and istio-ingress via bootstrap.sh
|
||||
`./bootstrap.sh deploy istio,istio-ingress clusters/$CLUSTER ../../../kubezero/charts`
|
||||
- patch all VirtualServices via script to new namespace
|
||||
`./scripts/patch_vs.sh`
|
||||
|
||||
Ingress service interruption ends.
|
||||
|
||||
## KubeZero - Part 2
|
||||
|
||||
- push kubezero & cluster config to git
|
||||
|
||||
- if logging incl. ElasticSearch is installed remove the old ECK operator first:
|
||||
`./scripts/remove_old_eck.sh`
|
||||
|
||||
- upgrade all remaining components and install new ArgoCD:
|
||||
`./bootstrap.sh deploy all clusters/$CLUSTER ../../../kubezero/charts`
|
||||
|
||||
## Verification / Tests
|
||||
- verify argocd incl. kubezero app
|
||||
- verify all argo apps status
|
||||
- verify all the things
|
@ -2,16 +2,23 @@
|
||||
|
||||
# CloudBender
|
||||
## Changes
|
||||
- controller node names are now strictly tight to the AZ they are in: AZ1 -> controller00, AZ2 -> controller01 etc. to prevent controller03 from happening in case AWS launches new instances before the old ones are actually terminated
|
||||
### Single node control plane
|
||||
- Control
|
||||
|
||||
## Upgrade
|
||||
- Set Kubernetes version in the controller config to eg. `1.20`
|
||||
- Update controller and worker stacks with latest CFN code
|
||||
- Set the specific wanted Kubernetes version in the controller config to eg. `v1.20.2`
|
||||
- configure your AWS CLI profile as well as your kubectl context to cluster you want to upgrade.
|
||||
- verify your config ...
|
||||
|
||||
- Upgrade requires careful replacement in case existing control planes are shuffled otherwise: ( this might reduce the number of online controllers temporarily to 1 ! )
|
||||
- manually set controller ASG to Min/Maz 0 for the ASG currently hosting controller00
|
||||
- terminate controller node in AZ1 which will return as controller00
|
||||
- replace controller01 and 02 in similar fashion
|
||||
- run ./scripts/upgrade_120.sh
|
||||
- update the CFN stack for kube-control-plane
|
||||
|
||||
### Single node control plane
|
||||
- will automatically be upgraded and the controller node replaced as part of the CFN update
|
||||
|
||||
### Clustered control plane
|
||||
- replace controller instances one by one in no particular order
|
||||
- once confirmed that the upgraded 1.20 control plane is working as expected update the clustered control plane CFN stack once more with `LBType: none` to remove the AWS NLB fronting the Kubernetes API which is not required anymore.
|
||||
|
||||
- replace worker nodes in a rolling fashion via. drain / terminate / rinse-repeat
|
||||
|
||||
|
22
docs/aws_sidecar.md
Normal file
22
docs/aws_sidecar.md
Normal file
@ -0,0 +1,22 @@
|
||||
```
|
||||
- args:
|
||||
- -c
|
||||
- sleep 3600
|
||||
command:
|
||||
- /bin/sh
|
||||
env:
|
||||
- name: AWS_ROLE_ARN
|
||||
value: <ARN>
|
||||
- name: AWS_WEB_IDENTITY_TOKEN_FILE
|
||||
value: /var/run/secrets/sts.amazonaws.com/serviceaccount/token
|
||||
image: amazon/aws-cli
|
||||
imagePullPolicy: Always
|
||||
name: busybox
|
||||
resources: {}
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /var/run/secrets/sts.amazonaws.com/serviceaccount/
|
||||
name: aws-token
|
||||
readOnly: true
|
||||
```
|
@ -4,8 +4,8 @@
|
||||
https://github.com/jpbetz/auger
|
||||
|
||||
## Change a PV object
|
||||
etcdctl get /registry/persistentvolumes/services-dsny-cache | auger decode > pv.yaml
|
||||
etcdctl get /registry/persistentvolumes/my_pv | auger decode > pv.yaml
|
||||
|
||||
vi pv.yaml
|
||||
|
||||
cat pv.yaml | auger encode | etcdctl put /registry/persistentvolumes/services-dsny-cache
|
||||
cat pv.yaml | auger encode | etcdctl put /registry/persistentvolumes/my_pv
|
||||
|
0
scripts/setClusterVersion.sh → scripts/deprecated_setClusterVersion.sh
Executable file → Normal file
0
scripts/setClusterVersion.sh → scripts/deprecated_setClusterVersion.sh
Executable file → Normal file
117
scripts/upgrade_120.sh
Executable file
117
scripts/upgrade_120.sh
Executable file
@ -0,0 +1,117 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Get all kube-control-plane ASGs in the current account and region
|
||||
asgs=$(aws autoscaling describe-auto-scaling-groups --output json | jq .AutoScalingGroups[].AutoScalingGroupName -r | grep kube-control-plane)
|
||||
|
||||
for asg in $asgs; do
|
||||
hooks=$(aws autoscaling describe-lifecycle-hooks --auto-scaling-group-name $asg --output json | jq '.LifecycleHooks[] | select (.LifecycleTransition=="autoscaling:EC2_INSTANCE_TERMINATING") | .LifecycleHookName' -r)
|
||||
|
||||
for hook in $hooks; do
|
||||
echo "Delete Lifecycle hook $hook of ASG $asg ? <Ctrl+C> to abort"
|
||||
read
|
||||
aws autoscaling delete-lifecycle-hook --lifecycle-hook-name $hook --auto-scaling-group-name $asg
|
||||
done
|
||||
done
|
||||
|
||||
# unset any AWS_DEFAULT_PROFILE as it will break aws-iam-auth
|
||||
unset AWS_DEFAULT_PROFILE
|
||||
|
||||
nodes=$(kubectl get nodes -l node-role.kubernetes.io/master -o json | jq .items[].metadata.name -r)
|
||||
|
||||
for node in $nodes; do
|
||||
echo "Deploying upgrade job on $node..."
|
||||
|
||||
cat <<'EOF' > _job.yaml
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: kubezero-upgrade
|
||||
namespace: kube-system
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
hostNetwork: true
|
||||
hostIPC: true
|
||||
hostPID: true
|
||||
containers:
|
||||
- name: busybox
|
||||
image: busybox
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
cat <<'EOF' > /host/tmp/upgrade.sh
|
||||
#!/bin/bash -ex
|
||||
for l in $(cat /etc/environment); do
|
||||
export $l
|
||||
done
|
||||
my_ip=$(ec2metadata --local-ipv4)
|
||||
my_id=$(ec2metadata --instance-id)
|
||||
clusterName=$(yq r /etc/kubezero/kubezero.yaml clusterName)
|
||||
my_asg=$(aws ec2 describe-tags --filters "Name=resource-id,Values=${my_id}" --output json | jq '.Tags[] | select(.Key=="aws:cloudformation:logical-id") | .Value' -r)
|
||||
|
||||
[ $my_asg == "KubeControlAsgAZ1" ] && nodename="etcd0-$clusterName"
|
||||
[ $my_asg == "KubeControlAsgAZ2" ] && nodename="etcd1-$clusterName"
|
||||
[ $my_asg == "KubeControlAsgAZ3" ] && nodename="etcd2-$clusterName"
|
||||
zone_name=$(hostname -d)
|
||||
|
||||
zone_id=$(aws route53 list-hosted-zones --query 'HostedZones[?Name==`'"$zone_name"'.`].Id' --output text | cut --delimiter="/" --fields=3)
|
||||
cat <<EOF2 > route53.json
|
||||
{ "Changes": [ { "Action": "UPSERT", "ResourceRecordSet": { "Name": "${nodename}.${zone_name}", "Type": "A", "TTL": 30, "ResourceRecords": [ { "Value": "$my_ip" } ] } } ] }
|
||||
EOF2
|
||||
|
||||
echo "Updating DNS entry for $nodename to $my_ip"
|
||||
aws route53 change-resource-record-sets --hosted-zone-id $zone_id --change-batch file://route53.json
|
||||
|
||||
echo "Adding additional control shutdown commands"
|
||||
if [ ! -f /usr/local/sbin/drain_delete_node.sh ]; then
|
||||
cat <<EOF3 > /usr/local/sbin/drain_delete_node.sh
|
||||
#!/bin/bash -ex
|
||||
export LC_TYPE=en_US.UTF-8
|
||||
export KUBECONFIG=/root/.kube/config
|
||||
kubeadm reset phase update-cluster-status
|
||||
kubeadm reset phase remove-etcd-member
|
||||
EOF3
|
||||
chmod +x /usr/local/sbin/drain_delete_node.sh
|
||||
sed -e 's,/usr/local/sbin/backup_control_plane.sh&,/usr/local/sbin/drain_delete_node.sh,' -i /usr/local/sbin/cloudbender_shutdown.sh
|
||||
fi
|
||||
|
||||
echo "Patching ClusterConfig to re-create new etcd server certificates"
|
||||
yq w /etc/kubezero/kubeadm/templates/ClusterConfiguration.yaml etcd.local.serverCertSANs[+] $nodename > /etc/kubernetes/kubeadm-recert.yaml
|
||||
rm -f /etc/kubernetes/pki/etcd/server.*
|
||||
kubeadm init phase certs etcd-server --config=/etc/kubernetes/kubeadm-recert.yaml 2>/dev/null
|
||||
kill -s HUP $(ps -e | grep etcd | awk '{print $1}')
|
||||
echo "Waiting for etcd to accept connections again...might take 30s or more"
|
||||
while true; do
|
||||
etcdctl member list -w simple 1>/dev/null 2>&1 && break || true
|
||||
sleep 3
|
||||
done
|
||||
EOF
|
||||
chmod +x /host/tmp/upgrade.sh
|
||||
chroot /host bash -c /tmp/upgrade.sh
|
||||
volumeMounts:
|
||||
- name: host
|
||||
mountPath: /host
|
||||
securityContext:
|
||||
privileged: true
|
||||
volumes:
|
||||
- name: host
|
||||
hostPath:
|
||||
path: /
|
||||
type: Directory
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: __node__
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/master
|
||||
effect: NoSchedule
|
||||
restartPolicy: Never
|
||||
EOF
|
||||
|
||||
# set controller node
|
||||
sed -i -e "s/__node__/$node/" _job.yaml
|
||||
|
||||
kubectl apply -f _job.yaml
|
||||
kubectl wait -n kube-system --timeout 300s --for=condition=complete job/kubezero-upgrade
|
||||
kubectl delete -f _job.yaml
|
||||
done
|
Loading…
Reference in New Issue
Block a user