From b37aa965f7b50510ad69ffd26cb53a3b11d4cc3f Mon Sep 17 00:00:00 2001 From: Stefan Reimer Date: Fri, 17 Dec 2021 11:57:23 +0100 Subject: [PATCH] feat: improved admin debug_shell, add etcd defrag to backup --- containers/admin/Dockerfile | 2 +- containers/admin/Makefile | 2 +- containers/admin/v1.21/kubeadm/Chart.yaml | 2 +- containers/admin/v1.21/kubeadm/README.md | 6 +- containers/admin/v1.21/kubeadm/values.yaml | 4 +- containers/admin/v1.21/kubezero.sh | 80 +++++++++++++--------- containers/admin/v1.21/upgrade_121.sh | 36 +++++----- 7 files changed, 76 insertions(+), 56 deletions(-) diff --git a/containers/admin/Dockerfile b/containers/admin/Dockerfile index 0041c37..248c106 100644 --- a/containers/admin/Dockerfile +++ b/containers/admin/Dockerfile @@ -5,7 +5,7 @@ FROM alpine:${ALPINE_VERSION} ARG KUBE_VERSION RUN cd /etc/apk/keys && \ - wget "https://cdn.zero-downtime.net/alpine/stefan@zero-downtime.net-6183d596.rsa.pub" && \ + wget "https://cdn.zero-downtime.net/alpine/stefan@zero-downtime.net-61bb6bfb.rsa.pub" && \ echo "@kubezero https://cdn.zero-downtime.net/alpine/v${KUBE_VERSION}/kubezero" >> /etc/apk/repositories && \ echo "@testing http://dl-cdn.alpinelinux.org/alpine/edge/testing" >> /etc/apk/repositories && \ apk upgrade -U -a && \ diff --git a/containers/admin/Makefile b/containers/admin/Makefile index 04ad919..448d839 100644 --- a/containers/admin/Makefile +++ b/containers/admin/Makefile @@ -1,4 +1,4 @@ -VERSION ?= 1.21.7 +VERSION ?= 1.21.8 ALPINE_VERSION ?= 3.15 REGISTRY := public.ecr.aws/zero-downtime REPOSITORY := kubezero-admin diff --git a/containers/admin/v1.21/kubeadm/Chart.yaml b/containers/admin/v1.21/kubeadm/Chart.yaml index e528637..f63eb02 100644 --- a/containers/admin/v1.21/kubeadm/Chart.yaml +++ b/containers/admin/v1.21/kubeadm/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: kubeadm description: KubeZero Kubeadm cluster config type: application -version: 1.21.7 +version: 1.21.8 home: https://kubezero.com icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png keywords: diff --git a/containers/admin/v1.21/kubeadm/README.md b/containers/admin/v1.21/kubeadm/README.md index 613538a..1e88545 100644 --- a/containers/admin/v1.21/kubeadm/README.md +++ b/containers/admin/v1.21/kubeadm/README.md @@ -1,6 +1,6 @@ # kubeadm -![Version: 1.21.7](https://img.shields.io/badge/Version-1.21.7-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) +![Version: 1.21.8](https://img.shields.io/badge/Version-1.21.8-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) KubeZero Kubeadm cluster config @@ -30,7 +30,7 @@ Kubernetes: `>= 1.20.0` | api.awsIamAuth.kubeAdminRole | string | `"arn:aws:iam::000000000000:role/KubernetesNode"` | | | api.awsIamAuth.workerNodeRole | string | `"arn:aws:iam::000000000000:role/KubernetesNode"` | | | api.endpoint | string | `"kube-api.changeme.org:6443"` | | -| api.etcdServers | string | `"https://localhost:2379"` | | +| api.etcdServers | string | `"https://etcd:2379"` | | | api.extraArgs | object | `{}` | | | api.listenPort | int | `6443` | | | api.oidcEndpoint | string | `""` | s3://${CFN[ConfigBucket]}/k8s/$CLUSTERNAME | @@ -46,7 +46,7 @@ Kubernetes: `>= 1.20.0` | network.cilium.enabled | bool | `false` | | | network.multus.enabled | bool | `false` | | | network.multus.tag | string | `"v3.8"` | | -| nodeName | string | `"localhost"` | set to $HOSTNAME | +| nodeName | string | `"kubezero-node"` | set to $HOSTNAME | | protectKernelDefaults | bool | `true` | | | systemd | bool | `true` | Set to false for openrc, eg. on Gentoo or Alpine | diff --git a/containers/admin/v1.21/kubeadm/values.yaml b/containers/admin/v1.21/kubeadm/values.yaml index 2086ce5..40718de 100644 --- a/containers/admin/v1.21/kubeadm/values.yaml +++ b/containers/admin/v1.21/kubeadm/values.yaml @@ -1,6 +1,6 @@ clusterName: pleasechangeme # -- set to $HOSTNAME -nodeName: localhost +nodeName: kubezero-node domain: changeme.org # -- Needs to be set to primary node IP @@ -9,7 +9,7 @@ listenAddress: 0.0.0.0 api: endpoint: kube-api.changeme.org:6443 listenPort: 6443 - etcdServers: "https://localhost:2379" + etcdServers: "https://etcd:2379" extraArgs: {} # -- https://s3.${REGION}.amazonaws.com/${CFN[ConfigBucket]}/k8s/$CLUSTERNAME serviceAccountIssuer: "" diff --git a/containers/admin/v1.21/kubezero.sh b/containers/admin/v1.21/kubezero.sh index ddc6ce0..03399f5 100755 --- a/containers/admin/v1.21/kubezero.sh +++ b/containers/admin/v1.21/kubezero.sh @@ -1,9 +1,15 @@ #!/bin/sh set -e -WORKDIR=/tmp/kubezero -HOSTFS=/host -VERSION=v1.21 +if [ -n "$DEBUG" ]; then + set -x + LOG="--v=5" +fi + +# Export vars to ease use in debug_shell etc +export WORKDIR=/tmp/kubezero +export HOSTFS=/host +export VERSION=v1.21 export KUBECONFIG="${HOSTFS}/root/.kube/config" @@ -13,11 +19,6 @@ export ETCDCTL_CACERT=${HOSTFS}/etc/kubernetes/pki/etcd/ca.crt export ETCDCTL_CERT=${HOSTFS}/etc/kubernetes/pki/apiserver-etcd-client.crt export ETCDCTL_KEY=${HOSTFS}/etc/kubernetes/pki/apiserver-etcd-client.key -if [ -n "$DEBUG" ]; then - set -x - LOG="--v=5" -fi - # Generic retry utility retry() { local tries=$1 @@ -60,12 +61,12 @@ render_kubeadm() { parse_kubezero() { [ -f ${HOSTFS}/etc/kubernetes/kubezero.yaml ] || { echo "Missing /etc/kubernetes/kubezero.yaml!"; exit 1; } - KUBE_VERSION=$(kubeadm version -o yaml | yq eval .clientVersion.gitVersion -) - CLUSTERNAME=$(yq eval '.clusterName' ${HOSTFS}/etc/kubernetes/kubezero.yaml) - ETCD_NODENAME=$(yq eval '.etcd.nodeName' ${HOSTFS}/etc/kubernetes/kubezero.yaml) + export KUBE_VERSION=$(kubeadm version -o yaml | yq eval .clientVersion.gitVersion -) + export CLUSTERNAME=$(yq eval '.clusterName' ${HOSTFS}/etc/kubernetes/kubezero.yaml) + export ETCD_NODENAME=$(yq eval '.etcd.nodeName' ${HOSTFS}/etc/kubernetes/kubezero.yaml) - AWS_IAM_AUTH=$(yq eval '.api.awsIamAuth.enabled' ${HOSTFS}/etc/kubernetes/kubezero.yaml) - AWS_NTH=$(yq eval '.addons.aws-node-termination-handler.enabled' ${HOSTFS}/etc/kubernetes/kubezero.yaml) + export AWS_IAM_AUTH=$(yq eval '.api.awsIamAuth.enabled' ${HOSTFS}/etc/kubernetes/kubezero.yaml) + export AWS_NTH=$(yq eval '.addons.aws-node-termination-handler.enabled' ${HOSTFS}/etc/kubernetes/kubezero.yaml) } @@ -173,28 +174,35 @@ if [ "$1" == 'upgrade' ]; then # - update oidc do we need that ? elif [[ "$1" == 'node-upgrade' ]]; then - echo "Starting node upgrade ..." - if [ -f ${HOSTFS}/usr/local/sbin/backup_control_plane.sh ]; then - mv ${HOSTFS}/usr/local/sbin/backup_control_plane.sh ${HOSTFS}/usr/local/sbin/backup_control_plane.disabled - echo "Disabled old cluster backup OS cronjob" - fi - echo "Migrating kubezero.yaml" + yq -i eval '.api.etcdServers = .api.allEtcdEndpoints | .network.multus.enabled = "true"' ${HOSTFS}/etc/kubernetes/kubezero.yaml - export restic_repo=$(grep "export RESTIC_REPOSITORY" ${HOSTFS}/usr/local/sbin/backup_control_plane.disabled | sed -e 's/.*=//' | sed -e 's/"//g') - export restic_pw="$(cat ${HOSTFS}/etc/cloudbender/clusterBackup.passphrase)" - export REGION=$(kubectl get node $NODE_NAME -o yaml | yq eval '.metadata.labels."topology.kubernetes.io/region"' -) + # AWS + if [ -f ${HOSTFS}/etc/cloudbender/clusterBackup.passphrase ]; then + if [ -f ${HOSTFS}/usr/local/sbin/backup_control_plane.sh ]; then + mv ${HOSTFS}/usr/local/sbin/backup_control_plane.sh ${HOSTFS}/usr/local/sbin/backup_control_plane.disabled + echo "Disabled old cluster backup OS cronjob" + fi - # enable backup and awsIamAuth. multus, match other reorg - yq -Mi e '.api.awsIamAuth.enabled = "true" | .api.awsIamAuth.workerNodeRole = .workerNodeRole | .api.awsIamAuth.kubeAdminRole = .kubeAdminRole - | .api.serviceAccountIssuer = .serviceAccountIssuer | .api.apiAudiences = "istio-ca,sts.amazonaws.com" - | .api.etcdServers = .api.allEtcdEndpoints - | .network.multus.enabled = "true" - | .addons.clusterBackup.enabled = "true" | .addons.clusterBackup.repository = strenv(restic_repo) | .addons.clusterBackup.password = strenv(restic_pw) - | .addons.clusterBackup.extraEnv[0].name = "AWS_DEFAULT_REGION" | .addons.clusterBackup.extraEnv[0].value = strenv(REGION) - ' ${HOSTFS}/etc/kubernetes/kubezero.yaml + # enable backup and awsIamAuth & multus + yq -i eval ' + .api.awsIamAuth.enabled = "true" | .api.awsIamAuth.workerNodeRole = .workerNodeRole | .api.awsIamAuth.kubeAdminRole = .kubeAdminRole + | .api.serviceAccountIssuer = .serviceAccountIssuer | .api.apiAudiences = "istio-ca,sts.amazonaws.com" + ' ${HOSTFS}/etc/kubernetes/kubezero.yaml + + export restic_repo=$(grep "export RESTIC_REPOSITORY" ${HOSTFS}/usr/local/sbin/backup_control_plane.disabled | sed -e 's/.*=//' | sed -e 's/"//g') + export restic_pw="$(cat ${HOSTFS}/etc/cloudbender/clusterBackup.passphrase)" + export REGION=$(kubectl get node $NODE_NAME -o yaml | yq eval '.metadata.labels."topology.kubernetes.io/region"' -) + + if [ -n "$restic_repo" ]; then + yq -i eval ' + .addons.clusterBackup.enabled = "true" | .addons.clusterBackup.repository = strenv(restic_repo) | .addons.clusterBackup.password = strenv(restic_pw) + | .addons.clusterBackup.extraEnv[0].name = "AWS_DEFAULT_REGION" | .addons.clusterBackup.extraEnv[0].value = strenv(REGION) + ' ${HOSTFS}/etc/kubernetes/kubezero.yaml + fi + fi echo "All done." @@ -284,7 +292,7 @@ elif [[ "$1" =~ "^(bootstrap|recover|join)$" ]]; then yq eval -M ".clusters[0].cluster.certificate-authority-data = \"$(cat ${HOSTFS}/etc/kubernetes/pki/ca.crt | base64 -w0)\"" ${WORKDIR}/kubeadm/templates/admin-aws-iam.yaml > ${HOSTFS}/etc/kubernetes/admin-aws-iam.yaml fi - # Install some basics on bootstrap and join for 1.21.7 to get new modules in place + # Install some basics on bootstrap and join for 1.21 to get new modules in place if [[ "$1" =~ "^(bootstrap|join|recover)$" ]]; then helm repo add kubezero https://cdn.zero-downtime.net/charts/ @@ -322,11 +330,15 @@ elif [ "$1" == 'backup' ]; then echo "Backup complete" - # Remove all previous + # Remove backups from previous versions restic forget --keep-tag $VERSION --prune + # Regular retention restic forget --keep-hourly 24 --keep-daily ${RESTIC_RETENTION:-7} --prune + # Defrag etcd backend + etcdctl --endpoints=https://${ETCD_NODENAME}:2379 defrag + elif [ "$1" == 'restore' ]; then mkdir -p ${WORKDIR} @@ -342,6 +354,10 @@ elif [ "$1" == 'restore' ]; then # Always use kubeadm kubectl config to never run into chicken egg with custom auth hooks cp ${WORKDIR}/admin.conf ${HOSTFS}/root/.kube/config +elif [ "$1" == 'debug_shell' ]; then + echo "Entering debug shell" + /bin/sh + else echo "Unknown command!" exit 1 diff --git a/containers/admin/v1.21/upgrade_121.sh b/containers/admin/v1.21/upgrade_121.sh index ee23afc..1d95a9f 100755 --- a/containers/admin/v1.21/upgrade_121.sh +++ b/containers/admin/v1.21/upgrade_121.sh @@ -1,5 +1,8 @@ -#!/bin/bash -set -eux +#!/bin/bash -e + +VERSION="v1.21.8" + +[ -n "$DEBUG" ] && DEBUG=1 # unset any AWS_DEFAULT_PROFILE as it will break aws-iam-auth unset AWS_DEFAULT_PROFILE @@ -9,11 +12,11 @@ nodes=$(kubectl get nodes -l node-role.kubernetes.io/control-plane -o json | jq for node in $nodes; do echo "Deploying node upgrade job on $node..." - cat <<'EOF' | sed -e "s/__node__/$node/g" | kubectl apply -f - + cat </dev/null while true; do - kubectl logs kubezero-upgrade-node-$node -n kube-system -f && break + kubectl logs kubezero-upgrade-${VERSION//.}-node-$node -n kube-system -f 2>/dev/null && break sleep 3 done - kubectl delete pod kubezero-upgrade-node-$node -n kube-system + kubectl delete pod kubezero-upgrade-${VERSION//.}-node-$node -n kube-system done echo "Deploying cluster upgrade job ..." -cat <<'EOF' | kubectl apply -f - +cat </dev/null while true; do - kubectl logs kubezero-upgrade-v121 -n kube-system -f && break + kubectl logs kubezero-upgrade-${VERSION//.} -n kube-system -f 2>/dev/null && break sleep 3 done +kubectl delete pod kubezero-upgrade-${VERSION//.} -n kube-system