feat: improved admin debug_shell, add etcd defrag to backup

This commit is contained in:
Stefan Reimer 2021-12-17 11:57:23 +01:00
parent 7bade805ee
commit c58feb9882
7 changed files with 76 additions and 56 deletions

View File

@ -5,7 +5,7 @@ FROM alpine:${ALPINE_VERSION}
ARG KUBE_VERSION ARG KUBE_VERSION
RUN cd /etc/apk/keys && \ RUN cd /etc/apk/keys && \
wget "https://cdn.zero-downtime.net/alpine/stefan@zero-downtime.net-6183d596.rsa.pub" && \ wget "https://cdn.zero-downtime.net/alpine/stefan@zero-downtime.net-61bb6bfb.rsa.pub" && \
echo "@kubezero https://cdn.zero-downtime.net/alpine/v${KUBE_VERSION}/kubezero" >> /etc/apk/repositories && \ echo "@kubezero https://cdn.zero-downtime.net/alpine/v${KUBE_VERSION}/kubezero" >> /etc/apk/repositories && \
echo "@testing http://dl-cdn.alpinelinux.org/alpine/edge/testing" >> /etc/apk/repositories && \ echo "@testing http://dl-cdn.alpinelinux.org/alpine/edge/testing" >> /etc/apk/repositories && \
apk upgrade -U -a && \ apk upgrade -U -a && \

View File

@ -1,4 +1,4 @@
VERSION ?= 1.21.7 VERSION ?= 1.21.8
ALPINE_VERSION ?= 3.15 ALPINE_VERSION ?= 3.15
REGISTRY := public.ecr.aws/zero-downtime REGISTRY := public.ecr.aws/zero-downtime
REPOSITORY := kubezero-admin REPOSITORY := kubezero-admin

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: kubeadm name: kubeadm
description: KubeZero Kubeadm cluster config description: KubeZero Kubeadm cluster config
type: application type: application
version: 1.21.7 version: 1.21.8
home: https://kubezero.com home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords: keywords:

View File

@ -1,6 +1,6 @@
# kubeadm # kubeadm
![Version: 1.21.7](https://img.shields.io/badge/Version-1.21.7-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![Version: 1.21.8](https://img.shields.io/badge/Version-1.21.8-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)
KubeZero Kubeadm cluster config KubeZero Kubeadm cluster config
@ -30,7 +30,7 @@ Kubernetes: `>= 1.20.0`
| api.awsIamAuth.kubeAdminRole | string | `"arn:aws:iam::000000000000:role/KubernetesNode"` | | | api.awsIamAuth.kubeAdminRole | string | `"arn:aws:iam::000000000000:role/KubernetesNode"` | |
| api.awsIamAuth.workerNodeRole | string | `"arn:aws:iam::000000000000:role/KubernetesNode"` | | | api.awsIamAuth.workerNodeRole | string | `"arn:aws:iam::000000000000:role/KubernetesNode"` | |
| api.endpoint | string | `"kube-api.changeme.org:6443"` | | | api.endpoint | string | `"kube-api.changeme.org:6443"` | |
| api.etcdServers | string | `"https://localhost:2379"` | | | api.etcdServers | string | `"https://etcd:2379"` | |
| api.extraArgs | object | `{}` | | | api.extraArgs | object | `{}` | |
| api.listenPort | int | `6443` | | | api.listenPort | int | `6443` | |
| api.oidcEndpoint | string | `""` | s3://${CFN[ConfigBucket]}/k8s/$CLUSTERNAME | | api.oidcEndpoint | string | `""` | s3://${CFN[ConfigBucket]}/k8s/$CLUSTERNAME |
@ -46,7 +46,7 @@ Kubernetes: `>= 1.20.0`
| network.cilium.enabled | bool | `false` | | | network.cilium.enabled | bool | `false` | |
| network.multus.enabled | bool | `false` | | | network.multus.enabled | bool | `false` | |
| network.multus.tag | string | `"v3.8"` | | | network.multus.tag | string | `"v3.8"` | |
| nodeName | string | `"localhost"` | set to $HOSTNAME | | nodeName | string | `"kubezero-node"` | set to $HOSTNAME |
| protectKernelDefaults | bool | `true` | | | protectKernelDefaults | bool | `true` | |
| systemd | bool | `true` | Set to false for openrc, eg. on Gentoo or Alpine | | systemd | bool | `true` | Set to false for openrc, eg. on Gentoo or Alpine |

View File

@ -1,6 +1,6 @@
clusterName: pleasechangeme clusterName: pleasechangeme
# -- set to $HOSTNAME # -- set to $HOSTNAME
nodeName: localhost nodeName: kubezero-node
domain: changeme.org domain: changeme.org
# -- Needs to be set to primary node IP # -- Needs to be set to primary node IP
@ -9,7 +9,7 @@ listenAddress: 0.0.0.0
api: api:
endpoint: kube-api.changeme.org:6443 endpoint: kube-api.changeme.org:6443
listenPort: 6443 listenPort: 6443
etcdServers: "https://localhost:2379" etcdServers: "https://etcd:2379"
extraArgs: {} extraArgs: {}
# -- https://s3.${REGION}.amazonaws.com/${CFN[ConfigBucket]}/k8s/$CLUSTERNAME # -- https://s3.${REGION}.amazonaws.com/${CFN[ConfigBucket]}/k8s/$CLUSTERNAME
serviceAccountIssuer: "" serviceAccountIssuer: ""

View File

@ -1,9 +1,15 @@
#!/bin/sh #!/bin/sh
set -e set -e
WORKDIR=/tmp/kubezero if [ -n "$DEBUG" ]; then
HOSTFS=/host set -x
VERSION=v1.21 LOG="--v=5"
fi
# Export vars to ease use in debug_shell etc
export WORKDIR=/tmp/kubezero
export HOSTFS=/host
export VERSION=v1.21
export KUBECONFIG="${HOSTFS}/root/.kube/config" export KUBECONFIG="${HOSTFS}/root/.kube/config"
@ -13,11 +19,6 @@ export ETCDCTL_CACERT=${HOSTFS}/etc/kubernetes/pki/etcd/ca.crt
export ETCDCTL_CERT=${HOSTFS}/etc/kubernetes/pki/apiserver-etcd-client.crt export ETCDCTL_CERT=${HOSTFS}/etc/kubernetes/pki/apiserver-etcd-client.crt
export ETCDCTL_KEY=${HOSTFS}/etc/kubernetes/pki/apiserver-etcd-client.key export ETCDCTL_KEY=${HOSTFS}/etc/kubernetes/pki/apiserver-etcd-client.key
if [ -n "$DEBUG" ]; then
set -x
LOG="--v=5"
fi
# Generic retry utility # Generic retry utility
retry() { retry() {
local tries=$1 local tries=$1
@ -60,12 +61,12 @@ render_kubeadm() {
parse_kubezero() { parse_kubezero() {
[ -f ${HOSTFS}/etc/kubernetes/kubezero.yaml ] || { echo "Missing /etc/kubernetes/kubezero.yaml!"; exit 1; } [ -f ${HOSTFS}/etc/kubernetes/kubezero.yaml ] || { echo "Missing /etc/kubernetes/kubezero.yaml!"; exit 1; }
KUBE_VERSION=$(kubeadm version -o yaml | yq eval .clientVersion.gitVersion -) export KUBE_VERSION=$(kubeadm version -o yaml | yq eval .clientVersion.gitVersion -)
CLUSTERNAME=$(yq eval '.clusterName' ${HOSTFS}/etc/kubernetes/kubezero.yaml) export CLUSTERNAME=$(yq eval '.clusterName' ${HOSTFS}/etc/kubernetes/kubezero.yaml)
ETCD_NODENAME=$(yq eval '.etcd.nodeName' ${HOSTFS}/etc/kubernetes/kubezero.yaml) export ETCD_NODENAME=$(yq eval '.etcd.nodeName' ${HOSTFS}/etc/kubernetes/kubezero.yaml)
AWS_IAM_AUTH=$(yq eval '.api.awsIamAuth.enabled' ${HOSTFS}/etc/kubernetes/kubezero.yaml) export AWS_IAM_AUTH=$(yq eval '.api.awsIamAuth.enabled' ${HOSTFS}/etc/kubernetes/kubezero.yaml)
AWS_NTH=$(yq eval '.addons.aws-node-termination-handler.enabled' ${HOSTFS}/etc/kubernetes/kubezero.yaml) export AWS_NTH=$(yq eval '.addons.aws-node-termination-handler.enabled' ${HOSTFS}/etc/kubernetes/kubezero.yaml)
} }
@ -173,28 +174,35 @@ if [ "$1" == 'upgrade' ]; then
# - update oidc do we need that ? # - update oidc do we need that ?
elif [[ "$1" == 'node-upgrade' ]]; then elif [[ "$1" == 'node-upgrade' ]]; then
echo "Starting node upgrade ..." echo "Starting node upgrade ..."
echo "Migrating kubezero.yaml"
yq -i eval '.api.etcdServers = .api.allEtcdEndpoints | .network.multus.enabled = "true"' ${HOSTFS}/etc/kubernetes/kubezero.yaml
# AWS
if [ -f ${HOSTFS}/etc/cloudbender/clusterBackup.passphrase ]; then
if [ -f ${HOSTFS}/usr/local/sbin/backup_control_plane.sh ]; then if [ -f ${HOSTFS}/usr/local/sbin/backup_control_plane.sh ]; then
mv ${HOSTFS}/usr/local/sbin/backup_control_plane.sh ${HOSTFS}/usr/local/sbin/backup_control_plane.disabled mv ${HOSTFS}/usr/local/sbin/backup_control_plane.sh ${HOSTFS}/usr/local/sbin/backup_control_plane.disabled
echo "Disabled old cluster backup OS cronjob" echo "Disabled old cluster backup OS cronjob"
fi fi
echo "Migrating kubezero.yaml" # enable backup and awsIamAuth & multus
yq -i eval '
.api.awsIamAuth.enabled = "true" | .api.awsIamAuth.workerNodeRole = .workerNodeRole | .api.awsIamAuth.kubeAdminRole = .kubeAdminRole
| .api.serviceAccountIssuer = .serviceAccountIssuer | .api.apiAudiences = "istio-ca,sts.amazonaws.com"
' ${HOSTFS}/etc/kubernetes/kubezero.yaml
export restic_repo=$(grep "export RESTIC_REPOSITORY" ${HOSTFS}/usr/local/sbin/backup_control_plane.disabled | sed -e 's/.*=//' | sed -e 's/"//g') export restic_repo=$(grep "export RESTIC_REPOSITORY" ${HOSTFS}/usr/local/sbin/backup_control_plane.disabled | sed -e 's/.*=//' | sed -e 's/"//g')
export restic_pw="$(cat ${HOSTFS}/etc/cloudbender/clusterBackup.passphrase)" export restic_pw="$(cat ${HOSTFS}/etc/cloudbender/clusterBackup.passphrase)"
export REGION=$(kubectl get node $NODE_NAME -o yaml | yq eval '.metadata.labels."topology.kubernetes.io/region"' -) export REGION=$(kubectl get node $NODE_NAME -o yaml | yq eval '.metadata.labels."topology.kubernetes.io/region"' -)
# enable backup and awsIamAuth. multus, match other reorg if [ -n "$restic_repo" ]; then
yq -Mi e '.api.awsIamAuth.enabled = "true" | .api.awsIamAuth.workerNodeRole = .workerNodeRole | .api.awsIamAuth.kubeAdminRole = .kubeAdminRole yq -i eval '
| .api.serviceAccountIssuer = .serviceAccountIssuer | .api.apiAudiences = "istio-ca,sts.amazonaws.com" .addons.clusterBackup.enabled = "true" | .addons.clusterBackup.repository = strenv(restic_repo) | .addons.clusterBackup.password = strenv(restic_pw)
| .api.etcdServers = .api.allEtcdEndpoints
| .network.multus.enabled = "true"
| .addons.clusterBackup.enabled = "true" | .addons.clusterBackup.repository = strenv(restic_repo) | .addons.clusterBackup.password = strenv(restic_pw)
| .addons.clusterBackup.extraEnv[0].name = "AWS_DEFAULT_REGION" | .addons.clusterBackup.extraEnv[0].value = strenv(REGION) | .addons.clusterBackup.extraEnv[0].name = "AWS_DEFAULT_REGION" | .addons.clusterBackup.extraEnv[0].value = strenv(REGION)
' ${HOSTFS}/etc/kubernetes/kubezero.yaml ' ${HOSTFS}/etc/kubernetes/kubezero.yaml
fi
fi
echo "All done." echo "All done."
@ -284,7 +292,7 @@ elif [[ "$1" =~ "^(bootstrap|recover|join)$" ]]; then
yq eval -M ".clusters[0].cluster.certificate-authority-data = \"$(cat ${HOSTFS}/etc/kubernetes/pki/ca.crt | base64 -w0)\"" ${WORKDIR}/kubeadm/templates/admin-aws-iam.yaml > ${HOSTFS}/etc/kubernetes/admin-aws-iam.yaml yq eval -M ".clusters[0].cluster.certificate-authority-data = \"$(cat ${HOSTFS}/etc/kubernetes/pki/ca.crt | base64 -w0)\"" ${WORKDIR}/kubeadm/templates/admin-aws-iam.yaml > ${HOSTFS}/etc/kubernetes/admin-aws-iam.yaml
fi fi
# Install some basics on bootstrap and join for 1.21.7 to get new modules in place # Install some basics on bootstrap and join for 1.21 to get new modules in place
if [[ "$1" =~ "^(bootstrap|join|recover)$" ]]; then if [[ "$1" =~ "^(bootstrap|join|recover)$" ]]; then
helm repo add kubezero https://cdn.zero-downtime.net/charts/ helm repo add kubezero https://cdn.zero-downtime.net/charts/
@ -322,11 +330,15 @@ elif [ "$1" == 'backup' ]; then
echo "Backup complete" echo "Backup complete"
# Remove all previous # Remove backups from previous versions
restic forget --keep-tag $VERSION --prune restic forget --keep-tag $VERSION --prune
# Regular retention
restic forget --keep-hourly 24 --keep-daily ${RESTIC_RETENTION:-7} --prune restic forget --keep-hourly 24 --keep-daily ${RESTIC_RETENTION:-7} --prune
# Defrag etcd backend
etcdctl --endpoints=https://${ETCD_NODENAME}:2379 defrag
elif [ "$1" == 'restore' ]; then elif [ "$1" == 'restore' ]; then
mkdir -p ${WORKDIR} mkdir -p ${WORKDIR}
@ -342,6 +354,10 @@ elif [ "$1" == 'restore' ]; then
# Always use kubeadm kubectl config to never run into chicken egg with custom auth hooks # Always use kubeadm kubectl config to never run into chicken egg with custom auth hooks
cp ${WORKDIR}/admin.conf ${HOSTFS}/root/.kube/config cp ${WORKDIR}/admin.conf ${HOSTFS}/root/.kube/config
elif [ "$1" == 'debug_shell' ]; then
echo "Entering debug shell"
/bin/sh
else else
echo "Unknown command!" echo "Unknown command!"
exit 1 exit 1

View File

@ -1,5 +1,8 @@
#!/bin/bash #!/bin/bash -e
set -eux
VERSION="v1.21.8"
[ -n "$DEBUG" ] && DEBUG=1
# unset any AWS_DEFAULT_PROFILE as it will break aws-iam-auth # unset any AWS_DEFAULT_PROFILE as it will break aws-iam-auth
unset AWS_DEFAULT_PROFILE unset AWS_DEFAULT_PROFILE
@ -9,11 +12,11 @@ nodes=$(kubectl get nodes -l node-role.kubernetes.io/control-plane -o json | jq
for node in $nodes; do for node in $nodes; do
echo "Deploying node upgrade job on $node..." echo "Deploying node upgrade job on $node..."
cat <<'EOF' | sed -e "s/__node__/$node/g" | kubectl apply -f - cat <<EOF | sed -e "s/__node__/$node/g" | kubectl apply -f -
apiVersion: v1 apiVersion: v1
kind: Pod kind: Pod
metadata: metadata:
name: kubezero-upgrade-node-__node__ name: kubezero-upgrade-${VERSION//.}-node-__node__
namespace: kube-system namespace: kube-system
labels: labels:
app: kubezero-upgrade-node app: kubezero-upgrade-node
@ -21,14 +24,14 @@ spec:
hostNetwork: true hostNetwork: true
containers: containers:
- name: kubezero-admin - name: kubezero-admin
image: public.ecr.aws/zero-downtime/kubezero-admin:v1.21.7 image: public.ecr.aws/zero-downtime/kubezero-admin:${VERSION}
imagePullPolicy: Always imagePullPolicy: Always
command: ["kubezero.sh"] command: ["kubezero.sh"]
args: args:
- node-upgrade - node-upgrade
env: env:
- name: DEBUG - name: DEBUG
value: "1" value: "$DEBUG"
- name: NODE_NAME - name: NODE_NAME
valueFrom: valueFrom:
fieldRef: fieldRef:
@ -55,21 +58,21 @@ spec:
effect: NoSchedule effect: NoSchedule
restartPolicy: Never restartPolicy: Never
EOF EOF
kubectl wait pod kubezero-upgrade-node-$node -n kube-system --timeout 120s --for=condition=initialized kubectl wait pod kubezero-upgrade-${VERSION//.}-node-$node -n kube-system --timeout 120s --for=condition=initialized 2>/dev/null
while true; do while true; do
kubectl logs kubezero-upgrade-node-$node -n kube-system -f && break kubectl logs kubezero-upgrade-${VERSION//.}-node-$node -n kube-system -f 2>/dev/null && break
sleep 3 sleep 3
done done
kubectl delete pod kubezero-upgrade-node-$node -n kube-system kubectl delete pod kubezero-upgrade-${VERSION//.}-node-$node -n kube-system
done done
echo "Deploying cluster upgrade job ..." echo "Deploying cluster upgrade job ..."
cat <<'EOF' | kubectl apply -f - cat <<EOF | kubectl apply -f -
apiVersion: v1 apiVersion: v1
kind: Pod kind: Pod
metadata: metadata:
name: kubezero-upgrade-v121 name: kubezero-upgrade-${VERSION//.}
namespace: kube-system namespace: kube-system
labels: labels:
app: kubezero-upgrade app: kubezero-upgrade
@ -79,14 +82,14 @@ spec:
#hostPID: true #hostPID: true
containers: containers:
- name: kubezero-admin - name: kubezero-admin
image: public.ecr.aws/zero-downtime/kubezero-admin:v1.21.7 image: public.ecr.aws/zero-downtime/kubezero-admin:${VERSION}
imagePullPolicy: Always imagePullPolicy: Always
command: ["kubezero.sh"] command: ["kubezero.sh"]
args: args:
- upgrade - upgrade
env: env:
- name: DEBUG - name: DEBUG
value: "1" value: "$DEBUG"
- name: NODE_NAME - name: NODE_NAME
valueFrom: valueFrom:
fieldRef: fieldRef:
@ -107,15 +110,16 @@ spec:
- name: workdir - name: workdir
emptyDir: {} emptyDir: {}
nodeSelector: nodeSelector:
node-role.kubernetes.io/master: "" node-role.kubernetes.io/control-plane: ""
tolerations: tolerations:
- key: node-role.kubernetes.io/master - key: node-role.kubernetes.io/master
effect: NoSchedule effect: NoSchedule
restartPolicy: Never restartPolicy: Never
EOF EOF
kubectl wait pod kubezero-upgrade-v121 -n kube-system --timeout 120s --for=condition=initialized kubectl wait pod kubezero-upgrade-${VERSION//.} -n kube-system --timeout 120s --for=condition=initialized 2>/dev/null
while true; do while true; do
kubectl logs kubezero-upgrade-v121 -n kube-system -f && break kubectl logs kubezero-upgrade-${VERSION//.} -n kube-system -f 2>/dev/null && break
sleep 3 sleep 3
done done
kubectl delete pod kubezero-upgrade-${VERSION//.} -n kube-system