feat: improved admin debug_shell, add etcd defrag to backup

This commit is contained in:
Stefan Reimer 2021-12-17 11:57:23 +01:00
parent d1864912ea
commit b37aa965f7
7 changed files with 76 additions and 56 deletions

View File

@ -5,7 +5,7 @@ FROM alpine:${ALPINE_VERSION}
ARG KUBE_VERSION
RUN cd /etc/apk/keys && \
wget "https://cdn.zero-downtime.net/alpine/stefan@zero-downtime.net-6183d596.rsa.pub" && \
wget "https://cdn.zero-downtime.net/alpine/stefan@zero-downtime.net-61bb6bfb.rsa.pub" && \
echo "@kubezero https://cdn.zero-downtime.net/alpine/v${KUBE_VERSION}/kubezero" >> /etc/apk/repositories && \
echo "@testing http://dl-cdn.alpinelinux.org/alpine/edge/testing" >> /etc/apk/repositories && \
apk upgrade -U -a && \

View File

@ -1,4 +1,4 @@
VERSION ?= 1.21.7
VERSION ?= 1.21.8
ALPINE_VERSION ?= 3.15
REGISTRY := public.ecr.aws/zero-downtime
REPOSITORY := kubezero-admin

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: kubeadm
description: KubeZero Kubeadm cluster config
type: application
version: 1.21.7
version: 1.21.8
home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords:

View File

@ -1,6 +1,6 @@
# kubeadm
![Version: 1.21.7](https://img.shields.io/badge/Version-1.21.7-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)
![Version: 1.21.8](https://img.shields.io/badge/Version-1.21.8-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)
KubeZero Kubeadm cluster config
@ -30,7 +30,7 @@ Kubernetes: `>= 1.20.0`
| api.awsIamAuth.kubeAdminRole | string | `"arn:aws:iam::000000000000:role/KubernetesNode"` | |
| api.awsIamAuth.workerNodeRole | string | `"arn:aws:iam::000000000000:role/KubernetesNode"` | |
| api.endpoint | string | `"kube-api.changeme.org:6443"` | |
| api.etcdServers | string | `"https://localhost:2379"` | |
| api.etcdServers | string | `"https://etcd:2379"` | |
| api.extraArgs | object | `{}` | |
| api.listenPort | int | `6443` | |
| api.oidcEndpoint | string | `""` | s3://${CFN[ConfigBucket]}/k8s/$CLUSTERNAME |
@ -46,7 +46,7 @@ Kubernetes: `>= 1.20.0`
| network.cilium.enabled | bool | `false` | |
| network.multus.enabled | bool | `false` | |
| network.multus.tag | string | `"v3.8"` | |
| nodeName | string | `"localhost"` | set to $HOSTNAME |
| nodeName | string | `"kubezero-node"` | set to $HOSTNAME |
| protectKernelDefaults | bool | `true` | |
| systemd | bool | `true` | Set to false for openrc, eg. on Gentoo or Alpine |

View File

@ -1,6 +1,6 @@
clusterName: pleasechangeme
# -- set to $HOSTNAME
nodeName: localhost
nodeName: kubezero-node
domain: changeme.org
# -- Needs to be set to primary node IP
@ -9,7 +9,7 @@ listenAddress: 0.0.0.0
api:
endpoint: kube-api.changeme.org:6443
listenPort: 6443
etcdServers: "https://localhost:2379"
etcdServers: "https://etcd:2379"
extraArgs: {}
# -- https://s3.${REGION}.amazonaws.com/${CFN[ConfigBucket]}/k8s/$CLUSTERNAME
serviceAccountIssuer: ""

View File

@ -1,9 +1,15 @@
#!/bin/sh
set -e
WORKDIR=/tmp/kubezero
HOSTFS=/host
VERSION=v1.21
if [ -n "$DEBUG" ]; then
set -x
LOG="--v=5"
fi
# Export vars to ease use in debug_shell etc
export WORKDIR=/tmp/kubezero
export HOSTFS=/host
export VERSION=v1.21
export KUBECONFIG="${HOSTFS}/root/.kube/config"
@ -13,11 +19,6 @@ export ETCDCTL_CACERT=${HOSTFS}/etc/kubernetes/pki/etcd/ca.crt
export ETCDCTL_CERT=${HOSTFS}/etc/kubernetes/pki/apiserver-etcd-client.crt
export ETCDCTL_KEY=${HOSTFS}/etc/kubernetes/pki/apiserver-etcd-client.key
if [ -n "$DEBUG" ]; then
set -x
LOG="--v=5"
fi
# Generic retry utility
retry() {
local tries=$1
@ -60,12 +61,12 @@ render_kubeadm() {
parse_kubezero() {
[ -f ${HOSTFS}/etc/kubernetes/kubezero.yaml ] || { echo "Missing /etc/kubernetes/kubezero.yaml!"; exit 1; }
KUBE_VERSION=$(kubeadm version -o yaml | yq eval .clientVersion.gitVersion -)
CLUSTERNAME=$(yq eval '.clusterName' ${HOSTFS}/etc/kubernetes/kubezero.yaml)
ETCD_NODENAME=$(yq eval '.etcd.nodeName' ${HOSTFS}/etc/kubernetes/kubezero.yaml)
export KUBE_VERSION=$(kubeadm version -o yaml | yq eval .clientVersion.gitVersion -)
export CLUSTERNAME=$(yq eval '.clusterName' ${HOSTFS}/etc/kubernetes/kubezero.yaml)
export ETCD_NODENAME=$(yq eval '.etcd.nodeName' ${HOSTFS}/etc/kubernetes/kubezero.yaml)
AWS_IAM_AUTH=$(yq eval '.api.awsIamAuth.enabled' ${HOSTFS}/etc/kubernetes/kubezero.yaml)
AWS_NTH=$(yq eval '.addons.aws-node-termination-handler.enabled' ${HOSTFS}/etc/kubernetes/kubezero.yaml)
export AWS_IAM_AUTH=$(yq eval '.api.awsIamAuth.enabled' ${HOSTFS}/etc/kubernetes/kubezero.yaml)
export AWS_NTH=$(yq eval '.addons.aws-node-termination-handler.enabled' ${HOSTFS}/etc/kubernetes/kubezero.yaml)
}
@ -173,28 +174,35 @@ if [ "$1" == 'upgrade' ]; then
# - update oidc do we need that ?
elif [[ "$1" == 'node-upgrade' ]]; then
echo "Starting node upgrade ..."
if [ -f ${HOSTFS}/usr/local/sbin/backup_control_plane.sh ]; then
mv ${HOSTFS}/usr/local/sbin/backup_control_plane.sh ${HOSTFS}/usr/local/sbin/backup_control_plane.disabled
echo "Disabled old cluster backup OS cronjob"
fi
echo "Migrating kubezero.yaml"
yq -i eval '.api.etcdServers = .api.allEtcdEndpoints | .network.multus.enabled = "true"' ${HOSTFS}/etc/kubernetes/kubezero.yaml
export restic_repo=$(grep "export RESTIC_REPOSITORY" ${HOSTFS}/usr/local/sbin/backup_control_plane.disabled | sed -e 's/.*=//' | sed -e 's/"//g')
export restic_pw="$(cat ${HOSTFS}/etc/cloudbender/clusterBackup.passphrase)"
export REGION=$(kubectl get node $NODE_NAME -o yaml | yq eval '.metadata.labels."topology.kubernetes.io/region"' -)
# AWS
if [ -f ${HOSTFS}/etc/cloudbender/clusterBackup.passphrase ]; then
if [ -f ${HOSTFS}/usr/local/sbin/backup_control_plane.sh ]; then
mv ${HOSTFS}/usr/local/sbin/backup_control_plane.sh ${HOSTFS}/usr/local/sbin/backup_control_plane.disabled
echo "Disabled old cluster backup OS cronjob"
fi
# enable backup and awsIamAuth. multus, match other reorg
yq -Mi e '.api.awsIamAuth.enabled = "true" | .api.awsIamAuth.workerNodeRole = .workerNodeRole | .api.awsIamAuth.kubeAdminRole = .kubeAdminRole
| .api.serviceAccountIssuer = .serviceAccountIssuer | .api.apiAudiences = "istio-ca,sts.amazonaws.com"
| .api.etcdServers = .api.allEtcdEndpoints
| .network.multus.enabled = "true"
| .addons.clusterBackup.enabled = "true" | .addons.clusterBackup.repository = strenv(restic_repo) | .addons.clusterBackup.password = strenv(restic_pw)
| .addons.clusterBackup.extraEnv[0].name = "AWS_DEFAULT_REGION" | .addons.clusterBackup.extraEnv[0].value = strenv(REGION)
' ${HOSTFS}/etc/kubernetes/kubezero.yaml
# enable backup and awsIamAuth & multus
yq -i eval '
.api.awsIamAuth.enabled = "true" | .api.awsIamAuth.workerNodeRole = .workerNodeRole | .api.awsIamAuth.kubeAdminRole = .kubeAdminRole
| .api.serviceAccountIssuer = .serviceAccountIssuer | .api.apiAudiences = "istio-ca,sts.amazonaws.com"
' ${HOSTFS}/etc/kubernetes/kubezero.yaml
export restic_repo=$(grep "export RESTIC_REPOSITORY" ${HOSTFS}/usr/local/sbin/backup_control_plane.disabled | sed -e 's/.*=//' | sed -e 's/"//g')
export restic_pw="$(cat ${HOSTFS}/etc/cloudbender/clusterBackup.passphrase)"
export REGION=$(kubectl get node $NODE_NAME -o yaml | yq eval '.metadata.labels."topology.kubernetes.io/region"' -)
if [ -n "$restic_repo" ]; then
yq -i eval '
.addons.clusterBackup.enabled = "true" | .addons.clusterBackup.repository = strenv(restic_repo) | .addons.clusterBackup.password = strenv(restic_pw)
| .addons.clusterBackup.extraEnv[0].name = "AWS_DEFAULT_REGION" | .addons.clusterBackup.extraEnv[0].value = strenv(REGION)
' ${HOSTFS}/etc/kubernetes/kubezero.yaml
fi
fi
echo "All done."
@ -284,7 +292,7 @@ elif [[ "$1" =~ "^(bootstrap|recover|join)$" ]]; then
yq eval -M ".clusters[0].cluster.certificate-authority-data = \"$(cat ${HOSTFS}/etc/kubernetes/pki/ca.crt | base64 -w0)\"" ${WORKDIR}/kubeadm/templates/admin-aws-iam.yaml > ${HOSTFS}/etc/kubernetes/admin-aws-iam.yaml
fi
# Install some basics on bootstrap and join for 1.21.7 to get new modules in place
# Install some basics on bootstrap and join for 1.21 to get new modules in place
if [[ "$1" =~ "^(bootstrap|join|recover)$" ]]; then
helm repo add kubezero https://cdn.zero-downtime.net/charts/
@ -322,11 +330,15 @@ elif [ "$1" == 'backup' ]; then
echo "Backup complete"
# Remove all previous
# Remove backups from previous versions
restic forget --keep-tag $VERSION --prune
# Regular retention
restic forget --keep-hourly 24 --keep-daily ${RESTIC_RETENTION:-7} --prune
# Defrag etcd backend
etcdctl --endpoints=https://${ETCD_NODENAME}:2379 defrag
elif [ "$1" == 'restore' ]; then
mkdir -p ${WORKDIR}
@ -342,6 +354,10 @@ elif [ "$1" == 'restore' ]; then
# Always use kubeadm kubectl config to never run into chicken egg with custom auth hooks
cp ${WORKDIR}/admin.conf ${HOSTFS}/root/.kube/config
elif [ "$1" == 'debug_shell' ]; then
echo "Entering debug shell"
/bin/sh
else
echo "Unknown command!"
exit 1

View File

@ -1,5 +1,8 @@
#!/bin/bash
set -eux
#!/bin/bash -e
VERSION="v1.21.8"
[ -n "$DEBUG" ] && DEBUG=1
# unset any AWS_DEFAULT_PROFILE as it will break aws-iam-auth
unset AWS_DEFAULT_PROFILE
@ -9,11 +12,11 @@ nodes=$(kubectl get nodes -l node-role.kubernetes.io/control-plane -o json | jq
for node in $nodes; do
echo "Deploying node upgrade job on $node..."
cat <<'EOF' | sed -e "s/__node__/$node/g" | kubectl apply -f -
cat <<EOF | sed -e "s/__node__/$node/g" | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: kubezero-upgrade-node-__node__
name: kubezero-upgrade-${VERSION//.}-node-__node__
namespace: kube-system
labels:
app: kubezero-upgrade-node
@ -21,14 +24,14 @@ spec:
hostNetwork: true
containers:
- name: kubezero-admin
image: public.ecr.aws/zero-downtime/kubezero-admin:v1.21.7
image: public.ecr.aws/zero-downtime/kubezero-admin:${VERSION}
imagePullPolicy: Always
command: ["kubezero.sh"]
args:
- node-upgrade
env:
- name: DEBUG
value: "1"
value: "$DEBUG"
- name: NODE_NAME
valueFrom:
fieldRef:
@ -55,21 +58,21 @@ spec:
effect: NoSchedule
restartPolicy: Never
EOF
kubectl wait pod kubezero-upgrade-node-$node -n kube-system --timeout 120s --for=condition=initialized
kubectl wait pod kubezero-upgrade-${VERSION//.}-node-$node -n kube-system --timeout 120s --for=condition=initialized 2>/dev/null
while true; do
kubectl logs kubezero-upgrade-node-$node -n kube-system -f && break
kubectl logs kubezero-upgrade-${VERSION//.}-node-$node -n kube-system -f 2>/dev/null && break
sleep 3
done
kubectl delete pod kubezero-upgrade-node-$node -n kube-system
kubectl delete pod kubezero-upgrade-${VERSION//.}-node-$node -n kube-system
done
echo "Deploying cluster upgrade job ..."
cat <<'EOF' | kubectl apply -f -
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: kubezero-upgrade-v121
name: kubezero-upgrade-${VERSION//.}
namespace: kube-system
labels:
app: kubezero-upgrade
@ -79,14 +82,14 @@ spec:
#hostPID: true
containers:
- name: kubezero-admin
image: public.ecr.aws/zero-downtime/kubezero-admin:v1.21.7
image: public.ecr.aws/zero-downtime/kubezero-admin:${VERSION}
imagePullPolicy: Always
command: ["kubezero.sh"]
args:
- upgrade
env:
- name: DEBUG
value: "1"
value: "$DEBUG"
- name: NODE_NAME
valueFrom:
fieldRef:
@ -107,15 +110,16 @@ spec:
- name: workdir
emptyDir: {}
nodeSelector:
node-role.kubernetes.io/master: ""
node-role.kubernetes.io/control-plane: ""
tolerations:
- key: node-role.kubernetes.io/master
effect: NoSchedule
restartPolicy: Never
EOF
kubectl wait pod kubezero-upgrade-v121 -n kube-system --timeout 120s --for=condition=initialized
kubectl wait pod kubezero-upgrade-${VERSION//.} -n kube-system --timeout 120s --for=condition=initialized 2>/dev/null
while true; do
kubectl logs kubezero-upgrade-v121 -n kube-system -f && break
kubectl logs kubezero-upgrade-${VERSION//.} -n kube-system -f 2>/dev/null && break
sleep 3
done
kubectl delete pod kubezero-upgrade-${VERSION//.} -n kube-system