feat: latest 1.23 cluster upgrade config

This commit is contained in:
Stefan Reimer 2022-09-14 19:08:14 +02:00
parent 9f389d5c13
commit a49b139e54
24 changed files with 408 additions and 112 deletions

View File

@ -25,9 +25,12 @@ RUN cd /etc/apk/keys && \
restic@testing \ restic@testing \
helm@testing helm@testing
RUN helm repo add kubezero https://cdn.zero-downtime.net/charts RUN helm repo add kubezero https://cdn.zero-downtime.net/charts && \
mkdir -p /var/lib/kubezero
ADD admin/kubezero.sh admin/libhelm.sh admin/migrate_argo.py /usr/bin
ADD admin/libhelm.sh admin/v${KUBE_VERSION}/* /var/lib/kubezero
ADD admin/kubezero.sh admin/libhelm.sh /usr/bin
ADD charts/kubeadm /charts/kubeadm ADD charts/kubeadm /charts/kubeadm
ADD charts/kubezero /charts/kubezero ADD charts/kubezero /charts/kubezero

12
admin/README.md Normal file
View File

@ -0,0 +1,12 @@
# Cluster upgrade flow
## During 1.23 upgrade
- create new kubezero-values CM if not exists yet, by merging parts of the legacy /etc/kubernetes/kubeadm-values.yaml values with potentially existing values from kubezero ArgoCD app values
# General flow
- No ArgoCD -> user kubezero-values CM
- ArgoCD -> update kubezero-values CM with current values from ArgoCD app values
- Apply any upgrades / migrations

View File

@ -6,7 +6,7 @@ if [ -n "$DEBUG" ]; then
fi fi
# include helm lib # include helm lib
. libhelm.sh . /var/lib/kubezero/libhelm.sh
# Export vars to ease use in debug_shell etc # Export vars to ease use in debug_shell etc
export WORKDIR=/tmp/kubezero export WORKDIR=/tmp/kubezero
@ -70,11 +70,11 @@ parse_kubezero() {
[ -f ${HOSTFS}/etc/kubernetes/kubezero.yaml ] && cp ${HOSTFS}/etc/kubernetes/kubezero.yaml ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml [ -f ${HOSTFS}/etc/kubernetes/kubezero.yaml ] && cp ${HOSTFS}/etc/kubernetes/kubezero.yaml ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml
fi fi
export CLUSTERNAME=$(yq eval '.global.clusterName' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml) export CLUSTERNAME=$(yq eval '.global.clusterName // .clusterName' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml)
export HIGHAVAILABLE=$(yq eval '.global.highAvailable // "false"' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml) export HIGHAVAILABLE=$(yq eval '.global.highAvailable // .highAvailable // "false"' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml)
export ETCD_NODENAME=$(yq eval '.etcd.nodeName' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml) export ETCD_NODENAME=$(yq eval '.etcd.nodeName' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml)
export NODENAME=$(yq eval '.nodeName' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml) export NODENAME=$(yq eval '.nodeName' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml)
export PROVIDER_ID=$(yq eval '.providerID' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml) export PROVIDER_ID=$(yq eval '.providerID // ""' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml)
export AWS_IAM_AUTH=$(yq eval '.api.awsIamAuth.enabled // "false"' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml) export AWS_IAM_AUTH=$(yq eval '.api.awsIamAuth.enabled // "false"' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml)
# From here on bail out, allows debug_shell even in error cases # From here on bail out, allows debug_shell even in error cases
@ -120,33 +120,9 @@ post_kubeadm() {
} }
cluster_upgrade() { upgrade_cluster() {
### PRE 1.23 specific # pre upgrade hook
##################### [ -f /var/lib/kubezero/pre-upgrade.sh ] && . /var/lib/kubezero/pre-upgrade.sh
# Migrate addons and network values from local kubeadm-values.yaml on controllers into CM
# - remove secrets from addons
# - enable cilium
if [[ $PROVIDER_ID =~ ^aws ]]; then
REGION=$(echo $PROVIDER_ID | sed -e 's,aws:///,,' -e 's,/.*,,' -e 's/\w$//')
fi
kubectl get cm -n kube-system kubezero-values || \
kubectl create configmap -n kube-system kubezero-values \
--from-literal values.yaml="$(yq e 'del .addons.clusterBackup.repository | del .addons.clusterBackup.password | \
.addons.clusterBackup.image.tag =strenv(KUBE_VERSION) | \
.network.cilium.enabled = true | .network.multus.defaultNetworks = ["cilium"] | \
.network.cilium.cluster.name = strenv(CLUSTERNAME) | \
.global.clusterName = strenv(CLUSTERNAME) | \
.global.highAvailable = strenv(HIGHAVAILABLE) | \
.global.aws.region = strenv(REGION)' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml)"
# Create kubeadm-values CM if not available
kubectl get cm -n kube-system kubeadm-values || \
kubectl create configmap -n kube-system kubeadm-values
#####################
render_kubeadm render_kubeadm
@ -162,10 +138,8 @@ cluster_upgrade() {
cp ${HOSTFS}/etc/kubernetes/admin.conf ${HOSTFS}/root/.kube/config cp ${HOSTFS}/etc/kubernetes/admin.conf ${HOSTFS}/root/.kube/config
fi fi
### POST 1.23 specific # post upgrade hook
##################### [ -f /var/lib/kubezero/post-upgrade.sh ] && . /var/lib/kubezero/post-upgrade.sh
######################
# Cleanup after kubeadm on the host # Cleanup after kubeadm on the host
rm -rf ${HOSTFS}/etc/kubernetes/tmp rm -rf ${HOSTFS}/etc/kubernetes/tmp
@ -181,7 +155,7 @@ cluster_upgrade() {
} }
node_upgrade() { upgrade_node() {
echo "Starting node upgrade ..." echo "Starting node upgrade ..."
echo "All done." echo "All done."
@ -194,7 +168,7 @@ control_plane_node() {
render_kubeadm render_kubeadm
# Ensure clean slate if bootstrap, restore PKI otherwise # Ensure clean slate if bootstrap, restore PKI otherwise
if [[ "$CMD" =~ "^(bootstrap)$" ]]; then if [[ "$CMD" =~ ^(bootstrap)$ ]]; then
rm -rf ${HOSTFS}/var/lib/etcd/member rm -rf ${HOSTFS}/var/lib/etcd/member
else else
@ -211,7 +185,7 @@ control_plane_node() {
cp ${WORKDIR}/admin.conf ${HOSTFS}/root/.kube/config cp ${WORKDIR}/admin.conf ${HOSTFS}/root/.kube/config
# Only restore etcd data during "restore" and none exists already # Only restore etcd data during "restore" and none exists already
if [[ "$CMD" =~ "^(restore)$" ]]; then if [[ "$CMD" =~ ^(restore)$ ]]; then
if [ ! -d ${HOSTFS}/var/lib/etcd/member ]; then if [ ! -d ${HOSTFS}/var/lib/etcd/member ]; then
etcdctl snapshot restore ${HOSTFS}/etc/kubernetes/etcd_snapshot \ etcdctl snapshot restore ${HOSTFS}/etc/kubernetes/etcd_snapshot \
--name $ETCD_NODENAME \ --name $ETCD_NODENAME \
@ -238,7 +212,7 @@ control_plane_node() {
_kubeadm init phase preflight _kubeadm init phase preflight
_kubeadm init phase kubeconfig all _kubeadm init phase kubeconfig all
if [[ "$CMD" =~ "^(join)$" ]]; then if [[ "$CMD" =~ ^(join)$ ]]; then
# Delete any former self in case forseti did not delete yet # Delete any former self in case forseti did not delete yet
kubectl delete node ${NODENAME} --wait=true || true kubectl delete node ${NODENAME} --wait=true || true
# Wait for all pods to be deleted otherwise we end up with stale pods eg. kube-proxy and all goes to .... # Wait for all pods to be deleted otherwise we end up with stale pods eg. kube-proxy and all goes to ....
@ -297,7 +271,7 @@ control_plane_node() {
retry 0 5 30 kubectl cluster-info --request-timeout 3 >/dev/null retry 0 5 30 kubectl cluster-info --request-timeout 3 >/dev/null
# Update providerID as underlying VM changed during restore # Update providerID as underlying VM changed during restore
if [[ "$CMD" =~ "^(restore)$" ]]; then if [[ "$CMD" =~ ^(restore)$ ]]; then
if [ -n "$PROVIDER_ID" ]; then if [ -n "$PROVIDER_ID" ]; then
etcdhelper \ etcdhelper \
-cacert ${HOSTFS}/etc/kubernetes/pki/etcd/ca.crt \ -cacert ${HOSTFS}/etc/kubernetes/pki/etcd/ca.crt \
@ -308,7 +282,7 @@ control_plane_node() {
fi fi
fi fi
if [[ "$CMD" =~ "^(bootstrap|restore)$" ]]; then if [[ "$CMD" =~ ^(bootstrap|restore)$ ]]; then
_kubeadm init phase upload-config all _kubeadm init phase upload-config all
_kubeadm init phase upload-certs --skip-certificate-key-print _kubeadm init phase upload-certs --skip-certificate-key-print
@ -319,7 +293,7 @@ control_plane_node() {
_kubeadm init phase mark-control-plane _kubeadm init phase mark-control-plane
_kubeadm init phase kubelet-finalize all _kubeadm init phase kubelet-finalize all
if [[ "$CMD" =~ "^(bootstrap|restore)$" ]]; then if [[ "$CMD" =~ ^(bootstrap|restore)$ ]]; then
_kubeadm init phase addon all _kubeadm init phase addon all
fi fi
@ -343,14 +317,10 @@ control_plane_node() {
apply_module() { apply_module() {
MODULES=$1 MODULES=$1
kubectl get configmap -n kube-system kubezero-values -o yaml | yq '.data."values.yaml"' > $WORKDIR/_values.yaml get_kubezero_values
# Always use embedded kubezero chart # Always use embedded kubezero chart
helm template $CHARTS/kubezero -f $WORKDIR/_values.yaml --version ~$KUBE_VERSION --devel --output-dir $WORKDIR helm template $CHARTS/kubezero -f $WORKDIR/kubezero-values.yaml --version ~$KUBE_VERSION --devel --output-dir $WORKDIR
# Resolve all the all enabled modules
[ -z "$MODULES" ] && MODULES="$(ls ${WORKDIR}/kubezero/templates | sed -e 's/.yaml//g')"
# CRDs first # CRDs first
for t in $MODULES; do for t in $MODULES; do
@ -415,8 +385,8 @@ parse_kubezero
# Execute tasks # Execute tasks
for t in $@; do for t in $@; do
case "$t" in case "$t" in
cluster_upgrade) cluster_upgrade;; upgrade_cluster) upgrade_cluster;;
node_upgrade) node_upgrade;; upgrade_node) upgrade_node;;
bootstrap) control_plane_node bootstrap;; bootstrap) control_plane_node bootstrap;;
join) control_plane_node join;; join) control_plane_node join;;
restore) control_plane_node restore;; restore) control_plane_node restore;;

50
admin/libhelm.sh Executable file → Normal file
View File

@ -20,6 +20,53 @@ function chart_location() {
} }
function argo_used() {
kubectl get application kubezero -n argocd && rc=$? || rc=$?
return $rc
}
# get kubezero-values from ArgoCD if available or use in-cluster CM without Argo
function get_kubezero_values() {
argo_used && \
{ kubectl get application kubezero -n argocd -o yaml | yq .spec.source.helm.values > ${WORKDIR}/kubezero-values.yaml; } || \
{ kubectl get configmap -n kube-system kubezero-values -o yaml | yq '.data."values.yaml"' > ${WORKDIR}/kubezero-values.yaml ;}
}
function disable_argo() {
cat > _argoapp_patch.yaml <<EOF
spec:
syncWindows:
- kind: deny
schedule: '0 * * * *'
duration: 24h
namespaces:
- '*'
EOF
kubectl patch appproject kubezero -n argocd --patch-file _argoapp_patch.yaml --type=merge && rm _argoapp_patch.yaml
}
function enable_argo() {
kubectl patch appproject kubezero -n argocd --type json -p='[{"op": "remove", "path": "/spec/syncWindows"}]' || true
}
function argo_app_synced() {
APP=$1
# Ensure we are synced otherwise bail out
status=$(kubectl get application $APP -n argocd -o yaml | yq .status.sync.status)
if [ "$status" != "Synced" ]; then
echo "ArgoCD Application $APP not 'Synced'!"
return 1
fi
return 0
}
# make sure namespace exists prior to calling helm as the create-namespace options doesn't work # make sure namespace exists prior to calling helm as the create-namespace options doesn't work
function create_ns() { function create_ns() {
local namespace=$1 local namespace=$1
@ -83,6 +130,9 @@ function _helm() {
yq eval '.spec.source.helm.values' $WORKDIR/kubezero/templates/${module}.yaml > $WORKDIR/values.yaml yq eval '.spec.source.helm.values' $WORKDIR/kubezero/templates/${module}.yaml > $WORKDIR/values.yaml
echo "using values for $module: "
cat $WORKDIR/values.yaml
if [ $action == "crds" ]; then if [ $action == "crds" ]; then
# Allow custom CRD handling # Allow custom CRD handling
declare -F ${module}-crds && ${module}-crds || _crds declare -F ${module}-crds && ${module}-crds || _crds

View File

@ -2,13 +2,44 @@
import sys import sys
import argparse import argparse
import io import io
import ruamel.yaml import yaml
DEFAULT_VERSION = "1.23.10-2"
yaml = ruamel.yaml.YAML() def migrate(values):
yaml.preserve_quotes = True """Actual changes here"""
yaml.explicit_start = True
yaml.indent(mapping=2, sequence=4, offset=2) # migrate ClusterName to clusterName
if "ClusterName" in values:
values["clusterName"] = values["ClusterName"]
values.pop("ClusterName")
# Create new clusterwide cloudprovider data if possible
try:
if values["cert-manager"]["clusterIssuer"]["solvers"][0]["dns01"]["route53"]["regions"]:
if "global" not in values:
values["global"] = {}
if "aws" not in values["global"]:
values["global"]["aws"] = {}
values["global"]["aws"]["region"] = values["cert-manager"]["clusterIssuer"]["solvers"][0]["dns01"]["route53"]["region"]
except KeyError:
pass
return values
def str_presenter(dumper, data):
if len(data.splitlines()) > 1: # check for multiline string
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
return dumper.represent_scalar("tag:yaml.org,2002:str", data)
yaml.add_representer(str, str_presenter)
# to use with safe_dump:
yaml.representer.SafeRepresenter.add_representer(str, str_presenter)
def rec_sort(d): def rec_sort(d):
@ -31,14 +62,17 @@ def rec_sort(d):
if isinstance(d, list): if isinstance(d, list):
for idx, elem in enumerate(d): for idx, elem in enumerate(d):
d[idx] = rec_sort(elem) d[idx] = rec_sort(elem)
return d return d
parser = argparse.ArgumentParser(description="Migrate ArgoCD Kubezero values to new cluster config") parser = argparse.ArgumentParser(
description="Migrate ArgoCD Kubezero values to new cluster config"
)
parser.add_argument( parser.add_argument(
"--version", "--version",
dest="version", dest="version",
default="1.23.10", default=DEFAULT_VERSION,
action="store", action="store",
required=False, required=False,
help="Update KubeZero version", help="Update KubeZero version",
@ -46,37 +80,25 @@ parser.add_argument(
args = parser.parse_args() args = parser.parse_args()
application = yaml.load(sys.stdin) application = yaml.safe_load(sys.stdin)
# Set version from cmd line # Set version from cmd line
if args.version: if args.version:
application["spec"]["source"]["targetRevision"] = args.version application["spec"]["source"]["targetRevision"] = args.version
# Extract Helm values # Extract Helm values
values = yaml.load(application["spec"]["source"]["helm"]["values"]) values = yaml.safe_load(application["spec"]["source"]["helm"]["values"])
### Do your thing
# migrate ClusterName to clusterName
if "ClusterName" in values:
values["clusterName"] = values["ClusterName"]
values.pop("ClusterName")
# Create new clusterwide cloudprovider data if possible
try:
if values["cert-manager"]["clusterIssuer"]["solvers"][0]["dns01"]["route53"]["regions"]:
if "aws" not in values:
values["aws"] = {}
values["aws"]["region"] = values["cert-manager"]["clusterIssuer"]["solvers"][0]["dns01"]["route53"]["region"]
except KeyError:
pass
### End
# Merge new values # Merge new values
buffer = io.StringIO() buffer = io.StringIO()
yaml.dump(rec_sort(values), buffer) yaml.safe_dump(
rec_sort(migrate(values)),
buffer,
default_flow_style=False,
indent=2,
sort_keys=False,
)
application["spec"]["source"]["helm"]["values"] = buffer.getvalue() application["spec"]["source"]["helm"]["values"] = buffer.getvalue()
# Dump final yaml # Output new Application resource
yaml.dump(application, sys.stdout) yaml.dump(application, sys.stdout, default_flow_style=False)

View File

@ -1,6 +1,9 @@
#!/bin/bash -e #!/bin/bash -e
VERSION="v1.23.10-1" SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
. $SCRIPT_DIR/libhelm.sh
VERSION="v1.23.10-2"
[ -n "$DEBUG" ] && set -x [ -n "$DEBUG" ] && set -x
@ -120,19 +123,21 @@ EOF
kubectl delete pod kubezero-upgrade-${VERSION//.} -n kube-system kubectl delete pod kubezero-upgrade-${VERSION//.} -n kube-system
} }
all_nodes_upgrade "mount --make-shared /host/sys/fs/cgroup; mount --make-shared /host/sys;" argo_used && disable_argo
control_plane_upgrade cluster_upgrade #all_nodes_upgrade "mount --make-shared /host/sys/fs/cgroup; mount --make-shared /host/sys;"
echo "Adjust kubezero-values CM !!" control_plane_upgrade upgrade_cluster
read
#echo "Adjust kubezero-values CM !!"
#read
#kubectl delete ds kube-multus-ds -n kube-system #kubectl delete ds kube-multus-ds -n kube-system
control_plane_upgrade "apply_network, apply_addons" control_plane_upgrade "apply_network, apply_addons"
exit 0
kubectl rollout restart daemonset/calico-node -n kube-system #kubectl rollout restart daemonset/calico-node -n kube-system
kubectl rollout restart daemonset/cilium -n kube-system #kubectl rollout restart daemonset/cilium -n kube-system
#kubectl rollout restart daemonset/kube-multus-ds -n kube-system
kubectl rollout restart daemonset/kube-multus-ds -n kube-system argo_used && enable_argo

View File

@ -0,0 +1,72 @@
#!/bin/bash
# Migrate addons and network values from local kubeadm-values.yaml on controllers into CM
# - remove secrets from addons
# - enable cilium
# Create kubeadm-values CM if not available
kubectl get cm -n kube-system kubeadm-values || \
kubectl create configmap -n kube-system kubeadm-values
kubectl get cm -n kube-system kubezero-values || \
kubectl create configmap -n kube-system kubezero-values
# tweak local kubeadm for upgrade later on
yq eval -i '.global.clusterName = strenv(CLUSTERNAME) |
.global.highAvailable = env(HIGHAVAILABLE)' \
${HOSTFS}/etc/kubernetes/kubeadm-values.yaml
# extract addons
yq e '.addons |
del .clusterBackup.repository |
del .clusterBackup.password |
.clusterBackup.image.tag = strenv(KUBE_VERSION) |
{"addons": .}' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml > $WORKDIR/addons-values.yaml
# extract network
yq e '.network |
.cilium.enabled = true |
.multus.defaultNetworks = ["cilium"] |
.cilium.cluster.name = strenv(CLUSTERNAME) |
{"network": .}' ${HOSTFS}/etc/kubernetes/kubeadm-values.yaml > $WORKDIR/network-values.yaml
# get current argo cd values
kubectl get application kubezero -n argocd -o yaml | yq .spec.source.helm.values > ${WORKDIR}/argo-values.yaml
# merge all into new CM
yq ea '. as $item ireduce ({}; . * $item ) |
.global.clusterName = strenv(CLUSTERNAME) |
.global.highAvailable = env(HIGHAVAILABLE)' $WORKDIR/addons-values.yaml ${WORKDIR}/network-values.yaml $WORKDIR/argo-values.yaml > $WORKDIR/kubezero-values.yaml
# Update kubezero-values CM
kubectl get cm -n kube-system kubezero-values -o=yaml | \
yq e '.data."values.yaml" |= load_str("/tmp/kubezero/kubezero-values.yaml")' | \
kubectl replace -f -
kubezero_chart_version=$(yq .version /charts/kubezero/Chart.yaml)
# update argo app, create new from scratch as Argo is really picky being patched
# autosync DISABLED !!!
cat > $WORKDIR/kube-argo.yaml <<EOF
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: kubezero
namespace: argocd
spec:
project: kubezero
source:
repoURL: https://cdn.zero-downtime.net/charts
chart: kubezero
targetRevision: $kubezero_chart_version
destination:
server: https://kubernetes.default.svc
namespace: argocd
syncPolicy: {}
EOF
yq eval -i '.spec.source.helm.values |= load_str("/tmp/kubezero/kubezero-values.yaml")' $WORKDIR/kube-argo.yaml
kubectl apply -f $WORKDIR/kube-argo.yaml

View File

@ -72,7 +72,7 @@ apiServer:
authentication-token-webhook-config-file: /etc/kubernetes/apiserver/aws-iam-authenticator.yaml authentication-token-webhook-config-file: /etc/kubernetes/apiserver/aws-iam-authenticator.yaml
{{- end }} {{- end }}
feature-gates: {{ include "kubeadm.featuregates" ( dict "return" "csv" ) | trimSuffix "," | quote }} feature-gates: {{ include "kubeadm.featuregates" ( dict "return" "csv" ) | trimSuffix "," | quote }}
enable-admission-plugins: DenyServiceExternalIPs,NodeRestriction,EventRateLimit enable-admission-plugins: DenyServiceExternalIPs,NodeRestriction,EventRateLimit,ExtendedResourceToleration
# {{- if .Values.global.highAvailable }} # {{- if .Values.global.highAvailable }}
# goaway-chance: ".001" # goaway-chance: ".001"
# {{- end }} # {{- end }}

View File

@ -2,13 +2,15 @@ apiVersion: v2
name: kubezero-addons name: kubezero-addons
description: KubeZero umbrella chart for various optional cluster addons description: KubeZero umbrella chart for various optional cluster addons
type: application type: application
version: 0.6.1 version: 0.6.2
appVersion: v1.23.10 appVersion: v1.23.10
home: https://kubezero.com home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords: keywords:
- kubezero - kubezero
- fuse-device-plugin - fuse-device-plugin
- neuron-device-plugin
- nvidia-device-plugin
- aws-node-termination-handler - aws-node-termination-handler
- external-dns - external-dns
maintainers: maintainers:
@ -27,4 +29,9 @@ dependencies:
version: 9.21.0 version: 9.21.0
repository: https://kubernetes.github.io/autoscaler repository: https://kubernetes.github.io/autoscaler
condition: cluster-autoscaler.enabled condition: cluster-autoscaler.enabled
- name: nvidia-device-plugin
version: 0.12.2
# https://github.com/NVIDIA/k8s-device-plugin
repository: https://nvidia.github.io/k8s-device-plugin
condition: nvidia-device-plugin.enabled
kubeVersion: ">= 1.20.0" kubeVersion: ">= 1.20.0"

View File

@ -20,4 +20,9 @@
## AWS Neuron ## AWS Neuron
Device plugin for [AWS Neuron](https://aws.amazon.com/machine-learning/neuron/) - [Inf1 instances](https://aws.amazon.com/ec2/instance-types/inf1/) Device plugin for [AWS Neuron](https://aws.amazon.com/machine-learning/neuron/) - [Inf1 instances](https://aws.amazon.com/ec2/instance-types/inf1/)
## Nvidia
## Cluster AutoScaler
- https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws/README.md
{{ template "chart.valuesSection" . }} {{ template "chart.valuesSection" . }}

View File

@ -17,8 +17,6 @@ spec:
spec: spec:
serviceAccount: neuron-device-plugin serviceAccount: neuron-device-plugin
tolerations: tolerations:
- key: CriticalAddonsOnly
operator: Exists
- key: aws.amazon.com/neuron - key: aws.amazon.com/neuron
operator: Exists operator: Exists
effect: NoSchedule effect: NoSchedule

View File

@ -83,6 +83,33 @@ awsNeuron:
name: public.ecr.aws/neuron/neuron-device-plugin name: public.ecr.aws/neuron/neuron-device-plugin
tag: 1.9.3.0 tag: 1.9.3.0
nvidia-device-plugin:
enabled: false
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
- key: kubezero-workergroup
effect: NoSchedule
operator: Exists
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "node.kubernetes.io/instance-type"
operator: In
values:
- g5.xlarge
- g5.2xlarge
- g5.4xlarge
- g5.8xlarge
- g5.12xlarge
- g5.16xlarge
- g5.24xlarge
- g5.48xlarge
cluster-autoscaler: cluster-autoscaler:
enabled: false enabled: false

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: kubezero-ci name: kubezero-ci
description: KubeZero umbrella chart for all things CI description: KubeZero umbrella chart for all things CI
type: application type: application
version: 0.5.12 version: 0.5.13
home: https://kubezero.com home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords: keywords:
@ -26,7 +26,7 @@ dependencies:
repository: https://dl.gitea.io/charts/ repository: https://dl.gitea.io/charts/
condition: gitea.enabled condition: gitea.enabled
- name: jenkins - name: jenkins
version: 4.1.16 version: 4.2.1
repository: https://charts.jenkins.io repository: https://charts.jenkins.io
condition: jenkins.enabled condition: jenkins.enabled
- name: trivy - name: trivy

View File

@ -114,15 +114,15 @@ jenkins:
numToKeepStr: "10" numToKeepStr: "10"
installPlugins: installPlugins:
- kubernetes:3670.v6ca_059233222 - kubernetes:3704.va_08f0206b_95e
- workflow-aggregator:581.v0c46fa_697ffd - workflow-aggregator:581.v0c46fa_697ffd
- git:4.11.4 - git:4.11.5
- configuration-as-code:1512.vb_79d418d5fc8 - configuration-as-code:1512.vb_79d418d5fc8
- antisamy-markup-formatter:2.7 - antisamy-markup-formatter:2.7
- prometheus:2.0.11 - prometheus:2.0.11
- htmlpublisher:1.30 - htmlpublisher:1.30
- build-discarder:60.v1747b0eb632a - build-discarder:139.v05696a_7fe240
- dark-theme:185.v276b_5a_8966a_e - dark-theme:245.vb_a_2b_b_010ea_96
- kubernetes-credentials-provider:1.196.va_55f5e31e3c2 - kubernetes-credentials-provider:1.196.va_55f5e31e3c2
serviceAccountAgent: serviceAccountAgent:

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: kubezero-istio name: kubezero-istio
description: KubeZero Umbrella Chart for Istio description: KubeZero Umbrella Chart for Istio
type: application type: application
version: 0.8.2 version: 0.8.3
home: https://kubezero.com home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords: keywords:

View File

@ -1,6 +1,6 @@
global: global:
# hub: docker.io/istio # hub: docker.io/istio
tag: 1.14.3-distroless tag: 1.14.4-distroless
logAsJson: true logAsJson: true

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: kubezero-network name: kubezero-network
description: KubeZero umbrella chart for all things network description: KubeZero umbrella chart for all things network
type: application type: application
version: 0.3.2 version: 0.3.3
home: https://kubezero.com home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords: keywords:
@ -23,7 +23,7 @@ dependencies:
repository: https://helm.cilium.io/ repository: https://helm.cilium.io/
condition: cilium.enabled condition: cilium.enabled
- name: metallb - name: metallb
version: 0.13.4 version: 0.13.5
repository: https://metallb.github.io/metallb repository: https://metallb.github.io/metallb
condition: metallb.enabled condition: metallb.enabled
- name: calico - name: calico

View File

@ -1,7 +1,5 @@
metallb: metallb:
enabled: false enabled: false
psp:
create: false
controller: controller:
tolerations: tolerations:

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: kubezero name: kubezero
description: KubeZero - Root App of Apps chart description: KubeZero - Root App of Apps chart
type: application type: application
version: 1.23.10-1 version: 1.23.10-2
home: https://kubezero.com home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords: keywords:

View File

@ -28,6 +28,11 @@ awsNeuron:
{{- toYaml . | nindent 2 }} {{- toYaml . | nindent 2 }}
{{- end }} {{- end }}
{{- with index .Values "addons" "nvidia-device-plugin" }}
nvidia-device-plugin:
{{- toYaml . | nindent 2 }}
{{- end }}
{{- with index .Values "addons" "external-dns" }} {{- with index .Values "addons" "external-dns" }}
external-dns: external-dns:
{{- toYaml . | nindent 2 }} {{- toYaml . | nindent 2 }}

View File

@ -17,6 +17,12 @@ cilium:
{{- define "network-argo" }} {{- define "network-argo" }}
# Metallb
ignoreDifferences:
- group: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
jsonPointers:
- /spec/conversion/webhook/clientConfig/caBundle
{{- end }} {{- end }}
{{ include "kubezero-app.app" . }} {{ include "kubezero-app.app" . }}

View File

@ -8,12 +8,12 @@ HighAvailableControlplane: false
addons: addons:
enabled: true enabled: true
targetRevision: 0.6.1 targetRevision: 0.6.2
network: network:
enabled: true enabled: true
retain: true retain: true
targetRevision: 0.3.2 targetRevision: 0.3.3
cert-manager: cert-manager:
enabled: false enabled: false
@ -31,7 +31,7 @@ storage:
istio: istio:
enabled: false enabled: false
namespace: istio-system namespace: istio-system
targetRevision: 0.8.2 targetRevision: 0.8.3
istio-ingress: istio-ingress:
enabled: false enabled: false

116
docs/v1.23.md Normal file
View File

@ -0,0 +1,116 @@
# KubeZero 1.23
## What's new - Major themes
- update inf1 neuron drivers incl. node auto-taints
- support for Nvidia g5 instances incl. the whole toolchain up to device drivers etc, auto node taints
- ExtendedResourceToleration AdmissionController enabled to auto tolerate INF1 and Nvidia pods
- Cluster-Autoscaler
### Alpine - Custom AMIs
Starting with 1.22, all KubeZero nodes boot using custom AMIs. These AMIs will be provided and shared by the Zero Down Time for all customers. As always, all sources incl. the build pipeline are freely available [here](https://git.zero-downtime.net/ZeroDownTime/alpine-zdt-images).
This eliminates *ALL* dependencies at boot time other than container registries. Gone are the days when Ubuntu, SuSE or Github decided to ruin your morning coffee.
KubeZero migrates from Ubuntu 20.04 LTS to [Alpine v3.15](https://www.alpinelinux.org/releases/) as its base OS.
#### Highlights:
- minimal attack surface by removing all unnecessary bloat,
like all things SystemD, Ubuntu's snap, etc
- reduced root file system size from 8GB to 2GB
- minimal memory consumption of about 12MB fully booted
*Minimal* fully booted instance incl. SSH and Monit:
| | Ubuntu | Alpine|
|-|--------|-----|
| Memory used | 60MB | 12 MB |
| RootFS used | 1.1GB | 330 MB |
| RootFS encrypted | no | yes |
| Kernel | 5.11 | 5.15 |
| Init | Systemd | OpenRC |
| AMI / EBS size | 8GB | 1GB |
| Boot time | ~120s | ~45s |
- Encrypted AMIs:
This closes the last gaps you might have in achieving *full encryption at rest* for every volume within a default KubeZero deployment.
### Etcd
On AWS a new dedicated GP3 EBS volume gets provisioned per controller and is used as persistent etcd storage. These volumes will persist for the life time of the cluster and reused by future controller nodes in each AZ.
This ensure no data loss during upgrade or restore situations of single controller clusters. The hourly backup on S3 will still be used as fallback / disaster recovery option in case the file system gets corrupted etc.
### DNS
The [external-dns](https://github.com/kubernetes-sigs/external-dns) controller got integrated and is used to provide DNS based loadbalacing for the apiserver itself. This allows high available control planes on AWS as well as bare-metal in combination with various DNS providers.
Further usage of this controller to automate any DNS related configurations, like Ingress etc. is planned for following releases.
### Container runtime
Cri-o now uses crun rather than runc, which reduces the memory overhead *per pod* from 16M to 4M, details at [crun intro](https://www.redhat.com/sysadmin/introduction-crun)
With 1.22 and the switch to crun, support for [CgroupV2](https://www.kernel.org/doc/Documentation/cgroup-v2.txt) has been enabled.
### AWS Neuron INF support
Initial support for [Inf1 instances](https://aws.amazon.com/ec2/instance-types/inf1/) part of [AWS Neuron](https://aws.amazon.com/machine-learning/neuron/).
Workers automatically load the custom kernel module on these instance types and expose the `/dev/neuron*` devices.
## Version upgrades
- Istio to 1.13.3 using the new Helm [gateway charts](https://istio.io/latest/docs/setup/additional-setup/gateway/)
- Logging: ECK operator upgraded from 1.6 to 2.1, fluent-bit 1.9.3
- Metrics: Prometheus and all Grafana charts to latest to match V1.22
- ArgoCD to V2.2.5
- AWS EBS/EFS CSI drivers to latest versions
- cert-manager to V1.8
- aws-termination-handler to 1.16
- aws-iam-authenticator to 0.5.7, required for >1.22 which allows using the latest version on the client side again
## Misc
- new metrics and dashboards for openEBS LVM CSI drivers
- new node label `node.kubernetes.io/instance-type` for all nodes containing the EC2 instance type
- kubelet root moved to `/var/lib/containers` to ensure ephemeral storage is allocated from the configurable volume rather than the root fs of the worker
# Upgrade
`(No, really, you MUST read this before you upgrade)`
- Ensure your Kube context points to the correct cluster !
- Ensure any usage of Kiam has been migrated to OIDC providers as any remaining Kiam components will be deleted as part of the upgrade
1. Migrate ArgoCD KubeZero config:
`cat <cluster/env/kubezero/application.yaml> | ./releases/v1.22/migrate_agro.py` and adjust if needed and replace the original. Do NOT commit yet !
2. Upgrade `logging` and `metrics` module
- `kubectl get crd elasticsearches.elasticsearch.k8s.elastic.co && kubectl replace -f https://download.elastic.co/downloads/eck/2.1.0/crds.yaml` CRDs for logging
- `./bootstrap.sh apply logging <env>` logging module to support new OS coming with 1.22
- `./bootstrap.sh crds metrics <env>` CRDs for metrics
- `./bootstrap.sh apply metrics <env>` to get new exporters in place to support 1.22
3. Trigger the cluster upgrade:
`./release/v1.22/upgrade_cluster.sh`
4. Upgrade CFN stacks for the control plane and all worker groups
Change Kubernetes version in controller config from `1.21.9` to `1.22.8`
5. Reboot controller(s) one by one
Wait each time for controller to join and all pods running.
Might take a while ...
6. Launch new set of workers, at least enough to host new Istio Ingress gateways due to Kernel requirements
Eg. by doubling `desired` for each worker ASG,
7. Upgrade via boostrap.sh
As the changes around Istio are substantial in this release we need to upgrade some parts step by step to prevent service outages, especially for private-ingress.
- `./bootstrap.sh crds all <env>` to deploy all new CRDs first
- `./bootstrap.sh apply cert-manager <env>` to update cert-manager, required for Istio
- `./bootstrap.sh apply istio <env>` to update the Istio control plane
- `./bootstrap.sh apply istio-private-ingress <env>` to deploy the new private-ingress gateways first
- `./bootstrap.sh apply istio-ingress <env>` to update the public ingress and also remove the 1.21 private-ingress gateways
8. Finalize via ArgoCD
git add / commit / pusSh `<cluster/env/kubezero/application.yaml>` and watch ArgoCD do its work.
9. Drain old workers
Drain one by one and reset each ASG to initial "desired" value.

View File

@ -4,5 +4,5 @@ NODES=$(kubectl get nodes -o json | jq -rc .items[].status.addresses[0].address)
for n in $NODES; do for n in $NODES; do
>&2 echo "Node: $n" >&2 echo "Node: $n"
ssh -q $n "$@" ssh -q alpine@$n "$@"
done done