feat: add delete_module to admin, various cleanups

This commit is contained in:
Stefan Reimer 2022-11-14 14:14:11 +01:00
parent f8588d8938
commit e9eeb8be8d
8 changed files with 70 additions and 37 deletions

View File

@ -323,6 +323,21 @@ apply_module() {
}
delete_module() {
MODULES=$1
get_kubezero_values
# Always use embedded kubezero chart
helm template $CHARTS/kubezero -f $WORKDIR/kubezero-values.yaml --version ~$KUBE_VERSION --devel --output-dir $WORKDIR
for t in $MODULES; do
_helm delete $t
done
echo "Deleted KubeZero modules: $MODULES. Potential CRDs must be removed manually."
}
# backup etcd + /etc/kubernetes/pki
backup() {
# Display all ENVs, careful this exposes the password !
@ -377,7 +392,8 @@ for t in $@; do
bootstrap) control_plane_node bootstrap;;
join) control_plane_node join;;
restore) control_plane_node restore;;
apply_*) apply_module ${t##apply_};;
apply_*) apply_module "${t##apply_}";;
delete_*) delete_module "${t##delete_}";;
backup) backup;;
debug_shell) debug_shell;;
*) echo "Unknown command: '$t'";;

View File

@ -108,6 +108,7 @@ function _crds() {
# Only apply if there are actually any crds
if [ -s $WORKDIR/crds.yaml ]; then
[ -n "$DEBUG" ] && cat $WORKDIR/crds.yaml
kubectl apply -f $WORKDIR/crds.yaml --server-side --force-conflicts
fi
}
@ -115,7 +116,7 @@ function _crds() {
# helm template | kubectl apply -f -
# confine to one namespace if possible
function apply() {
function render() {
helm template $(chart_location $chart) -n $namespace --name-template $module $targetRevision --skip-crds -f $WORKDIR/values.yaml $API_VERSIONS --kube-version $KUBE_VERSION $@ \
| python3 -c '
#!/usr/bin/python3
@ -128,8 +129,6 @@ for manifest in yaml.safe_load_all(sys.stdin):
manifest["metadata"]["namespace"] = sys.argv[1]
print("---")
print(yaml.dump(manifest))' $namespace > $WORKDIR/helm.yaml
kubectl $action -f $WORKDIR/helm.yaml --server-side --force-conflicts && rc=$? || rc=$?
}
@ -164,13 +163,15 @@ function _helm() {
# Optional pre hook
declare -F ${module}-pre && ${module}-pre
apply
render
kubectl $action -f $WORKDIR/helm.yaml --server-side --force-conflicts && rc=$? || rc=$?
# Optional post hook
declare -F ${module}-post && ${module}-post
elif [ $action == "delete" ]; then
apply
render
kubectl $action -f $WORKDIR/helm.yaml && rc=$? || rc=$?
# Delete dedicated namespace if not kube-system
[ -n "$DELETE_NS" ] && delete_ns $namespace

View File

@ -124,6 +124,10 @@ spec:
node-role.kubernetes.io/control-plane: ""
tolerations:
- key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
restartPolicy: Never
EOF

View File

@ -26,16 +26,22 @@ Kubernetes: `>= 1.24.0`
| Key | Type | Default | Description |
|-----|------|---------|-------------|
| cilium.bpf.hostLegacyRouting | bool | `true` | |
| cilium.cgroup.autoMount.enabled | bool | `false` | |
| cilium.cgroup.hostRoot | string | `"/sys/fs/cgroup"` | |
| cilium.cluster.id | int | `240` | |
| cilium.cluster.name | string | `"default"` | |
| cilium.cni.binPath | string | `"/usr/libexec/cni"` | |
| cilium.cni.exclusive | bool | `false` | |
| cilium.cni.logFile | string | `"/var/log/cilium-cni.log"` | |
| cilium.containerRuntime.integration | string | `"crio"` | |
| cilium.enabled | bool | `false` | |
| cilium.hubble.enabled | bool | `false` | |
| cilium.hubble.relay.enabled | bool | `false` | |
| cilium.hubble.tls.auto.certManagerIssuerRef.group | string | `"cert-manager.io"` | |
| cilium.hubble.tls.auto.certManagerIssuerRef.kind | string | `"ClusterIssuer"` | |
| cilium.hubble.tls.auto.certManagerIssuerRef.name | string | `"kubezero-local-ca-issuer"` | |
| cilium.hubble.tls.auto.method | string | `"cert-manager"` | |
| cilium.hubble.ui.enabled | bool | `false` | |
| cilium.ipam.operator.clusterPoolIPv4PodCIDRList[0] | string | `"10.240.0.0/16"` | |
| cilium.l7Proxy | bool | `false` | |
| cilium.operator.nodeSelector."node-role.kubernetes.io/control-plane" | string | `""` | |
@ -58,4 +64,5 @@ Kubernetes: `>= 1.24.0`
| multus.clusterNetwork | string | `"cilium"` | |
| multus.defaultNetworks | list | `[]` | |
| multus.enabled | bool | `false` | |
| multus.readinessindicatorfile | string | `"/etc/cni/net.d/05-cilium.conf"` | |
| multus.tag | string | `"v3.9.2"` | |

View File

@ -40,8 +40,8 @@ cilium:
#-- Ensure this is false if multus is enabled
exclusive: false
bpf:
hostLegacyRouting: true
#bpf:
# hostLegacyRouting: true
# tproxy: false
cluster:
@ -57,10 +57,10 @@ cilium:
- 10.240.0.0/16
# Keep it simple for now
# nodePort:
# enabled: true
l7Proxy: false
#rollOutCiliumPods: true
cgroup:
autoMount:
enabled: false
@ -84,3 +84,14 @@ cilium:
hubble:
enabled: false
relay:
enabled: false
ui:
enabled: false
tls:
auto:
method: cert-manager
certManagerIssuerRef:
group: cert-manager.io
kind: ClusterIssuer
name: kubezero-local-ca-issuer

View File

@ -72,7 +72,7 @@ Kubernetes: `>= 1.24.0`
| storage.aws-ebs-csi-driver.enabled | bool | `false` | |
| storage.aws-efs-csi-driver.enabled | bool | `false` | |
| storage.enabled | bool | `false` | |
| storage.targetRevision | string | `"0.7.3"` | |
| storage.targetRevision | string | `"0.7.4"` | |
----------------------------------------------
Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0)

View File

@ -36,7 +36,7 @@ cert-manager:
storage:
enabled: false
targetRevision: 0.7.3
targetRevision: 0.7.4
aws-ebs-csi-driver:
enabled: false
aws-efs-csi-driver:

View File

@ -2,47 +2,41 @@
## TODO
### FeatureGates
- PodAndContainerStatsFromCRI
- DelegateFSGroupToCSIDriver
## What's new - Major themes
- Cilium added as second CNI to prepare full migration to Cilium with 1.24 upgrade
- support for Nvidia g5 instances incl. pre-installed kernel drivers, cudo toolchain and CRI intergration
- updated inf1 neuron drivers
- ExtendedResourceToleration AdmissionController and auto-taints allowing Neuron and Nvidia pods ONLY to be scheduled on dedicated workers
- full Cluster-Autoscaler integration
- Cilium is now the default CNI, calico got removed
- cluster-autoscaler is enabled by default on AWS
## Version upgrades
- Istio to 1.14.4
- Logging: ECK operator to 2.4, fluent-bit 1.9.8
- Metrics: Prometheus and all Grafana charts to latest to match V1.23
- ArgoCD to V2.4 ( access to pod via shell disabled by default )
- AWS EBS/EFS CSI drivers to latest versions
- cert-manager to V1.9.1
- cilium
- metallb
- nvidia-device-plugin
- aws-node-termination-handler
- aws-ebs-csi-driver
- aws-efs-csi-driver
### FeatureGates
- PodAndContainerStatsFromCRI
- DelegateFSGroupToCSIDriver
# Upgrade
`(No, really, you MUST read this before you upgrade)`
- Ensure your Kube context points to the correct cluster !
1. Enable `containerProxy` for NAT instances and upgrade NAT instance using the new V2 Pulumi stacks
1. Review CFN config for controller and workers ( enable containerProxy, remove legacy version settings etc )
2. Review CFN config for controller and workers ( enable containerProxy, remove legacy version settings etc )
2. Upgrade CFN stacks for the control plane and all worker groups
3. Upgrade CFN stacks for the control plane and all worker groups
4. Trigger fully-automated cluster upgrade:
3. Trigger fully-automated cluster upgrade:
`./admin/upgrade_cluster.sh <path to the argocd app kubezero yaml for THIS cluster>`
5. Reboot controller(s) one by one
4. Reboot controller(s) one by one
Wait each time for controller to join and all pods running.
Might take a while ...
6. Launch new set of workers eg. by doubling `desired` for each worker ASG
5. Launch new set of workers eg. by doubling `desired` for each worker ASG
once new workers are ready, cordon and drain all old workers
The cluster-autoscaler will remove the old workers automatically after about 10min !
7. If all looks good, commit the ArgoApp resouce for Kubezero, before re-enabling ArgoCD itself.
6. If all looks good, commit the ArgoApp resouce for Kubezero, before re-enabling ArgoCD itself.
git add / commit / push `<cluster/env/kubezero/application.yaml>`