feat: add delete_module to admin, various cleanups

This commit is contained in:
Stefan Reimer 2022-11-14 14:14:11 +01:00
parent f8588d8938
commit e9eeb8be8d
8 changed files with 70 additions and 37 deletions

View File

@ -323,6 +323,21 @@ apply_module() {
} }
delete_module() {
MODULES=$1
get_kubezero_values
# Always use embedded kubezero chart
helm template $CHARTS/kubezero -f $WORKDIR/kubezero-values.yaml --version ~$KUBE_VERSION --devel --output-dir $WORKDIR
for t in $MODULES; do
_helm delete $t
done
echo "Deleted KubeZero modules: $MODULES. Potential CRDs must be removed manually."
}
# backup etcd + /etc/kubernetes/pki # backup etcd + /etc/kubernetes/pki
backup() { backup() {
# Display all ENVs, careful this exposes the password ! # Display all ENVs, careful this exposes the password !
@ -377,7 +392,8 @@ for t in $@; do
bootstrap) control_plane_node bootstrap;; bootstrap) control_plane_node bootstrap;;
join) control_plane_node join;; join) control_plane_node join;;
restore) control_plane_node restore;; restore) control_plane_node restore;;
apply_*) apply_module ${t##apply_};; apply_*) apply_module "${t##apply_}";;
delete_*) delete_module "${t##delete_}";;
backup) backup;; backup) backup;;
debug_shell) debug_shell;; debug_shell) debug_shell;;
*) echo "Unknown command: '$t'";; *) echo "Unknown command: '$t'";;

View File

@ -108,6 +108,7 @@ function _crds() {
# Only apply if there are actually any crds # Only apply if there are actually any crds
if [ -s $WORKDIR/crds.yaml ]; then if [ -s $WORKDIR/crds.yaml ]; then
[ -n "$DEBUG" ] && cat $WORKDIR/crds.yaml
kubectl apply -f $WORKDIR/crds.yaml --server-side --force-conflicts kubectl apply -f $WORKDIR/crds.yaml --server-side --force-conflicts
fi fi
} }
@ -115,7 +116,7 @@ function _crds() {
# helm template | kubectl apply -f - # helm template | kubectl apply -f -
# confine to one namespace if possible # confine to one namespace if possible
function apply() { function render() {
helm template $(chart_location $chart) -n $namespace --name-template $module $targetRevision --skip-crds -f $WORKDIR/values.yaml $API_VERSIONS --kube-version $KUBE_VERSION $@ \ helm template $(chart_location $chart) -n $namespace --name-template $module $targetRevision --skip-crds -f $WORKDIR/values.yaml $API_VERSIONS --kube-version $KUBE_VERSION $@ \
| python3 -c ' | python3 -c '
#!/usr/bin/python3 #!/usr/bin/python3
@ -128,8 +129,6 @@ for manifest in yaml.safe_load_all(sys.stdin):
manifest["metadata"]["namespace"] = sys.argv[1] manifest["metadata"]["namespace"] = sys.argv[1]
print("---") print("---")
print(yaml.dump(manifest))' $namespace > $WORKDIR/helm.yaml print(yaml.dump(manifest))' $namespace > $WORKDIR/helm.yaml
kubectl $action -f $WORKDIR/helm.yaml --server-side --force-conflicts && rc=$? || rc=$?
} }
@ -164,13 +163,15 @@ function _helm() {
# Optional pre hook # Optional pre hook
declare -F ${module}-pre && ${module}-pre declare -F ${module}-pre && ${module}-pre
apply render
kubectl $action -f $WORKDIR/helm.yaml --server-side --force-conflicts && rc=$? || rc=$?
# Optional post hook # Optional post hook
declare -F ${module}-post && ${module}-post declare -F ${module}-post && ${module}-post
elif [ $action == "delete" ]; then elif [ $action == "delete" ]; then
apply render
kubectl $action -f $WORKDIR/helm.yaml && rc=$? || rc=$?
# Delete dedicated namespace if not kube-system # Delete dedicated namespace if not kube-system
[ -n "$DELETE_NS" ] && delete_ns $namespace [ -n "$DELETE_NS" ] && delete_ns $namespace

View File

@ -124,6 +124,10 @@ spec:
node-role.kubernetes.io/control-plane: "" node-role.kubernetes.io/control-plane: ""
tolerations: tolerations:
- key: node-role.kubernetes.io/master - key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule effect: NoSchedule
restartPolicy: Never restartPolicy: Never
EOF EOF

View File

@ -26,16 +26,22 @@ Kubernetes: `>= 1.24.0`
| Key | Type | Default | Description | | Key | Type | Default | Description |
|-----|------|---------|-------------| |-----|------|---------|-------------|
| cilium.bpf.hostLegacyRouting | bool | `true` | |
| cilium.cgroup.autoMount.enabled | bool | `false` | | | cilium.cgroup.autoMount.enabled | bool | `false` | |
| cilium.cgroup.hostRoot | string | `"/sys/fs/cgroup"` | | | cilium.cgroup.hostRoot | string | `"/sys/fs/cgroup"` | |
| cilium.cluster.id | int | `240` | | | cilium.cluster.id | int | `240` | |
| cilium.cluster.name | string | `"default"` | | | cilium.cluster.name | string | `"default"` | |
| cilium.cni.binPath | string | `"/usr/libexec/cni"` | | | cilium.cni.binPath | string | `"/usr/libexec/cni"` | |
| cilium.cni.exclusive | bool | `false` | |
| cilium.cni.logFile | string | `"/var/log/cilium-cni.log"` | | | cilium.cni.logFile | string | `"/var/log/cilium-cni.log"` | |
| cilium.containerRuntime.integration | string | `"crio"` | | | cilium.containerRuntime.integration | string | `"crio"` | |
| cilium.enabled | bool | `false` | | | cilium.enabled | bool | `false` | |
| cilium.hubble.enabled | bool | `false` | | | cilium.hubble.enabled | bool | `false` | |
| cilium.hubble.relay.enabled | bool | `false` | |
| cilium.hubble.tls.auto.certManagerIssuerRef.group | string | `"cert-manager.io"` | |
| cilium.hubble.tls.auto.certManagerIssuerRef.kind | string | `"ClusterIssuer"` | |
| cilium.hubble.tls.auto.certManagerIssuerRef.name | string | `"kubezero-local-ca-issuer"` | |
| cilium.hubble.tls.auto.method | string | `"cert-manager"` | |
| cilium.hubble.ui.enabled | bool | `false` | |
| cilium.ipam.operator.clusterPoolIPv4PodCIDRList[0] | string | `"10.240.0.0/16"` | | | cilium.ipam.operator.clusterPoolIPv4PodCIDRList[0] | string | `"10.240.0.0/16"` | |
| cilium.l7Proxy | bool | `false` | | | cilium.l7Proxy | bool | `false` | |
| cilium.operator.nodeSelector."node-role.kubernetes.io/control-plane" | string | `""` | | | cilium.operator.nodeSelector."node-role.kubernetes.io/control-plane" | string | `""` | |
@ -58,4 +64,5 @@ Kubernetes: `>= 1.24.0`
| multus.clusterNetwork | string | `"cilium"` | | | multus.clusterNetwork | string | `"cilium"` | |
| multus.defaultNetworks | list | `[]` | | | multus.defaultNetworks | list | `[]` | |
| multus.enabled | bool | `false` | | | multus.enabled | bool | `false` | |
| multus.readinessindicatorfile | string | `"/etc/cni/net.d/05-cilium.conf"` | |
| multus.tag | string | `"v3.9.2"` | | | multus.tag | string | `"v3.9.2"` | |

View File

@ -40,8 +40,8 @@ cilium:
#-- Ensure this is false if multus is enabled #-- Ensure this is false if multus is enabled
exclusive: false exclusive: false
bpf: #bpf:
hostLegacyRouting: true # hostLegacyRouting: true
# tproxy: false # tproxy: false
cluster: cluster:
@ -57,10 +57,10 @@ cilium:
- 10.240.0.0/16 - 10.240.0.0/16
# Keep it simple for now # Keep it simple for now
# nodePort:
# enabled: true
l7Proxy: false l7Proxy: false
#rollOutCiliumPods: true
cgroup: cgroup:
autoMount: autoMount:
enabled: false enabled: false
@ -84,3 +84,14 @@ cilium:
hubble: hubble:
enabled: false enabled: false
relay:
enabled: false
ui:
enabled: false
tls:
auto:
method: cert-manager
certManagerIssuerRef:
group: cert-manager.io
kind: ClusterIssuer
name: kubezero-local-ca-issuer

View File

@ -72,7 +72,7 @@ Kubernetes: `>= 1.24.0`
| storage.aws-ebs-csi-driver.enabled | bool | `false` | | | storage.aws-ebs-csi-driver.enabled | bool | `false` | |
| storage.aws-efs-csi-driver.enabled | bool | `false` | | | storage.aws-efs-csi-driver.enabled | bool | `false` | |
| storage.enabled | bool | `false` | | | storage.enabled | bool | `false` | |
| storage.targetRevision | string | `"0.7.3"` | | | storage.targetRevision | string | `"0.7.4"` | |
---------------------------------------------- ----------------------------------------------
Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0) Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0)

View File

@ -36,7 +36,7 @@ cert-manager:
storage: storage:
enabled: false enabled: false
targetRevision: 0.7.3 targetRevision: 0.7.4
aws-ebs-csi-driver: aws-ebs-csi-driver:
enabled: false enabled: false
aws-efs-csi-driver: aws-efs-csi-driver:

View File

@ -2,47 +2,41 @@
## TODO ## TODO
### FeatureGates
- PodAndContainerStatsFromCRI
- DelegateFSGroupToCSIDriver
## What's new - Major themes ## What's new - Major themes
- Cilium is now the default CNI, calico got removed
- Cilium added as second CNI to prepare full migration to Cilium with 1.24 upgrade - cluster-autoscaler is enabled by default on AWS
- support for Nvidia g5 instances incl. pre-installed kernel drivers, cudo toolchain and CRI intergration
- updated inf1 neuron drivers
- ExtendedResourceToleration AdmissionController and auto-taints allowing Neuron and Nvidia pods ONLY to be scheduled on dedicated workers
- full Cluster-Autoscaler integration
## Version upgrades ## Version upgrades
- Istio to 1.14.4 - cilium
- Logging: ECK operator to 2.4, fluent-bit 1.9.8 - metallb
- Metrics: Prometheus and all Grafana charts to latest to match V1.23 - nvidia-device-plugin
- ArgoCD to V2.4 ( access to pod via shell disabled by default ) - aws-node-termination-handler
- AWS EBS/EFS CSI drivers to latest versions - aws-ebs-csi-driver
- cert-manager to V1.9.1 - aws-efs-csi-driver
### FeatureGates
- PodAndContainerStatsFromCRI
- DelegateFSGroupToCSIDriver
# Upgrade # Upgrade
`(No, really, you MUST read this before you upgrade)` `(No, really, you MUST read this before you upgrade)`
- Ensure your Kube context points to the correct cluster ! - Ensure your Kube context points to the correct cluster !
1. Enable `containerProxy` for NAT instances and upgrade NAT instance using the new V2 Pulumi stacks 1. Review CFN config for controller and workers ( enable containerProxy, remove legacy version settings etc )
2. Review CFN config for controller and workers ( enable containerProxy, remove legacy version settings etc ) 2. Upgrade CFN stacks for the control plane and all worker groups
3. Upgrade CFN stacks for the control plane and all worker groups 3. Trigger fully-automated cluster upgrade:
4. Trigger fully-automated cluster upgrade:
`./admin/upgrade_cluster.sh <path to the argocd app kubezero yaml for THIS cluster>` `./admin/upgrade_cluster.sh <path to the argocd app kubezero yaml for THIS cluster>`
5. Reboot controller(s) one by one 4. Reboot controller(s) one by one
Wait each time for controller to join and all pods running. Wait each time for controller to join and all pods running.
Might take a while ... Might take a while ...
6. Launch new set of workers eg. by doubling `desired` for each worker ASG 5. Launch new set of workers eg. by doubling `desired` for each worker ASG
once new workers are ready, cordon and drain all old workers once new workers are ready, cordon and drain all old workers
The cluster-autoscaler will remove the old workers automatically after about 10min ! The cluster-autoscaler will remove the old workers automatically after about 10min !
7. If all looks good, commit the ArgoApp resouce for Kubezero, before re-enabling ArgoCD itself. 6. If all looks good, commit the ArgoApp resouce for Kubezero, before re-enabling ArgoCD itself.
git add / commit / push `<cluster/env/kubezero/application.yaml>` git add / commit / push `<cluster/env/kubezero/application.yaml>`