Compare commits

..

1 Commits

Author SHA1 Message Date
3562ce83f6 chore(deps): update kubezero-metrics-dependencies 2025-01-16 03:05:34 +00:00
6 changed files with 188 additions and 72 deletions

View File

@ -33,11 +33,6 @@ dependencies:
# https://github.com/NVIDIA/k8s-device-plugin
repository: https://nvidia.github.io/k8s-device-plugin
condition: nvidia-device-plugin.enabled
- name: neuron-helm-chart
version: 1.1.1
# https://github.com/aws-neuron/neuron-helm-charts/tree/main/charts/neuron-helm-chart
repository: oci://public.ecr.aws/neuron #/neuron-helm-chart
condition: neuron-helm-chart.enabled
- name: sealed-secrets
version: 2.16.2
repository: https://bitnami-labs.github.io/sealed-secrets

View File

@ -25,7 +25,6 @@ Kubernetes: `>= 1.26.0`
| https://nvidia.github.io/k8s-device-plugin | nvidia-device-plugin | 0.17.0 |
| https://twin.github.io/helm-charts | aws-eks-asg-rolling-update-handler | 1.5.0 |
| oci://public.ecr.aws/aws-ec2/helm | aws-node-termination-handler | 0.24.1 |
| oci://public.ecr.aws/neuron | neuron-helm-chart | 1.1.1 |
# MetalLB
@ -101,11 +100,13 @@ Device plugin for [AWS Neuron](https://aws.amazon.com/machine-learning/neuron/)
| aws-node-termination-handler.tolerations[0].effect | string | `"NoSchedule"` | |
| aws-node-termination-handler.tolerations[0].key | string | `"node-role.kubernetes.io/control-plane"` | |
| aws-node-termination-handler.useProviderId | bool | `true` | |
| awsNeuron.enabled | bool | `false` | |
| awsNeuron.image.name | string | `"public.ecr.aws/neuron/neuron-device-plugin"` | |
| awsNeuron.image.tag | string | `"2.22.4.0"` | |
| cluster-autoscaler.autoDiscovery.clusterName | string | `""` | |
| cluster-autoscaler.awsRegion | string | `"us-west-2"` | |
| cluster-autoscaler.enabled | bool | `false` | |
| cluster-autoscaler.extraArgs.balance-similar-node-groups | bool | `true` | |
| cluster-autoscaler.extraArgs.ignore-daemonsets-utilization | bool | `true` | |
| cluster-autoscaler.extraArgs.ignore-taint | string | `"node.cilium.io/agent-not-ready"` | |
| cluster-autoscaler.extraArgs.scan-interval | string | `"30s"` | |
| cluster-autoscaler.extraArgs.skip-nodes-with-local-storage | bool | `false` | |
@ -140,24 +141,22 @@ Device plugin for [AWS Neuron](https://aws.amazon.com/machine-learning/neuron/)
| fuseDevicePlugin.enabled | bool | `false` | |
| fuseDevicePlugin.image.name | string | `"public.ecr.aws/zero-downtime/fuse-device-plugin"` | |
| fuseDevicePlugin.image.tag | string | `"v1.2.0"` | |
| neuron-helm-chart.devicePlugin.tolerations[0].key | string | `"CriticalAddonsOnly"` | |
| neuron-helm-chart.devicePlugin.tolerations[0].operator | string | `"Exists"` | |
| neuron-helm-chart.devicePlugin.tolerations[1].effect | string | `"NoSchedule"` | |
| neuron-helm-chart.devicePlugin.tolerations[1].key | string | `"aws.amazon.com/neuron"` | |
| neuron-helm-chart.devicePlugin.tolerations[1].operator | string | `"Exists"` | |
| neuron-helm-chart.devicePlugin.tolerations[2].effect | string | `"NoSchedule"` | |
| neuron-helm-chart.devicePlugin.tolerations[2].key | string | `"kubezero-workergroup"` | |
| neuron-helm-chart.devicePlugin.tolerations[2].operator | string | `"Exists"` | |
| neuron-helm-chart.devicePlugin.volumeMounts[0].mountPath | string | `"/var/lib/kubelet/device-plugins"` | |
| neuron-helm-chart.devicePlugin.volumeMounts[0].name | string | `"device-plugin"` | |
| neuron-helm-chart.devicePlugin.volumeMounts[1].mountPath | string | `"/run"` | |
| neuron-helm-chart.devicePlugin.volumeMounts[1].name | string | `"infa-map"` | |
| neuron-helm-chart.devicePlugin.volumes[0].hostPath.path | string | `"/var/lib/kubelet/device-plugins"` | |
| neuron-helm-chart.devicePlugin.volumes[0].name | string | `"device-plugin"` | |
| neuron-helm-chart.devicePlugin.volumes[1].hostPath.path | string | `"/run"` | |
| neuron-helm-chart.devicePlugin.volumes[1].name | string | `"infa-map"` | |
| neuron-helm-chart.enabled | bool | `false` | |
| neuron-helm-chart.npd.enabled | bool | `false` | |
| nvidia-device-plugin.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].key | string | `"node.kubernetes.io/instance-type"` | |
| nvidia-device-plugin.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator | string | `"In"` | |
| nvidia-device-plugin.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[0] | string | `"g5.xlarge"` | |
| nvidia-device-plugin.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[10] | string | `"g4dn.4xlarge"` | |
| nvidia-device-plugin.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[11] | string | `"g4dn.8xlarge"` | |
| nvidia-device-plugin.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[12] | string | `"g4dn.12xlarge"` | |
| nvidia-device-plugin.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[13] | string | `"g4dn.16xlarge"` | |
| nvidia-device-plugin.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[1] | string | `"g5.2xlarge"` | |
| nvidia-device-plugin.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[2] | string | `"g5.4xlarge"` | |
| nvidia-device-plugin.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[3] | string | `"g5.8xlarge"` | |
| nvidia-device-plugin.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[4] | string | `"g5.12xlarge"` | |
| nvidia-device-plugin.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[5] | string | `"g5.16xlarge"` | |
| nvidia-device-plugin.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[6] | string | `"g5.24xlarge"` | |
| nvidia-device-plugin.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[7] | string | `"g5.48xlarge"` | |
| nvidia-device-plugin.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[8] | string | `"g4dn.xlarge"` | |
| nvidia-device-plugin.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[9] | string | `"g4dn.2xlarge"` | |
| nvidia-device-plugin.cdi.nvidiaHookPath | string | `"/usr/bin"` | |
| nvidia-device-plugin.deviceDiscoveryStrategy | string | `"nvml"` | |
| nvidia-device-plugin.enabled | bool | `false` | |
@ -168,23 +167,6 @@ Device plugin for [AWS Neuron](https://aws.amazon.com/machine-learning/neuron/)
| nvidia-device-plugin.tolerations[1].effect | string | `"NoSchedule"` | |
| nvidia-device-plugin.tolerations[1].key | string | `"kubezero-workergroup"` | |
| nvidia-device-plugin.tolerations[1].operator | string | `"Exists"` | |
| py-kube-downscaler.enabled | bool | `false` | |
| py-kube-downscaler.excludedNamespaces[0] | string | `"kube-system"` | |
| py-kube-downscaler.excludedNamespaces[1] | string | `"operators"` | |
| py-kube-downscaler.excludedNamespaces[2] | string | `"monitoring"` | |
| py-kube-downscaler.excludedNamespaces[3] | string | `"logging"` | |
| py-kube-downscaler.excludedNamespaces[4] | string | `"telemetry"` | |
| py-kube-downscaler.excludedNamespaces[5] | string | `"istio-system"` | |
| py-kube-downscaler.excludedNamespaces[6] | string | `"istio-ingress"` | |
| py-kube-downscaler.excludedNamespaces[7] | string | `"cert-manager"` | |
| py-kube-downscaler.excludedNamespaces[8] | string | `"argocd"` | |
| py-kube-downscaler.nodeSelector."node-role.kubernetes.io/control-plane" | string | `""` | |
| py-kube-downscaler.resources.limits.cpu | string | `nil` | |
| py-kube-downscaler.resources.limits.memory | string | `"256Mi"` | |
| py-kube-downscaler.resources.requests.cpu | string | `"20m"` | |
| py-kube-downscaler.resources.requests.memory | string | `"48Mi"` | |
| py-kube-downscaler.tolerations[0].effect | string | `"NoSchedule"` | |
| py-kube-downscaler.tolerations[0].key | string | `"node-role.kubernetes.io/control-plane"` | |
| sealed-secrets.enabled | bool | `false` | |
| sealed-secrets.fullnameOverride | string | `"sealed-secrets-controller"` | |
| sealed-secrets.keyrenewperiod | string | `"0"` | |

View File

@ -0,0 +1,80 @@
{{- if .Values.awsNeuron.enabled }}
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: neuron-device-plugin
namespace: kube-system
spec:
selector:
matchLabels:
name: neuron-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: neuron-device-plugin-ds
spec:
serviceAccount: neuron-device-plugin
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- key: aws.amazon.com/neuron
operator: Exists
effect: NoSchedule
- key: kubezero-workergroup
effect: NoSchedule
operator: Exists
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "node.kubernetes.io/instance-type"
operator: In
values:
- inf1.xlarge
- inf1.2xlarge
- inf1.6xlarge
- inf1.24xlarge
- inf2.xlarge
- inf2.4xlarge
- inf2.8xlarge
- inf2.24xlarge
- inf2.48xlarge
- trn1.2xlarge
- trn1.32xlarge
- trn1n.32xlarge
containers:
- image: "{{ .Values.awsNeuron.image.name }}:{{ .Values.awsNeuron.image.tag }}"
imagePullPolicy: IfNotPresent
name: neuron-device-plugin
env:
- name: KUBECONFIG
value: /etc/kubernetes/kubelet.conf
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: infa-map
mountPath: /run
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: infa-map
hostPath:
path: /run
{{- end }}

View File

@ -0,0 +1,59 @@
{{- if .Values.awsNeuron.enabled }}
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: neuron-device-plugin
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- apiGroups:
- ""
resources:
- pods
verbs:
- update
- patch
- get
- list
- watch
- apiGroups:
- ""
resources:
- nodes/status
verbs:
- patch
- update
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: neuron-device-plugin
namespace: kube-system
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: neuron-device-plugin
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: neuron-device-plugin
subjects:
- kind: ServiceAccount
name: neuron-device-plugin
namespace: kube-system
{{- end }}

View File

@ -155,35 +155,12 @@ fuseDevicePlugin:
name: public.ecr.aws/zero-downtime/fuse-device-plugin
tag: v1.2.0
neuron-helm-chart:
awsNeuron:
enabled: false
npd:
enabled: false
devicePlugin:
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- key: aws.amazon.com/neuron
operator: Exists
effect: NoSchedule
- key: kubezero-workergroup
effect: NoSchedule
operator: Exists
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: infa-map
mountPath: /run
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: infa-map
hostPath:
path: /run
image:
name: public.ecr.aws/neuron/neuron-device-plugin
tag: 2.22.4.0
nvidia-device-plugin:
enabled: false
@ -201,6 +178,29 @@ nvidia-device-plugin:
effect: NoSchedule
operator: Exists
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "node.kubernetes.io/instance-type"
operator: In
values:
- g5.xlarge
- g5.2xlarge
- g5.4xlarge
- g5.8xlarge
- g5.12xlarge
- g5.16xlarge
- g5.24xlarge
- g5.48xlarge
- g4dn.xlarge
- g4dn.2xlarge
- g4dn.4xlarge
- g4dn.8xlarge
- g4dn.12xlarge
- g4dn.16xlarge
cluster-autoscaler:
enabled: false

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: kubezero-metrics
description: KubeZero Umbrella Chart for Prometheus, Grafana and Alertmanager as well as all Kubernetes integrations.
type: application
version: 0.10.2
version: 0.10.3
home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords:
@ -19,14 +19,14 @@ dependencies:
version: ">= 0.1.6"
repository: https://cdn.zero-downtime.net/charts/
- name: kube-prometheus-stack
version: 66.1.1
version: 66.7.1
repository: https://prometheus-community.github.io/helm-charts
- name: prometheus-adapter
version: 4.11.0
repository: https://prometheus-community.github.io/helm-charts
condition: prometheus-adapter.enabled
- name: prometheus-pushgateway
version: 2.15.0
version: 2.17.0
repository: https://prometheus-community.github.io/helm-charts
condition: prometheus-pushgateway.enabled
kubeVersion: ">= 1.26.0"