feat: Add AWS Neuron device plugin, remove ecr-login-renew

This commit is contained in:
Stefan Reimer 2022-05-04 14:24:14 +02:00
parent 5cc74c27ab
commit 6fa6d90d34
8 changed files with 149 additions and 105 deletions

View File

@ -2,14 +2,13 @@ apiVersion: v2
name: kubezero-addons
description: KubeZero umbrella chart for various optional cluster addons
type: application
version: 0.5.0
version: 0.5.1
appVersion: v1.22.8
home: https://kubezero.com
icon: https://cdn.zero-downtime.net/assets/kubezero/logo-small-64.png
keywords:
- kubezero
- fuse-device-plugin
- k8s-ecr-login-renew
- aws-node-termination-handler
- external-dns
maintainers:

View File

@ -25,19 +25,9 @@ Kubernetes: `>= 1.20.0`
# device-plugins
# k8s-ecr-login-renew
## AWS Neuron
Device plugin for [AWS Neuron](https://aws.amazon.com/machine-learning/neuron/) - [Inf1 instances](https://aws.amazon.com/ec2/instance-types/inf1/)
## IAM setup
- Create IAM user for ECR read-only access and attach the following managed policy: `AmazonEC2ContainerRegistryReadOnly`
- create AWS credentials for the IAM users
## Kubernetes secret
Create secret with the IAM user credential for ecr-renew to use, using the credentials from the previous step:
`kubectl create secret -n kube-system generic ecr-renew-cred --from-literal=AWS_REGION=<AWS_REGION> --from-literal=AWS_ACCESS_KEY_ID=<AWS_SECRET_ID> --from-literal=AWS_SECRET_ACCESS_KEY=<AWS_SECRET_KEY>`
## Resources
- https://github.com/nabsul/k8s-ecr-login-renew
## Values
| Key | Type | Default | Description |
@ -65,6 +55,7 @@ Create secret with the IAM user credential for ecr-renew to use, using the crede
| aws-node-termination-handler.taintNode | bool | `true` | |
| aws-node-termination-handler.tolerations[0].effect | string | `"NoSchedule"` | |
| aws-node-termination-handler.tolerations[0].key | string | `"node-role.kubernetes.io/master"` | |
| awsNeuron.enabled | bool | `false` | |
| clusterBackup.enabled | bool | `false` | |
| clusterBackup.extraEnv | list | `[]` | |
| clusterBackup.image.name | string | `"public.ecr.aws/zero-downtime/kubezero-admin"` | |
@ -96,4 +87,3 @@ Create secret with the IAM user credential for ecr-renew to use, using the crede
| forseti.image.name | string | `"public.ecr.aws/zero-downtime/forseti"` | |
| forseti.image.tag | string | `"v0.1.2"` | |
| fuseDevicePlugin.enabled | bool | `false` | |
| k8sEcrLoginRenew.enabled | bool | `false` | |

View File

@ -17,17 +17,7 @@
# device-plugins
# k8s-ecr-login-renew
## AWS Neuron
Device plugin for [AWS Neuron](https://aws.amazon.com/machine-learning/neuron/) - [Inf1 instances](https://aws.amazon.com/ec2/instance-types/inf1/)
## IAM setup
- Create IAM user for ECR read-only access and attach the following managed policy: `AmazonEC2ContainerRegistryReadOnly`
- create AWS credentials for the IAM users
## Kubernetes secret
Create secret with the IAM user credential for ecr-renew to use, using the credentials from the previous step:
`kubectl create secret -n kube-system generic ecr-renew-cred --from-literal=AWS_REGION=<AWS_REGION> --from-literal=AWS_ACCESS_KEY_ID=<AWS_SECRET_ID> --from-literal=AWS_SECRET_ACCESS_KEY=<AWS_SECRET_KEY>`
## Resources
- https://github.com/nabsul/k8s-ecr-login-renew
{{ template "chart.valuesSection" . }}

View File

@ -0,0 +1,79 @@
{{- if .Values.awsNeuron.enabled }}
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: neuron-device-plugin
namespace: kube-system
spec:
selector:
matchLabels:
name: neuron-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
name: neuron-device-plugin-ds
spec:
serviceAccount: neuron-device-plugin
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- key: aws.amazon.com/neuron
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "beta.kubernetes.io/instance-type"
operator: In
values:
- inf1.xlarge
- inf1.2xlarge
- inf1.6xlarge
- inf1.4xlarge
- matchExpressions:
- key: "node.kubernetes.io/instance-type"
operator: In
values:
- inf1.xlarge
- inf1.2xlarge
- inf1.6xlarge
- inf1.24xlarge
containers:
- image: "{{ .Values.awsNeuron.image.name }}:{{ .Values.awsNeuron.image.tag }}"
imagePullPolicy: IfNotPresent
name: neuron-device-plugin
env:
- name: KUBECONFIG
value: /etc/kubernetes/kubelet.conf
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: infa-map
mountPath: /run
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: infa-map
hostPath:
path: /run
{{- end }}

View File

@ -0,0 +1,59 @@
{{- if .Values.awsNeuron.enabled }}
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: neuron-device-plugin
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- apiGroups:
- ""
resources:
- pods
verbs:
- update
- patch
- get
- list
- watch
- apiGroups:
- ""
resources:
- nodes/status
verbs:
- patch
- update
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: neuron-device-plugin
namespace: kube-system
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: neuron-device-plugin
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: neuron-device-plugin
subjects:
- kind: ServiceAccount
name: neuron-device-plugin
namespace: kube-system
{{- end }}

View File

@ -1,42 +0,0 @@
{{- if .Values.k8sEcrLoginRenew.enabled }}
apiVersion: batch/v1beta1
kind: CronJob
metadata:
namespace: kube-system
name: ecr-renew
labels:
app: ecr-renew
spec:
schedule: "0 */6 * * *"
successfulJobsHistoryLimit: 3
failedJobsHistoryLimit: 5
jobTemplate:
spec:
template:
spec:
restartPolicy: OnFailure
serviceAccountName: ecr-renew
containers:
- name: ecr-renew
image: nabsul/k8s-ecr-login-renew:v1.4
env:
- name: DOCKER_SECRET_NAME
value: ecr-login
- name: TARGET_NAMESPACE
value: "*"
- name: AWS_REGION
valueFrom:
secretKeyRef:
name: ecr-renew-cred
key: AWS_REGION
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: ecr-renew-cred
key: AWS_ACCESS_KEY_ID
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: ecr-renew-cred
key: AWS_SECRET_ACCESS_KEY
{{- end }}

View File

@ -1,35 +0,0 @@
{{- if .Values.k8sEcrLoginRenew.enabled }}
apiVersion: v1
kind: ServiceAccount
metadata:
namespace: kube-system
name: ecr-renew
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: ecr-renew
rules:
- apiGroups: [""]
resources: ["secrets"]
verbs: ["create", "update", "get", "delete"]
- apiGroups: [""]
resources: ["namespaces"]
verbs: ["get", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
namespace: kube-system
name: ecr-renew
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: ecr-renew
subjects:
- kind: ServiceAccount
name: ecr-renew
namespace: kube-system
{{- end }}

View File

@ -70,9 +70,13 @@ aws-node-termination-handler:
fuseDevicePlugin:
enabled: false
k8sEcrLoginRenew:
awsNeuron:
enabled: false
image:
name: public.ecr.aws/neuron/neuron-device-plugin
tag: 1.9.0.0
external-dns:
enabled: false