feat: first fully working Nvidia support for KubeZero workers

This commit is contained in:
Stefan Reimer 2022-09-06 13:30:04 +02:00
parent 622d12858a
commit f50fa392d4
6 changed files with 167 additions and 52 deletions

View File

@ -2,6 +2,8 @@ BUILDER := v3.16.0
RELEASE := v3.16
PKG := '*'
CF_DIST := E1YFUJXMCXT2RN
.PHONY: builder aports_update download upload
all: build
@ -37,6 +39,9 @@ build: packages distfiles work
download:
aws s3 sync s3://zero-downtime-web/cdn/alpine/$(RELEASE)/kubezero/x86_64/ packages/work/x86_64/ --exclude APKINDEX.tar.gz
upload:
invalidate:
aws cloudfront create-invalidation --distribution $(CF_DIST) --paths "/alpine/*"
upload: invalidate
aws s3 sync --delete packages/work/x86_64/ s3://zero-downtime-web/cdn/alpine/$(RELEASE)/kubezero/x86_64/ --exclude APKINDEX.tar.gz
aws s3 cp packages/work/x86_64/APKINDEX.tar.gz s3://zero-downtime-web/cdn/alpine/$(RELEASE)/kubezero/x86_64/ --cache-control max-age=1

View File

@ -1,58 +1,78 @@
# Contributor: Stefan Reimer <stefan@zero-downtime.net>
# Maintainer: Stefan Reimer <stefan@zero-downtime.net>
pkgname=nvidia-container-toolkit
pkgver=1.10.0
pkgrel=1
pkgdesc="NVIDIA Container toolkit incl. cri hooks"
url="https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/overview.html"
arch="x86_64"
license="Apache"
makedepends="xz"
depends="glibc-bin nvidia-drivers"
options="!check !tracedeps"
pkgdesc='NVIDIA container runtime toolkit'
arch='x86_64'
url='https://github.com/NVIDIA/nvidia-container-toolkit'
license='Apache'
_nv_ver="$pkgver"-1
_libcap=2.25-2
_libseccomp=2.3.3-4
makedepends='go bash'
#depends='libnvidia-container-tools>=1.9.0'
options='!lto'
source="${pkgname}-v${pkgver}-${pkgrel}.tar.gz"::"${url}/archive/v${pkgver}.tar.gz"
_srcdir="${srcdir}/${pkgname}-${pkgver}"
source="https://nvidia.github.io/libnvidia-container/stable/debian10/amd64/libnvidia-container1_"$_nv_ver"_amd64.deb
https://nvidia.github.io/libnvidia-container/stable/debian10/amd64/libnvidia-container-tools_"$_nv_ver"_amd64.deb
https://nvidia.github.io/libnvidia-container/stable/debian10/amd64/nvidia-container-toolkit_"$_nv_ver"_amd64.deb
http://deb.debian.org/debian/pool/main/libc/libcap2/libcap2_"$_libcap"_amd64.deb
http://deb.debian.org/debian/pool/main/libs/libseccomp/libseccomp2_"$_libseccomp"_amd64.deb
config.toml
oci-nvidia-hook.json
"
build() {
mkdir bin
export GOPATH="${srcdir}"
export GOBIN="$GOPATH/bin"
go get "./..."
# Patch Nvidia dl loader
sed -i '/RTLD_DEEPBIND/d' vendor/github.com/NVIDIA/go-nvml/pkg/dl/dl.go
go build -v \
-modcacherw \
-buildmode=pie \
-gcflags "all=-trimpath=${PWD}" \
-asmflags "all=-trimpath=${PWD}" \
-ldflags "-s -w -extldflags ${LDFLAGS}" \
-trimpath \
-o bin \
"./..."
#-ldflags " -s -w -extldflags=-Wl,-z,now,-z,relro" \
return 0
}
package() {
install -D -m755 "${_srcdir}/bin/${pkgname}" "${pkgdir}/usr/bin/${pkgname}"
mkdir -p "$pkgdir"/usr/bin "$pkgdir"/usr/glibc-compat/lib/nvidia
cd "${pkgdir}/usr/bin/"
ln -sf "${pkgname}" "nvidia-container-runtime-hook"
cd -
install -D -m644 "${_srcdir}/config/config.toml.centos" "${pkgdir}/etc/nvidia-container-runtime/config.toml"
install -D -m644 "${_srcdir}/oci-nvidia-hook.json" "${pkgdir}/usr/share/containers/oci/hooks.d/00-oci-nvidia-hook.json"
# libnvidia-container1
ar -x "$srcdir"/libnvidia-container1_"$_nv_ver"_amd64.deb && tar xfJ data.tar.xz
mv usr/lib/x86_64-linux-gnu/* "$pkgdir"/usr/glibc-compat/lib/nvidia
rm -rf control.tar.xz data.tar.xz debian-binary usr
install -D -m644 "${_srcdir}/LICENSE" "${pkgdir}/usr/share/licenses/$pkgname/LICENSE"
# libnvidia-container-tools
ar -x "$srcdir"/libnvidia-container-tools_"$_nv_ver"_amd64.deb && tar xfJ data.tar.xz
mv usr/bin/nvidia-container-cli "$pkgdir"/usr/bin
rm -rf control.tar.xz data.tar.xz debian-binary usr
# nvidia-container-toolkit
ar -x "$srcdir"/nvidia-container-toolkit_"$_nv_ver"_amd64.deb && tar xfJ data.tar.xz
mv usr/bin/nvidia* "$pkgdir"/usr/bin
rm -rf control.tar.xz data.tar.xz debian-binary usr
# Add dependencies which also need to be compiled against glibc: libcap, libseccomp
# libcap
ar -x "$srcdir"/libcap2_"$_libcap"_amd64.deb && tar xfJ data.tar.xz
mv lib/x86_64-linux-gnu/libcap.so.* "$pkgdir"/usr/glibc-compat/lib
rm -rf control.tar.xz data.tar.xz debian-binary usr
# libseccomp
ar -x "$srcdir"/libseccomp2_"$_libseccomp"_amd64.deb && tar xfJ data.tar.xz
mv usr/lib/x86_64-linux-gnu/libseccomp.so.* "$pkgdir"/usr/glibc-compat/lib
rm -rf control.tar.xz data.tar.xz debian-binary usr
# Now lets patch the elf binaries to fix library paths and order
doas apk add patchelf@edge-community
patchelf --remove-rpath "$pkgdir"/usr/bin/nvidia-container-cli
patchelf --remove-rpath "$pkgdir"/usr/glibc-compat/lib/nvidia/libnvidia-container.so.1
# Install our runtime config and oci hook
install -Dm644 "$srcdir"/oci-nvidia-hook.json "$pkgdir"/usr/share/containers/oci/hooks.d/oci-nvidia-hook.json
install -Dm644 config.toml "$pkgdir"/etc/nvidia-container-runtime/config.toml
}
sha512sums='
4f35918811edf7ea0f24d04eba12e4670b039edd0bb29ed4f16b47c4208f8a3d4316042f5ca52a278fedd1dce549070df69807080b1a7bda4a859369ad707bf9 nvidia-container-toolkit-v1.10.0-1.tar.gz
23ba2aec28f01c0037bbb4812ea542589e96f6527cf49468a4a7c54ca05808cf0984a8dfe13ee3455b8c6ae8468c58590f9e1e6996927c710bcf3e545772a356 libnvidia-container1_1.10.0-1_amd64.deb
c5369c832bd91703e6e6e86a4431c2eebb2ddeaadff126174b41ed11e969dc8cc49dcab26b3ac18abb43b466a86ce76908eaa2f5e4109c689a0c3a4fa47548b5 libnvidia-container-tools_1.10.0-1_amd64.deb
3043729bd96dd153db1dc317685167f34da6b9d202134335212fb7d861532a265a59e002c86fff2664c67687f4c8bcc75913c74018930a6c68c0f2044eceacf0 nvidia-container-toolkit_1.10.0-1_amd64.deb
694a3ec64ef3056d5874ff03b889b868c294bccb16506468fdf1c289fe3aaadc2da25a5934de653af9633a5d993d2bb21491d84b3b2e2529e6b31d92c78a2228 libcap2_2.25-2_amd64.deb
5a4eaa96e6e774948889909d618a8ed44a82f649cbba11622dc7b4478098bea006995d5a5a60ca026a57b76ad866d1e2c6caebd154a26eb6bd7e15291b558057 libseccomp2_2.3.3-4_amd64.deb
040ac2e3f58549dc09e5bce0d694e4be2f6aae736014bf0ee90042646562d5f1ef1f5990eb9f2c2a2fdf504587b82f4aa0eb99d04c5d3e407670e4012e3edd4e config.toml
fe02a2749c18876eda344764026c0cabae1134cd9504d92e877000c000c99783ee7d3af16e1981c85a4e7bad0951060b88028d0576aa17bfae7d241838b86fb6 oci-nvidia-hook.json
'

View File

@ -0,0 +1,28 @@
disable-require = false
#accept-nvidia-visible-devices-envvar-when-unprivileged = true
#accept-nvidia-visible-devices-as-volume-mounts = false
[nvidia-container-cli]
#root = "/run/nvidia/driver"
#path = "/usr/bin/nvidia-container-cli"
environment = []
debug = "/var/log/nvidia-container-toolkit.log"
#ldcache = "/etc/ld.so.cache"
load-kmods = false
#no-cgroups = false
#user = "root:video"
ldconfig = "@/usr/glibc-compat/sbin/ldconfig"
[nvidia-container-runtime]
debug = "/var/log/nvidia-container-runtime.log"
#log-level = "debug"
# Specify the runtimes to consider. This list is processed in order and the PATH
# searched for matching executables unless the entry is an absolute path.
runtimes = [
"crun",
]
mode = "legacy"
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

View File

@ -0,0 +1,12 @@
{
"version": "1.0.0",
"hook": {
"path": "/usr/bin/nvidia-container-toolkit",
"args": ["nvidia-container-toolkit", "prestart"]
},
"when": {
"always": true,
"commands": ["nvidia.*"]
},
"stages": ["prestart"]
}

View File

@ -0,0 +1,57 @@
# Contributor: Stefan Reimer <stefan@zero-downtime.net>
# Maintainer: Stefan Reimer <stefan@zero-downtime.net>
pkgname=nvidia-drivers
pkgver=515.65.01
pkgrel=0
pkgdesc="NVIDIA Driver"
url="https://www.nvidia.com/download/index.aspx"
arch="x86_64"
license="MIT OR GPL-2.0"
makedepends="bash xz"
depends="glibc-bin"
options="!check !strip !tracedeps"
source="NVIDIA-Linux-x86_64-$pkgver.run::https://download.nvidia.com/XFree86/Linux-x86_64/$pkgver/NVIDIA-Linux-x86_64-$pkgver.run"
build() {
sh "$srcdir"/NVIDIA-Linux-x86_64-$pkgver.run -x -s
}
package() {
#/bin/bash
# Install basic nvidia-smi inck. libnvidia-ml, libcuda into /lib64 as this is hardcoded in the nvidia-container-toolkit bins
mkdir -p "$pkgdir"/usr/glibc-compat/lib/nvidia "$pkgdir"/usr/sbin
# tell glibc compat about our libs
mkdir -p "$pkgdir"/usr/glibc-compat/etc/ld.so.conf.d
echo "/usr/glibc-compat/lib/nvidia" > "$pkgdir"/usr/glibc-compat/etc/ld.so.conf.d/nvidia.conf
cd "$srcdir"/NVIDIA-Linux-x86_64-$pkgver
SBINS="nvidia-smi nvidia-debugdump nvidia-cuda-mps-control nvidia-cuda-mps-server nvidia-persistenced"
for f in $SBINS; do
cp $f "$pkgdir"/usr/sbin
done
# which libs are from debug log at runtime
# LIBS=$(grep "missing library" /var/log/nvidia-container-toolkit.log | awk '{print $7}' | sort | uniq)
# cross checked via .manifest for targets and symlinks
LIBS="libEGL_nvidia.so libGLESv1_CM_nvidia.so libGLESv2_nvidia.so libGLX_nvidia.so libcuda.so libcudadebugger.so libnvcuvid.so libnvidia-allocator.so libnvidia-cbl.so libnvidia-cfg.so libnvidia-compiler.so libnvidia-eglcore.so libnvidia-encode.so libnvidia-fatbinaryloader.so libnvidia-fbc.so libnvidia-glcore.so libnvidia-glsi.so libnvidia-glvkspirv.so libnvidia-ifr.so libnvidia-ml.so libnvidia-ngx.so libnvidia-nscq.so libnvidia-opencl.so libnvidia-opticalflow.so libnvidia-pkcs11.so libnvidia-ptxjitcompiler.so libnvidia-rtcore.so libnvidia-tls.so libnvoptix.so libvdpau_nvidia.so"
for lib in $LIBS; do
native=$(grep $lib .manifest | grep _LIB | grep NATIVE | grep -v SYMLINK | awk '{print $1}')
if [ -n "$native" ]; then
cp $native "$pkgdir"/usr/glibc-compat/lib/nvidia
links="$(grep $native .manifest | grep SYMLINK | grep -v COMPAT32 | awk '{print $1}')"
for target in $links; do
ln -s /usr/glibc-compat/lib/nvidia/$native "$pkgdir"/usr/glibc-compat/lib/nvidia/$target
done
fi
done
}
sha512sums='
5221a4ac071eb39a37a841f19cfe4983286dc35e918956b40604404ef36c122612475df7b9a391a9a70bd60f44e598c8a0e5ec54ccc3e90d51f01e1b2fbe5e33 NVIDIA-Linux-x86_64-515.65.01.run
'

View File

@ -1,6 +1,6 @@
# Contributor: Stefan Reimer <stefan@zero-downtime.net>
# Maintainer: Stefan Reimer <stefan@zero-downtime.net>
pkgname=nvidia-gpu-driver
pkgname=nvidia-open-gpu
pkgver=515.65.01
pkgrel=0
pkgdesc="NVIDIA Linux open GPU kernel modules incl. GSP firmware"
@ -8,10 +8,11 @@ url="https://github.com/NVIDIA/open-gpu-kernel-modules"
arch="x86_64"
license="MIT OR GPL-2.0"
makedepends="bash linux-headers linux-virt-dev xz"
options="!check !strip !tracedeps lib64"
options="!checkp !strip"
source="nvidia-$pkgver.tar.gz::https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/$pkgver.tar.gz
NVIDIA-Linux-x86_64-$pkgver.run::https://download.nvidia.com/XFree86/Linux-x86_64/$pkgver/NVIDIA-Linux-x86_64-$pkgver.run"
builddir="$srcdir/open-gpu-kernel-modules-$pkgver"
build() {
@ -23,7 +24,7 @@ build() {
package() {
KERNEL_VERSION=$(basename $(ls -d /lib/modules/*-virt))
depends="glibc-bin linux-virt=~$(echo $KERNEL_VERSION | sed -e 's/-.*$//')"
depends="linux-virt=~$(echo $KERNEL_VERSION | sed -e 's/-.*$//')"
modules="nvidia.ko \
nvidia-drm.ko \
@ -39,14 +40,6 @@ package() {
# Add gsp firmware see: https://download.nvidia.com/XFree86/Linux-x86_64/$pkgver/README/gsp.html
mkdir -p "$pkgdir"/lib/firmware/nvidia/"$pkgver"
sh "$srcdir"/NVIDIA-Linux-x86_64-$pkgver.run -x -s && cp NVIDIA-Linux-x86_64-$pkgver/firmware/gsp.bin "$pkgdir"/lib/firmware/nvidia/"$pkgver"/gsp.bin
#/bin/bash
# Install basic nvidia-smi inck. libnvidia-ml, libcuda into /lib64 as this is hardcoded in the nvidia-container-toolkit bins
mkdir -p "$pkgdir"/lib64 "$pkgdir"/usr/sbin
cp NVIDIA-Linux-x86_64-$pkgver/libnvidia-ml.so."$pkgver" "$pkgdir"/lib64/libnvidia-ml.so.1
cp NVIDIA-Linux-x86_64-$pkgver/libcuda.so."$pkgver" "$pkgdir"/lib64/libcuda.so.1
cp NVIDIA-Linux-x86_64-$pkgver/nvidia-smi "$pkgdir"/usr/sbin/nvidia-smi
}
sha512sums='