Skip to content
This repository has been archived by the owner on Jan 11, 2023. It is now read-only.

Commit

Permalink
Add dkms to manage nvidia gpu kmod compliation across different linux…
Browse files Browse the repository at this point in the history
… kernels (#3688)
  • Loading branch information
lachie83 authored and jackfrancis committed Aug 17, 2018
1 parent 5a4a560 commit f0eee0d
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 2 deletions.
17 changes: 17 additions & 0 deletions parts/k8s/kubernetesagentcustomdata.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,23 @@ write_files:
WantedBy=multi-user.target
{{end}}

{{if IsNSeriesSKU .}}
- path: "/etc/systemd/system/nvidia-modprobe.service"
permissions: "0644"
owner: "root"
content: |
[Unit]
Description=Installs and loads Nvidia GPU kernel module
[Service]
Type=oneshot
RemainAfterExit=true
ExecStartPre=/bin/sh -c "dkms autoinstall --verbose"
ExecStart=/bin/sh -c "nvidia-modprobe -u -c0"
ExecStartPost=/bin/sh -c "sleep 10 && systemctl restart kubelet"
[Install]
WantedBy=multi-user.target
{{end}}

- path: "/etc/kubernetes/certs/ca.crt"
permissions: "0644"
encoding: "base64"
Expand Down
5 changes: 3 additions & 2 deletions pkg/acsengine/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,7 @@ func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string {
- retrycmd_if_failure_no_stats 180 1 5 curl -fsSL https://nvidia.github.io/nvidia-docker/ubuntu16.04/amd64/nvidia-docker.list > /tmp/nvidia-docker.list
- cat /tmp/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
- apt_get_update
- retrycmd_if_failure 5 5 300 apt-get install -y linux-headers-$(uname -r) gcc make
- retrycmd_if_failure 5 5 300 apt-get install -y linux-headers-$(uname -r) gcc make dkms
- retrycmd_if_failure 5 5 300 apt-get -o Dpkg::Options::="--force-confold" install -y nvidia-docker2=%s+docker%s nvidia-container-runtime=%s+docker%s
- sudo pkill -SIGHUP dockerd
- mkdir -p %s
Expand All @@ -505,13 +505,14 @@ func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string {
Run nvidia-smi to test the installation, unmount overlayfs and restard kubelet (GPUs are only discovered when kubelet starts)
*/
installScript += fmt.Sprintf(`
- sh nvidia-drivers-%s --silent --accept-license --no-drm --utility-prefix="%s" --opengl-prefix="%s"
- sh nvidia-drivers-%s --silent --accept-license --no-drm --dkms --utility-prefix="%s" --opengl-prefix="%s"
- echo "%s" > /etc/ld.so.conf.d/nvidia.conf
- sudo ldconfig
- umount -l /usr/lib/x86_64-linux-gnu
- nvidia-modprobe -u -c0
- %s/bin/nvidia-smi
- sudo ldconfig
- systemctl enable nvidia-modprobe
- retrycmd_if_failure 5 10 60 systemctl restart kubelet`, dv, dest, dest, fmt.Sprintf("%s/lib64", dest), dest)

/* If a new GPU sku becomes available, add a key to this map, but only provide an installation script if you have a confirmation
Expand Down

0 comments on commit f0eee0d

Please sign in to comment.