Skip to content

Commit

Permalink
Merge pull request #25 from rh-mobb/add-nvidia-gpu
Browse files Browse the repository at this point in the history
add nvidia gpu support
  • Loading branch information
paulczar authored Jul 14, 2022
2 parents 97b96a0 + 8ea57f0 commit 3adf4f4
Show file tree
Hide file tree
Showing 9 changed files with 458 additions and 0 deletions.
23 changes: 23 additions & 0 deletions charts/nvidia-gpu/.helmignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/
9 changes: 9 additions & 0 deletions charts/nvidia-gpu/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: v2
name: nvidia-gpu
description: A Helm chart for Kubernetes
maintainers:
- name: paulczar
home: https://github.com/rh-mobb/helm-charts
type: application
version: 0.1.0
appVersion: "1.16.0"
84 changes: 84 additions & 0 deletions charts/nvidia-gpu/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Helm Chart to set up NVIDIA GPU nodes


## Prerequisites

* An ARO / ROSA cluster
* Helm CLI

## Prepare Environment

1. Create namespaces

```bash
oc create namespace openshift-nfd
oc create namespace nvidia-gpu-operator
```

1. Add the MOBB chart repository to your Helm

```bash
helm repo add mobb https://rh-mobb.github.io/helm-charts/
```

1. Update your repositories

```bash
helm repo update
```

1. Use the `mobb/operatorhub` chart to deploy the needed operators

```bash
helm upgrade -n nvidia-gpu-operator nvidia-gpu-operator \
mobb/operatorhub --install \
--values https://raw.githubusercontent.com/rh-mobb/helm-charts/main/charts/nvidia-gpu/files/operatorhub.yaml
```

1. Wait until the two operators are running

```bash
watch kubectl get pods -n openshift-nfd
```

```
NAME READY STATUS RESTARTS AGE
nfd-controller-manager-7b66c67bd9-rk98w 2/2 Running 0 47s
```

```bash
watch kubectl get pods -n nvidia-gpu-operator
```

```
kubectl get pods -n nvidia-gpu-operator
NAME READY STATUS RESTARTS AGE
gpu-operator-5d8cb7dd5f-c4ljk 1/1 Running 0 87s
```
## Deploy the Helm Chart

1. Install a Chart

```bash
helm upgrade --install -n nvidia-gpu-operator nvidia-gpu \
mobb/nvidia-gpu --disable-openapi-validation
```

1. Validate the NFD can see the GPU(s)

```bash
oc describe node | egrep 'Roles|pci-10de' | grep -v master
```

You should see output like:

```
Roles: worker
feature.node.kubernetes.io/pci-10de.present=true
```
1. Verify the GPUs are available on the host

```bash
oc project nvidia-gpu-operator
for i in $(oc get pod -lopenshift.driver-toolkit=true --no-headers |awk '{print $1}'); do echo $i; oc exec -it $i -- nvidia-smi ; echo -e '\n' ; done
```
21 changes: 21 additions & 0 deletions charts/nvidia-gpu/files/operatorhub.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
subscriptions:
- name: gpu-operator-certified
channel: v1.11
installPlanApproval: Automatic
source: certified-operators
sourceNamespace: openshift-marketplace
namespace: nvidia-gpu-operator
- name: nfd
channel: stable
installPlanApproval: Automatic
source: redhat-operators
sourceNamespace: openshift-marketplace
namespace: openshift-nfd

operatorGroups:
- name: nvidia-gpu-operator-group
namespace: nvidia-gpu-operator
targetNamespace: nvidia-gpu-operator
- name: openshift-nfd
namespace: openshift-nfd
targetNamespace: ~
Empty file.
62 changes: 62 additions & 0 deletions charts/nvidia-gpu/templates/_helpers.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
{{/*
Expand the name of the chart.
*/}}
{{- define "nvidia-gpu.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}

{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "nvidia-gpu.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}

{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "nvidia-gpu.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}

{{/*
Common labels
*/}}
{{- define "nvidia-gpu.labels" -}}
helm.sh/chart: {{ include "nvidia-gpu.chart" . }}
{{ include "nvidia-gpu.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end }}

{{/*
Selector labels
*/}}
{{- define "nvidia-gpu.selectorLabels" -}}
app.kubernetes.io/name: {{ include "nvidia-gpu.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}

{{/*
Create the name of the service account to use
*/}}
{{- define "nvidia-gpu.serviceAccountName" -}}
{{- if .Values.serviceAccount.create }}
{{- default (include "nvidia-gpu.fullname" .) .Values.serviceAccount.name }}
{{- else }}
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
{{- end }}
47 changes: 47 additions & 0 deletions charts/nvidia-gpu/templates/clusterpolicy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
apiVersion: nvidia.com/v1
kind: ClusterPolicy
metadata:
name: gpu-cluster-policy
labels:
{{- include "nvidia-gpu.labels" . | nindent 4 }}
spec:
migManager:
enabled: true
operator:
defaultRuntime: crio
initContainer: {}
runtimeClass: nvidia
deployGFD: true
dcgm:
enabled: true
gfd: {}
dcgmExporter:
config:
name: ''
driver:
licensingConfig:
nlsEnabled: false
configMapName: ''
certConfig:
name: ''
kernelModuleConfig:
name: ''
repoConfig:
configMapName: ''
virtualTopology:
config: ''
enabled: true
use_ocp_driver_toolkit: true
devicePlugin: {}
mig:
strategy: single
validator:
plugin:
env:
- name: WITH_WORKLOAD
value: 'true'
nodeStatusExporter:
enabled: true
daemonsets: {}
toolkit:
enabled: true
129 changes: 129 additions & 0 deletions charts/nvidia-gpu/templates/nfd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
kind: NodeFeatureDiscovery
apiVersion: nfd.openshift.io/v1
metadata:
name: nfd-instance
namespace: openshift-nfd
labels:
{{- include "nvidia-gpu.labels" . | nindent 4 }}
spec:
customConfig:
configData: |
# - name: "more.kernel.features"
# matchOn:
# - loadedKMod: ["example_kmod3"]
# - name: "more.features.by.nodename"
# value: customValue
# matchOn:
# - nodename: ["special-.*-node-.*"]
operand:
image: >-
registry.redhat.io/openshift4/ose-node-feature-discovery@sha256:07658ef3df4b264b02396e67af813a52ba416b47ab6e1d2d08025a350ccd2b7b
servicePort: 12000
workerConfig:
configData: |
core:
# labelWhiteList:
# noPublish: false
sleepInterval: 60s
# sources: [all]
# klog:
# addDirHeader: false
# alsologtostderr: false
# logBacktraceAt:
# logtostderr: true
# skipHeaders: false
# stderrthreshold: 2
# v: 0
# vmodule:
## NOTE: the following options are not dynamically run-time
## configurable and require a nfd-worker restart to take effect
## after being changed
# logDir:
# logFile:
# logFileMaxSize: 1800
# skipLogHeaders: false
sources:
# cpu:
# cpuid:
## NOTE: whitelist has priority over blacklist
# attributeBlacklist:
# - "BMI1"
# - "BMI2"
# - "CLMUL"
# - "CMOV"
# - "CX16"
# - "ERMS"
# - "F16C"
# - "HTT"
# - "LZCNT"
# - "MMX"
# - "MMXEXT"
# - "NX"
# - "POPCNT"
# - "RDRAND"
# - "RDSEED"
# - "RDTSCP"
# - "SGX"
# - "SSE"
# - "SSE2"
# - "SSE3"
# - "SSE4.1"
# - "SSE4.2"
# - "SSSE3"
# attributeWhitelist:
# kernel:
# kconfigFile: "/path/to/kconfig"
# configOpts:
# - "NO_HZ"
# - "X86"
# - "DMI"
pci:
deviceClassWhitelist:
- "0200"
- "03"
- "12"
deviceLabelFields:
# - "class"
- "vendor"
# - "device"
# - "subsystem_vendor"
# - "subsystem_device"
# usb:
# deviceClassWhitelist:
# - "0e"
# - "ef"
# - "fe"
# - "ff"
# deviceLabelFields:
# - "class"
# - "vendor"
# - "device"
# custom:
# - name: "my.kernel.feature"
# matchOn:
# - loadedKMod: ["example_kmod1", "example_kmod2"]
# - name: "my.pci.feature"
# matchOn:
# - pciId:
# class: ["0200"]
# vendor: ["15b3"]
# device: ["1014", "1017"]
# - pciId :
# vendor: ["8086"]
# device: ["1000", "1100"]
# - name: "my.usb.feature"
# matchOn:
# - usbId:
# class: ["ff"]
# vendor: ["03e7"]
# device: ["2485"]
# - usbId:
# class: ["fe"]
# vendor: ["1a6e"]
# device: ["089a"]
# - name: "my.combined.feature"
# matchOn:
# - pciId:
# vendor: ["15b3"]
# device: ["1014", "1017"]
# loadedKMod : ["vendor_kmod1", "vendor_kmod2"]
Loading

0 comments on commit 3adf4f4

Please sign in to comment.