-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #25 from rh-mobb/add-nvidia-gpu
add nvidia gpu support
- Loading branch information
Showing
9 changed files
with
458 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# Patterns to ignore when building packages. | ||
# This supports shell glob matching, relative path matching, and | ||
# negation (prefixed with !). Only one pattern per line. | ||
.DS_Store | ||
# Common VCS dirs | ||
.git/ | ||
.gitignore | ||
.bzr/ | ||
.bzrignore | ||
.hg/ | ||
.hgignore | ||
.svn/ | ||
# Common backup files | ||
*.swp | ||
*.bak | ||
*.tmp | ||
*.orig | ||
*~ | ||
# Various IDEs | ||
.project | ||
.idea/ | ||
*.tmproj | ||
.vscode/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
apiVersion: v2 | ||
name: nvidia-gpu | ||
description: A Helm chart for Kubernetes | ||
maintainers: | ||
- name: paulczar | ||
home: https://github.com/rh-mobb/helm-charts | ||
type: application | ||
version: 0.1.0 | ||
appVersion: "1.16.0" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
# Helm Chart to set up NVIDIA GPU nodes | ||
|
||
|
||
## Prerequisites | ||
|
||
* An ARO / ROSA cluster | ||
* Helm CLI | ||
|
||
## Prepare Environment | ||
|
||
1. Create namespaces | ||
|
||
```bash | ||
oc create namespace openshift-nfd | ||
oc create namespace nvidia-gpu-operator | ||
``` | ||
|
||
1. Add the MOBB chart repository to your Helm | ||
|
||
```bash | ||
helm repo add mobb https://rh-mobb.github.io/helm-charts/ | ||
``` | ||
|
||
1. Update your repositories | ||
|
||
```bash | ||
helm repo update | ||
``` | ||
|
||
1. Use the `mobb/operatorhub` chart to deploy the needed operators | ||
|
||
```bash | ||
helm upgrade -n nvidia-gpu-operator nvidia-gpu-operator \ | ||
mobb/operatorhub --install \ | ||
--values https://raw.githubusercontent.com/rh-mobb/helm-charts/main/charts/nvidia-gpu/files/operatorhub.yaml | ||
``` | ||
|
||
1. Wait until the two operators are running | ||
|
||
```bash | ||
watch kubectl get pods -n openshift-nfd | ||
``` | ||
|
||
``` | ||
NAME READY STATUS RESTARTS AGE | ||
nfd-controller-manager-7b66c67bd9-rk98w 2/2 Running 0 47s | ||
``` | ||
|
||
```bash | ||
watch kubectl get pods -n nvidia-gpu-operator | ||
``` | ||
|
||
``` | ||
kubectl get pods -n nvidia-gpu-operator | ||
NAME READY STATUS RESTARTS AGE | ||
gpu-operator-5d8cb7dd5f-c4ljk 1/1 Running 0 87s | ||
``` | ||
## Deploy the Helm Chart | ||
|
||
1. Install a Chart | ||
|
||
```bash | ||
helm upgrade --install -n nvidia-gpu-operator nvidia-gpu \ | ||
mobb/nvidia-gpu --disable-openapi-validation | ||
``` | ||
|
||
1. Validate the NFD can see the GPU(s) | ||
|
||
```bash | ||
oc describe node | egrep 'Roles|pci-10de' | grep -v master | ||
``` | ||
|
||
You should see output like: | ||
|
||
``` | ||
Roles: worker | ||
feature.node.kubernetes.io/pci-10de.present=true | ||
``` | ||
1. Verify the GPUs are available on the host | ||
|
||
```bash | ||
oc project nvidia-gpu-operator | ||
for i in $(oc get pod -lopenshift.driver-toolkit=true --no-headers |awk '{print $1}'); do echo $i; oc exec -it $i -- nvidia-smi ; echo -e '\n' ; done | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
subscriptions: | ||
- name: gpu-operator-certified | ||
channel: v1.11 | ||
installPlanApproval: Automatic | ||
source: certified-operators | ||
sourceNamespace: openshift-marketplace | ||
namespace: nvidia-gpu-operator | ||
- name: nfd | ||
channel: stable | ||
installPlanApproval: Automatic | ||
source: redhat-operators | ||
sourceNamespace: openshift-marketplace | ||
namespace: openshift-nfd | ||
|
||
operatorGroups: | ||
- name: nvidia-gpu-operator-group | ||
namespace: nvidia-gpu-operator | ||
targetNamespace: nvidia-gpu-operator | ||
- name: openshift-nfd | ||
namespace: openshift-nfd | ||
targetNamespace: ~ |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
{{/* | ||
Expand the name of the chart. | ||
*/}} | ||
{{- define "nvidia-gpu.name" -}} | ||
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} | ||
{{- end }} | ||
|
||
{{/* | ||
Create a default fully qualified app name. | ||
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). | ||
If release name contains chart name it will be used as a full name. | ||
*/}} | ||
{{- define "nvidia-gpu.fullname" -}} | ||
{{- if .Values.fullnameOverride }} | ||
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} | ||
{{- else }} | ||
{{- $name := default .Chart.Name .Values.nameOverride }} | ||
{{- if contains $name .Release.Name }} | ||
{{- .Release.Name | trunc 63 | trimSuffix "-" }} | ||
{{- else }} | ||
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} | ||
{{- end }} | ||
{{- end }} | ||
{{- end }} | ||
|
||
{{/* | ||
Create chart name and version as used by the chart label. | ||
*/}} | ||
{{- define "nvidia-gpu.chart" -}} | ||
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} | ||
{{- end }} | ||
|
||
{{/* | ||
Common labels | ||
*/}} | ||
{{- define "nvidia-gpu.labels" -}} | ||
helm.sh/chart: {{ include "nvidia-gpu.chart" . }} | ||
{{ include "nvidia-gpu.selectorLabels" . }} | ||
{{- if .Chart.AppVersion }} | ||
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} | ||
{{- end }} | ||
app.kubernetes.io/managed-by: {{ .Release.Service }} | ||
{{- end }} | ||
|
||
{{/* | ||
Selector labels | ||
*/}} | ||
{{- define "nvidia-gpu.selectorLabels" -}} | ||
app.kubernetes.io/name: {{ include "nvidia-gpu.name" . }} | ||
app.kubernetes.io/instance: {{ .Release.Name }} | ||
{{- end }} | ||
|
||
{{/* | ||
Create the name of the service account to use | ||
*/}} | ||
{{- define "nvidia-gpu.serviceAccountName" -}} | ||
{{- if .Values.serviceAccount.create }} | ||
{{- default (include "nvidia-gpu.fullname" .) .Values.serviceAccount.name }} | ||
{{- else }} | ||
{{- default "default" .Values.serviceAccount.name }} | ||
{{- end }} | ||
{{- end }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
apiVersion: nvidia.com/v1 | ||
kind: ClusterPolicy | ||
metadata: | ||
name: gpu-cluster-policy | ||
labels: | ||
{{- include "nvidia-gpu.labels" . | nindent 4 }} | ||
spec: | ||
migManager: | ||
enabled: true | ||
operator: | ||
defaultRuntime: crio | ||
initContainer: {} | ||
runtimeClass: nvidia | ||
deployGFD: true | ||
dcgm: | ||
enabled: true | ||
gfd: {} | ||
dcgmExporter: | ||
config: | ||
name: '' | ||
driver: | ||
licensingConfig: | ||
nlsEnabled: false | ||
configMapName: '' | ||
certConfig: | ||
name: '' | ||
kernelModuleConfig: | ||
name: '' | ||
repoConfig: | ||
configMapName: '' | ||
virtualTopology: | ||
config: '' | ||
enabled: true | ||
use_ocp_driver_toolkit: true | ||
devicePlugin: {} | ||
mig: | ||
strategy: single | ||
validator: | ||
plugin: | ||
env: | ||
- name: WITH_WORKLOAD | ||
value: 'true' | ||
nodeStatusExporter: | ||
enabled: true | ||
daemonsets: {} | ||
toolkit: | ||
enabled: true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
kind: NodeFeatureDiscovery | ||
apiVersion: nfd.openshift.io/v1 | ||
metadata: | ||
name: nfd-instance | ||
namespace: openshift-nfd | ||
labels: | ||
{{- include "nvidia-gpu.labels" . | nindent 4 }} | ||
spec: | ||
customConfig: | ||
configData: | | ||
# - name: "more.kernel.features" | ||
# matchOn: | ||
# - loadedKMod: ["example_kmod3"] | ||
# - name: "more.features.by.nodename" | ||
# value: customValue | ||
# matchOn: | ||
# - nodename: ["special-.*-node-.*"] | ||
operand: | ||
image: >- | ||
registry.redhat.io/openshift4/ose-node-feature-discovery@sha256:07658ef3df4b264b02396e67af813a52ba416b47ab6e1d2d08025a350ccd2b7b | ||
servicePort: 12000 | ||
workerConfig: | ||
configData: | | ||
core: | ||
# labelWhiteList: | ||
# noPublish: false | ||
sleepInterval: 60s | ||
# sources: [all] | ||
# klog: | ||
# addDirHeader: false | ||
# alsologtostderr: false | ||
# logBacktraceAt: | ||
# logtostderr: true | ||
# skipHeaders: false | ||
# stderrthreshold: 2 | ||
# v: 0 | ||
# vmodule: | ||
## NOTE: the following options are not dynamically run-time | ||
## configurable and require a nfd-worker restart to take effect | ||
## after being changed | ||
# logDir: | ||
# logFile: | ||
# logFileMaxSize: 1800 | ||
# skipLogHeaders: false | ||
sources: | ||
# cpu: | ||
# cpuid: | ||
## NOTE: whitelist has priority over blacklist | ||
# attributeBlacklist: | ||
# - "BMI1" | ||
# - "BMI2" | ||
# - "CLMUL" | ||
# - "CMOV" | ||
# - "CX16" | ||
# - "ERMS" | ||
# - "F16C" | ||
# - "HTT" | ||
# - "LZCNT" | ||
# - "MMX" | ||
# - "MMXEXT" | ||
# - "NX" | ||
# - "POPCNT" | ||
# - "RDRAND" | ||
# - "RDSEED" | ||
# - "RDTSCP" | ||
# - "SGX" | ||
# - "SSE" | ||
# - "SSE2" | ||
# - "SSE3" | ||
# - "SSE4.1" | ||
# - "SSE4.2" | ||
# - "SSSE3" | ||
# attributeWhitelist: | ||
# kernel: | ||
# kconfigFile: "/path/to/kconfig" | ||
# configOpts: | ||
# - "NO_HZ" | ||
# - "X86" | ||
# - "DMI" | ||
pci: | ||
deviceClassWhitelist: | ||
- "0200" | ||
- "03" | ||
- "12" | ||
deviceLabelFields: | ||
# - "class" | ||
- "vendor" | ||
# - "device" | ||
# - "subsystem_vendor" | ||
# - "subsystem_device" | ||
# usb: | ||
# deviceClassWhitelist: | ||
# - "0e" | ||
# - "ef" | ||
# - "fe" | ||
# - "ff" | ||
# deviceLabelFields: | ||
# - "class" | ||
# - "vendor" | ||
# - "device" | ||
# custom: | ||
# - name: "my.kernel.feature" | ||
# matchOn: | ||
# - loadedKMod: ["example_kmod1", "example_kmod2"] | ||
# - name: "my.pci.feature" | ||
# matchOn: | ||
# - pciId: | ||
# class: ["0200"] | ||
# vendor: ["15b3"] | ||
# device: ["1014", "1017"] | ||
# - pciId : | ||
# vendor: ["8086"] | ||
# device: ["1000", "1100"] | ||
# - name: "my.usb.feature" | ||
# matchOn: | ||
# - usbId: | ||
# class: ["ff"] | ||
# vendor: ["03e7"] | ||
# device: ["2485"] | ||
# - usbId: | ||
# class: ["fe"] | ||
# vendor: ["1a6e"] | ||
# device: ["089a"] | ||
# - name: "my.combined.feature" | ||
# matchOn: | ||
# - pciId: | ||
# vendor: ["15b3"] | ||
# device: ["1014", "1017"] | ||
# loadedKMod : ["vendor_kmod1", "vendor_kmod2"] |
Oops, something went wrong.