Merge pull request #25 from rh-mobb/add-nvidia-gpu

add nvidia gpu support
rh-mobb · Jul 14, 2022 · 3adf4f4 · 3adf4f4
2 parents 97b96a0 + 8ea57f0
commit 3adf4f4
Show file tree

Hide file tree

Showing 9 changed files with 458 additions and 0 deletions.
diff --git a/charts/nvidia-gpu/.helmignore b/charts/nvidia-gpu/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/charts/nvidia-gpu/Chart.yaml b/charts/nvidia-gpu/Chart.yaml
@@ -0,0 +1,9 @@
+apiVersion: v2
+name: nvidia-gpu
+description: A Helm chart for Kubernetes
+maintainers:
+  - name: paulczar
+home: https://github.com/rh-mobb/helm-charts
+type: application
+version: 0.1.0
+appVersion: "1.16.0"
diff --git a/charts/nvidia-gpu/README.md b/charts/nvidia-gpu/README.md
@@ -0,0 +1,84 @@
+# Helm Chart to set up NVIDIA GPU nodes
+
+
+## Prerequisites
+
+* An ARO / ROSA cluster
+* Helm CLI
+
+## Prepare Environment
+
+1. Create namespaces
+
+    ```bash
+    oc create namespace openshift-nfd
+    oc create namespace nvidia-gpu-operator
+    ```
+
+1. Add the MOBB chart repository to your Helm
+
+    ```bash
+    helm repo add mobb https://rh-mobb.github.io/helm-charts/
+    ```
+
+1. Update your repositories
+
+    ```bash
+    helm repo update
+    ```
+
+1. Use the `mobb/operatorhub` chart to deploy the needed operators
+
+    ```bash
+    helm upgrade -n nvidia-gpu-operator nvidia-gpu-operator \
+      mobb/operatorhub --install \
+      --values https://raw.githubusercontent.com/rh-mobb/helm-charts/main/charts/nvidia-gpu/files/operatorhub.yaml
+    ```
+
+1. Wait until the two operators are running
+
+    ```bash
+    watch kubectl get pods -n openshift-nfd
+    ```
+
+    ```
+    NAME                                      READY   STATUS    RESTARTS   AGE
+    nfd-controller-manager-7b66c67bd9-rk98w   2/2     Running   0          47s
+    ```
+
+    ```bash
+    watch kubectl get pods -n nvidia-gpu-operator
+    ```
+
+    ```
+    kubectl get pods -n nvidia-gpu-operator
+    NAME                            READY   STATUS    RESTARTS   AGE
+    gpu-operator-5d8cb7dd5f-c4ljk   1/1     Running   0          87s
+    ```
+## Deploy the Helm Chart
+
+1. Install a Chart
+
+    ```bash
+    helm upgrade --install -n nvidia-gpu-operator nvidia-gpu \
+      mobb/nvidia-gpu --disable-openapi-validation
+    ```
+
+1. Validate the NFD can see the GPU(s)
+
+    ```bash
+    oc describe node | egrep 'Roles|pci-10de' | grep -v master
+    ```
+
+    You should see output like:
+
+    ```
+    Roles:    worker
+              feature.node.kubernetes.io/pci-10de.present=true
+    ```
+1. Verify the GPUs are available on the host
+
+    ```bash
+    oc project nvidia-gpu-operator
+    for i in $(oc get pod -lopenshift.driver-toolkit=true --no-headers |awk '{print $1}'); do echo $i; oc exec -it $i -- nvidia-smi ; echo -e '\n' ;  done
+    ```
diff --git a/charts/nvidia-gpu/files/operatorhub.yaml b/charts/nvidia-gpu/files/operatorhub.yaml
@@ -0,0 +1,21 @@
+subscriptions:
+  - name: gpu-operator-certified
+    channel: v1.11
+    installPlanApproval: Automatic
+    source: certified-operators
+    sourceNamespace: openshift-marketplace
+    namespace: nvidia-gpu-operator
+  - name: nfd
+    channel: stable
+    installPlanApproval: Automatic
+    source: redhat-operators
+    sourceNamespace: openshift-marketplace
+    namespace: openshift-nfd
+
+operatorGroups:
+  - name: nvidia-gpu-operator-group
+    namespace: nvidia-gpu-operator
+    targetNamespace: nvidia-gpu-operator
+  - name: openshift-nfd
+    namespace: openshift-nfd
+    targetNamespace: ~
diff --git a/charts/nvidia-gpu/templates/NOTES.txt b/charts/nvidia-gpu/templates/NOTES.txt
diff --git a/charts/nvidia-gpu/templates/_helpers.tpl b/charts/nvidia-gpu/templates/_helpers.tpl
@@ -0,0 +1,62 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "nvidia-gpu.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "nvidia-gpu.fullname" -}}
+{{- if .Values.fullnameOverride }}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- $name := default .Chart.Name .Values.nameOverride }}
+{{- if contains $name .Release.Name }}
+{{- .Release.Name | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "nvidia-gpu.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "nvidia-gpu.labels" -}}
+helm.sh/chart: {{ include "nvidia-gpu.chart" . }}
+{{ include "nvidia-gpu.selectorLabels" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
+
+{{/*
+Selector labels
+*/}}
+{{- define "nvidia-gpu.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "nvidia-gpu.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+
+{{/*
+Create the name of the service account to use
+*/}}
+{{- define "nvidia-gpu.serviceAccountName" -}}
+{{- if .Values.serviceAccount.create }}
+{{- default (include "nvidia-gpu.fullname" .) .Values.serviceAccount.name }}
+{{- else }}
+{{- default "default" .Values.serviceAccount.name }}
+{{- end }}
+{{- end }}
diff --git a/charts/nvidia-gpu/templates/clusterpolicy.yaml b/charts/nvidia-gpu/templates/clusterpolicy.yaml
@@ -0,0 +1,47 @@
+apiVersion: nvidia.com/v1
+kind: ClusterPolicy
+metadata:
+  name: gpu-cluster-policy
+  labels:
+    {{- include "nvidia-gpu.labels" . | nindent 4 }}
+spec:
+  migManager:
+    enabled: true
+  operator:
+    defaultRuntime: crio
+    initContainer: {}
+    runtimeClass: nvidia
+    deployGFD: true
+  dcgm:
+    enabled: true
+  gfd: {}
+  dcgmExporter:
+    config:
+      name: ''
+  driver:
+    licensingConfig:
+      nlsEnabled: false
+      configMapName: ''
+    certConfig:
+      name: ''
+    kernelModuleConfig:
+      name: ''
+    repoConfig:
+      configMapName: ''
+    virtualTopology:
+      config: ''
+    enabled: true
+    use_ocp_driver_toolkit: true
+  devicePlugin: {}
+  mig:
+    strategy: single
+  validator:
+    plugin:
+      env:
+        - name: WITH_WORKLOAD
+          value: 'true'
+  nodeStatusExporter:
+    enabled: true
+  daemonsets: {}
+  toolkit:
+    enabled: true
diff --git a/charts/nvidia-gpu/templates/nfd.yaml b/charts/nvidia-gpu/templates/nfd.yaml
@@ -0,0 +1,129 @@
+kind: NodeFeatureDiscovery
+apiVersion: nfd.openshift.io/v1
+metadata:
+  name: nfd-instance
+  namespace: openshift-nfd
+  labels:
+    {{- include "nvidia-gpu.labels" . | nindent 4 }}
+spec:
+  customConfig:
+    configData: |
+      #    - name: "more.kernel.features"
+      #      matchOn:
+      #      - loadedKMod: ["example_kmod3"]
+      #    - name: "more.features.by.nodename"
+      #      value: customValue
+      #      matchOn:
+      #      - nodename: ["special-.*-node-.*"]
+  operand:
+    image: >-
+      registry.redhat.io/openshift4/ose-node-feature-discovery@sha256:07658ef3df4b264b02396e67af813a52ba416b47ab6e1d2d08025a350ccd2b7b
+    servicePort: 12000
+  workerConfig:
+    configData: |
+      core:
+      #  labelWhiteList:
+      #  noPublish: false
+        sleepInterval: 60s
+      #  sources: [all]
+      #  klog:
+      #    addDirHeader: false
+      #    alsologtostderr: false
+      #    logBacktraceAt:
+      #    logtostderr: true
+      #    skipHeaders: false
+      #    stderrthreshold: 2
+      #    v: 0
+      #    vmodule:
+      ##   NOTE: the following options are not dynamically run-time
+      ##          configurable and require a nfd-worker restart to take effect
+      ##          after being changed
+      #    logDir:
+      #    logFile:
+      #    logFileMaxSize: 1800
+      #    skipLogHeaders: false
+      sources:
+      #  cpu:
+      #    cpuid:
+      ##     NOTE: whitelist has priority over blacklist
+      #      attributeBlacklist:
+      #        - "BMI1"
+      #        - "BMI2"
+      #        - "CLMUL"
+      #        - "CMOV"
+      #        - "CX16"
+      #        - "ERMS"
+      #        - "F16C"
+      #        - "HTT"
+      #        - "LZCNT"
+      #        - "MMX"
+      #        - "MMXEXT"
+      #        - "NX"
+      #        - "POPCNT"
+      #        - "RDRAND"
+      #        - "RDSEED"
+      #        - "RDTSCP"
+      #        - "SGX"
+      #        - "SSE"
+      #        - "SSE2"
+      #        - "SSE3"
+      #        - "SSE4.1"
+      #        - "SSE4.2"
+      #        - "SSSE3"
+      #      attributeWhitelist:
+      #  kernel:
+      #    kconfigFile: "/path/to/kconfig"
+      #    configOpts:
+      #      - "NO_HZ"
+      #      - "X86"
+      #      - "DMI"
+        pci:
+          deviceClassWhitelist:
+            - "0200"
+            - "03"
+            - "12"
+          deviceLabelFields:
+      #      - "class"
+            - "vendor"
+      #      - "device"
+      #      - "subsystem_vendor"
+      #      - "subsystem_device"
+      #  usb:
+      #    deviceClassWhitelist:
+      #      - "0e"
+      #      - "ef"
+      #      - "fe"
+      #      - "ff"
+      #    deviceLabelFields:
+      #      - "class"
+      #      - "vendor"
+      #      - "device"
+      #  custom:
+      #    - name: "my.kernel.feature"
+      #      matchOn:
+      #        - loadedKMod: ["example_kmod1", "example_kmod2"]
+      #    - name: "my.pci.feature"
+      #      matchOn:
+      #        - pciId:
+      #            class: ["0200"]
+      #            vendor: ["15b3"]
+      #            device: ["1014", "1017"]
+      #        - pciId :
+      #            vendor: ["8086"]
+      #            device: ["1000", "1100"]
+      #    - name: "my.usb.feature"
+      #      matchOn:
+      #        - usbId:
+      #          class: ["ff"]
+      #          vendor: ["03e7"]
+      #          device: ["2485"]
+      #        - usbId:
+      #          class: ["fe"]
+      #          vendor: ["1a6e"]
+      #          device: ["089a"]
+      #    - name: "my.combined.feature"
+      #      matchOn:
+      #        - pciId:
+      #            vendor: ["15b3"]
+      #            device: ["1014", "1017"]
+      #          loadedKMod : ["vendor_kmod1", "vendor_kmod2"]