Skip to content

Commit

Permalink
NAS-130121 / 24.10 / Add support for GPU ref (#14071)
Browse files Browse the repository at this point in the history
* Add type hints for nvidia gpu util func

* Add util function which normalizes gpu details and gives us the data we need to expose gpu resources

* Add public endpoint which exposes gpus which can be consumed by apps

* Add gpu stuff to questions context

* Mark update as true for config

* Properly get normalized gpus for apps

* Normalize gpu ref value

* Fix conditional for non-nvidia gpus
  • Loading branch information
sonicaj authored Jul 24, 2024
1 parent 05dae0f commit dd7b62c
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 5 deletions.
2 changes: 1 addition & 1 deletion src/middlewared/middlewared/plugins/apps/crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def update_internal(self, job, app, data, progress_keyword='Update'):
)

new_values = self.middleware.call_sync(
'app.schema.normalize_and_validate_values', app_version_details, config, False,
'app.schema.normalize_and_validate_values', app_version_details, config, True,
get_installed_app_path(app_name), app
)

Expand Down
27 changes: 26 additions & 1 deletion src/middlewared/middlewared/plugins/apps/resources.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
from middlewared.schema import accepts, Dict, Int, List, Ref, returns, Str
from middlewared.service import Service
from middlewared.service import private, Service

from middlewared.utils.gpu import get_nvidia_gpus

from .resources_utils import get_normalized_gpu_choices


class AppService(Service):
Expand Down Expand Up @@ -52,3 +56,24 @@ async def ip_choices(self):
ip['address']: ip['address']
for ip in await self.middleware.call('interface.ip_in_use', {'static': True, 'any': True})
}

@accepts()
@returns(Dict('gpu_choices', additional_attrs=True))
async def gpu_choices(self):
"""
Returns GPU choices which can be used by applications.
"""
return {
gpu['description']: {
k: gpu[k] for k in ('vendor', 'description', 'vendor_specific_config')
}
for gpu in await self.gpu_choices_internal()
if not gpu['error']
}

@private
async def gpu_choices_internal(self):
return get_normalized_gpu_choices(
await self.middleware.call('device.get_gpus'),
await self.middleware.run_in_thread(get_nvidia_gpus),
)
56 changes: 56 additions & 0 deletions src/middlewared/middlewared/plugins/apps/resources_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
def get_gpu_base_dict() -> dict:
return {
'vendor': '',
'description': '',
'error': None,
'vendor_specific_config': {},
'gpu_details': {},
'pci_slot': None,
}


def get_normalized_gpu_choices(all_gpus_info: list[dict], nvidia_gpus: dict) -> list[dict]:
all_gpus_info = {gpu['addr']['pci_slot']: gpu for gpu in all_gpus_info}
gpus = []
for pci_slot, gpu_info in all_gpus_info.items():
gpu_config = get_gpu_base_dict() | {
'vendor': gpu_info['vendor'],
'description': gpu_info['description'],
'gpu_details': gpu_info,
'pci_slot': pci_slot,
}
gpus.append(gpu_config)

if gpu_info['vendor'] == 'NVIDIA':
if pci_slot not in nvidia_gpus:
gpu_config.update({
'error': 'Unable to locate GPU details from procfs',
})
continue

nvidia_gpu = nvidia_gpus[pci_slot]
error = None
if not nvidia_gpu.get('gpu_uuid'):
error = 'GPU UUID not found'
elif '?' in nvidia_gpu['gpu_uuid']:
error = 'Malformed GPU UUID found'
if error:
gpu_config.update({
'error': error,
'nvidia_gpu_details': nvidia_gpu,
})
continue

gpu_config.update({
'vendor_specific_config': {
'uuid': nvidia_gpu['gpu_uuid'],
},
'description': nvidia_gpu.get('model') or gpu_config['description'],
})

if not gpu_info['available_to_host']:
gpu_config.update({
'error': 'GPU not available to host',
})

return gpus
15 changes: 15 additions & 0 deletions src/middlewared/middlewared/plugins/apps/schema_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
REF_MAPPING = {
'definitions/certificate': 'certificate',
'definitions/certificate_authority': 'certificate_authorities',
'definitions/gpu_configuration': 'gpu_configuration',
'normalize/acl': 'acl',
'normalize/ix_volume': 'ix_volume',
}
Expand Down Expand Up @@ -103,6 +104,20 @@ async def normalize_certificate_authorities(self, attr, value, complete_config,

return value

async def normalize_gpu_configuration(self, attr, value, complete_config, context):
gpu_choices = {
gpu['pci_slot']: gpu
for gpu in await self.middleware.call('app.gpu_choices_internal') if not gpu['error']
}
if not any(gpu['vendor'] != 'NVIDIA' for gpu in gpu_choices.values()):
value['use_all_gpus'] = False

for nvidia_gpu_pci_slot in list(value['nvidia_gpu_selection']):
if nvidia_gpu_pci_slot not in gpu_choices or gpu_choices[nvidia_gpu_pci_slot]['vendor'] != 'NVIDIA':
value['nvidia_gpu_selection'].pop(nvidia_gpu_pci_slot)

return value

async def normalize_ix_volume(self, attr, value, complete_config, context):
# Let's allow ix volume attr to be a string as well making it easier to define a volume in questions.yaml
assert isinstance(attr, (Dict, Str)) is True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ async def get_normalized_questions_context(self):
'certificates': await self.middleware.call('app.certificate_choices'),
'certificate_authorities': await self.middleware.call('app.certificate_authority_choices'),
'ip_choices': await self.middleware.call('app.ip_choices'),
'gpu_choices': await self.middleware.call('app.gpu_choices_internal'),
}

@private
Expand Down
7 changes: 4 additions & 3 deletions src/middlewared/middlewared/utils/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import re
import subprocess
from typing import TextIO

import pyudev

Expand All @@ -13,9 +14,9 @@
RE_PCI_ADDR = re.compile(r'(?P<domain>.*):(?P<bus>.*):(?P<slot>.*)\.')


def parse_nvidia_info_file(fileobj):
def parse_nvidia_info_file(file_obj: TextIO) -> tuple[dict, str]:
gpu, bus_loc = dict(), None
for line in fileobj:
for line in file_obj:
k, v = line.split(':', 1)
k, v = k.strip().lower().replace(' ', '_'), v.strip()
gpu[k] = v
Expand All @@ -24,7 +25,7 @@ def parse_nvidia_info_file(fileobj):
return gpu, bus_loc


def get_nvidia_gpus():
def get_nvidia_gpus() -> dict[str, dict]:
"""Don't be so complicated. Return basic information about
NVIDIA devices (if any) that are connected."""
gpus = dict()
Expand Down

0 comments on commit dd7b62c

Please sign in to comment.