From f2674669265a56a8ecaed3d0147f319ee136a231 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Wed, 1 May 2024 10:30:41 +0300 Subject: [PATCH] Fix MIG GPU support --- clearml/utilities/gpu/gpustat.py | 57 ++++++++++++++++++++++++--- clearml/utilities/resource_monitor.py | 40 ++++++++++++++++--- 2 files changed, 85 insertions(+), 12 deletions(-) diff --git a/clearml/utilities/gpu/gpustat.py b/clearml/utilities/gpu/gpustat.py index f84d73bb..9bab7fef 100644 --- a/clearml/utilities/gpu/gpustat.py +++ b/clearml/utilities/gpu/gpustat.py @@ -56,6 +56,21 @@ def uuid(self): """ return self.entry['uuid'] + @property + def mig_index(self): + """ + Returns the index of the MIG partition (as in nvidia-smi). + """ + return self.entry.get("mig_index") + + @property + def mig_uuid(self): + """ + Returns the uuid of the MIG partition returned by nvidia-smi when running in MIG mode, + e.g. MIG-12345678-abcd-abcd-uuid-123456abcdef + """ + return self.entry.get("mig_uuid") + @property def name(self): """ @@ -160,6 +175,7 @@ class GPUStatCollection(object): _initialized = False _device_count = None _gpu_device_info = {} + _mig_device_info = {} def __init__(self, gpu_list, driver_version=None, driver_cuda_version=None): self.gpus = gpu_list @@ -190,7 +206,7 @@ def _decode(b): return b.decode() # for python3, to unicode return b - def get_gpu_info(index, handle): + def get_gpu_info(index, handle, is_mig=False): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): @@ -226,12 +242,13 @@ def get_process_info(nv_process): pass return process - if not GPUStatCollection._gpu_device_info.get(index): + device_info = GPUStatCollection._mig_device_info if is_mig else GPUStatCollection._gpu_device_info + if not device_info.get(index): name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) - GPUStatCollection._gpu_device_info[index] = (name, uuid) + device_info[index] = (name, uuid) - name, uuid = GPUStatCollection._gpu_device_info[index] + name, uuid = device_info[index] try: temperature = N.nvmlDeviceGetTemperature( @@ -327,8 +344,36 @@ def get_process_info(nv_process): for index in range(GPUStatCollection._device_count): handle = N.nvmlDeviceGetHandleByIndex(index) gpu_info = get_gpu_info(index, handle) - gpu_stat = GPUStat(gpu_info) - gpu_list.append(gpu_stat) + mig_cnt = 0 + # noinspection PyBroadException + try: + mig_cnt = N.nvmlDeviceGetMaxMigDeviceCount(handle) + except Exception: + pass + + if mig_cnt <= 0: + gpu_list.append(GPUStat(gpu_info)) + continue + + got_mig_info = False + for mig_index in range(mig_cnt): + try: + mig_handle = N.nvmlDeviceGetMigDeviceHandleByIndex(handle, mig_index) + mig_info = get_gpu_info(mig_index, mig_handle, is_mig=True) + mig_info["mig_name"] = mig_info["name"] + mig_info["name"] = gpu_info["name"] + mig_info["mig_index"] = mig_info["index"] + mig_info["mig_uuid"] = mig_info["uuid"] + mig_info["index"] = gpu_info["index"] + mig_info["uuid"] = gpu_info["uuid"] + mig_info["temperature.gpu"] = gpu_info["temperature.gpu"] + mig_info["fan.speed"] = gpu_info["fan.speed"] + gpu_list.append(GPUStat(mig_info)) + got_mig_info = True + except Exception as e: + pass + if not got_mig_info: + gpu_list.append(GPUStat(gpu_info)) # 2. additional info (driver version, etc). if get_driver_info: diff --git a/clearml/utilities/resource_monitor.py b/clearml/utilities/resource_monitor.py index 84e1bf34..ad855cce 100644 --- a/clearml/utilities/resource_monitor.py +++ b/clearml/utilities/resource_monitor.py @@ -12,6 +12,7 @@ from .process.mp import BackgroundMonitor from ..backend_api import Session from ..binding.frameworks.tensorflow_bind import IsTensorboardInit +from ..config import config try: from .gpu import gpustat @@ -46,6 +47,11 @@ def __init__(self, task, sample_frequency_per_sec=2., report_frequency_sec=30., self._last_process_pool = {} self._last_process_id_list = [] self._gpu_memory_per_process = True + self._default_gpu_utilization = config.get("resource_monitoring.default_gpu_utilization", 100) + # allow default_gpu_utilization as null in the config, in which case we don't log anything + if self._default_gpu_utilization is not None: + self._default_gpu_utilization = int(self._default_gpu_utilization) + self._gpu_utilization_warning_sent = False # noinspection PyBroadException try: @@ -314,13 +320,18 @@ def mem_usage_children(a_mem_size, pr, parent_mem=None): return mem_size - def _skip_nonactive_gpu(self, idx, gpu): + def _skip_nonactive_gpu(self, gpu): if not self._active_gpus: return False # noinspection PyBroadException try: uuid = getattr(gpu, "uuid", None) - return str(idx) not in self._active_gpus and (not uuid or uuid not in self._active_gpus) + mig_uuid = getattr(gpu, "mig_uuid", None) + return ( + str(gpu.index) not in self._active_gpus + and (not uuid or uuid not in self._active_gpus) + and (not mig_uuid or mig_uuid not in self._active_gpus) + ) except Exception: pass return False @@ -349,7 +360,7 @@ def _get_gpu_stats(self): self._gpu_memory_per_process = False break # only monitor the active gpu's, if none were selected, monitor everything - if self._skip_nonactive_gpu(i, g): + if self._skip_nonactive_gpu(g): continue gpu_mem[i] = 0 @@ -369,10 +380,27 @@ def _get_gpu_stats(self): for i, g in enumerate(gpu_stat.gpus): # only monitor the active gpu's, if none were selected, monitor everything - if self._skip_nonactive_gpu(i, g): + if self._skip_nonactive_gpu(g): continue stats["gpu_%d_temperature" % i] = float(g["temperature.gpu"]) - stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"]) + if g["utilization.gpu"] is not None: + stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"]) + else: + stats["gpu_%d_utilization" % i] = self._default_gpu_utilization + if not self._gpu_utilization_warning_sent: + if g.mig_index is not None: + self._task.get_logger().report_text( + "Running inside MIG, Nvidia driver cannot export utilization, pushing fixed value {}".format( # noqa + self._default_gpu_utilization + ) + ) + else: + self._task.get_logger().report_text( + "Nvidia driver cannot export utilization, pushing fixed value {}".format( + self._default_gpu_utilization + ) + ) + self._gpu_utilization_warning_sent = True stats["gpu_%d_mem_usage" % i] = 100. * float(g["memory.used"]) / float(g["memory.total"]) # already in MBs stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024 @@ -400,7 +428,7 @@ def _get_machine_specs(self): if self._gpustat: gpu_stat = self._gpustat.new_query(shutdown=True, get_driver_info=True) if gpu_stat.gpus: - gpus = [g for i, g in enumerate(gpu_stat.gpus) if not self._skip_nonactive_gpu(i, g)] + gpus = [g for i, g in enumerate(gpu_stat.gpus) if not self._skip_nonactive_gpu(g)] specs.update( gpu_count=int(len(gpus)), gpu_type=', '.join(g.name for g in gpus),