Skip to content

Commit

Permalink
Fix MIG GPU support
Browse files Browse the repository at this point in the history
  • Loading branch information
allegroai committed May 1, 2024
1 parent 7a4154f commit f267466
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 12 deletions.
57 changes: 51 additions & 6 deletions clearml/utilities/gpu/gpustat.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,21 @@ def uuid(self):
"""
return self.entry['uuid']

@property
def mig_index(self):
"""
Returns the index of the MIG partition (as in nvidia-smi).
"""
return self.entry.get("mig_index")

@property
def mig_uuid(self):
"""
Returns the uuid of the MIG partition returned by nvidia-smi when running in MIG mode,
e.g. MIG-12345678-abcd-abcd-uuid-123456abcdef
"""
return self.entry.get("mig_uuid")

@property
def name(self):
"""
Expand Down Expand Up @@ -160,6 +175,7 @@ class GPUStatCollection(object):
_initialized = False
_device_count = None
_gpu_device_info = {}
_mig_device_info = {}

def __init__(self, gpu_list, driver_version=None, driver_cuda_version=None):
self.gpus = gpu_list
Expand Down Expand Up @@ -190,7 +206,7 @@ def _decode(b):
return b.decode() # for python3, to unicode
return b

def get_gpu_info(index, handle):
def get_gpu_info(index, handle, is_mig=False):
"""Get one GPU information specified by nvml handle"""

def get_process_info(nv_process):
Expand Down Expand Up @@ -226,12 +242,13 @@ def get_process_info(nv_process):
pass
return process

if not GPUStatCollection._gpu_device_info.get(index):
device_info = GPUStatCollection._mig_device_info if is_mig else GPUStatCollection._gpu_device_info
if not device_info.get(index):
name = _decode(N.nvmlDeviceGetName(handle))
uuid = _decode(N.nvmlDeviceGetUUID(handle))
GPUStatCollection._gpu_device_info[index] = (name, uuid)
device_info[index] = (name, uuid)

name, uuid = GPUStatCollection._gpu_device_info[index]
name, uuid = device_info[index]

try:
temperature = N.nvmlDeviceGetTemperature(
Expand Down Expand Up @@ -327,8 +344,36 @@ def get_process_info(nv_process):
for index in range(GPUStatCollection._device_count):
handle = N.nvmlDeviceGetHandleByIndex(index)
gpu_info = get_gpu_info(index, handle)
gpu_stat = GPUStat(gpu_info)
gpu_list.append(gpu_stat)
mig_cnt = 0
# noinspection PyBroadException
try:
mig_cnt = N.nvmlDeviceGetMaxMigDeviceCount(handle)
except Exception:
pass

if mig_cnt <= 0:
gpu_list.append(GPUStat(gpu_info))
continue

got_mig_info = False
for mig_index in range(mig_cnt):
try:
mig_handle = N.nvmlDeviceGetMigDeviceHandleByIndex(handle, mig_index)
mig_info = get_gpu_info(mig_index, mig_handle, is_mig=True)
mig_info["mig_name"] = mig_info["name"]
mig_info["name"] = gpu_info["name"]
mig_info["mig_index"] = mig_info["index"]
mig_info["mig_uuid"] = mig_info["uuid"]
mig_info["index"] = gpu_info["index"]
mig_info["uuid"] = gpu_info["uuid"]
mig_info["temperature.gpu"] = gpu_info["temperature.gpu"]
mig_info["fan.speed"] = gpu_info["fan.speed"]
gpu_list.append(GPUStat(mig_info))
got_mig_info = True
except Exception as e:
pass
if not got_mig_info:
gpu_list.append(GPUStat(gpu_info))

# 2. additional info (driver version, etc).
if get_driver_info:
Expand Down
40 changes: 34 additions & 6 deletions clearml/utilities/resource_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .process.mp import BackgroundMonitor
from ..backend_api import Session
from ..binding.frameworks.tensorflow_bind import IsTensorboardInit
from ..config import config

try:
from .gpu import gpustat
Expand Down Expand Up @@ -46,6 +47,11 @@ def __init__(self, task, sample_frequency_per_sec=2., report_frequency_sec=30.,
self._last_process_pool = {}
self._last_process_id_list = []
self._gpu_memory_per_process = True
self._default_gpu_utilization = config.get("resource_monitoring.default_gpu_utilization", 100)
# allow default_gpu_utilization as null in the config, in which case we don't log anything
if self._default_gpu_utilization is not None:
self._default_gpu_utilization = int(self._default_gpu_utilization)
self._gpu_utilization_warning_sent = False

# noinspection PyBroadException
try:
Expand Down Expand Up @@ -314,13 +320,18 @@ def mem_usage_children(a_mem_size, pr, parent_mem=None):

return mem_size

def _skip_nonactive_gpu(self, idx, gpu):
def _skip_nonactive_gpu(self, gpu):
if not self._active_gpus:
return False
# noinspection PyBroadException
try:
uuid = getattr(gpu, "uuid", None)
return str(idx) not in self._active_gpus and (not uuid or uuid not in self._active_gpus)
mig_uuid = getattr(gpu, "mig_uuid", None)
return (
str(gpu.index) not in self._active_gpus
and (not uuid or uuid not in self._active_gpus)
and (not mig_uuid or mig_uuid not in self._active_gpus)
)
except Exception:
pass
return False
Expand Down Expand Up @@ -349,7 +360,7 @@ def _get_gpu_stats(self):
self._gpu_memory_per_process = False
break
# only monitor the active gpu's, if none were selected, monitor everything
if self._skip_nonactive_gpu(i, g):
if self._skip_nonactive_gpu(g):
continue

gpu_mem[i] = 0
Expand All @@ -369,10 +380,27 @@ def _get_gpu_stats(self):

for i, g in enumerate(gpu_stat.gpus):
# only monitor the active gpu's, if none were selected, monitor everything
if self._skip_nonactive_gpu(i, g):
if self._skip_nonactive_gpu(g):
continue
stats["gpu_%d_temperature" % i] = float(g["temperature.gpu"])
stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"])
if g["utilization.gpu"] is not None:
stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"])
else:
stats["gpu_%d_utilization" % i] = self._default_gpu_utilization
if not self._gpu_utilization_warning_sent:
if g.mig_index is not None:
self._task.get_logger().report_text(
"Running inside MIG, Nvidia driver cannot export utilization, pushing fixed value {}".format( # noqa
self._default_gpu_utilization
)
)
else:
self._task.get_logger().report_text(
"Nvidia driver cannot export utilization, pushing fixed value {}".format(
self._default_gpu_utilization
)
)
self._gpu_utilization_warning_sent = True
stats["gpu_%d_mem_usage" % i] = 100. * float(g["memory.used"]) / float(g["memory.total"])
# already in MBs
stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024
Expand Down Expand Up @@ -400,7 +428,7 @@ def _get_machine_specs(self):
if self._gpustat:
gpu_stat = self._gpustat.new_query(shutdown=True, get_driver_info=True)
if gpu_stat.gpus:
gpus = [g for i, g in enumerate(gpu_stat.gpus) if not self._skip_nonactive_gpu(i, g)]
gpus = [g for i, g in enumerate(gpu_stat.gpus) if not self._skip_nonactive_gpu(g)]
specs.update(
gpu_count=int(len(gpus)),
gpu_type=', '.join(g.name for g in gpus),
Expand Down

0 comments on commit f267466

Please sign in to comment.