From 41f564139e496671762e85e67b5d64f31b8b1e3d Mon Sep 17 00:00:00 2001 From: Yassin Khalifa Date: Thu, 22 Aug 2024 12:09:22 -0400 Subject: [PATCH] CRCIdle - adding min-max memory Adding the min max memory calculation to crc-idle and its tests. --- apps/crc_idle.py | 48 +++++++++++++++++++++++++++++++----------- tests/test_crc_idle.py | 26 ++++++++++++++--------- 2 files changed, 52 insertions(+), 22 deletions(-) diff --git a/apps/crc_idle.py b/apps/crc_idle.py index 20fb413..11be895 100755 --- a/apps/crc_idle.py +++ b/apps/crc_idle.py @@ -58,7 +58,7 @@ def get_cluster_list(self, args: Namespace) -> tuple[str]: return specified_clusters or argument_clusters @staticmethod - def _count_idle_cpu_resources(cluster: str, partition: str) -> dict[int, int]: + def _count_idle_cpu_resources(cluster: str, partition: str) -> dict[int, dict[str, int]]: """Return the idle CPU resources on a given cluster partition. Args: @@ -66,24 +66,36 @@ def _count_idle_cpu_resources(cluster: str, partition: str) -> dict[int, int]: partition: The partition in the parent cluster. Returns: - A dictionary mapping the number of idle resources to the number of nodes with that many idle resources. + A dictionary mapping the number of idle resources to a dictionary with the number of nodes with that many + idle resources, minimum free memory, and maximum free memory on these nodes. """ # Use `sinfo` command to determine the status of each node in the given partition - command = f'sinfo -h -M {cluster} -p {partition} -N -o %N,%C' + command = f'sinfo -h -M {cluster} -p {partition} -N -o %N,%C,%e' slurm_data = Shell.run_command(command).strip().split() # Count the number of nodes having a given number of idle cores/GPUs return_dict = dict() for node_info in slurm_data: - node_name, resource_data = node_info.split(',') + node_name, resource_data, free_mem = node_info.split(',') allocated, idle, other, total = [int(x) for x in resource_data.split('/')] - return_dict[idle] = return_dict.setdefault(idle, 0) + 1 + if idle not in return_dict: + # Initialize a new entry for this idle count + return_dict[idle] = { + 'count': 1, + 'min_free_mem': int(free_mem), + 'max_free_mem': int(free_mem) + } + else: + # Update the count and min/max free memory + return_dict[idle]['count'] += 1 + return_dict[idle]['min_free_mem'] = min(return_dict[idle]['min_free_mem'], int(free_mem)) + return_dict[idle]['max_free_mem'] = max(return_dict[idle]['max_free_mem'], int(free_mem)) return return_dict @staticmethod - def _count_idle_gpu_resources(cluster: str, partition: str) -> dict[int, int]: + def _count_idle_gpu_resources(cluster: str, partition: str) -> dict[int, dict[str, int]]: """Return idle GPU resources on a given cluster partition. If the host node is in a `drain` state, the GPUs are reported as unavailable. @@ -97,14 +109,14 @@ def _count_idle_gpu_resources(cluster: str, partition: str) -> dict[int, int]: """ # Use `sinfo` command to determine the status of each node in the given partition - slurm_output_format = "NodeList:'_',gres:5'_',gresUsed:12'_',StateCompact:' '" + slurm_output_format = "NodeList:'_',gres:5'_',gresUsed:12'_',StateCompact:'_',FreeMem ' '" command = f"sinfo -h -M {cluster} -p {partition} -N --Format={slurm_output_format}" slurm_data = Shell.run_command(command).strip().split() # Count the number of nodes having a given number of idle cores/GPUs return_dict = dict() for node_info in slurm_data: - node_name, total, allocated, state = node_info.split('_') + node_name, total, allocated, state, free_mem = node_info.split('_') # If the node is in a downed state, report 0 resource availability. if re.search("drain", state): @@ -115,11 +127,22 @@ def _count_idle_gpu_resources(cluster: str, partition: str) -> dict[int, int]: total = int(total[-1:]) idle = total - allocated - return_dict[idle] = return_dict.setdefault(idle, 0) + 1 + if idle not in return_dict: + # Initialize a new entry for this idle count + return_dict[idle] = { + 'count': 1, + 'min_free_mem': int(free_mem), + 'max_free_mem': int(free_mem) + } + else: + # Update the count and min/max free memory + return_dict[idle]['count'] += 1 + return_dict[idle]['min_free_mem'] = min(return_dict[idle]['min_free_mem'], int(free_mem)) + return_dict[idle]['max_free_mem'] = max(return_dict[idle]['max_free_mem'], int(free_mem)) return return_dict - def count_idle_resources(self, cluster: str, partition: str) -> dict[int, int]: + def count_idle_resources(self, cluster: str, partition: str) -> dict[int, dict[str, int]]: """Determine the number of idle resources on a given cluster partition. The returned dictionary maps the number of idle resources (e.g., cores) @@ -151,14 +174,15 @@ def print_partition_summary(self, cluster: str, partition: str, idle_resources: idle_resources: Dictionary mapping idle resources to number of nodes """ - output_width = 30 + output_width = 70 header = f'Cluster: {cluster}, Partition: {partition}' unit = self.cluster_types[cluster] print(header) print('=' * output_width) for idle, nodes in sorted(idle_resources.items()): - print(f'{nodes:4d} nodes w/ {idle:3d} idle {unit}') + print(f'{nodes["count"]:4d} nodes w/ {idle:3d} idle {unit} {(nodes["min_free_mem"]/1024):,.2f}G - ' + f'{(nodes["max_free_mem"]/1024):,.2f}G min-max free memory') if not idle_resources: print(' No idle resources') diff --git a/tests/test_crc_idle.py b/tests/test_crc_idle.py index e64b3c7..ad3b6a0 100644 --- a/tests/test_crc_idle.py +++ b/tests/test_crc_idle.py @@ -79,12 +79,14 @@ def test_count_idle_cpu_resources(self, mock_run_command: Mock) -> None: cluster = 'smp' partition = 'default' - mock_run_command.return_value = "node1,2/4/0/4\nnode2,3/2/0/3" + mock_run_command.return_value = "node1,2/4/0/4,3500\nnode2,3/2/0/3,4000" app = CrcIdle() result = app.count_idle_resources(cluster, partition) - expected = {4: 1, 2: 1} + expected = {4: {'count': 1, 'min_free_mem': 3500, 'max_free_mem': 3500}, + 2: {'count': 1, 'min_free_mem': 4000, 'max_free_mem': 4000} + } self.assertEqual(expected, result) @patch('apps.utils.Shell.run_command') @@ -93,11 +95,13 @@ def test_count_idle_gpu_resources(self, mock_run_command: Mock) -> None: cluster = 'gpu' partition = 'default' - mock_run_command.return_value = "node1_4_2_idle\nnode2_4_4_drain" + mock_run_command.return_value = "node1_4_2_idle_3500\nnode2_4_4_drain_4000" app = CrcIdle() result = app.count_idle_resources(cluster, partition) - expected = {2: 1, 0: 1} + expected = {2: {'count': 1, 'min_free_mem': 3500, 'max_free_mem': 3500}, + 0: {'count': 1, 'min_free_mem': 4000, 'max_free_mem': 4000} + } self.assertEqual(expected, result) @@ -110,16 +114,18 @@ def test_print_partition_summary_with_idle_resources(self, mock_print: Mock) -> cluster = 'smp' partition = 'default' - idle_resources = {2: 3, 4: 1} # 3 nodes with 2 idle resources, 1 node with 4 idle resources + idle_resources = {2: {'count': 3, 'min_free_mem': 2500, 'max_free_mem': 3500}, + 4: {'count': 1, 'min_free_mem': 3000, 'max_free_mem': 3000} + } # 3 nodes with 2 idle resources, 1 node with 4 idle resources app = CrcIdle() app.print_partition_summary(cluster, partition, idle_resources) mock_print.assert_has_calls([ call(f'Cluster: {cluster}, Partition: {partition}'), - call('=' * 30), - call(' 3 nodes w/ 2 idle cores'), - call(' 1 nodes w/ 4 idle cores'), + call('=' * 70), + call(' 3 nodes w/ 2 idle cores 2.44G - 3.42G min-max free memory'), + call(' 1 nodes w/ 4 idle cores 2.94G - 2.94G min-max free memory'), call('') ], any_order=False) @@ -135,13 +141,13 @@ def test_print_partition_summary_no_idle_resources(self, mock_print: Mock) -> No app.print_partition_summary(cluster, partition, idle_resources) mock_print.assert_any_call(f'Cluster: {cluster}, Partition: {partition}') - mock_print.assert_any_call('=' * 30) + mock_print.assert_any_call('=' * 70) mock_print.assert_any_call(' No idle resources') mock_print.assert_any_call('') mock_print.assert_has_calls([ call(f'Cluster: {cluster}, Partition: {partition}'), - call('=====' * 6), + call('=====' * 14), call(' No idle resources'), call('') ], any_order=False)