Skip to content

Commit

Permalink
CRCIdle - adding min-max memory
Browse files Browse the repository at this point in the history
Adding the min max memory calculation to crc-idle and its tests.
  • Loading branch information
yassinkhalifa committed Aug 22, 2024
1 parent b4d5bd4 commit b0847ae
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 22 deletions.
48 changes: 36 additions & 12 deletions apps/crc_idle.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,32 +58,44 @@ def get_cluster_list(self, args: Namespace) -> tuple[str]:
return specified_clusters or argument_clusters

@staticmethod
def _count_idle_cpu_resources(cluster: str, partition: str) -> dict[int, int]:
def _count_idle_cpu_resources(cluster: str, partition: str) -> dict[int, dict[str, int]]:
"""Return the idle CPU resources on a given cluster partition.
Args:
cluster: The cluster to print a summary for.
partition: The partition in the parent cluster.
Returns:
A dictionary mapping the number of idle resources to the number of nodes with that many idle resources.
A dictionary mapping the number of idle resources to a dictionary with the number of nodes with that many
idle resources, minimum free memory, and maximum free memory on these nodes.
"""

# Use `sinfo` command to determine the status of each node in the given partition
command = f'sinfo -h -M {cluster} -p {partition} -N -o %N,%C'
command = f'sinfo -h -M {cluster} -p {partition} -N -o %N,%C,%e'
slurm_data = Shell.run_command(command).strip().split()

# Count the number of nodes having a given number of idle cores/GPUs
return_dict = dict()
for node_info in slurm_data:
node_name, resource_data = node_info.split(',')
node_name, resource_data, free_mem = node_info.split(',')

Check warning on line 80 in apps/crc_idle.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

apps/crc_idle.py#L80

Unused variable 'node_name'
allocated, idle, other, total = [int(x) for x in resource_data.split('/')]
return_dict[idle] = return_dict.setdefault(idle, 0) + 1
if idle not in return_dict:
# Initialize a new entry for this idle count
return_dict[idle] = {
'count': 1,
'min_free_mem': int(free_mem),
'max_free_mem': int(free_mem)
}
else:
# Update the count and min/max free memory
return_dict[idle]['count'] += 1
return_dict[idle]['min_free_mem'] = min(return_dict[idle]['min_free_mem'], int(free_mem))
return_dict[idle]['max_free_mem'] = max(return_dict[idle]['max_free_mem'], int(free_mem))

return return_dict

@staticmethod
def _count_idle_gpu_resources(cluster: str, partition: str) -> dict[int, int]:
def _count_idle_gpu_resources(cluster: str, partition: str) -> dict[int, dict[str, int]]:
"""Return idle GPU resources on a given cluster partition.
If the host node is in a `drain` state, the GPUs are reported as unavailable.
Expand All @@ -97,14 +109,14 @@ def _count_idle_gpu_resources(cluster: str, partition: str) -> dict[int, int]:
"""

# Use `sinfo` command to determine the status of each node in the given partition
slurm_output_format = "NodeList:'_',gres:5'_',gresUsed:12'_',StateCompact:' '"
slurm_output_format = "NodeList:'_',gres:5'_',gresUsed:12'_',StateCompact:'_',FreeMem ' '"
command = f"sinfo -h -M {cluster} -p {partition} -N --Format={slurm_output_format}"
slurm_data = Shell.run_command(command).strip().split()

# Count the number of nodes having a given number of idle cores/GPUs
return_dict = dict()
for node_info in slurm_data:
node_name, total, allocated, state = node_info.split('_')
node_name, total, allocated, state, free_mem = node_info.split('_')

Check warning on line 119 in apps/crc_idle.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

apps/crc_idle.py#L119

Unused variable 'node_name'

# If the node is in a downed state, report 0 resource availability.
if re.search("drain", state):
Expand All @@ -115,11 +127,22 @@ def _count_idle_gpu_resources(cluster: str, partition: str) -> dict[int, int]:
total = int(total[-1:])
idle = total - allocated

return_dict[idle] = return_dict.setdefault(idle, 0) + 1
if idle not in return_dict:
# Initialize a new entry for this idle count
return_dict[idle] = {
'count': 1,
'min_free_mem': int(free_mem),
'max_free_mem': int(free_mem)
}
else:
# Update the count and min/max free memory
return_dict[idle]['count'] += 1
return_dict[idle]['min_free_mem'] = min(return_dict[idle]['min_free_mem'], int(free_mem))
return_dict[idle]['max_free_mem'] = max(return_dict[idle]['max_free_mem'], int(free_mem))

return return_dict

def count_idle_resources(self, cluster: str, partition: str) -> dict[int, int]:
def count_idle_resources(self, cluster: str, partition: str) -> dict[int, dict[str, int]]:
"""Determine the number of idle resources on a given cluster partition.
The returned dictionary maps the number of idle resources (e.g., cores)
Expand Down Expand Up @@ -151,14 +174,15 @@ def print_partition_summary(self, cluster: str, partition: str, idle_resources:
idle_resources: Dictionary mapping idle resources to number of nodes
"""

output_width = 30
output_width = 70
header = f'Cluster: {cluster}, Partition: {partition}'
unit = self.cluster_types[cluster]

print(header)
print('=' * output_width)
for idle, nodes in sorted(idle_resources.items()):
print(f'{nodes:4d} nodes w/ {idle:3d} idle {unit}')
print(f'{nodes["count"]:4d} nodes w/ {idle:3d} idle {unit} {(nodes["min_free_mem"]/1024):,.2f}G - '
f'{(nodes["max_free_mem"]/1024):,.2f}G min-max free memory')

if not idle_resources:
print(' No idle resources')
Expand Down
26 changes: 16 additions & 10 deletions tests/test_crc_idle.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,14 @@ def test_count_idle_cpu_resources(self, mock_run_command: Mock) -> None:

cluster = 'smp'
partition = 'default'
mock_run_command.return_value = "node1,2/4/0/4\nnode2,3/2/0/3"
mock_run_command.return_value = "node1,2/4/0/4,3500\nnode2,3/2/0/3,4000"

app = CrcIdle()
result = app.count_idle_resources(cluster, partition)

expected = {4: 1, 2: 1}
expected = {4: {'count': 1, 'min_free_mem': 3500, 'max_free_mem': 3500},
2: {'count': 1, 'min_free_mem': 4000, 'max_free_mem': 4000}
}
self.assertEqual(expected, result)

@patch('apps.utils.Shell.run_command')
Expand All @@ -93,11 +95,13 @@ def test_count_idle_gpu_resources(self, mock_run_command: Mock) -> None:

cluster = 'gpu'
partition = 'default'
mock_run_command.return_value = "node1_4_2_idle\nnode2_4_4_drain"
mock_run_command.return_value = "node1_4_2_idle_3500\nnode2_4_4_drain_4000"

app = CrcIdle()
result = app.count_idle_resources(cluster, partition)
expected = {2: 1, 0: 1}
expected = {2: {'count': 1, 'min_free_mem': 3500, 'max_free_mem': 3500},
0: {'count': 1, 'min_free_mem': 4000, 'max_free_mem': 4000}
}
self.assertEqual(expected, result)


Expand All @@ -110,16 +114,18 @@ def test_print_partition_summary_with_idle_resources(self, mock_print: Mock) ->

cluster = 'smp'
partition = 'default'
idle_resources = {2: 3, 4: 1} # 3 nodes with 2 idle resources, 1 node with 4 idle resources
idle_resources = {2: {'count': 3, 'min_free_mem': 2500, 'max_free_mem': 3500},
4: {'count': 1, 'min_free_mem': 3000, 'max_free_mem': 3000}
} # 3 nodes with 2 idle resources, 1 node with 4 idle resources

app = CrcIdle()
app.print_partition_summary(cluster, partition, idle_resources)

mock_print.assert_has_calls([
call(f'Cluster: {cluster}, Partition: {partition}'),
call('=' * 30),
call(' 3 nodes w/ 2 idle cores'),
call(' 1 nodes w/ 4 idle cores'),
call('=' * 70),
call(' 3 nodes w/ 2 idle cores 2.44G - 3.42G min-max free memory'),
call(' 1 nodes w/ 4 idle cores 2.93G - 2.93G min-max free memory'),
call('')
], any_order=False)

Expand All @@ -135,13 +141,13 @@ def test_print_partition_summary_no_idle_resources(self, mock_print: Mock) -> No
app.print_partition_summary(cluster, partition, idle_resources)

mock_print.assert_any_call(f'Cluster: {cluster}, Partition: {partition}')
mock_print.assert_any_call('=' * 30)
mock_print.assert_any_call('=' * 70)
mock_print.assert_any_call(' No idle resources')
mock_print.assert_any_call('')

mock_print.assert_has_calls([
call(f'Cluster: {cluster}, Partition: {partition}'),
call('=====' * 6),
call('=====' * 14),
call(' No idle resources'),
call('')
], any_order=False)

0 comments on commit b0847ae

Please sign in to comment.