Libensemble · shuds13 · Aug 14, 2024 · Jul 29, 2024 · Jul 30, 2024 · Jul 30, 2024
diff --git a/.github/workflows/basic.yml b/.github/workflows/basic.yml
@@ -163,4 +163,4 @@ jobs:
         runs-on: ubuntu-latest
         steps:
         - uses: actions/checkout@v4
-        - uses: crate-ci/[email protected].4
+        - uses: crate-ci/[email protected].6
diff --git a/.github/workflows/extra.yml b/.github/workflows/extra.yml
@@ -229,6 +229,12 @@ jobs:
             rm ./libensemble/tests/unit_tests/test_ufunc_runners.py
             rm ./libensemble/tests/unit_tests/test_executor_balsam.py
 
+        - name: Start Redis
+          if: matrix.os == 'ubuntu-latest'
+          uses: supercharge/[email protected]
+          with:
+            redis-version: 7
+
         - name: Run extensive tests, Ubuntu
           if: matrix.os == 'ubuntu-latest'
           run: |
@@ -254,4 +260,4 @@ jobs:
         runs-on: ubuntu-latest
         steps:
         - uses: actions/checkout@v4
-        - uses: crate-ci/[email protected].4
+        - uses: crate-ci/[email protected].6
diff --git a/.wci.yml b/.wci.yml
@@ -16,8 +16,8 @@ description: |
 language: Python
 
 release:
-  version: 1.4.1
-  date: 2024-07-29
+  version: 1.4.2
+  date: 2024-08-14
 
 documentation:
   general: https://libensemble.readthedocs.io

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -8,6 +8,32 @@ GitHub issues are referenced, and can be viewed with hyperlinks on the `github r
 
 .. _`github releases page`: https://github.com/Libensemble/libensemble/releases
 
+Release 1.4.2
+--------------
+
+:Date: August 14, 2024
+
+* Fix under-utilized resource usage. #1398
+  * Fixes bug causing executor to wrongly increase processor counts when not all nodes are utilized.
+  * Fixes case where setting `num_gpus` to zero was treated as `None`.
+* Add missing PerlmutterGPU specs (these were detected anyway). #1393
+* Handle case where Perlmutter finds no partition. #1391
+* Launch environment scripts in shell. #1392
+
+:Examples:
+
+* Add proxystore example (uses a proxy in history array). #1326
+
+:Note:
+
+* Tests were run on Linux and MacOS with Python versions 3.9, 3.10, 3.11, 3.12
+* Heterogeneous workflows tested on Frontier (OLCF), Polaris (ALCF), and Perlmutter (NERSC).
+* Note that tests have been recently run on Aurora (ALCF), but the system was unavailable at time of release.
+
+:Known Issues:
+
+* See known issues section in the documentation.
+
 Release 1.4.1
 --------------
 
@@ -25,7 +51,6 @@ Release 1.4.1
 
 * See known issues section in the documentation.
 
-
 Release 1.4.0
 --------------
 

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,4 +1,4 @@
-sphinx<8
+sphinx<9
 sphinxcontrib-bibtex
 sphinxcontrib-spelling
 autodoc_pydantic

diff --git a/install/misc_feature_requirements.txt b/install/misc_feature_requirements.txt
@@ -1 +1,2 @@
-globus-compute-sdk==2.24.0
+globus-compute-sdk==2.25.0
+proxystore==0.7.0
diff --git a/install/testing_requirements.txt b/install/testing_requirements.txt
@@ -1,9 +1,9 @@
-flake8==7.1.0
+flake8==7.1.1
 coverage==7.3.1
 pytest==8.3.2
 pytest-cov==5.0.0
 pytest-timeout==2.3.1
 mock==5.1.0
 python-dateutil==2.9.0.post0
 anyio==4.4.0
-matplotlib==3.9.1
+matplotlib==3.9.2
diff --git a/libensemble/ensemble.py b/libensemble/ensemble.py
@@ -12,8 +12,8 @@
 from libensemble.specs import AllocSpecs, ExitCriteria, GenSpecs, LibeSpecs, SimSpecs
 from libensemble.tools import add_unique_random_streams
 from libensemble.tools import parse_args as parse_args_f
-from libensemble.tools.parse_args import mpi_init
 from libensemble.tools import save_libE_output
+from libensemble.tools.parse_args import mpi_init
 from libensemble.utils.misc import specs_dump
 
 ATTR_ERR_MSG = 'Unable to load "{}". Is the function or submodule correctly named?'

diff --git a/libensemble/executors/mpi_executor.py b/libensemble/executors/mpi_executor.py
@@ -138,7 +138,7 @@ def set_resources(self, resources: Resources) -> None:
         self.resources = resources
 
     def _launch_with_retries(
-        self, task: Task, subgroup_launch: bool, wait_on_start: Union[bool, int], run_cmd: List[str]
+        self, task: Task, subgroup_launch: bool, wait_on_start: Union[bool, int], run_cmd: List[str], use_shell: bool
     ) -> None:
         """Launch task with retry mechanism"""
         retry_count = 0
@@ -156,6 +156,7 @@ def _launch_with_retries(
                         stdout=out,
                         stderr=err,
                         start_new_session=subgroup_launch,
+                        shell=use_shell,
                     )
             except Exception as e:
                 logger.warning(f"task {task.name} submit command failed on try {retry_count} with error {e}")
@@ -325,12 +326,9 @@ def submit(
         if not num_procs and not match_procs_to_gpus:
             num_procs = self.gen_nprocs
 
-        if not num_gpus:
+        if num_gpus is None:
             num_gpus = self.gen_ngpus
 
-        if not num_nodes and (self.gen_ngpus or self.gen_nprocs):
-            num_nodes = self.resources.worker_resources.local_node_count
-
         if mpi_runner_type is not None:
             if isinstance(mpi_runner_type, str):
                 mpi_config = {"mpi_runner": mpi_runner_type}
@@ -367,8 +365,10 @@ def submit(
 
         if env_script is not None:
             run_cmd = Executor._process_env_script(task, runline, env_script)
+            use_shell = True
         else:
             run_cmd = runline
+            use_shell = False
 
         if dry_run:
             logger.info(f"Test (No submit) Runline: {' '.join(run_cmd)}")
@@ -378,7 +378,7 @@ def submit(
             task._implement_env()
 
             # Launch Task
-            self._launch_with_retries(task, sglaunch, wait_on_start, run_cmd)
+            self._launch_with_retries(task, sglaunch, wait_on_start, run_cmd, use_shell)
 
             if not task.timer.timing and not task.finished:
                 task.timer.start()

diff --git a/libensemble/executors/mpi_runner.py b/libensemble/executors/mpi_runner.py
@@ -121,7 +121,7 @@ def _set_gpu_cli_option(self, wresources, extra_args, gpu_setting_name, gpu_valu
     def _set_gpu_env_var(self, wresources, task, gpus_per_node, gpus_env):
         """Add GPU environment variable setting to the tasks environment"""
         jassert(wresources.matching_slots, f"Cannot assign CPUs/GPUs to non-matching slots per node {wresources.slots}")
-        slot_list = wresources.get_slots_as_string(multiplier=wresources.gpus_per_rset, limit=gpus_per_node)
+        slot_list = wresources.get_slots_as_string(multiplier=wresources.gpus_per_rset_per_node, limit=gpus_per_node)
         task._add_to_env(gpus_env, slot_list)
 
     def _local_runner_set_gpus(self, task, wresources, extra_args, gpus_per_node, ppn):
@@ -171,7 +171,7 @@ def _assign_gpus(self, task, resources, nprocs, nnodes, ppn, ngpus, extra_args,
 
         # gpus per node for this worker.
         if wresources.doihave_gpus():
-            gpus_avail_per_node = wresources.slot_count * wresources.gpus_per_rset
+            gpus_avail_per_node = wresources.slot_count * wresources.gpus_per_rset_per_node
         else:
             gpus_avail_per_node = 0
 
@@ -224,6 +224,35 @@ def _assign_gpus(self, task, resources, nprocs, nnodes, ppn, ngpus, extra_args,
 
         return nprocs, nnodes, ppn, extra_args
 
+    def _get_min_nodes(self, nprocs, ppn, nnodes, ngpus, resources):
+        """Get minimum nodes needed to match configuration"""
+        if nnodes is not None:
+            return nnodes
+        if ppn:
+            return None  # nnodes gets processed later.
+        if resources is not None:
+            wresources = resources.worker_resources
+            total_nodes = wresources.local_node_count
+            procs_on_node = wresources.slot_count * wresources.procs_per_rset_per_node
+
+            if not nprocs and ngpus is None:
+                # Delay node evaluation to GPU assignment code
+                return None
+            proc_min_nodes = 1
+            gpu_min_nodes = 1
+            if nprocs:
+                proc_min_nodes = (nprocs + procs_on_node - 1) // procs_on_node
+            if ngpus:
+                gpus_on_node = wresources.slot_count * wresources.gpus_per_rset_per_node
+                gpu_min_nodes = (ngpus + gpus_on_node - 1) // gpus_on_node
+
+            min_nodes = max(proc_min_nodes, gpu_min_nodes)
+            nnodes = min(min_nodes, total_nodes)
+            # Must have atleast one processor per node to use GPUs
+            if nprocs:
+                nnodes = min(nnodes, nprocs)
+            return nnodes
+
     def _adjust_procs(self, nprocs, ppn, nnodes, ngpus, resources):
         """Adjust an invalid config"""
 
@@ -241,8 +270,8 @@ def adjust_resource(n_units, units_attr, units_name):
 
         if resources is not None:
             wresources = resources.worker_resources
-            ngpus = adjust_resource(ngpus, "gpus_per_rset", "ngpus")
-            nprocs = adjust_resource(nprocs, "procs_per_rset", "nprocs")
+            ngpus = adjust_resource(ngpus, "gpus_per_rset_per_node", "ngpus")
+            nprocs = adjust_resource(nprocs, "procs_per_rset_per_node", "nprocs")
         return nprocs, ngpus
 
     def get_mpi_specs(
@@ -284,6 +313,8 @@ def get_mpi_specs(
 
         if match_procs_to_gpus:
             jassert(no_config_set, "match_procs_to_gpus is mutually exclusive with either of nprocs/ppn")
+
+        nnodes = self._get_min_nodes(nprocs, ppn, nnodes, ngpus, resources)
         nprocs, ngpus = self._adjust_procs(nprocs, ppn, nnodes, ngpus, resources)
 
         if auto_assign_gpus or ngpus is not None:
@@ -294,7 +325,7 @@ def get_mpi_specs(
                 task, resources, nprocs, nnodes, ppn, ngpus, extra_args, match_procs_to_gpus
             )
 
-        rm_rpn = True if self.rm_rpn and ppn is None and nnodes is None else False
+        rm_rpn = self.rm_rpn and ppn is None and nnodes is None
 
         hostlist = None
         if machinefile and not self.mfile_support:

diff --git a/libensemble/gen_funcs/persistent_aposmm.py b/libensemble/gen_funcs/persistent_aposmm.py
@@ -14,12 +14,12 @@
 import numpy as np
 from mpmath import gamma
 
-# from scipy.spatial.distance import cdist
-
 from libensemble.gen_funcs.aposmm_localopt_support import ConvergedMsg, LocalOptInterfacer, simulate_recv_from_manager
 from libensemble.message_numbers import EVAL_GEN_TAG, FINISHED_PERSISTENT_GEN_TAG, PERSIS_STOP, STOP_TAG
 from libensemble.tools.persistent_support import PersistentSupport
 
+# from scipy.spatial.distance import cdist
+
 
 # Due to recursion error in scipy cdist function
 def cdist(XA, XB, metric="euclidean"):

diff --git a/libensemble/resources/mpi_resources.py b/libensemble/resources/mpi_resources.py
@@ -213,7 +213,7 @@ def get_resources(resources, num_procs=None, num_nodes=None, procs_per_node=None
         )
 
     if num_nodes < local_node_count:
-        logger.warning(
+        logger.debug(
             "User constraints mean fewer nodes being used "
             f"than available. {num_nodes} nodes used. {local_node_count} nodes available"
         )

diff --git a/libensemble/resources/platforms.py b/libensemble/resources/platforms.py
@@ -8,6 +8,7 @@
 option or the environment variable ``LIBE_PLATFORM``.
 """
 
+import logging
 import os
 import subprocess
 from typing import Optional
@@ -16,6 +17,10 @@
 
 from libensemble.utils.misc import specs_dump
 
+logger = logging.getLogger(__name__)
+# To change logging level for just this module
+# logger.setLevel(logging.DEBUG)
+
 
 class PlatformException(Exception):
     """Platform module exception"""
@@ -178,6 +183,8 @@ class PerlmutterCPU(Perlmutter):
 
 
 class PerlmutterGPU(Perlmutter):
+    cores_per_node: int = 64
+    logical_cores_per_node: int = 128
     gpus_per_node: int = 4
     gpu_setting_type: str = "runner_default"
     gpu_env_fallback: str = "CUDA_VISIBLE_DEVICES"
@@ -269,6 +276,7 @@ class Known_platforms(BaseModel):
     generic_rocm: GenericROCm = GenericROCm()
     crusher: Crusher = Crusher()
     frontier: Frontier = Frontier()
+    perlmutter: Perlmutter = Perlmutter()
     perlmutter_c: PerlmutterCPU = PerlmutterCPU()
     perlmutter_g: PerlmutterGPU = PerlmutterGPU()
     polaris: Polaris = Polaris()
@@ -292,10 +300,15 @@ def known_envs():
     """Detect system by environment variables"""
     name = None
     if os.environ.get("NERSC_HOST") == "perlmutter":
-        if "gpu_" in os.environ.get("SLURM_JOB_PARTITION"):
-            name = "perlmutter_g"
+        partition = os.environ.get("SLURM_JOB_PARTITION")
+        if partition:
+            if "gpu_" in partition:
+                name = "perlmutter_g"
+            else:
+                name = "perlmutter_c"
         else:
-            name = "perlmutter_c"
+            name = "perlmutter"
+            logger.manager_warning("Perlmutter detected, but no compute partition detected. Are you on login nodes?")
     return name
 
 

diff --git a/libensemble/resources/rset_resources.py b/libensemble/resources/rset_resources.py
@@ -51,8 +51,9 @@ def __init__(self, num_workers, resources):
         self.num_workers = num_workers
         self.num_workers_2assign2 = RSetResources.get_workers2assign2(self.num_workers, resources)
         self.total_num_rsets = resources.num_resource_sets or self.num_workers_2assign2
-
+        self.num_nodes = len(resources.global_nodelist)
         self.split_list, self.local_rsets_list = RSetResources.get_partitioned_nodelist(self.total_num_rsets, resources)
+        self.nodes_in_rset = len(self.split_list[0])
 
         gpus_avail_per_node = resources.gpus_avail_per_node
         self.rsets_per_node = RSetResources.get_rsets_on_a_node(self.total_num_rsets, resources)
@@ -67,16 +68,20 @@ def __init__(self, num_workers, resources):
         self.total_num_gpu_rsets = np.count_nonzero(self.all_rsets["gpus"])
         self.total_num_nongpu_rsets = np.count_nonzero(~self.all_rsets["gpus"])
 
-        self.gpus_per_rset = gpus_avail_per_node // self.gpu_rsets_per_node if self.gpu_rsets_per_node else 0
-        self.cores_per_rset = resources.physical_cores_avail_per_node // self.rsets_per_node
+        self.gpus_per_rset_per_node = gpus_avail_per_node // self.gpu_rsets_per_node if self.gpu_rsets_per_node else 0
+        self.cores_per_rset_per_node = resources.physical_cores_avail_per_node // self.rsets_per_node
 
         # Oversubsribe
-        if self.cores_per_rset == 0:
+        if self.cores_per_rset_per_node == 0:
             cpn = resources.physical_cores_avail_per_node
             procs_per_core = self.rsets_per_node // cpn + (self.rsets_per_node % cpn > 0)
-            self.procs_per_rset = resources.physical_cores_avail_per_node * procs_per_core
+            self.procs_per_rset_per_node = resources.physical_cores_avail_per_node * procs_per_core
         else:
-            self.procs_per_rset = self.cores_per_rset
+            self.procs_per_rset_per_node = self.cores_per_rset_per_node
+
+        self.gpus_per_rset = self.gpus_per_rset_per_node * self.nodes_in_rset
+        self.cores_per_rset = self.cores_per_rset_per_node * self.nodes_in_rset
+        self.procs_per_rset = self.procs_per_rset_per_node * self.nodes_in_rset
 
     @staticmethod
     def get_group_list(split_list, gpus_per_node=0, gpus_per_group=None):

diff --git a/libensemble/resources/worker_resources.py b/libensemble/resources/worker_resources.py
@@ -273,7 +273,7 @@ def set_env_to_gpus(self, env_var=None, delimiter=","):
         """
         assert self.matching_slots, f"Cannot assign GPUs to non-matching slots per node {self.slots}"
         if self.doihave_gpus():
-            env_value = self.get_slots_as_string(multiplier=self.gpus_per_rset, limit=self.gen_ngpus)
+            env_value = self.get_slots_as_string(multiplier=self.gpus_per_rset_per_node, limit=self.gen_ngpus)
             if env_var is None:
                 if self.platform_info is not None:
                     if self.platform_info.get("gpu_setting_type") == "env":

diff --git a/libensemble/sim_funcs/simple_sim.py b/libensemble/sim_funcs/simple_sim.py
@@ -5,6 +5,7 @@
 __all__ = ["norm_eval"]
 
 import numpy as np
+
 from libensemble.specs import input_fields, output_data
 
 

diff --git a/libensemble/sim_funcs/var_resources.py b/libensemble/sim_funcs/var_resources.py
@@ -279,7 +279,7 @@ def CUDA_variable_resources(H, _, sim_specs, libE_info):
     cores_per_node = resources.slot_count
 
     # Set to detected GPUs
-    # gpus_per_slot = resources.gpus_per_rset
+    # gpus_per_slot = resources.gpus_per_rset_per_node
     # resources.set_env_to_slots("CUDA_VISIBLE_DEVICES", multiplier=gpus_per_slot)
     # cores_per_node = resources.slot_count * gpus_per_slot  # One CPU per GPU
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,6 +5,7 @@
		__all__ = ["norm_eval"]

		import numpy as np

		from libensemble.specs import input_fields, output_data


Expand Down