Merge branch 'develop' into feature/ufunc_field_decorators

Libensemble · Sep 5, 2023 · 15dead6 · 15dead6
2 parents 56a70a7 + aa2447c
commit 15dead6
Show file tree

Hide file tree

Showing 27 changed files with 849 additions and 149 deletions.
diff --git a/.flake8 b/.flake8
@@ -42,6 +42,7 @@ per-file-ignores =
         examples/calling_scripts/run_libensemble_on_warpx.py:E402
         libensemble/gen_funcs/persistent_aposmm.py:E402, E501
         libensemble/tests/regression_tests/test_persistent_aposmm*:E402
+        libensemble/tests/regression_tests/test_with_app_persistent_aposmm_tao_nm.py:E402
         libensemble/tests/regression_tests/test_persistent_gp_multitask_ax.py:E402
         libensemble/tests/regression_tests/test_ytopt_heffte.py:E402
         libensemble/tests/functionality_tests/test_uniform_sampling_then_persistent_localopt_runs.py:E402

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -30,7 +30,10 @@ jobs:
                       python-version: "3.10"
                       mpi-version: "mpich"
                       comms-type: t
-
+                    - os: ubuntu-latest
+                      mpi-version: "openmpi"
+                      python-version: "3.10"
+                      comms-type: l
         env:
             HYDRA_LAUNCHER: "fork"
             TERM: xterm-256color
@@ -101,13 +104,12 @@ jobs:
         #   run: pip install mpi4py
 
         - name: Install mpi4py and MPI from conda
-          if: matrix.python-version != '3.10' && matrix.os != 'windows-latest'
+          if: (matrix.python-version != '3.10' && matrix.os == 'ubuntu-latest') || matrix.os == 'macos-latest'
           run: |
-            conda install ${{ matrix.mpi-version }}
-            conda install mpi4py
+            conda install mpi4py ${{ matrix.mpi-version }}
 
         - name: Install mpi4py from pip, MPI from conda
-          if: matrix.python-version == '3.10' && matrix.os != 'windows-latest'
+          if: matrix.python-version == '3.10' && matrix.os == 'ubuntu-latest'
           run: |
             conda install ${{ matrix.mpi-version }}
             pip install mpi4py

diff --git a/.spell b/.spell
@@ -4,3 +4,4 @@ apoints
 numer
 hist
 inout
+slac
diff --git a/docs/data_structures/libE_specs.rst b/docs/data_structures/libE_specs.rst
@@ -206,6 +206,16 @@ the ``LibeSpecs`` class. When provided as a Python class, options are validated
                     By default resources will be divided by workers (excluding
                     ``zero_resource_workers``).
 
+                "gen_num_procs" [int] = ``0``:
+                    The default number of processors (MPI ranks) required by generators. Unless
+                    overridden by equivalent `persis_info` settings, generators will be allocated
+                    this many processors for applications launched via the MPIExecutor.
+
+                "gen_num_gpus" [int] = ``0``:
+                    The default number of GPUs required by generators. Unless overridden by
+                    the equivalent `persis_info` settings, generators will be allocated this
+                    many GPUs.
+
                 "enforce_worker_core_bounds" [bool] = ``False``:
                     Permit submission of tasks with a
                     higher processor count than the CPUs available to the worker.

diff --git a/docs/examples/aposmm.rst b/docs/examples/aposmm.rst
@@ -12,17 +12,33 @@ Optional (see below): petsc4py_, nlopt_, DFO-LS_
 Configuring APOSMM
 ^^^^^^^^^^^^^^^^^^
 
-By default, APOSMM will import several optimizers which require
-external packages. To import only the optimization packages you are using,
-add the following lines in that calling script, before importing APOSMM::
+APOSMM works with a choice of optimizers, some requiring external packages. To
+import the optimization packages (and their dependencies) at a global level
+(recommended), add the following lines in the calling script, before importing
+APOSMM::
 
     import libensemble.gen_funcs
     libensemble.gen_funcs.rc.aposmm_optimizers = <optimizers>
 
-Where ``optimizers`` is a string (or list of strings) from the available options:
+where ``optimizers`` is a string (or list of strings) from the available options:
 
 ``"petsc"``, ``"nlopt"``, ``"dfols"``, ``"scipy"``, ``"external"``
 
+.. dropdown:: Issues with ensemble hanging or failed simulations?
+
+    Note, that if using **mpi4py** comms, PETSc must be imported at the global
+    level, or the ensemble may hang.
+
+    Exception: In the case that you are using the MPIExecutor or other MPI inside
+    a user function and you are using Open MPI, then you must:
+
+    - Use ``local`` comms for libEnsemble (not ``mpi4py``)
+    - Must **NOT** include the *rc* line above
+
+    This is because PETSc imports MPI, and a global import of PETSc would result
+    in nested MPI (which is not supported by Open MPI). When the above line is
+    not used, an import local to the optimization function will happen.
+
 To see the optimization algorithms supported, see `LocalOptInterfacer`_.
 
 .. seealso::

diff --git a/docs/resource_manager/overview.rst b/docs/resource_manager/overview.rst
@@ -193,10 +193,17 @@ if ``split2fit`` is *False*, as this could otherwise never be scheduled.
 Varying generator resources
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-For all supporting allocation functions, setting the ``persis_info["gen_resources"]``
-to an integer value will provide resource sets to generators when they are started,
-with the default to provide no resources. This could be set in the calling script
-or inside the allocation function.
+By default, generators are not allocated resources in dynamic mode. Fixed resources
+for the generator can be set using the *libE_specs* options
+``gen_num_procs`` and ``gen_num_gpus``, which takes an integer value.
+If only  ``gen_num_gpus`` is set, then number of processors will match.
+
+To vary generator resources, ``persis_info`` settings can be used in allocation
+functions before calling the ``gen_work`` support function. This takes the
+same options (``gen_num_procs`` and ``gen_num_gpus``)
+
+Alternatively, the setting ``persis_info["gen_resources"]`` can also be set to
+a number of resource sets.
 
 Note that persistent workers maintain their resources until coming out of a
 persistent state.

diff --git a/docs/resource_manager/zero_resource_workers.rst b/docs/resource_manager/zero_resource_workers.rst
@@ -50,9 +50,17 @@ worker for the persistent generator - a common use-case.
 
 In general, the number of resource sets should be set to enable the maximum
 concurrency desired by the ensemble, taking into account generators and simulators.
-Users can set generator resources by setting ``persis_info["gen_resources"]``
-to an integer value, representing the number of resource sets to give to the
-generator. The default is zero.
+
+Users can set generator resources using the *libE_specs* options
+``gen_num_procs`` and/or ``gen_num_gpus``, which take an integer values.
+If only  ``gen_num_gpus`` is set, then number of processors will match.
+
+To vary generator resources, ``persis_info`` settings can be used in allocation
+functions before calling the ``gen_work`` support function. This takes the
+same options (``gen_num_procs`` and ``gen_num_gpus``).
+
+Alternatively, the setting ``persis_info["gen_resources"]`` can also be set to
+a number of resource sets.
 
 The available nodes are always divided by the number of resource sets, and there
 may be multiple nodes or a partition of a node in each resource set. If the split

diff --git a/libensemble/executors/mpi_executor.py b/libensemble/executors/mpi_executor.py
@@ -15,7 +15,7 @@
 import logging
 import os
 import time
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import libensemble.utils.launcher as launcher
 from libensemble.executors.executor import Executor, ExecutorException, Task
@@ -84,11 +84,38 @@ def __init__(self, custom_info: dict = {}) -> None:
         self.fail_time = 2
         self.retry_delay_incr = 5  # Incremented wait after each launch attempt
         self.resources = None
+        self.platform_info = None
+        self.gen_nprocs = None
+        self.gen_ngpus = None
 
         # Apply custom options
         self.mpi_runner_type = custom_info.get("mpi_runner")
         self.runner_name = custom_info.get("runner_name")
         self.subgroup_launch = custom_info.get("subgroup_launch")
+        self.mpi_runner_obj = None  # Do not set here or will override platform
+
+    def _create_mpi_runner_obj(self, mpi_runner_type, runner_name, subgroup_launch) -> MPIRunner:
+        mpi_runner_obj = MPIRunner.get_runner(mpi_runner_type, runner_name, self.platform_info)
+        if subgroup_launch is not None:
+            mpi_runner_obj.subgroup_launch = subgroup_launch
+        return mpi_runner_obj
+
+    def _create_mpi_runner_from_config(self, mpi_config: dict = {}) -> MPIRunner:
+        """Return an mpi_runner object from given info"""
+
+        mpi_runner_type = mpi_config.get("mpi_runner")
+        runner_name = mpi_config.get("runner_name")
+        subgroup_launch = mpi_config.get("subgroup_launch")
+        return self._create_mpi_runner_obj(mpi_runner_type, runner_name, subgroup_launch)
+
+    def _create_mpi_runner_from_attr(self) -> MPIRunner:
+        """Create mpi_runner_obj based on existing attributes
+
+        If runner type has not been given, then detect
+        """
+        if not self.mpi_runner_type:
+            self.mpi_runner_type = get_MPI_variant()
+        return self._create_mpi_runner_obj(self.mpi_runner_type, self.runner_name, self.subgroup_launch)
 
     def add_platform_info(self, platform_info={}):
         """Add user supplied platform info to executor"""
@@ -97,17 +124,10 @@ def add_platform_info(self, platform_info={}):
         if platform_info:
             self.mpi_runner_type = self.mpi_runner_type or platform_info.get("mpi_runner")
             self.runner_name = self.runner_name or platform_info.get("runner_name")
+        self.platform_info = platform_info
 
         # If runner type has not been given, then detect
-        if not self.mpi_runner_type:
-            self.mpi_runner_type = get_MPI_variant()
-        self.mpi_runner = MPIRunner.get_runner(self.mpi_runner_type, self.runner_name, platform_info)
-
-        if self.subgroup_launch is not None:
-            self.mpi_runner.subgroup_launch = self.subgroup_launch
-
-        self.gen_nprocs = None
-        self.gen_ngpus = None
+        self.mpi_runner_obj = self._create_mpi_runner_from_attr()
 
     def set_gen_procs_gpus(self, libE_info):
         """Add gen supplied procs and gpus"""
@@ -117,22 +137,15 @@ def set_gen_procs_gpus(self, libE_info):
     def set_resources(self, resources: Resources) -> None:
         self.resources = resources
 
-    def _launch_with_retries(
-        self, task: Task, runline: List[str], subgroup_launch: bool, wait_on_start: bool, env_script: str
-    ) -> None:
+    def _launch_with_retries(self, task: Task, subgroup_launch: bool, wait_on_start: bool, run_cmd: List[str]) -> None:
         """Launch task with retry mechanism"""
         retry_count = 0
 
-        if env_script is not None:
-            run_cmd = Executor._process_env_script(task, runline, env_script)
-        else:
-            run_cmd = runline
-
         while retry_count < self.max_launch_attempts:
             retry = False
             try:
                 retry_string = f" (Retry {retry_count})" if retry_count > 0 else ""
-                logger.info(f"Launching task {task.name}{retry_string}: {' '.join(runline)}")
+                logger.info(f"Launching task {task.name}{retry_string}: {' '.join(run_cmd)}")
                 task.run_attempts += 1
                 with open(task.stdout, "w") as out, open(task.stderr, "w") as err:
                     task.process = launcher.launch(
@@ -185,6 +198,7 @@ def submit(
         auto_assign_gpus: Optional[bool] = False,
         match_procs_to_gpus: Optional[bool] = False,
         env_script: Optional[str] = None,
+        mpi_runner_type: Optional[Union[str, dict]] = None,
     ) -> Task:
         """Creates a new task, and either executes or schedules execution.
 
@@ -250,12 +264,12 @@ def submit(
             resources determination unless also supplied in the direct
             options.
 
-        auto_assign_gpus: bool, optional
+        auto_assign_gpus: bool, Optional
             Auto-assign GPUs available to this worker using either the method
             supplied in configuration or determined by detected environment.
             Default: False
 
-        match_procs_to_gpus: bool, optional
+        match_procs_to_gpus: bool, Optional
             For use with auto_assign_gpus. Auto-assigns MPI processors to match
             the assigned GPUs. Default: False unless auto_assign_gpus is True and
             no other CPU configuration is supplied.
@@ -265,6 +279,13 @@ def submit(
             launched task. This will be run in the subprocess, and not affect
             the worker environment. The script should start with a shebang.
 
+        mpi_runner_type: (str|dict), Optional
+            An MPI runner to be used for this submit only. Supply either a string
+            for the MPI runner type or a dictionary for detailed configuration
+            (see custom_info on MPIExecutor constructor). This will not change
+            the default MPI runner for the executor.
+            Example string inputs are "mpich", "openmpi", "srun", "jsrun", "aprun".
+
         Returns
         -------
 
@@ -303,7 +324,16 @@ def submit(
         if not num_nodes and (self.gen_ngpus or self.gen_nprocs):
             num_nodes = self.resources.worker_resources.local_node_count
 
-        mpi_specs = self.mpi_runner.get_mpi_specs(
+        if mpi_runner_type is not None:
+            if isinstance(mpi_runner_type, str):
+                mpi_config = {"mpi_runner": mpi_runner_type}
+            else:
+                mpi_config = mpi_runner_type
+            mpi_runner_obj = self._create_mpi_runner_from_config(mpi_config)
+        else:
+            mpi_runner_obj = self.mpi_runner_obj or self._create_mpi_runner_from_attr()
+
+        mpi_specs = mpi_runner_obj.get_mpi_specs(
             task,
             num_procs,
             num_nodes,
@@ -318,8 +348,8 @@ def submit(
             self.workerID,
         )
 
-        mpi_command = self.mpi_runner.mpi_command
-        sglaunch = self.mpi_runner.subgroup_launch
+        mpi_command = mpi_runner_obj.mpi_command
+        sglaunch = mpi_runner_obj.subgroup_launch
         runline = launcher.form_command(mpi_command, mpi_specs)
 
         runline.extend(task.app.app_cmd.split())
@@ -328,16 +358,21 @@ def submit(
 
         task.runline = " ".join(runline)  # Allow to be queried
 
+        if env_script is not None:
+            run_cmd = Executor._process_env_script(task, runline, env_script)
+        else:
+            run_cmd = runline
+
         if dry_run:
             task.dry_run = True
-            logger.info(f"Test (No submit) Runline: {' '.join(runline)}")
+            logger.info(f"Test (No submit) Runline: {' '.join(run_cmd)}")
             task._set_complete(dry_run=True)
         else:
             # Set environment variables and launch task
             task._implement_env()
 
             # Launch Task
-            self._launch_with_retries(task, runline, sglaunch, wait_on_start, env_script)
+            self._launch_with_retries(task, sglaunch, wait_on_start, run_cmd)
 
             if not task.timer.timing and not task.finished:
                 task.timer.start()
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,4 @@ apoints @@
     numer
     hist
     inout
+    slac