Libensemble · shuds13 · Sep 13, 2023 · Sep 3, 2023 · Sep 4, 2023 · Sep 4, 2023
diff --git a/docs/tutorials/executor_forces_tutorial.rst b/docs/tutorials/executor_forces_tutorial.rst
diff --git a/docs/tutorials/forces_gpu_tutorial.rst b/docs/tutorials/forces_gpu_tutorial.rst
@@ -13,26 +13,27 @@ number of particles (allows live GPU usage to be viewed).
 
 In the first example, each worker will be using one GPU. The code will assign the
 GPUs available to each worker, using the appropriate method. This works on systems
-using nVidia, AMD and intel GPUs.
+using **Nvidia**, **AMD**, and **Intel** GPUs without modifying the scripts.
 
 Videos demonstrate running this example on Perlmutter_, Spock_, and Polaris_.
 *The first two videos are from an earlier release - you no longer need to change
-particle count or modify the `forces.c` file).*
+particle count or modify the `forces.c` file).*. Also, on Polaris, it is no
+longer necessary to change the MPI runner.
 
 Simulation function
 -------------------
 
 The ``sim_f`` (``forces_simf.py``) is as follows. The lines that are different
-to the forces simple example are highlighted:
+to the simple forces example are highlighted:
 
 .. code-block:: python
     :linenos:
-    :emphasize-lines: 29-30, 37
+    :emphasize-lines: 31-32, 39
 
     import numpy as np
 
-    # To retrieve our MPI Executor
-    from libensemble.executors.executor import Executor
+    # Optional status codes to display in libE_stats.txt for each gen or sim
+    from libensemble.message_numbers import TASK_FAILED, WORKER_DONE
 
     # Optional - to print GPU settings
     from libensemble.tools.test_support import check_gpu_setting
@@ -44,14 +45,16 @@ to the forces simple example are highlighted:
         Assigns one MPI rank to each GPU assigned to the worker.
         """
 
+        calc_status = 0
+
         # Parse out num particles, from generator function
         particles = str(int(H["x"][0][0]))
 
         # app arguments: num particles, timesteps, also using num particles as seed
         args = particles + " " + str(10) + " " + particles
 
         # Retrieve our MPI Executor
-        exctr = Executor.executor
+        exctr = libE_info["executor"]
 
         # Submit our forces app for execution.
         task = exctr.submit(
@@ -67,29 +70,36 @@ to the forces simple example are highlighted:
         # Optional - prints GPU assignment (method and numbers)
         check_gpu_setting(task, assert_setting=False, print_setting=True)
 
-        # Stat file to check for bad runs
+        # Try loading final energy reading, set the sim's status
         statfile = "forces.stat"
+        try:
+            data = np.loadtxt(statfile)
+            final_energy = data[-1]
+            calc_status = WORKER_DONE
+        except Exception:
+            final_energy = np.nan
+            calc_status = TASK_FAILED
 
-        # Read final energy
-        data = np.loadtxt(statfile)
-        final_energy = data[-1]
-
-        # Define our output array,  populate with energy reading
+        # Define our output array, populate with energy reading
         output = np.zeros(1, dtype=sim_specs["out"])
-        output["energy"][0] = final_energy
+        output["energy"] = final_energy
 
+        # Return final information to worker, for reporting to manager
+        return output, persis_info, calc_status
 
-    return output
 
-Line 37 simply prints out how the GPUs were assigned. If this is not as desired,
+Lines 31-32 tell the executor to use the GPUs assigned to this worker, and
+to match processors (MPI ranks) to GPUs.
+
+The user can also set ``num_procs`` and ``num_gpus`` in the generator as in
+the `forces_gpu_var_resources`_ example, and skip lines 31-32.
+
+Line 37 simply prints out how the GPUs were assigned. If this is not as expected,
 a :attr:`platform_specs<libensemble.specs.LibeSpecs.platform_specs>` *libE_specs*
 option can be provided in the calling script. Alternatively, for known systems,
 the LIBE_PLATFORM environment variable can be set.
 
-The user can also set ``num_procs`` and ``num_gpus`` in the generator as in
-the `test_GPU_variable_resources.py`_ example.
-
-While this is sufficient for many users, note that it is possible to query
+While this is sufficient for many/most users, note that it is possible to query
 the resources assigned to *this* worker (nodes and partitions of nodes),
 and use this information however you want.
 
@@ -153,7 +163,7 @@ and use this information however you want.
 
         return output
 
-    The above code will assign a GPU to each worker on CUDA capable systems,
+    The above code will assign a GPU to each worker on CUDA-capable systems,
     so long as the number of workers is chosen to fit the resources.
 
     If you want to have one rank with multiple GPUs, then change source lines 30/31
@@ -206,15 +216,11 @@ Running the example
 -------------------
 
 As an example, if you have been allocated two nodes, each with four GPUs, then assign
-eight workers. For example::
+nine workers (the extra worker runs the persistent generator).
 
-    python run_libe_forces.py --comms local --nworkers 8
+For example::
 
-Note that if you are running one persistent generator that does not require
-resources, then assign nine workers and fix the number of *resource_sets* in
-your calling script::
-
-    libE_specs["num_resource_sets"] = 8
+    python run_libe_forces.py --comms local --nworkers 9
 
 See :ref:`zero resource workers<zero_resource_workers>` for more ways to express this.
 
@@ -228,29 +234,31 @@ forces run.
 Varying resources
 -----------------
 
-The same code can be used when varying worker resources. In this case, you may
-add an integer field called ``resource_sets`` as a ``gen_specs["out"]`` in your
-calling script.
-
-In the generator function, assign the ``resource_sets`` field of
-:ref:`H<funcguides-history>` for each point generated. For example
-if a larger simulation requires two MPI tasks (and two GPUs), set the ``resource_sets``
-field to *2* for that sim_id in the generator function.
+A variant of this example where you may specify any number of processors
+and GPUs for each simulation is given in the `forces_gpu_var_resources`_ example.
 
-The calling script run_libe_forces.py_ contains alternative commented-out lines for
-a variable resource example. Search for "Uncomment for var resources"
-
-In this case, the simulator function will work unmodified, assigning one CPU processor
-and one GPU to each MPI rank.
+In this example, when simulations are parameterized in the generator function,
+the ``gen_specs["out"]`` field ``num_gpus`` is set for each simulation (based
+on the number of particles). These values will automatically be used for each
+simulation (they do not need to be passed as a ``sim_specs["in"]``).
 
 Further guidance on varying the resources assigned to workers can be found under the
 :doc:`resource manager<../resource_manager/resources_index>` section.
 
+Multiple Applications
+---------------------
+
+Another variant of this example, forces_multi_app_, has two applications, one that
+uses GPUs, and another that only uses CPUs. The dynamic resource management can
+manage both types of resources and assign these to the same nodes concurrently, for
+maximum efficiency.
+
 Checking GPU usage
 ------------------
 
 The output of `forces.x` will say if it has run on the host or device. When running
-libEnsemble, this can be found under the ``ensemble`` directory.
+libEnsemble, this can be found in the simulation directories (under the ``ensemble``
+directory).
 
 You can check you are running forces on the GPUs as expected by using profiling tools and/or
 by using a monitoring utility. For NVIDIA GPUs, for example, the **Nsight** profiler is
@@ -295,12 +303,10 @@ that runs 8 workers on 2 nodes:
 
     export MPICH_GPU_SUPPORT_ENABLED=1
     export SLURM_EXACT=1
-    export SLURM_MEM_PER_NODE=0
 
-    python run_libe_forces.py --comms local --nworkers 8
+    python run_libe_forces.py --comms local --nworkers 9
 
-where ``SLURM_EXACT`` and ``SLURM_MEM_PER_NODE`` are set to prevent
-resource conflicts on each node.
+where ``SLURM_EXACT`` is set to help prevent resource conflicts on each node.
 
 .. _forces_gpu: https://github.com/Libensemble/libensemble/blob/develop/libensemble/tests/scaling_tests/forces/forces_gpu
 .. _forces.c: https://github.com/Libensemble/libensemble/blob/develop/libensemble/tests/scaling_tests/forces/forces_app/forces.c
@@ -309,4 +315,5 @@ resource conflicts on each node.
 .. _Spock: https://www.youtube.com/watch?v=XHXcslDORjU
 .. _Polaris: https://youtu.be/Ff0dYYLQzoU
 .. _run_libe_forces.py: https://github.com/Libensemble/libensemble/blob/develop/libensemble/tests/scaling_tests/forces/forces_gpu/run_libe_forces.py
-.. _test_GPU_variable_resources.py: https://github.com/Libensemble/libensemble/blob/develop/libensemble/tests/regression_tests/test_GPU_variable_resources.py
+.. _forces_gpu_var_resources: https://github.com/Libensemble/libensemble/blob/develop/libensemble/tests/scaling_tests/forces/forces_gpu_var_resources/run_libe_forces.py
+.. _forces_multi_app: https://github.com/Libensemble/libensemble/blob/develop/libensemble/tests/scaling_tests/forces/forces_multi_app/run_libe_forces.py
diff --git a/libensemble/gen_funcs/persistent_sampling_var_resources.py b/libensemble/gen_funcs/persistent_sampling_var_resources.py
@@ -2,6 +2,10 @@
 
 Each function generates points uniformly over the domain defined by ``gen_specs["user"]["ub"]``
 and ``gen_specs["user"]["lb"]``.
+
+Most functions use a random request of resources over a range, setting num_procs, num_gpus or
+resource sets. The function ``uniform_sample_with_var_gpus`` uses the ``x`` value to determine
+the number of GPUs requested.
 """
 
 import numpy as np
@@ -56,6 +60,47 @@ def uniform_sample(_, persis_info, gen_specs, libE_info):
     return H_o, persis_info, FINISHED_PERSISTENT_GEN_TAG
 
 
+def uniform_sample_with_var_gpus(_, persis_info, gen_specs, libE_info):
+    """
+    Requests a number of GPUs based on the ``x`` value to be used in the evaluation
+    of the generated points. By default, simulations will assign one MPI processor
+    per GPU.
+
+    Note that the ``num_gpus`` gen_specs["out"] option (similar to ``num_procs``) does
+    not need to be passed as a sim_specs["in"]. It will automatically be passed to
+    simulation functions and used by any MPI Executor unless overridden in the
+    ``executor.submit`` function.
+
+    .. seealso::
+        `test_GPU_variable_resources.py <https://github.com/Libensemble/libensemble/blob/develop/libensemble/tests/regression_tests/test_GPU_variable_resources.py>`_
+    """  # noqa
+
+    b, n, lb, ub = _get_user_params(gen_specs["user"])
+    rng = persis_info["rand_stream"]
+    ps = PersistentSupport(libE_info, EVAL_GEN_TAG)
+    tag = None
+    max_gpus = gen_specs["user"]["max_gpus"]
+
+    while tag not in [STOP_TAG, PERSIS_STOP]:
+        x = rng.uniform(lb, ub, (b, n))
+        bucket_size = (ub[0] - lb[0]) / max_gpus
+
+        # Determine number of GPUs based on linear split over x range (first dimension).
+        ngpus = [int((num - lb[0]) / bucket_size) + 1 for num in x[:, 0]]
+
+        H_o = np.zeros(b, dtype=gen_specs["out"])
+        H_o["x"] = x
+        H_o["num_gpus"] = ngpus
+
+        print(f"GEN created {b} sims requiring {ngpus} GPUs", flush=True)
+
+        tag, Work, calc_in = ps.send_recv(H_o)
+        if hasattr(calc_in, "__len__"):
+            b = len(calc_in)
+
+    return H_o, persis_info, FINISHED_PERSISTENT_GEN_TAG
+
+
 def uniform_sample_with_procs_gpus(_, persis_info, gen_specs, libE_info):
     """
     Randomly requests a different number of processors and gpus to be used in the
@@ -137,11 +182,15 @@ def uniform_sample_diff_simulations(_, persis_info, gen_specs, libE_info):
     while tag not in [STOP_TAG, PERSIS_STOP]:
         H_o = np.zeros(b, dtype=gen_specs["out"])
         H_o["x"] = rng.uniform(lb, ub, (b, n))
+
         nprocs = rng.integers(1, gen_specs["user"]["max_procs"] + 1, b)
         use_gpus = rng.choice([True, False], b)
         H_o["num_procs"] = nprocs
         H_o["num_gpus"] = np.where(use_gpus, nprocs, 0)
-        print(f"GEN created {b} sims requiring {nprocs} procs. Use GPUs {use_gpus}", flush=True)
+        if "app_type" in H_o.dtype.names:
+            H_o["app_type"] = np.where(use_gpus, "gpu_app", "cpu_app")
+
+        print(f"\nGEN created {b} sims requiring {nprocs} procs. Use GPUs {use_gpus}", flush=True)
 
         tag, Work, calc_in = ps.send_recv(H_o)
         if hasattr(calc_in, "__len__"):

diff --git a/libensemble/manager.py b/libensemble/manager.py
@@ -226,6 +226,7 @@ def __init__(
             raise ManagerException(
                 "Manager errored on initialization",
                 "Ensemble directory already existed and wasn't empty.",
+                "To reuse ensemble dir, set libE_specs['reuse_output_dir'] = True",
                 e,
             )
 

diff --git a/libensemble/tests/regression_tests/test_GPU_variable_resources.py b/libensemble/tests/regression_tests/test_GPU_variable_resources.py
@@ -30,7 +30,8 @@
 from libensemble import Ensemble
 from libensemble.alloc_funcs.start_only_persistent import only_persistent_gens as alloc_f
 from libensemble.executors.mpi_executor import MPIExecutor
-from libensemble.gen_funcs.persistent_sampling_var_resources import uniform_sample_with_procs_gpus as gen_f
+from libensemble.gen_funcs.persistent_sampling_var_resources import uniform_sample_with_procs_gpus as gen_f1
+from libensemble.gen_funcs.persistent_sampling_var_resources import uniform_sample_with_var_gpus as gen_f2
 
 # Import libEnsemble items for this test
 from libensemble.sim_funcs import six_hump_camel
@@ -50,12 +51,13 @@
     exctr = MPIExecutor()
     exctr.register_app(full_path=six_hump_camel_app, app_name="six_hump_camel")
 
-    gpu_test = Ensemble(parse_args=True)
+    gpu_test = Ensemble(parse_args=True, executor=exctr)
     gpu_test.libE_specs = LibeSpecs(
         num_resource_sets=gpu_test.nworkers - 1,
         resource_info={"cores_on_node": (8, 16), "gpus_on_node": 4},
         sim_dirs_make=True,
         ensemble_dir_path="./ensemble_GPU_variable_w" + str(gpu_test.nworkers),
+        reuse_output_dir=True,
     )
 
     gpu_test.sim_specs = SimSpecs(
@@ -65,12 +67,12 @@
         user={"dry_run": False},
     )
     gpu_test.gen_specs = GenSpecs(
-        gen_f=gen_f,
+        gen_f=gen_f1,
         persis_in=["f", "x", "sim_id"],
         out=[("num_procs", int), ("num_gpus", int), ("x", float, 2)],
         user={
             "initial_batch_size": gpu_test.nworkers - 1,
-            "max_procs": (gpu_test.nworkers - 1) // 2,  # Any sim created can req. 1 worker up to max
+            "max_procs": gpu_test.nworkers - 1,  # Any sim created can req. 1 worker up to max
             "lb": np.array([-3, -2]),
             "ub": np.array([3, 2]),
         },
@@ -84,10 +86,22 @@
         },
     )
 
+    # Run with random num_procs/num_gpus for each simulation
     gpu_test.persis_info = add_unique_random_streams({}, gpu_test.nworkers + 1)
-    gpu_test.exit_criteria = ExitCriteria(sim_max=40)
+    gpu_test.exit_criteria = ExitCriteria(sim_max=20)
+
+    gpu_test.run()
+    if gpu_test.is_manager:
+        assert gpu_test.flag == 0
+
+    # Run with num_gpus based on x[0] for each simulation
+    gpu_test.gen_specs.gen_f = gen_f2
+    gpu_test.gen_specs.user["max_gpus"] = gpu_test.nworkers - 1
+    gpu_test.persis_info = add_unique_random_streams({}, gpu_test.nworkers + 1)
+    gpu_test.exit_criteria = ExitCriteria(sim_max=20)
     gpu_test.run()
 
     if gpu_test.is_manager:
         assert gpu_test.flag == 0
+
         gpu_test.save_output(__file__)
diff --git a/libensemble/tests/regression_tests/test_GPU_variable_resources_multi_task.py b/libensemble/tests/regression_tests/test_GPU_variable_resources_multi_task.py
@@ -26,7 +26,7 @@
 
 This test must be run with 9 or more workers (8 sim workers), in order
 to resource all works units. More generally:
-((nworkers - 1) - gpus_on_node) >= gen_specs["user"][max_resource_sets]
+    ((nworkers - 1) - gpus_on_node) >= gen_specs["user"][max_procs]
 
 """
 
@@ -55,7 +55,13 @@
 
 # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows).
 if __name__ == "__main__":
-    gpu_test = Ensemble(parse_args=True)
+
+    # Get paths for applications to run
+    six_hump_camel_app = six_hump_camel.__file__
+    exctr = MPIExecutor()
+    exctr.register_app(full_path=six_hump_camel_app, app_name="six_hump_camel")
+
+    gpu_test = Ensemble(parse_args=True, executor=exctr)
     nworkers = gpu_test.nworkers
     gpu_test.libE_specs = LibeSpecs(
         num_resource_sets=gpu_test.nworkers - 1,
@@ -64,11 +70,6 @@
         ensemble_dir_path="./ensemble_GPU_variable_multi_task_w" + str(nworkers),
     )
 
-    # Get paths for applications to run
-    six_hump_camel_app = six_hump_camel.__file__
-    exctr = MPIExecutor()
-    exctr.register_app(full_path=six_hump_camel_app, app_name="six_hump_camel")
-
     gpu_test.sim_specs = SimSpecs(
         sim_f=sim_f,
         inputs=["x"],