Merge branch 'mongodb_hyperopt' into hyperopt_loss_on_fk-refactor

NNPDF · Feb 23, 2024 · 22f6394 · 22f6394
2 parents 8d24ed8 + 08e16c4
commit 22f6394
Show file tree

Hide file tree

Showing 13 changed files with 525 additions and 21 deletions.
diff --git a/.github/workflows/python_installation.yml b/.github/workflows/python_installation.yml
@@ -34,6 +34,11 @@ jobs:
         conda config --append channels conda-forge
         conda config --set show_channel_urls true
         conda install lhapdf pandoc
+    - name: Install MongoDB for parallel hyperopts
+      shell: bash -l {0}
+      run: |
+        conda install mongodb
+        mongod --version
     - name: Install nnpdf with testing and qed extras
       shell: bash -l {0}
       run: |

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
@@ -32,6 +32,8 @@ requirements:
         - psutil # to ensure n3fit affinity is with the right processors
         - blas==1.0 *mkl* # [osx] # Host's blas is mkl, force also runtime blas to be
         - hyperopt
+        - mongodb
+        - pymongo <4
         - seaborn
         - lhapdf
         - sqlite
@@ -57,7 +59,8 @@ requirements:
         - pineappl >=0.6.2
         - eko >=0.14.1
         - fiatlux
-        - curio >=1.0 # reportengine uses it but it's not in its dependencies 
+        - frozendict  # needed for caching of data loading
+        - curio >=1.0 # reportengine uses it but it's not in its dependencies
 
 test:
     requires:

diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst
@@ -386,3 +386,44 @@ To achieve this, you can use the ``--restart`` option within the ``n3fit`` comma
 
 The above command example is effective when the number of saved trials in the ``test_run/nnfit/replica_1/tries.pkl`` is
 less than ``20``. If there are ``20`` or more saved trials, ``n3fit`` will simply terminate, displaying the best results.
+
+
+Running hyperoptimizations in parallel with MongoDB
+---------------------------------------------------
+
+In NNPDF, you can effectively run hyperoptimization experiments in parallel using `MongoDB <https://www.mongodb.com>`_.
+This functionality is provided by the :class:`~n3fit.hyper_optimization.mongofiletrials.MongoFileTrials` class,
+which extends the capabilities of `hyperopt <https://github.com/hyperopt/hyperopt>`_'s `MongoTrials` and enables the
+simultaneous evaluation of multiple trials.
+
+To set up and run a parallelized hyperopt search, follow these steps:
+
+ 1. **Instantiate the MongoDB database:** Start by setting up the database in your current directory.
+ This database is referred to as ``hyperopt-db`` in the following instructions. You can initiate it with the command:
+
+  .. code-block:: bash
+
+    mongod --dbpath ./hyperopt-db
+
+  By default, ``mongod`` uses port ``27017``. This is also the default port for the ``n3fit --db-port`` option.
+  If you wish to use a different port, specify it as follows: ``mongod --dbpath ./hyperopt-db --db-port YOUR_PORT_NUMBER``.
+
+ 2. **Launch NNPDF with MongoDB integration:** Open a new command prompt and run ``n3fit`` with the desired configuration:
+
+  .. code-block:: bash
+
+    n3fit hyper-quickcard.yml 1 -r N_replicas --hyperopt N_trials --parallel-hyperopt --num-mongo-workers N
+
+  Here, ``N`` represents the number of MongoDB workers you wish to launch in parallel.
+  Each mongo worker handles one trial in Hyperopt. So, launching more workers allows for the simultaneous calculation of a greater number of trials.
+  Note that there is no need to manually launch mongo workers, as the ``hyperopt-mongo-worker`` command is automatically
+  executed by the :meth:`~n3fit.hyper_optimization.mongofiletrials.MongoFileTrials.start_mongo_workers` method.
+  By default, the ``host`` argument is set to ``localhost``, and the database is named ``hyperopt``.
+  If necessary, you can modify these settings using the ``n3fit --db-host`` or ``n3fit --db-name`` options.
+
+
+.. note::
+
+  Unlike in serial execution, parallel hyperoptimization runs do not generate ``tries.pkl`` files.
+  To resume an experiment, simply retain the MongoDB database created during your previous run.
+  Then, follow steps 1 and 2 as described above to restart the experiment.
diff --git a/n3fit/src/n3fit/backends/__init__.py b/n3fit/src/n3fit/backends/__init__.py
@@ -15,6 +15,7 @@
 )
 from n3fit.backends.keras_backend.internal_state import (
     clear_backend_state,
+    get_physical_gpus,
     set_eager,
     set_initial_state,
 )

diff --git a/n3fit/src/n3fit/backends/keras_backend/internal_state.py b/n3fit/src/n3fit/backends/keras_backend/internal_state.py
@@ -143,3 +143,14 @@ def set_initial_state(debug=False, external_seed=None, max_cores=None, double_pr
     # Once again, if in debug mode or external_seed set, set also the TF seed
     if debug or external_seed:
         tf.random.set_seed(use_seed)
+
+
+def get_physical_gpus():
+    """
+    Retrieve a list of all physical GPU devices available in the system.
+
+    Returns
+    -------
+        list: A list of TensorFlow physical devices of type 'GPU'.
+    """
+    return tf.config.list_physical_devices('GPU')
diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py
@@ -21,6 +21,7 @@
 
 from n3fit.backends import MetaLayer, MetaModel
 from n3fit.hyper_optimization.filetrials import FileTrials
+from n3fit.hyper_optimization.mongofiletrials import MongoFileTrials
 
 log = logging.getLogger(__name__)
 
@@ -120,28 +121,53 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals=
     """
     # Tell the trainer we are doing hpyeropt
     model_trainer.set_hyperopt(True, keys=hyperscanner.hyper_keys)
+
     # Generate the trials object
-    trials = FileTrials(replica_path_set, parameters=hyperscanner.as_dict())
+    if hyperscanner.parallel_hyperopt:
+        # Instantiate `MongoFileTrials`
+        # Mongo database should have already been initiated at this point
+        trials = MongoFileTrials(
+            replica_path_set,
+            db_host=hyperscanner.db_host,
+            db_port=hyperscanner.db_port,
+            db_name=hyperscanner.db_name,
+            num_workers=hyperscanner.num_mongo_workers,
+            parameters=hyperscanner.as_dict(),
+        )
+    else:
+        # Instantiate `FileTrials`
+        trials = FileTrials(replica_path_set, parameters=hyperscanner.as_dict())
+
     # Initialize seed for hyperopt
     trials.rstate = np.random.default_rng(HYPEROPT_SEED)
 
-    # For restarts, reset the state of `FileTrials` saved in the pickle file
     if hyperscanner.restart_hyperopt:
-        pickle_file_to_load = f"{replica_path_set}/tries.pkl"
-        log.info("Restarting hyperopt run using the pickle file %s", pickle_file_to_load)
-        trials = FileTrials.from_pkl(pickle_file_to_load)
-
-    # Perform the scan
-    best = hyperopt.fmin(
-        fn=_status_wrapper(model_trainer.hyperparametrizable),
+        # For parallel hyperopt restarts, extract the database tar file
+        if hyperscanner.parallel_hyperopt:
+            log.info("Restarting hyperopt run using the MongoDB database %s", trials.db_name)
+            trials.extract_mongodb_database()
+        else:
+            # For sequential hyperopt restarts, reset the state of `FileTrials` saved in the pickle file
+            pickle_file_to_load = f"{replica_path_set}/tries.pkl"
+            log.info("Restarting hyperopt run using the pickle file %s", pickle_file_to_load)
+            trials = FileTrials.from_pkl(pickle_file_to_load)
+
+    # Call to hyperopt.fmin
+    fmin_args = dict(
+        fn=model_trainer.hyperparametrizable,
         space=hyperscanner.as_dict(),
         algo=hyperopt.tpe.suggest,
         max_evals=max_evals,
-        show_progressbar=False,
         trials=trials,
         rstate=trials.rstate,
-        trials_save_file=trials.pkl_file,
     )
+    if hyperscanner.parallel_hyperopt:
+        trials.start_mongo_workers()
+        best = hyperopt.fmin(**fmin_args, show_progressbar=True, max_queue_len=trials.num_workers)
+        trials.stop_mongo_workers()
+        trials.compress_mongodb_database()
+    else:
+        best = hyperopt.fmin(**fmin_args, show_progressbar=False, trials_save_file=trials.pkl_file)
     return hyperscanner.space_eval(best)
 
 
@@ -213,6 +239,17 @@ def __init__(self, parameters, sampling_dict, steps=5):
         restart_config = sampling_dict.get("restart")
         self.restart_hyperopt = True if restart_config else False
 
+        # adding extra options for parallel execution
+        parallel_config = sampling_dict.get("parallel")
+        self.parallel_hyperopt = True if parallel_config else False
+
+        # setting up MondoDB options
+        if self.parallel_hyperopt:
+            self.db_host = sampling_dict.get("db_host")
+            self.db_port = sampling_dict.get("db_port")
+            self.db_name = sampling_dict.get("db_name")
+            self.num_mongo_workers = sampling_dict.get("num_mongo_workers")
+
         self.hyper_keys = set([])
 
         if "parameters" in sampling_dict: