sync with internal AZ verion 4.5.11

MolecularAI · Nov 22, 2024 · c3d97b0 · c3d97b0
1 parent f814377
commit c3d97b0
Show file tree

Hide file tree

Showing 134 changed files with 2,663 additions and 737 deletions.
diff --git a/.env b/.env
@@ -1,4 +1,4 @@
 # example dotenv file
 
 # make the scoring components in contrib/ available
-PYTHONPATH=/location/to/REINVENT4/contrib
+#PYTHONPATH=/location/to/REINVENT4/contrib
diff --git a/.gitattributes b/.gitattributes
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,214 @@ This follows the guideline on [keep a changelog](https://keepachangelog.com/)
 - CAZP scoring component
 
 
+## [4.5.11] 2024-11-18
+
+### Changed
+
+- Convert float nan and infs to valid json format before remote reporting
+
+
+## [4.5.10] 2024-11-16
+
+### Added
+
+- optional tautomer canonicalisation in data pipeline
+
+
+## [4.5.9] 2024-11-07
+
+### Fixed
+
+- read configuration file from stdin
+
+
+## [4.5.8] 2024-11-07
+
+### Changed
+
+- refactor of top level code
+
+
+## [4.5.7] 2024-11-07
+
+### Fixed
+
+- check if DF is set
+
+
+## [4.5.6] 2024-11-05
+
+### Added
+
+- YAML configuration file reader
+
+
+## [4.5.5] 2024-11-05
+
+### Added
+
+- Logging of configuration file absolute path
+
+### Changed
+
+- Automatic configuration file format detection
+
+
+## [4.5.4] 2024-10-28
+
+### Added
+
+- Exponential decay transform
+
+### Fixed
+
+- Ambiguity in parsing optional parameters with multiple endpoints and multiple optional parameters
+
+
+## [4.5.3] 2024-10-23
+
+### Added
+
+- component-level parameters for scoring components
+
+
+## [4.5.2] 2024-10-23
+
+### Added
+
+- executable module: can run `python -m reinvent`
+
+
+## [4.5.1] 2024-10-23
+
+### Added
+
+- SIGUSR1 for controlled termination
+
+
+## [4.5.0] 2024-10-08
+
+### Added
+
+- PepInvent in Sampling and Staged learning mode with example toml config provided 
+- PepInvent prior
+
+
+## [4.4.37] 2024-10-07
+
+### Fixed
+
+- Atom map number removal for Libinvent sampling dropped SMILES
+
+
+## [4.4.36] 2024-09-27
+
+### Added
+
+- Stage number for JSON to remote monitor
+
+### Changed
+
+- Relaxed dependencies
+
+
+## [4.4.35] 2024-09-26
+
+### Added
+
+- Terminate staged learning on SIGTERM and check if running in multiprocessing environment
+
+### Changed
+
+- ValueError for all scoring components such that the staged learning handler can handle failing components
+
+
+## [4.4.34] 2024-09-16
+
+### Fixed
+
+- SMILES in DF memory were wrongly computed
+
+
+## [4.4.33] 2024-09-14
+
+### Fixed
+
+- run-qsartuna.py: convert ndarray to list to make it JSON serializble
+
+
+## [4.4.32] 2024-09-13
+
+### Fixed
+
+- PMI component: check for embedding failure in RDKit's conformer generator
+
+
+## [4.4.31] 2024-09-13
+
+### Fixed
+
+- Dockstream component wrongly quoted the SMILES string
+- Diversity filter setup in config was ignored
+
+
+## [4.4.30] 2024-09-12
+
+### Fixed
+
+- Fixed config reading bug for DF
+
+
+## [4.4.29] 2024-09-05
+
+### Changed
+
+- Changed Molformer sampling valid and unique from percentage to fraction on tensorboard
+
+
+## [4.4.28] 2024-08-29
+
+### Fixed
+
+- Fixed incorrect tanimoto similarity log in Mol2Mol sampling mode
+
+
+## [4.4.27] 2024-07-23
+
+### Fixed
+
+- Corrected typo in Libinvent report
+
+
+## [4.4.26] 2024-07-21
+
+### Fixed
+
+- Report for sampling returned np.array which is incompatibile with JSON serialization
+
+
+## [4.4.25] 2024-07-19
+
+### Fixed
+
+- Allowed responder as an optional input in scoring input validation
+
+
+## [4.4.24] 2024-07-19
+
+### Fixed
+
+- Fixed remote for Libinvent
+- Batchsize defaults to 1 for TL
+
+
+## [4.4.23] 2024-07-18
+
+### Fixed
+
+- Added temperature parameter in Sampling and RL config validation 
+
+
 ## [4.4.22] 2024-07-10
 
 ### Fixed
@@ -314,7 +522,7 @@ Various code improvements.
 
 ### Added
 
-- Stages can now defined their own diversity filters.  Global filter always overwrites stage settings.  Currently no mechanism to carry over DF from previous stage, use single stage runs.
+- Stages can now define their own diversity filters.  Global filter always overwrites stage settings.  Currently no mechanism to carry over DF from previous stage, use single stage runs.
 
 
 ## [4.3.11] 2024-04-30

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,23 @@
+New in REINVENT 4.5
+===================
+
+For details see CHANGELOG.md.
+
+* PepINVENT: transformer (SMILES) based peptide generator and prior model
+* Temperature factor parameter (transformer generators) for sampling and RL
+* Support script run-qsartuna.py to play QSARtuna models in external environment
+* Component-level parameters for scoring components
+* Renamed Qptuna scoring component to [QSARtuna](https://github.com/MolecularAI/QSARtuna)
+* Staged learning terminates on SIGTERM (Ctrl-C) and writes out checkpoint file
+* SIGUSR1 for graceful termination of staged learning runs
+* Relaxed dependencies to accomodate install of other software in same environment e.g. QSARtuna
+* Updated some dependencies e.g. PyTorch (now at version 2.4.1)
+* New notebook in contrib demoing docking with DockStream and OpenEye
+* YAML configuration file reader
+* Configuration file format is automatically detected from filename extension
+* Various code improvements and fixes
+
+
 New in REINVENT 4.4
 ===================
 

diff --git a/README.md b/README.md
@@ -146,9 +146,9 @@ Unit and Integration Tests
 --------------------------
 
 This is primarily for developers and admins/users who wish to ensure that the
-installation works.  The information here is not relevant to the
-practical use of REINVENT.  Please refer to _Basic Usage_ for instructions on
-how to use the `reinvent` command.
+installation works.  The information here is not relevant to the practical use
+of REINVENT.  Please refer to _Basic Usage_ for instructions on how to use the 
+`reinvent` command.
 
 The REINVENT project uses the `pytest` framework for its tests.  Before you run
 them you first have to create a configuration file for the tests.

diff --git a/configs/json/scoring.json b/configs/json/scoring.json
@@ -1,9 +1,9 @@
 {
     "run_type": "scoring",
-    "output_csv": "scoring.csv",
     "json_out_config": "_scoring.json",
     "parameters": {
-        "smiles_file": "compounds.smi"
+        "smiles_file": "compounds.smi",
+        "output_csv": "scoring.csv"
     },
     "scoring": {
         "type": "geometric_mean",

diff --git a/configs/toml/pepinvent.smi b/configs/toml/pepinvent.smi
@@ -0,0 +1,6 @@
+# Example peptide file for REINVENT4 PepInvent
+#
+# One masked peptide with CHUCKLES representation per line
+# ? for mask
+
+?|N[C@@H](CO)C(=O)|?|N[C@@H](Cc1ccc(O)cc1)C(=O)|N(C)[C@@H]([C@@H](C)O)C(=O)|N[C@H](Cc1c[nH]cn1)C(=O)|N[C@@H](CC(=O)N)C2(=O)
diff --git a/configs/toml/sampling.toml b/configs/toml/sampling.toml
@@ -30,6 +30,13 @@ model_file = "priors/reinvent.prior"
 #temperature = 1.0 # temperature in multinomial sampling
 #tb_logdir = "tb_logs"  # name of the TensorBoard logging directory
 
+## Pepinvent
+#model_file = "priors/pepinvent.prior"
+#smiles_file = "pepinvent.smi"
+#sample_strategy = "beamsearch"  # multinomial or beamsearch (deterministic)
+#temperature = 1.0 # temperature in multinomial sampling
+#tb_logdir = "tb_logs"  # name of the TensorBoard logging directory
+
 output_file = 'sampling.csv'  # sampled SMILES and NLL in CSV format
 
 num_smiles = 157  # number of SMILES to be sampled, 1 per input SMILES

diff --git a/configs/toml/staged_learning.toml b/configs/toml/staged_learning.toml
@@ -45,6 +45,13 @@ agent_file = "priors/reinvent.prior"
 #sample_strategy = "multinomial"  # multinomial or beamsearch (deterministic)
 #distance_threshold = 100
 
+## Pepinvent
+#prior_file = "priors/pepinvent.prior"
+#agent_file = "priors/pepinvent.prior"
+#smiles_file = "pepinvent.smi"
+#sample_strategy = "multinomial"  # multinomial or beamsearch (deterministic)
+#distance_threshold = 100
+
 batch_size = 64          # network
 
 unique_sequences = true  # if true remove all duplicates raw sequences in each step

diff --git a/contrib/tests/reinvent_plugins/unit_tests/components/test_comp_unwanted_substructures.py b/contrib/tests/reinvent_plugins/unit_tests/components/test_comp_unwanted_substructures.py
@@ -2,8 +2,8 @@
 
 import numpy as np
 
-from reinvent_plugins.components.comp_unwanted_substructures import Parameters
-from reinvent_plugins.components.comp_unwanted_substructures import UnwantedSubstructures
+from reinvent_plugins.components.RDKit_extra.comp_unwanted_substructures import Parameters
+from reinvent_plugins.components.RDKit_extra.comp_unwanted_substructures import UnwantedSubstructures
 
 SMILIES = [
     "CC1=C(C=C(C=C1)N2C(=O)C(=C(N2)C)N=NC3=CC=CC(=C3O)C4=CC(=CC=C4)C(=O)O)C", # Eltrombopag

diff --git a/pyproject.toml b/pyproject.toml
@@ -47,9 +47,8 @@ dependencies = [
   "mmpdb >=2.1,<3",
   "molvs >=0.1.1,<0.2",
   "numpy >=1.21,<2",
-  "OpenEye-toolkits >=2022",  # Requires --extra-index-url=https://pypi.anaconda.org/OpenEye/simple
+  "openEye-toolkits >=2022",  # Requires --extra-index-url=https://pypi.anaconda.org/OpenEye/simple
   "pandas >=2,<3",
-  "pathos >=0.3.0,<2",
   "Pillow >=10.0,<11.0",
   "pydantic >=2,<3",
   "pytest >=8,<9",
@@ -59,20 +58,18 @@ dependencies = [
   "rdkit >=2021.0",
   "requests >=2.28,<3",
   "requests_mock >=1.10,<2",
-  "scikit-learn==1.2.2",
-  "scipy >=1.10,<2",
   "tenacity >=8.2,<9",
-  "tensorboard",
+  "tensorboard >=2,<3",
   "tomli >=2.0,<3",
-  "torch==2.3.1+cu121", # Requires --extra-index-url https://download.pytorch.org/whl/cu121
-  "torchvision==0.18.1+cu121",  # Needed to log molecular images to Tensorboard.
+  "torch==2.5.1+cu124", # Requires --extra-index-url https://download.pytorch.org/whl/cu121
   "tqdm >=4.64,<5",
   "typing_extensions >=4.0,<5",
   "xxhash >=3,<4",
 ]
 
 [project.scripts]
-reinvent = "reinvent.Reinvent:main"
+reinvent = "reinvent.Reinvent:main_script"
+reinvent_datapre = "reinvent.datapipeline.preprocess:main_script"
 
 
 # FIXME: change urls for public release.