fix: fixed enzyme optmization with Kcat fitness function

Signed-off-by: yvesnana <[email protected]>
GT4SD · Apr 24, 2024 · e73bac8 · e73bac8
1 parent 510123b
commit e73bac8
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 17 deletions.
diff --git a/examples/enzeptional/example_enzeptional.py b/examples/enzeptional/example_enzeptional.py
@@ -1,13 +1,20 @@
 import logging
 import pandas as pd
+from typing import Tuple, List, Optional
 from gt4sd.frameworks.enzeptional.processing import HFandTAPEModelUtility
 from gt4sd.frameworks.enzeptional.core import SequenceMutator, EnzymeOptimizer
 from gt4sd.configuration import GT4SDConfiguration, sync_algorithm_with_s3
 
 
-def initialize_environment(model = "feasibility"):
-    """Synchronize with GT4SD S3 storage and set up the environment."""
-    # NOTE: For those interested in optimizing kcat values, it is important to adjust the scorer path to reflect this focus, thereby selecting the appropriate model for kcat optimization: f"{configuration.gt4sd_local_cache_path}/properties/proteins/enzeptional/scorers/kcat/model.pkl". The specification of the scaler, located within the same directory as the `scorer.pkl`, is mandatory for accurate model performance.
+def initialize_environment(model = "feasibility") -> Tuple[str, Optional[str]]:
+    """Synchronize with GT4SD S3 storage and set up the environment.
+    
+    Args:
+        model (str): Type of optimization ("feasibility" or "kcat").
+
+    Returns:
+        Tuple[str, Optional[str]]: The path to the scorer file and scaler file (if existing). 
+    """    
     configuration = GT4SDConfiguration.get_instance()
     sync_algorithm_with_s3("proteins/enzeptional/scorers", module="properties")
     name = model.lower()
@@ -17,16 +24,39 @@ def initialize_environment(model = "feasibility"):
         return f"{configuration.gt4sd_local_cache_path}/properties/proteins/enzeptional/scorers/{name}/model.pkl", None
 
 
-def load_experiment_parameters():
+def load_experiment_parameters() -> Tuple[List, List, List, List]:
     """Load experiment parameters from a CSV file."""
     df = pd.read_csv("data.csv").iloc[1]
     return df["substrates"], df["products"], df["sequences"], eval(df["intervals"])
 
 
 def setup_optimizer(
-    substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path, scaler_path, concat_order, fitness_kcat
+    substrate_smiles: str,
+    product_smiles: str,
+    sample_sequence: str,
+    intervals: List[List[int]],
+    scorer_path: str,
+    scaler_path: str,
+    concat_order: List[str],
+    use_xgboost_scorer: bool
 ):
-    """Set up and return the optimizer with all necessary components configured."""
+    """Set up and return the optimizer with all necessary components configured
+
+    Args:
+        substrate_smiles (str): SMILES representation of
+        the substrate.
+        product_smiles (str): SMILES representation of the
+        product.
+        sample_sequence (str): The initial protein sequence.
+        intervals (List[List[int]]): Intervals for mutation.
+        scorer_path (str): File path to the scoring model.
+        scaler_path (str): Path to the scaller in case you are usinh the Kcat model.
+        concat_order (List[str]): Order of concatenating embeddings.
+        use_xgboost_scorer (bool): flag to specify if the fitness function is the Kcat.
+
+    Returns:
+        Initialized optmizer
+    """
     model_tokenizer_paths = "facebook/esm2_t33_650M_UR50D"
     chem_paths = "seyonec/ChemBERTa-zinc-base-v1"
 
@@ -58,47 +88,56 @@ def setup_optimizer(
         "crossover_type": "single_point",
         "concat_order": concat_order,
         "scaler_filepath": scaler_path,
-        "fitness_kcat": fitness_kcat
+        "use_xgboost_scorer": use_xgboost_scorer
     }
     return EnzymeOptimizer(**optimizer_config)
 
 
 def optimize_sequences(optimizer):
-    """Optimize sequences using the configured optimizer."""
+    """Optimize sequences using the configured optimizer.
+
+    Args:
+        optimizer: Initialized optimizer
+
+    Returns:
+        Optimized sequences
+    """    
     return optimizer.optimize(
         num_iterations=3, num_sequences=5, num_mutations=5, time_budget=3600
     )
 
 
 def main_kcat():
+    """Optimization using Kcat model"""    
     logging.basicConfig(level=logging.INFO)
     scorer_path, scaler_path = initialize_environment(model="kcat")
-    concat_order, fitness_kcat = ["substrate", "sequence"], True
+    concat_order, use_xgboost_scorer = ["substrate", "sequence"], True
     (
         substrate_smiles,
         product_smiles,
         sample_sequence,
         intervals,
     ) = load_experiment_parameters()
     optimizer = setup_optimizer(
-        substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path, scaler_path, concat_order, fitness_kcat
+        substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path, scaler_path, concat_order, use_xgboost_scorer
     )
     optimized_sequences, iteration_info = optimize_sequences(optimizer)
     logging.info("Optimization completed.")
 
 
 def main_feasibility():
+    """Optimization using Feasibility model"""    
     logging.basicConfig(level=logging.INFO)
     scorer_path, scaler_path = initialize_environment()
-    concat_order, fitness_kcat = ["substrate", "sequence", "product"], False
+    concat_order, use_xgboost_scorer = ["substrate", "sequence", "product"], False
     (
         substrate_smiles,
         product_smiles,
         sample_sequence,
         intervals,
     ) = load_experiment_parameters()
     optimizer = setup_optimizer(
-        substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path, scaler_path, concat_order, fitness_kcat
+        substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path, scaler_path, concat_order, use_xgboost_scorer
     )
     optimized_sequences, iteration_info = optimize_sequences(optimizer)
     logging.info("Optimization completed.")

diff --git a/src/gt4sd/frameworks/enzeptional/core.py b/src/gt4sd/frameworks/enzeptional/core.py
@@ -392,7 +392,7 @@ def __init__(
         pad_intervals: bool = False,
         concat_order=["sequence", "substrate", "product"],
         scaler_filepath: Optional[str] = None,
-        fitness_kcat: Optional[bool] = False,
+        use_xgboost_scorer: Optional[bool] = False,
     ):
         """
         Initializes the optimizer with models, sequences, and
@@ -423,7 +423,7 @@ def __init__(
             pad_intervals (bool): Flag to pad the intervals.
             concat_order (list): Order of concatenating embeddings.
             scaler_filepath (str): Path to the scaller in case you are usinh the Kcat model.
-            fitness_kcat (bool): flag to specify if the fitness function is the Kcat.
+            use_xgboost_scorer (bool): flag to specify if the fitness function is the Kcat.
         """
         self.sequence = sequence
         self.protein_model = protein_model
@@ -444,7 +444,7 @@ def __init__(
         self.scorer = load(scorer_filepath)
         if scaler_filepath is not None:
             self.scaler = load(scaler_filepath)
-        self.fitness_kcat = fitness_kcat
+        self.use_xgboost_scorer = use_xgboost_scorer
 
         # Initialize chem_model for SMILES embeddings
         self.chem_model = HFandTAPEModelUtility(chem_model_path, chem_tokenizer_path)
@@ -624,7 +624,7 @@ def score_sequence(self, sequence: str) -> Dict[str, Any]:
         combined_embedding = np.concatenate(ordered_embeddings)
         combined_embedding = combined_embedding.reshape(1, -1)
 
-        if self.fitness_kcat:
+        if self.use_xgboost_scorer:
             if self.scaler is not None:
                 combined_embedding = self.scaler.transform(combined_embedding)
             score = self.scorer.predict(xgb.DMatrix(combined_embedding))[0]
@@ -660,7 +660,7 @@ def score_sequences(self, sequences: List[str]) -> List[Dict[str, float]]:
             combined_embedding = np.concatenate(ordered_embeddings)
             combined_embedding = combined_embedding.reshape(1, -1)
 
-            if self.fitness_kcat:
+            if self.use_xgboost_scorer:
                 if self.scaler is not None:
                     combined_embedding = self.scaler.transform(combined_embedding)
                 score = self.scorer.predict(xgb.DMatrix(combined_embedding))[0]