Skip to content

Commit

Permalink
fix: fixed enzyme optmization with Kcat fitness function
Browse files Browse the repository at this point in the history
Signed-off-by: yvesnana <[email protected]>
  • Loading branch information
yvesnana committed Apr 24, 2024
1 parent 510123b commit e73bac8
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 17 deletions.
63 changes: 51 additions & 12 deletions examples/enzeptional/example_enzeptional.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
import logging
import pandas as pd
from typing import Tuple, List, Optional
from gt4sd.frameworks.enzeptional.processing import HFandTAPEModelUtility
from gt4sd.frameworks.enzeptional.core import SequenceMutator, EnzymeOptimizer
from gt4sd.configuration import GT4SDConfiguration, sync_algorithm_with_s3


def initialize_environment(model = "feasibility"):
"""Synchronize with GT4SD S3 storage and set up the environment."""
# NOTE: For those interested in optimizing kcat values, it is important to adjust the scorer path to reflect this focus, thereby selecting the appropriate model for kcat optimization: f"{configuration.gt4sd_local_cache_path}/properties/proteins/enzeptional/scorers/kcat/model.pkl". The specification of the scaler, located within the same directory as the `scorer.pkl`, is mandatory for accurate model performance.
def initialize_environment(model = "feasibility") -> Tuple[str, Optional[str]]:
"""Synchronize with GT4SD S3 storage and set up the environment.
Args:
model (str): Type of optimization ("feasibility" or "kcat").
Returns:
Tuple[str, Optional[str]]: The path to the scorer file and scaler file (if existing).
"""
configuration = GT4SDConfiguration.get_instance()
sync_algorithm_with_s3("proteins/enzeptional/scorers", module="properties")
name = model.lower()
Expand All @@ -17,16 +24,39 @@ def initialize_environment(model = "feasibility"):
return f"{configuration.gt4sd_local_cache_path}/properties/proteins/enzeptional/scorers/{name}/model.pkl", None


def load_experiment_parameters():
def load_experiment_parameters() -> Tuple[List, List, List, List]:
"""Load experiment parameters from a CSV file."""
df = pd.read_csv("data.csv").iloc[1]
return df["substrates"], df["products"], df["sequences"], eval(df["intervals"])


def setup_optimizer(
substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path, scaler_path, concat_order, fitness_kcat
substrate_smiles: str,
product_smiles: str,
sample_sequence: str,
intervals: List[List[int]],
scorer_path: str,
scaler_path: str,
concat_order: List[str],
use_xgboost_scorer: bool
):
"""Set up and return the optimizer with all necessary components configured."""
"""Set up and return the optimizer with all necessary components configured
Args:
substrate_smiles (str): SMILES representation of
the substrate.
product_smiles (str): SMILES representation of the
product.
sample_sequence (str): The initial protein sequence.
intervals (List[List[int]]): Intervals for mutation.
scorer_path (str): File path to the scoring model.
scaler_path (str): Path to the scaller in case you are usinh the Kcat model.
concat_order (List[str]): Order of concatenating embeddings.
use_xgboost_scorer (bool): flag to specify if the fitness function is the Kcat.
Returns:
Initialized optmizer
"""
model_tokenizer_paths = "facebook/esm2_t33_650M_UR50D"
chem_paths = "seyonec/ChemBERTa-zinc-base-v1"

Expand Down Expand Up @@ -58,47 +88,56 @@ def setup_optimizer(
"crossover_type": "single_point",
"concat_order": concat_order,
"scaler_filepath": scaler_path,
"fitness_kcat": fitness_kcat
"use_xgboost_scorer": use_xgboost_scorer
}
return EnzymeOptimizer(**optimizer_config)


def optimize_sequences(optimizer):
"""Optimize sequences using the configured optimizer."""
"""Optimize sequences using the configured optimizer.
Args:
optimizer: Initialized optimizer
Returns:
Optimized sequences
"""
return optimizer.optimize(
num_iterations=3, num_sequences=5, num_mutations=5, time_budget=3600
)


def main_kcat():
"""Optimization using Kcat model"""
logging.basicConfig(level=logging.INFO)
scorer_path, scaler_path = initialize_environment(model="kcat")
concat_order, fitness_kcat = ["substrate", "sequence"], True
concat_order, use_xgboost_scorer = ["substrate", "sequence"], True
(
substrate_smiles,
product_smiles,
sample_sequence,
intervals,
) = load_experiment_parameters()
optimizer = setup_optimizer(
substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path, scaler_path, concat_order, fitness_kcat
substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path, scaler_path, concat_order, use_xgboost_scorer
)
optimized_sequences, iteration_info = optimize_sequences(optimizer)
logging.info("Optimization completed.")


def main_feasibility():
"""Optimization using Feasibility model"""
logging.basicConfig(level=logging.INFO)
scorer_path, scaler_path = initialize_environment()
concat_order, fitness_kcat = ["substrate", "sequence", "product"], False
concat_order, use_xgboost_scorer = ["substrate", "sequence", "product"], False
(
substrate_smiles,
product_smiles,
sample_sequence,
intervals,
) = load_experiment_parameters()
optimizer = setup_optimizer(
substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path, scaler_path, concat_order, fitness_kcat
substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path, scaler_path, concat_order, use_xgboost_scorer
)
optimized_sequences, iteration_info = optimize_sequences(optimizer)
logging.info("Optimization completed.")
Expand Down
10 changes: 5 additions & 5 deletions src/gt4sd/frameworks/enzeptional/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ def __init__(
pad_intervals: bool = False,
concat_order=["sequence", "substrate", "product"],
scaler_filepath: Optional[str] = None,
fitness_kcat: Optional[bool] = False,
use_xgboost_scorer: Optional[bool] = False,
):
"""
Initializes the optimizer with models, sequences, and
Expand Down Expand Up @@ -423,7 +423,7 @@ def __init__(
pad_intervals (bool): Flag to pad the intervals.
concat_order (list): Order of concatenating embeddings.
scaler_filepath (str): Path to the scaller in case you are usinh the Kcat model.
fitness_kcat (bool): flag to specify if the fitness function is the Kcat.
use_xgboost_scorer (bool): flag to specify if the fitness function is the Kcat.
"""
self.sequence = sequence
self.protein_model = protein_model
Expand All @@ -444,7 +444,7 @@ def __init__(
self.scorer = load(scorer_filepath)
if scaler_filepath is not None:
self.scaler = load(scaler_filepath)
self.fitness_kcat = fitness_kcat
self.use_xgboost_scorer = use_xgboost_scorer

# Initialize chem_model for SMILES embeddings
self.chem_model = HFandTAPEModelUtility(chem_model_path, chem_tokenizer_path)
Expand Down Expand Up @@ -624,7 +624,7 @@ def score_sequence(self, sequence: str) -> Dict[str, Any]:
combined_embedding = np.concatenate(ordered_embeddings)
combined_embedding = combined_embedding.reshape(1, -1)

if self.fitness_kcat:
if self.use_xgboost_scorer:
if self.scaler is not None:
combined_embedding = self.scaler.transform(combined_embedding)
score = self.scorer.predict(xgb.DMatrix(combined_embedding))[0]
Expand Down Expand Up @@ -660,7 +660,7 @@ def score_sequences(self, sequences: List[str]) -> List[Dict[str, float]]:
combined_embedding = np.concatenate(ordered_embeddings)
combined_embedding = combined_embedding.reshape(1, -1)

if self.fitness_kcat:
if self.use_xgboost_scorer:
if self.scaler is not None:
combined_embedding = self.scaler.transform(combined_embedding)
score = self.scorer.predict(xgb.DMatrix(combined_embedding))[0]
Expand Down

0 comments on commit e73bac8

Please sign in to comment.