Fixed error in handling of soft-masked (lower case) DNA sequences

USDA-ARS-GBRU · Sep 19, 2023 · 871e8c3 · 871e8c3
1 parent bd62cf6
commit 871e8c3
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 7 deletions.
diff --git a/guidemaker/core.py b/guidemaker/core.py
@@ -1079,11 +1079,11 @@ def get_fastas(filelist, input_format="genbank", tempdir=None):
                 if is_gzip(file):
                     with gzip.open(file, 'rt') as f:
                         records = SeqIO.parse(f, input_format)
-                        SeqIO.write(records, f1, "fasta")
+                        SeqIO.write(map(lambda obj: obj.upper(), records), f1, "fasta")
                 else:
                     with open(file, 'r') as f:
                         records = (SeqIO.parse(f, input_format))
-                        SeqIO.write(records, f1, "fasta")
+                        SeqIO.write(map(lambda obj: obj.upper(), records), f1, "fasta")
         return fastpath
     except Exception as e:
         logger.exception("An error occurred in the input file %s" % file)
@@ -1152,7 +1152,8 @@ def get_max_cfd(cfdlist):
 def get_doench_efficiency_score(df, pam_orientation, num_threads=1):
     checkset={'AGG','CGG','TGG','GGG'}
     if pam_orientation == "3prime" and set(df.PAM)==checkset:
-        doenchscore = doench_predict.predict(np.array(df.target_seq30), num_threads=num_threads)
+
+        doenchscore = doench_predict.predict(np.array([x.upper() for x in df.target_seq30]), num_threads=num_threads)
         df["Efficiency"] = doenchscore
     else:
         logger.warning("NOTE: doench_efficiency_score based on Doench et al. 2016 - can only  be used for NGG PAM).Check PAM sequence and PAM orientation")

diff --git a/guidemaker/doench_featurization.py b/guidemaker/doench_featurization.py
@@ -22,8 +22,6 @@
 Nature Biotechnology Jan 2016, doi:10.1038/nbt.3437.
 """
 from itertools import product, islice
-from time import time
-from typing import List
 import logging
 from functools import partial
 from multiprocessing import Pool
@@ -52,7 +50,6 @@ def featurize_data(data: pd.DataFrame, learn_options: dict, pam_audit: bool=True
     if np.any(data["30mer"].str.len() != 30):
         raise AssertionError(f"Sequences should be 30 nt long")
 
-
     feature_sets = {}
 
     if learn_options["nuc_features"]:
@@ -171,6 +168,7 @@ def one_hot(seq, idmat, lookup):
             pos  = lookup[let]
             featurevect.extend(list(idmat[pos,:]))
         return pd.Series(featurevect)
+
 
     def sliding_window(iterable, n):
         """Create a generator of substrings

diff --git a/guidemaker/doench_predict.py b/guidemaker/doench_predict.py
@@ -92,7 +92,7 @@ def predict(
 
     Args:
         seq (numpy.ndarray) numpy array of 30 nt sequences with 25 nt of guide, NGG pam in 25:27 and the following 2 nts.
-        model_file (str): file path of the onnx Boosted Gradien Regressor model file without position data
+        model_file (str): file path of the onnx Boosted Gradient Regressor model file without position data
         model_metadata (str): file path of the json model parameters metadata file.
         pam_audit (bool): check PAM of each sequence.
         length_audit(bool) : check length of each sequence.