Skip to content

Commit

Permalink
Fixed error in handling of soft-masked (lower case) DNA sequences
Browse files Browse the repository at this point in the history
  • Loading branch information
Adam R. Rivers committed Sep 19, 2023
1 parent bd62cf6 commit 871e8c3
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 7 deletions.
7 changes: 4 additions & 3 deletions guidemaker/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1079,11 +1079,11 @@ def get_fastas(filelist, input_format="genbank", tempdir=None):
if is_gzip(file):
with gzip.open(file, 'rt') as f:
records = SeqIO.parse(f, input_format)
SeqIO.write(records, f1, "fasta")
SeqIO.write(map(lambda obj: obj.upper(), records), f1, "fasta")
else:
with open(file, 'r') as f:
records = (SeqIO.parse(f, input_format))
SeqIO.write(records, f1, "fasta")
SeqIO.write(map(lambda obj: obj.upper(), records), f1, "fasta")
return fastpath
except Exception as e:
logger.exception("An error occurred in the input file %s" % file)
Expand Down Expand Up @@ -1152,7 +1152,8 @@ def get_max_cfd(cfdlist):
def get_doench_efficiency_score(df, pam_orientation, num_threads=1):
checkset={'AGG','CGG','TGG','GGG'}
if pam_orientation == "3prime" and set(df.PAM)==checkset:
doenchscore = doench_predict.predict(np.array(df.target_seq30), num_threads=num_threads)

doenchscore = doench_predict.predict(np.array([x.upper() for x in df.target_seq30]), num_threads=num_threads)
df["Efficiency"] = doenchscore
else:
logger.warning("NOTE: doench_efficiency_score based on Doench et al. 2016 - can only be used for NGG PAM).Check PAM sequence and PAM orientation")
Expand Down
4 changes: 1 addition & 3 deletions guidemaker/doench_featurization.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@
Nature Biotechnology Jan 2016, doi:10.1038/nbt.3437.
"""
from itertools import product, islice
from time import time
from typing import List
import logging
from functools import partial
from multiprocessing import Pool
Expand Down Expand Up @@ -52,7 +50,6 @@ def featurize_data(data: pd.DataFrame, learn_options: dict, pam_audit: bool=True
if np.any(data["30mer"].str.len() != 30):
raise AssertionError(f"Sequences should be 30 nt long")


feature_sets = {}

if learn_options["nuc_features"]:
Expand Down Expand Up @@ -171,6 +168,7 @@ def one_hot(seq, idmat, lookup):
pos = lookup[let]
featurevect.extend(list(idmat[pos,:]))
return pd.Series(featurevect)


def sliding_window(iterable, n):
"""Create a generator of substrings
Expand Down
2 changes: 1 addition & 1 deletion guidemaker/doench_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def predict(
Args:
seq (numpy.ndarray) numpy array of 30 nt sequences with 25 nt of guide, NGG pam in 25:27 and the following 2 nts.
model_file (str): file path of the onnx Boosted Gradien Regressor model file without position data
model_file (str): file path of the onnx Boosted Gradient Regressor model file without position data
model_metadata (str): file path of the json model parameters metadata file.
pam_audit (bool): check PAM of each sequence.
length_audit(bool) : check length of each sequence.
Expand Down

0 comments on commit 871e8c3

Please sign in to comment.