diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ac7077..83ac37b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# v0.4.2 + +Fixed a bug where calulating Doench efficiency scores raised an error if there was an 'N' in the first three nucleotides past the PAM in the flanking genomic sequence. Guidemaker now removes those guides from consideration and reports it as a warning if the flag `--doench_efficiency_score` is used. + # v0.4.1 * Changed how Guidemaker handles DNA sequences that are soft-masked with lowercase letters. The new behavior unmasks all @@ -14,4 +18,5 @@ * replaced append methods with concat methods for Pandas 2.1.1 * output data is now gzipped * updated Dockerfile to use Minimamba base image -* Updates to Python dependencies \ No newline at end of file +* Updates to Python dependencies + diff --git a/guidemaker/core.py b/guidemaker/core.py index 7d2fdd3..456d111 100644 --- a/guidemaker/core.py +++ b/guidemaker/core.py @@ -1151,11 +1151,16 @@ def get_max_cfd(cfdlist): def get_doench_efficiency_score(df, pam_orientation, num_threads=1): checkset={'AGG','CGG','TGG','GGG'} - if pam_orientation == "3prime" and set(df.PAM)==checkset: - - doenchscore = doench_predict.predict(np.array([x.upper() for x in df.target_seq30]), num_threads=num_threads) - df["Efficiency"] = doenchscore + # filter out lines with N'safter the PAM, these cannot be scored + df2 = df[-df.target_seq30.str.contains('N')] + if len(df) != len(df2): + n_removed = len(df) - len(df2) + logger.warning("{} guides were removed from consideration becasue there were N's in the region flanking the PAM site. These cannot be scored.".format(n_removed) ) + if pam_orientation == "3prime" and set(df2.PAM)==checkset: + + doenchscore = doench_predict.predict(np.array([x.upper() for x in df2.target_seq30]), num_threads=num_threads) + df2["Efficiency"] = doenchscore else: logger.warning("NOTE: doench_efficiency_score based on Doench et al. 2016 - can only be used for NGG PAM).Check PAM sequence and PAM orientation") - df["Efficiency"] = "Not Available" - return df.drop('target_seq30', axis=1) + df2["Efficiency"] = "Not Available" + return df2.drop('target_seq30', axis=1) diff --git a/guidemaker/doench_predict.py b/guidemaker/doench_predict.py index 7ccd927..62eb910 100644 --- a/guidemaker/doench_predict.py +++ b/guidemaker/doench_predict.py @@ -88,7 +88,7 @@ def predict( length_audit: bool = False, num_threads: int = 1 ) -> np.array: - """Pedicts regressions scored from sequences. + """Predicts regression scores from sequences. Args: seq (numpy.ndarray) numpy array of 30 nt sequences with 25 nt of guide, NGG pam in 25:27 and the following 2 nts. diff --git a/requirements.txt b/requirements.txt index c038620..dfa2c06 100644 --- a/requirements.txt +++ b/requirements.txt @@ -70,7 +70,7 @@ tornado==6.3.3 typing_extensions==4.7.1 tzdata==2023c tzlocal==4.3.1 -urllib3==2.0.6 +urllib3==2.0.7 validators==0.22.0 watchdog==3.0.0 zipp==3.16.2