Skip to content

Commit

Permalink
Bug Fix - 367 (#35)
Browse files Browse the repository at this point in the history
* - Fixed references to ref_names_for_pe

* removed extra tabs

* trying to match empty line, no tabs

* - changed references to ref_names[0]

* Mckay/pd warnings (#45)

* refactor errors='ignore' to try except

* refactored integer slice to iloc[]

* moved to_numeric try except to function

* Refactor to_numeric_ignore_errors to to_numeric_ignore_columns

This change is slightly cleaner because it addresses the root issue that some
columns are strings (and can therefore not be converted to numeric types). Now
if an error does occur when converting the dfs to numeric types it won't be
swallowed up.

* Add documentation to to_numeric_ignore_columns

---------

Co-authored-by: Cole Lyman <[email protected]>

---------

Co-authored-by: Cole Lyman <[email protected]>
  • Loading branch information
2 people authored and kclem committed Mar 14, 2024
1 parent 0c834b3 commit ad3bd4e
Showing 1 changed file with 28 additions and 28 deletions.
56 changes: 28 additions & 28 deletions CRISPResso2/CRISPRessoCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -4590,14 +4590,14 @@ def get_scaffold_len(row, scaffold_start_loc, scaffold_seq):
mod_pcts.append(np.concatenate(([ref_name, 'Deletions'], np.array(ref1_all_deletion_count_vectors[ref_name]).astype(float)/tot)))
mod_pcts.append(np.concatenate(([ref_name, 'Substitutions'], np.array(ref1_all_substitution_count_vectors[ref_name]).astype(float)/tot)))
mod_pcts.append(np.concatenate(([ref_name, 'All_modifications'], np.array(ref1_all_indelsub_count_vectors[ref_name]).astype(float)/tot)))
mod_pcts.append(np.concatenate(([ref_name, 'Total'], [counts_total[ref_name]]*refs[ref_names_for_pe[0]]['sequence_length'])))
colnames = ['Batch', 'Modification']+list(refs[ref_names_for_pe[0]]['sequence'])
mod_pcts.append(np.concatenate(([ref_name, 'Total'], [counts_total[ref_name]]*refs[ref_names[0]]['sequence_length'])))
colnames = ['Batch', 'Modification']+list(refs[ref_names[0]]['sequence'])
pe_modification_percentage_summary_df = to_numeric_ignore_columns(pd.DataFrame(mod_pcts, columns=colnames), {'Batch', 'Modification'})

sgRNA_intervals = refs[ref_names_for_pe[0]]['sgRNA_intervals']
sgRNA_names = refs[ref_names_for_pe[0]]['sgRNA_names']
sgRNA_mismatches = refs[ref_names_for_pe[0]]['sgRNA_mismatches']
include_idxs_list = refs[ref_names_for_pe[0]]['include_idxs']
sgRNA_intervals = refs[ref_names[0]]['sgRNA_intervals']
sgRNA_names = refs[ref_names[0]]['sgRNA_names']
sgRNA_mismatches = refs[ref_names[0]]['sgRNA_mismatches']
include_idxs_list = refs[ref_names[0]]['include_idxs']

plot_root = _jp('11a.Prime_editing_nucleotide_percentage_quilt')
plot_11a_input = {
Expand All @@ -4613,24 +4613,24 @@ def get_scaffold_len(row, scaffold_start_loc, scaffold_seq):
}
info('Plotting prime editing nucleotide percentage quilt', {'percent_complete': 96})
plot(CRISPRessoPlot.plot_nucleotide_quilt, plot_11a_input)
crispresso2_info['results']['refs'][ref_names_for_pe[0]]['plot_11a_root'] = os.path.basename(plot_root)
crispresso2_info['results']['refs'][ref_names_for_pe[0]]['plot_11a_caption'] = "Figure 11a: Nucleotide distribution across all amplicons. At each base in the reference amplicon, the percentage of each base as observed in sequencing reads is shown (A = green; C = orange; G = yellow; T = purple). Black bars show the percentage of reads for which that base was deleted. Brown bars between bases show the percentage of reads having an insertion at that position."
crispresso2_info['results']['refs'][ref_names_for_pe[0]]['plot_11a_data'] = [('Nucleotide frequency table for ' + ref_name, os.path.basename(crispresso2_info['results']['refs'][ref_name]['nuc_freq_filename'])) for ref_name in ref_names_for_pe]

crispresso2_info['results']['refs'][ref_names_for_pe[0]]['plot_11b_roots'] = []
crispresso2_info['results']['refs'][ref_names_for_pe[0]]['plot_11b_captions'] = []
crispresso2_info['results']['refs'][ref_names_for_pe[0]]['plot_11b_datas'] = []

pe_sgRNA_sequences = refs[ref_names_for_pe[0]]['sgRNA_sequences']
pe_sgRNA_orig_sequences = refs[ref_names_for_pe[0]]['sgRNA_orig_sequences']
pe_sgRNA_cut_points = refs[ref_names_for_pe[0]]['sgRNA_cut_points']
pe_sgRNA_plot_cut_points = refs[ref_names_for_pe[0]]['sgRNA_plot_cut_points']
pe_sgRNA_intervals = refs[ref_names_for_pe[0]]['sgRNA_intervals']
pe_sgRNA_names = refs[ref_names_for_pe[0]]['sgRNA_names']
pe_sgRNA_plot_idxs = refs[ref_names_for_pe[0]]['sgRNA_plot_idxs']
pe_sgRNA_mismatches = refs[ref_names_for_pe[0]]['sgRNA_mismatches']
pe_ref_len = refs[ref_names_for_pe[0]]['sequence_length']
pe_include_idxs_list = refs[ref_names_for_pe[0]]['include_idxs']
crispresso2_info['results']['refs'][ref_names[0]]['plot_11a_root'] = os.path.basename(plot_root)
crispresso2_info['results']['refs'][ref_names[0]]['plot_11a_caption'] = "Figure 11a: Nucleotide distribution across all amplicons. At each base in the reference amplicon, the percentage of each base as observed in sequencing reads is shown (A = green; C = orange; G = yellow; T = purple). Black bars show the percentage of reads for which that base was deleted. Brown bars between bases show the percentage of reads having an insertion at that position."
crispresso2_info['results']['refs'][ref_names[0]]['plot_11a_data'] = [('Nucleotide frequency table for ' + ref_name, os.path.basename(crispresso2_info['results']['refs'][ref_name]['nuc_freq_filename'])) for ref_name in ref_names_for_pe]

crispresso2_info['results']['refs'][ref_names[0]]['plot_11b_roots'] = []
crispresso2_info['results']['refs'][ref_names[0]]['plot_11b_captions'] = []
crispresso2_info['results']['refs'][ref_names[0]]['plot_11b_datas'] = []

pe_sgRNA_sequences = refs[ref_names[0]]['sgRNA_sequences']
pe_sgRNA_orig_sequences = refs[ref_names[0]]['sgRNA_orig_sequences']
pe_sgRNA_cut_points = refs[ref_names[0]]['sgRNA_cut_points']
pe_sgRNA_plot_cut_points = refs[ref_names[0]]['sgRNA_plot_cut_points']
pe_sgRNA_intervals = refs[ref_names[0]]['sgRNA_intervals']
pe_sgRNA_names = refs[ref_names[0]]['sgRNA_names']
pe_sgRNA_plot_idxs = refs[ref_names[0]]['sgRNA_plot_idxs']
pe_sgRNA_mismatches = refs[ref_names[0]]['sgRNA_mismatches']
pe_ref_len = refs[ref_names[0]]['sequence_length']
pe_include_idxs_list = refs[ref_names[0]]['include_idxs']

for i in range(len(pe_sgRNA_cut_points)):
cut_point = pe_sgRNA_cut_points[i]
Expand All @@ -4653,7 +4653,7 @@ def get_scaffold_len(row, scaffold_start_loc, scaffold_seq):
#get new intervals
new_sgRNA_intervals = []
#add annotations for each sgRNA (to be plotted on this sgRNA's plot)
for (int_start, int_end) in refs[ref_names_for_pe[0]]['sgRNA_intervals']:
for (int_start, int_end) in refs[ref_names[0]]['sgRNA_intervals']:
new_sgRNA_intervals += [(int_start - new_sel_cols_start, int_end - new_sel_cols_start)]
new_include_idx = []
for x in pe_include_idxs_list:
Expand All @@ -4672,9 +4672,9 @@ def get_scaffold_len(row, scaffold_start_loc, scaffold_seq):
}
info('Plotting nucleotide quilt', {'percent_complete': 97})
plot(CRISPRessoPlot.plot_nucleotide_quilt, plot_11b_input)
crispresso2_info['results']['refs'][ref_names_for_pe[0]]['plot_11b_roots'].append(os.path.basename(plot_root))
crispresso2_info['results']['refs'][ref_names_for_pe[0]]['plot_11b_captions'].append('Figure 11b: Nucleotide distribution around the ' + sgRNA_legend + '.')
crispresso2_info['results']['refs'][ref_names_for_pe[0]]['plot_11b_datas'].append([('Nucleotide frequency in quantification window for ' + ref_name, os.path.basename(crispresso2_info['results']['refs'][ref_name]['quant_window_nuc_freq_filename'])) for ref_name in ref_names_for_pe])
crispresso2_info['results']['refs'][ref_names[0]]['plot_11b_roots'].append(os.path.basename(plot_root))
crispresso2_info['results']['refs'][ref_names[0]]['plot_11b_captions'].append('Figure 11b: Nucleotide distribution around the ' + sgRNA_legend + '.')
crispresso2_info['results']['refs'][ref_names[0]]['plot_11b_datas'].append([('Nucleotide frequency in quantification window for ' + ref_name, os.path.basename(crispresso2_info['results']['refs'][ref_name]['quant_window_nuc_freq_filename'])) for ref_name in ref_names_for_pe])

if args.prime_editing_pegRNA_scaffold_seq != "" and df_scaffold_insertion_sizes.shape[0] > 0 and df_scaffold_insertion_sizes['Num_match_scaffold'].max() > 0 and df_scaffold_insertion_sizes['Num_gaps'].max() > 0:
plot_root = _jp('11c.Prime_editing_scaffold_insertion_sizes')
Expand Down

0 comments on commit ad3bd4e

Please sign in to comment.