2_AssembleProbes.py

from Bio import SeqIO
from Bio.SeqUtils import GC
from Bio.SeqUtils import MeltingTemp as mt
from io import StringIO
import io
import subprocess
import os
import sys
import random
import glob
import argparse

def main(mismatch_cutoff):
	#Need to specify these values.
	adapts = open('adapters.txt', 'r')
	adapt_dict = {}
	for adapt in adapts:
		adapt = adapt.strip()
		adapt_name = adapt.split('\t')[0]
		adapt_seq = adapt.split('\t')[1]
		adapt_dict[adapt_name] = adapt_seq

	initial_barcode_file = 'barcodes.txt'
	# Increasing the below number increases the number of probes available, but
	# at the cost of probes with stronger predicted secondary structure
	# NOTE: This number is usually produced as a negative value by RNAfold, but
	# is used as an absolute value in this script. So 7.6 in this code behaves
	# like -7.6 in RNAfold
	allowed_ss_score = 9.7

	summary_file_name = 'summary_file_{}off.txt'.format(mismatch_cutoff)
	summary_file = open(summary_file_name, 'w')

	try:
		barcode_file = open(initial_barcode_file, 'r')
	except (FileNotFoundError, OSError) as e:
		raise e
	except Exception as e:
		print("Unexpected error:")
		raise e

	# Create a folder named 'probes' within the current working directory, this
	# will be where probes are saved.
	working_dir = os.getcwd()
	if os.path.exists(os.path.dirname('./probes/')):
		pass
	else:
		os.makedirs(os.path.dirname('./probes/'))

	# Assign barcodes. Use 'running_barcode_list.txt' to make sure barcodes are
	# conserved between runs
	unused_barcodes = []
	used_barcodes = {}
	for barcode in barcode_file:
		barcode = str(barcode).strip()
		if '\t' in barcode:
			code = barcode.split('\t')
			# barcodes are used in lower case for easier visualizing of final
			# probes
			used_barcodes[code[1]] = code[0].lower()
		else:
			barcode = barcode.lower()
			unused_barcodes.append(barcode)
	barcode_file.close()

	# use sub_sam files generated by the first script
	target_files = glob.glob('./part1/*.sub_sam')
	for target_file in target_files:
		target_name = str(target_file).replace('./part1/',
											   '').replace('.sub_sam','')
		adapter_seq = adapt_dict[target_name]
		bowtie_output = open(target_file, 'r')
		print('Running probe design for {}'.format(target_name))

		# Open file with all probes discovered from first script
		probe_seqs = SeqIO.parse('{}_AllProbes.fasta'
								 ''.format(target_file.replace('.sub_sam', '')),
								 'fasta')


		# if paralogs exist, open file
		paralog_list = []
		print('target_name={}'.format(target_name))
		try:
			paralog_file = open('./paralogs/{}_paralogs.txt'
								''.format(target_name), 'r')
			for line in paralog_file:
				line = line.strip()
				paralog_list.append(line)
			paralog_file.close()
		except FileNotFoundError:
			print('Warning: {} has no paralogs'.format(target_name))


		# initialize a dictionary of the probes
		probe_dict = {}
		for probe in probe_seqs:
			probe_dict[probe.name] = str(probe.seq)

		# use correct barcodes or assign new ones if not present
		if target_name in used_barcodes.keys():
			target_barcode = used_barcodes[target_name]
		else:
			target_barcode = random.choice(unused_barcodes)
			unused_barcodes.remove(target_barcode)
			used_barcodes[target_name] = target_barcode

		# dict for storing all probe names, and bowtie results
		bowtie_results_dict = {}

		for probe in probe_dict.keys():
			bowtie_results_dict[probe] = []

		for result in bowtie_output:
			result = result.split('\t')
			probe_name = result[0]
			hit_name = result[1]
			mismatches = int(result[3])

			# only keep hits with an edit distance of less than
			if mismatches < mismatch_cutoff:
				bowtie_results_dict[probe_name].append(hit_name)

		# list to sort by Tm after parsing bowtie results
		post_bowtie_list = []
		bowtie_output.close()

		for bowtie_probe in bowtie_results_dict.keys():
			comparison = False
			if len(bowtie_results_dict[bowtie_probe]) == 0:
				post_bowtie_list.append(genPostBowtieItem(bowtie_probe,
														  bowtie_probe,
														  probe_dict))
				# post_bowtie_list.append({'Name' : bowtie_probe,
				# 						 'Tm' : bowtie_probe.split('_')[2],
				# 						 'Sequence' : probe_dict[bowtie_probe]})
			elif len(bowtie_results_dict[bowtie_probe]) == 1:
				if (bowtie_probe.split('_')[0] ==
						bowtie_results_dict[bowtie_probe][0]):
					post_bowtie_list.append(genPostBowtieItem(bowtie_probe,
															  bowtie_probe,
															  probe_dict))
					# post_bowtie_list.append({'Name' : bowtie_probe,
					# 						 'Tm' : bowtie_probe.split('_')[2],
					# 						 'Sequence' : probe_dict[bowtie_probe]})
			elif len(bowtie_results_dict[bowtie_probe]) > 1:
				# keep track of number of bowtie hits that are itself or paralog
				required_score = len(bowtie_results_dict[bowtie_probe])
				score = 0
				for item in bowtie_results_dict[bowtie_probe]:
					if bowtie_probe.split('_')[0] == item:
						probe_name = bowtie_probe
						score += 1
					elif item in paralog_list:
						probe_name = bowtie_probe
						score += 1
					else:
						# breaks out of loop if non-paralog found
						break
				if score == required_score:
					post_bowtie_list.append(genPostBowtieItem(probe_name,
															  bowtie_probe,
															  probe_dict))
					# post_bowtie_list.append({'Name' : probe_name,
					# 						 'Tm' : bowtie_probe.split('_')[2],
					# 						 'Sequence' : probe_dict[bowtie_probe]})
			else:
				print('Could not find any bowtie hits for {}'
					  ''.format(target_name))
		print('{} probes passed offtarget prediction'
			  ''.format(len(post_bowtie_list)))

		# save all good probes to file and add barcodes and adapter
		post_bowtie_results = open('./probes/{}_{}off_PostBowtie.fasta'
								   ''.format(target_name, mismatch_cutoff), 'w')
		for a in post_bowtie_list:
			name = a['Name']
			f_arm = a['Sequence'][7:]
			t_arm = a['Sequence'][:7]
			final_sequence = "{}{}{}{}{}".format(f_arm, target_barcode[4:],
												 adapter_seq, target_barcode,
												 t_arm)
			post_bowtie_results.write('>{}_{}off\n{}\n'.format(name,
															   mismatch_cutoff,
															   final_sequence))
		post_bowtie_results.close()

		# Use RNAfold to predict secondary structure
		RNAfold_infile = ('./probes/{}_{}off_PostBowtie.fasta'
						  ''.format(target_name, mismatch_cutoff))

		# If no probes passed bowtie2 analysis, don't bother going through
		# RNAfold analysis. This actually throws an error if file is empty
		if len(post_bowtie_list) == 0:
			print('number of good probes\t{}'.format(len(post_bowtie_list)))
			summary_file.write('{}\t{}\t{}\n'
							   ''.format(target_name, len(post_bowtie_list),
										 target_barcode))
		else:
			try:
				# remove RNAfold outfile because it automatically appends
				# results and does not overwright.
				os.remove('tmp1.rna_out')
			except FileNotFoundError:
				pass
			buildcmd = ['RNAfold', '-p', '-d2', '--noLP', '--noPS',
						'-P', 'dna_mathews2004.par',
						'--noconv', '--MEA',
						'--infile={}'.format(RNAfold_infile),
						'--outfile=tmp1.rna_out']
			subprocess.call(buildcmd)

			# remove unused ps_files that RNAfold produced even with ps files
			# are suppressed
			ps_files = glob.glob('*.ps')
			for a in ps_files:
				os.remove(a)

			# Open the temporary RNAfold file
			ss_result_file = open('tmp1.rna_out', 'r')
			ss = ss_result_file.read()
			ss_result_file.close()
			ss_results = str(ss).split('>')
			ss_results_list = []

			# parse RNAfold output
			for ss_result in ss_results:
				b = ss_result.split('\n')
				if len(b) > 1:
					probe_name = b[0]
					probe_seq = b[1]
					data = str(b[2]).split(' ', 1)[1]
					data = float(data[1:7].strip(' ').strip('-'))

					if data <= allowed_ss_score:
						ss_results_list.append({'Name' : probe_name,
											'Sequence' : probe_seq,
											'Result' : data})

			# Sort probes with least secondary structure to most.
			sorted_probes = sorted(ss_results_list,
								   key=lambda k: k['Result'],
								   reverse=False)
			# post_RNAfold_results = open('./probes/{}_{}off_PostRNAfold.fasta'.format(target_name, mismatch_cutoff), 'w')
			# make sure there is no probe overlap
			# Use position_list to store all base positions that are already covered
			# by a probe
			position_list = []
			good_probes = []

			for probe in sorted_probes:
				probe_name = probe['Name'] + '_' + str(probe['Result']) + 'ss'
				probe_seq = probe['Sequence']

				# post_RNAfold_results.write('>{}\n{}\n'.format(probe_name, probe_seq))
				position = probe_name.split('_')[1]
				position = int(position.split('-')[0])

				# Ensure there is a 4 bp space between each probe
				if len(position_list) == 0:
					start = position - 2
					if start < 0:
						start = 0
					end = position + 27
					position_list.extend(range(start,end))
					good_probes.append(genGoodProbesItem(probe_name, probe_seq))
				else:
					if ((position - 2 not in position_list)
							and (position + 27 not in position_list)):
						start = position - 2

						if start < 0:
							start = 0
						end = position + 27

						position_list.extend(range(start,end))
						good_probes.append(genGoodProbesItem(probe_name,
															 probe_seq))
					else:
						pass

			# Save entire probe and only the template binding region seperately
			# for each analysis.
			final_results = open('./probes/{}_{}off_FinalProbes.fasta'
								 ''.format(target_name, mismatch_cutoff), 'w')
			final_results2 = open('./probes/{}_{}off_FinalTargets.fasta'
								 ''.format( target_name, mismatch_cutoff), 'w')

			for item in good_probes:
				target_sequence = item['Sequence'][53:] + item['Sequence'][:18]
				final_results.write('>{}\n/5phos/{}\n'.format(item['Name'],
															  item['Sequence']))
				final_results2.write('>{}\n{}\n'.format(item['Name'],
														target_sequence))

			print('number of good probes for {}\t{}'.format(target_name,
															len(good_probes)))
			summary_file.write('{}\t{}\t{}\n'.format(target_name,
													 len(good_probes),
													 target_barcode))
			# post_RNAfold_results.close()
			final_results.close()
			final_results2.close()



	# Recreate barcode list with new additions
	barcode_results = open('barcodes.txt', 'w')
	for key in used_barcodes.keys():
		barcode_results.write('{}\t{}\n'.format(used_barcodes[key], key))
	for leftover_code in unused_barcodes:
		barcode_results.write('{}\n'.format(leftover_code))
	summary_file.close()
	barcode_results.close()


def genPostBowtieItem(probe_name, bowtie_probe, probe_dict):
	d = {}
	d['Name'] = probe_name
	d['Tm'] = bowtie_probe.split('_')[2],
	d['Sequence'] = probe_dict[bowtie_probe]
	return d


def genGoodProbesItem(probe_name, probe_seq):
	d = {}
	d['Name'] = probe_name
	d['Sequence'] = probe_seq
	return d


if __name__ == '__main__':
	# set up argument parsing
	parser = argparse.ArgumentParser()

	# mismatches allowed
	parser.add_argument("mm", metavar="MM", nargs='?', default=6, type=int,
						help='Number of mismatches allowed in offtarget '
							 'alignments (default: 6).')

	# parse args
	args = parser.parse_args()

	# run main script
	main(args.mm)