forked from ignacio3437/agalmacap
-
Notifications
You must be signed in to change notification settings - Fork 0
/
AgalmaAA2dna.py
247 lines (215 loc) · 9.83 KB
/
AgalmaAA2dna.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
#!/usr/bin/env python3
import subprocess
import sys
from Bio import SeqIO
from pathlib import Path
from textwrap import dedent
helpful_text = """Agalma finds homologous genes from RNA-seq data. The RNA-seq assemblies (transcriptomes)
are DNA fasta files and the homologous gene alignments are amino acid fasta alignments.
The purpose of AgalmaAA2dna is to convert the amino acid homologous gene
alignments into dna fasta filesnamed after the RefSeq Protein ID that they
annotate too. A copy of the RefSeqProtein CDS from the reference
genome is also inlcuded in the DNA fasta file if write_ref option specified. A reciprical blast of
the AA sequence to the correspoding DNA assembly retrieves the transcriptome data
and the RefSeq CDS is retrieved by the fasta header info.
The output files for each gene (annotated by RefSeqProtID are saved here:
alnaa_folder/DNAbyLoci/RefSeqProtIDxx.fas
Not a stand-alone module.
"""
def run_command(command, command_out_path=None):
"""Runs command with subprocess library.
Prints error if command did not run.
if outpath given it will print output there, otherwise this function returns it. """
command_output = subprocess.getstatusoutput(command)
if command_output[0]:
print(f"{command} Failed")
return None
if command_out_path:
with open(command_out_path, "w") as out_handle:
out_handle.write(command_output[1])
return command_output[1]
def dna_or_aa(seq_file):
"""Reads 5 lines of fasta seq_file and guesses if DNA or RNA.
Exits if invalid residues are present"""
records = SeqIO.parse(seq_file, "fasta")
first2 = []
for n in range(3):
try:
first2.append(next(records))
except:
print(f"Empty file {seq_file}")
sys.exit(1)
seqset = set(''.join([str(record.seq) for record in first2]))
dna = set("NATGCactg-X")
nondna = seqset-dna
if len(nondna) == 0:
alphabet = 'dna'
elif len(nondna) > 4:
alphabet = 'aa'
else:
print(f"Error reading {seq_file} ")
# import pdb; pdb.set_trace()
sys.exit(1)
return alphabet
def blaster(num_threads, query, blastdb_prefix, blast_out_path=None):
"""Runs a blast search and returns the names of the top hit sequences from the blast database
(min eval = 1e-30)
Args:
threads: the number of threads
query: path to query file for blast search
blastdb_prefix: path to blast database file. Does not include blast extension.
Creates a blast database if does not exist
Returns:
blast results as a string. Saves string to text file in blast_out_path if specified
"""
qalphabet = dna_or_aa(query)
dbalphabet = dna_or_aa(blastdb_prefix)
if dbalphabet == 'aa':
blast_suffix, dbtype = '.pin', 'prot'
if qalphabet == 'dna':
blast_prog = 'blastx'
elif qalphabet == 'aa':
blast_prog = 'blastp'
if dbalphabet == 'dna':
blast_suffix, dbtype = '.nin', 'nucl'
if qalphabet == 'dna':
blast_prog = 'blastn'
elif qalphabet == 'aa':
blast_prog = 'tblastn'
# Create blast database if there is none
if not Path(f'{blastdb_prefix}{blast_suffix}').exists():
dbcommand = f"makeblastdb -in {blastdb_prefix} -dbtype {dbtype}"
run_command(dbcommand)
# Run Blast
bcommand = dedent(f'''
{blast_prog} -query {query}
-db {blastdb_prefix}
-evalue 1e-30
-outfmt '6 qseqid sseqid'
-max_target_seqs 1
-max_hsps 1
-num_threads {num_threads}
''').replace('\n', ' ')
blast_output = run_command(bcommand, command_out_path=blast_out_path)
return blast_output
def codex_file_reader(codex_file):
"""The codex dictionary is the blast result of the
agalma alignment consensus to the reference genome Protein fasta.
The dicitonary key is the agalma gene alignment ID and the
dictionary value is the RefSeq Protein ID from the reference genome.
"""
codex_dict = {}
with open(codex_file) as f:
for line in f:
if 'Warning' not in line:
(agalID, RefseqProtID) = line.split()
codex_dict[agalID] = RefseqProtID
return codex_dict
def groupagalmabytxtm(codex_dict, alnaa_folder, txtm):
"""Saves the sequence that has the same header as txtm from each
alignment in alnaa_folder to a file named "AAbytxtm/<txtm>.fas"
Strips the gaps from the AA sequences and removes seqs shorter than 50AA"""
txtm_aaseq_dict = {}
for agalID, RefseqProtID in codex_dict.items():
agal_aln_path = str(Path(alnaa_folder)/f"{agalID}.fas")
agal_aln = SeqIO.index(agal_aln_path, 'fasta')
try:
aaseq = str(agal_aln[txtm].seq).replace('-', '')
if len(aaseq) > 50: # Causes errors with blast if too short
txtm_aaseq_dict[RefseqProtID] = aaseq
except KeyError:
txtm_aaseq_dict[RefseqProtID] = None
txtm_aa_out_folder = Path(alnaa_folder)/"AAbytxtm"
txtm_aa_out_folder.mkdir(exist_ok=True)
with open(txtm_aa_out_folder/f"{txtm}.fas", 'w') as txtm_aa_out_handle:
for RefseqProtID, aaseq in txtm_aaseq_dict.items():
if aaseq:
txtm_aa_out_handle.write(f">{RefseqProtID}\n{aaseq}\n")
return
def fetchseq(names_tofetch, seqfile):
"""Searches for a list of names in fasta seqfile and
returns them as a biopython SeqIO records"""
seq_dict = SeqIO.to_dict(SeqIO.parse(seqfile, "fasta"))
fetched_records = [seq_dict[name] for name in names_tofetch]
return fetched_records
def blastbytxtm(alnaa_folder, txtm_folder, txtm, num_threads):
"""Each amino acid sequence is blasted to the DNA transcriptome assembly.
The results are written to the txtm_dna_out_folder in fasta format.
The fasta headers are '>txtm-loci' to be legible for cat_by_gene()"""
query_path = Path(alnaa_folder)/"AAbytxtm"
blastdb_path = str(Path(txtm_folder)/f"{txtm}.fas")
blast_out = str(query_path/f"{txtm}_recipblast.txt")
blaster(
num_threads=7, query=str(query_path/f"{txtm}.fas"),
blastdb_prefix=blastdb_path, blast_out_path=blast_out)
# store blast results as dictionary
rblast_dict = {}
with open(blast_out, 'r') as rblastout_handle:
rblast_lines = [line.split() for line in rblastout_handle.readlines() if 'Warning' not in line]## for new blast version a header line is inserted with word 'Warning' and should be ignored
for line in rblast_lines:
rblast_dict[line[1]] = line[0]
# fetch each sequences from the transcriptome and write them to a new file send to cat_by_gene()
blast_hit_records = fetchseq(names_tofetch=rblast_dict.keys(), seqfile=blastdb_path)
txtm_dna_out_folder = Path(alnaa_folder)/"DNAbytxtm"
txtm_dna_out_folder.mkdir(exist_ok=True)
dna_out_path = str(txtm_dna_out_folder/f"{txtm}_DNA.fas")
with open(dna_out_path, 'w') as dna_out_handle:
for blast_hit_record in blast_hit_records:
string = f">{txtm}-{rblast_dict[blast_hit_record.name]}\n{blast_hit_record.seq}\n"
dna_out_handle.write(string)
return txtm_dna_out_folder
def cat_by_gene(txtm_list, loci_list, in_path, file_ending, out_path):
"""This concatenates sequence files for all txtmanisms by loci.
Sequence files must have fasta heading of '>txtm-loci'
"""
txtm_dict = {}
for txtm in txtm_list:
seq_dict = SeqIO.to_dict(SeqIO.parse(str(in_path/f"{txtm}{file_ending}"), "fasta"))
txtm_dict[txtm] = seq_dict
for loci in loci_list:
out_file_path = f"{out_path}/{loci}.fas"
with open(out_file_path, 'a') as out_handle:
for txtm in txtm_list:
record_id = f"{txtm}-{loci}"
try:
out_handle.write(f">{txtm}\n{txtm_dict[txtm][record_id].seq}\n")
except KeyError:
pass
return
def agalmaaa2txtmdna(codex_file, alnaa_folder, txtm_folder, loci_dna_out_folder, num_threads):
"""Starting from Agalma AA loci alignments, this will blast each AA sequence to
the transcriptome it came from and will save the results by loci as DNA fasta files.
Args:
codex_file: textfile of each loci as a line: agalma_gene_id \t RefSeqCDS_topBlastHit
alnaa_folder: path to output of alncutter() on agalma supermatrix.
The loci AA alignments with agalma fasta headers
txtm_folder: folder with transcriptomes from agalma.
The names of the transcriptome files must match the fasta headers in the
AA alignments (whatever agalma sp was designated in the catalog)
Out:
written to loci_dna_out_folder
"""
txtms = []
for file in Path(txtm_folder).iterdir():
if not file.stem.startswith('.'):
if 'fa' in file.suffix:
txtms.append(file.stem)
# Create a dictionary to translate agalma ID's to the top hit refseq Protein ID
codex_dict = codex_file_reader(codex_file)
# Make new fasta of agalma output grouped by transcriptome, strips gaps
for txtm in txtms:
groupagalmabytxtm(codex_dict, alnaa_folder, txtm)
txtm_dna_out_folder = blastbytxtm(alnaa_folder, txtm_folder, txtm, num_threads)
# writes sequences with fasta header of txtm and the fasta file name is the loci name
Path(loci_dna_out_folder).mkdir(exist_ok=True)
cat_by_gene(
txtm_list=txtms, loci_list=codex_dict.values(),
in_path=txtm_dna_out_folder, file_ending="_DNA.fas",
out_path=loci_dna_out_folder)
return
def main():
# Not meant to be run as a stand-alone
print(helpful_text)
return
if __name__ == '__main__':
main()