-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_sequences.py
57 lines (51 loc) · 1.77 KB
/
get_sequences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from tqdm import tqdm
import pandas as pd
import shelve
import sys
import re
def parse_fasta(filename):
"""
Parse a fasta file and put it in a shelve DB and rename it with the prefix
of the filename. It will also write a combined fasta with the renamed
entries
:param fil: name of fasta files
:param fn2: name of the shelve
:return: shelve db name
"""
fn2 = '%s.shelve' % filename[:filename.rfind('.fasta')]
with shelve.open(fn2) as dic:
name = None
seq = ''
with open(filename) as F:
for line in tqdm(F, desc="Parsing %s" % filename):
if line.startswith('>'):
if name is not None:
dic[name] = seq
seq = ''
name = '%s' % (line.strip())
else:
seq += line
if name not in dic:
dic[name] = seq
return fn2
def main(filtered, fasta, taxa):
taxa = taxa.lower()
prefix = fasta[:fasta.rfind('.fasta')]
fas = parse_fasta(fasta)
df = pd.read_csv(filtered, sep='\t')
with shelve.open(fas) as dic:
seqs = [x[1:] for x in list(dic.keys())]
sdf = df[df.qseqid.isin(seqs)]
for name, d in tqdm(sdf.groupby(taxa), desc="Getting %s" % taxa):
name = re.sub('[^A-Za-z0-9_.]+', '', name)
if d.empty:
continue
with open('%s_%s.fas' % (prefix, name.replace(' ', '_')), 'w'
) as out:
q = d.qseqid.unique().tolist()
for seq in q:
out.write('>%s\n%s' % (seq, dic['>%s' % seq]))
if __name__ == '__main__':
# Usage:
# python get_sequences.py filtered_blast fasta taxa
main(sys.argv[1], sys.argv[2], sys.argv[3])