forked from eukref/pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
eukref_dbparser03.py
87 lines (73 loc) · 2.39 KB
/
eukref_dbparser03.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
"""
usage
removes sequences that are annotated as "remove" in your figtree and passed through annotatetree.py then annotateclusters.py
python editfasta.py expandedclusters.txt current_DB.fas current_DB-cleared.fas
"""
import sys
def oneline(infasta):
infile = open('%s' % (infasta), "r")
lines = infile.readlines()
infile.close()
outfile = open('%s' % (infasta), 'w')
for i, line in enumerate(lines):
if line[0] == ('>'):
if i > 0:
outfile.write("\n")
outfile.write(line)
else:
line = line.strip()
outfile.write(line)
outfile.write("\n")
outfile.close()
# tab delimited outfile of annotateclusters.py ==> "expandedclusters.txt"
infile = open(sys.argv[1], "r")
lines = infile.readlines()
infile.close()
infasta = sys.argv[2] # big fasta file ==> current_DB.fas
oneline(infasta)
infastafile = open(infasta, "r")
fastalines = infastafile.readlines()
infastafile.close()
# cleared fastafile of "remove" sequences ==> "current_DB-cleared.fas"
outfile = open(sys.argv[3], "w")
removallist = []
for line in lines:
removal = line.split("\t")[1]
removal = removal.strip()
id = line.split("\t")[0]
if removal == ("remove" or "REMOVE" or "Remove"):
removallist.append(id)
print removallist
for j in range(len(fastalines)):
line = fastalines[j]
if line[0] == ">":
if line.count("noginumber") > 0:
accession = line.split("|")[-1]
accession = accession.strip()
print accession
if accession not in removallist:
print 'Not_deleteing'
outfile.write(line)
next_l = fastalines[j + 1]
outfile.write(next_l)
elif line.startswith(">gi+"):
accession = line.split("+")[3]
if accession not in removallist:
outfile.write(line)
next_l = fastalines[j + 1]
outfile.write(next_l)
elif line.startswith(">gi|"):
accession = line.split("|")[3]
if accession not in removallist:
outfile.write(line)
next_l = fastalines[j + 1]
outfile.write(next_l)
else:
print 'sssssss'
outfile.write(line)
next_l = fastalines[j + 1]
outfile.write(next_l)
else:
pass
outfile.close()
sys.exit()