-
Notifications
You must be signed in to change notification settings - Fork 4
/
mtAlign.py
executable file
·171 lines (132 loc) · 6.09 KB
/
mtAlign.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/usr/bin/env pypy
## Script generates an alignment of variable sites, a list of positions
## Allows for masking of given elements of genome
## uses pypy rather than python
import sys, re, os, gzip, time, multiprocessing, math, random, datetime
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
from Bio.SeqRecord import SeqRecord
from optparse import OptionParser
from subprocess import call
usage = '''
mtAlign instructions
Usage: mtAlign.py [options] list_of_fasta.txt ref.fa output_prefix
list_of_fasta.txt: tab delimited 1) name, 2) fasta path and filename
ref.fa: reference in fasta format
output_prefix: output folder
Options
-m, --mask optional mask file for mobile elements,
tab delimited - start and end coordinates, counting from one, not zero
-p, --nprocs number of cores to use, default 1
Example:
./mtAlign.py -m mask.txt -p 4 list_of_fasta.txt ref.fa output_prefix
'''
if __name__=='__main__':
start = datetime.datetime.now()
### PROCESS INPUT OPTIONS AND ARGUMENTS ###
parser = OptionParser()
parser.add_option( '-m', '--mask', action = 'store', type='string', dest = 'maskfile', default = '' )
parser.add_option( '-p', '--nprocs', action = 'store', type='int', dest = 'nprocs', default = 1 )
opts, args = parser.parse_args()
#check have correct number of arguments
if len(args) == 3:
listoffasta, refpath, outname_prefix = args
maskfile = opts.maskfile
nprocs = opts.nprocs
else:
sys.stdout.write(usage)
sys.exit(1)
#read in reference, concatenate multiple chromasomes
#refseq = SeqIO.read( refpath, 'fasta' ).seq
refseq = "".join([s.seq._data for s in SeqIO.parse( refpath, 'fasta' )])
reflen = len( refseq )
sys.stderr.write('Successfully read in reference, length %s.\n'%reflen)
#read in mask file if provided
# set up default which is to use all sites, i.e. mask none
masksites = [1]*reflen
if maskfile:
r = open(maskfile,'r')
mask = [(int(l.split('\n')[0].split('\t')[0])-1, int(l.split('\n')[0].split('\t')[1])-1) for l in r] #convert back to base0
r.close()
for m in mask:
for i in range(m[0], m[1]+1):
masksites[i]=0
sys.stderr.write('Successfully read in mobile elements mask.\n')
## Read in all the sequences, and replace the id with the required nicename
seqlist = []
with open( listoffasta ) as fp:
for line in fp:
if line:
nicename, fapath = line.strip().split('\t')
if os.path.exists(fapath):
sys.stdout.write('%s\n'%fapath)
with gzip.open(fapath, 'rt') as fa_fh:
fa = list(SeqIO.parse(fa_fh, 'fasta'))[0]
fa.id = nicename
fa.description = ''
seqlist.append(fa)
else:
sys.stderr.write(fapath+' does not exist, skipping...')
time_elapsed = datetime.datetime.now() - start
time_elapsed = float(time_elapsed.seconds) + float(time_elapsed.microseconds/1e6)
sys.stdout.write("Successfully read in sequences in %0.1f seconds.\n"%time_elapsed)
## obtain non-shared positions
start = datetime.datetime.now()
def subset_pos(j, seqlist_j, masksites_j, start_i, end_i, out_q, outname_prefix):
sys.stdout.write('Starting from %s to %s\n'%(start_i, end_i-1))
sys.stdout.flush()
masksites_j = masksites_j[start_i: end_i]
seqlist_j = [s[start_i: end_i] for s in seqlist_j]
seq_generator = zip( masksites_j, *seqlist_j )
nonshared_pos =[ i+start_i for ( i, a ) in enumerate( seq_generator )
if a[0]==1 and len( set( [ ai for ai in a[1:] if ai in 'ACGT' ] ) ) > 1 ]
with open('%s_positions_%s'%(outname_prefix, j), 'w' ) as out:
out.write( '\n'.join( [ str( n+1 ) for n in nonshared_pos ] ) )
if len(nonshared_pos)>0:
out.write('\n')
out_q = multiprocessing.Queue()
chunksize = int(math.ceil(reflen / float(nprocs)))
procs = []
for i in range(nprocs):
p = multiprocessing.Process(
target=subset_pos,
args=(i, seqlist, masksites, i*chunksize, (i+1)*chunksize, out_q, outname_prefix))
procs.append(p)
p.start()
# Wait for all worker processes to finish
for p in procs:
p.join()
time_elapsed = datetime.datetime.now() - start
time_elapsed = float(time_elapsed.seconds) + float(time_elapsed.microseconds/1e6)
sys.stdout.write("Successfully wrote positions in %0.1f seconds.\n"%time_elapsed)
sys.stdout.flush()
start = datetime.datetime.now()
cmd = 'cat %s_positions_* > %s_positions.txt'%(outname_prefix, outname_prefix)
call(cmd, shell=True)
cmd = 'rm %s_positions_*'%outname_prefix
call(cmd, shell=True)
# read back in non-shared positions
posfile = '%s_positions.txt'%outname_prefix
f = open(posfile, 'r')
nonshared_pos = [int(l.strip())-1 for l in f]
f.close()
sys.stdout.write('Successfully obtained masked nonshared_diffs; there are %s of them.\n'%len( nonshared_pos ) )
sys.stdout.flush()
## Sort out the SNPs
for seq in seqlist:
nonshared_bases = ''.join( seq.seq[ i ] for i in nonshared_pos )
seq.seq._data = nonshared_bases
SeqIO.write( seqlist , '%s_snps.fa'%outname_prefix, 'fasta' )
sys.stderr.write('Successfully wrote snps fasta file.\n')
sys.stdout.flush()
## Write the positions
with open('%s_positions.txt'%outname_prefix, 'w' ) as out:
out.write( '\n'.join( [ str( n+1 ) for n in nonshared_pos ] ) )
out.write('\n')
sys.stderr.write('Successfully wrote nonshared positions.\n')
sys.stdout.flush()
time_elapsed = datetime.datetime.now() - start
time_elapsed = float(time_elapsed.seconds) + float(time_elapsed.microseconds/1e6)
sys.stdout.write("Successfully completed alignment in %0.1f seconds.\n"%time_elapsed)
sys.stdout.flush()