-
Notifications
You must be signed in to change notification settings - Fork 19
/
split_approx_mappings_in_chunks.py
executable file
·57 lines (43 loc) · 1.98 KB
/
split_approx_mappings_in_chunks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/usr/bin/python
# Usage
# Run:
# python3 split_approx_mappings_in_chunks.py approximate_mappings.paf 4
# It will generate the following files:
# approximate_mappings.paf.chunk_0.paf
# approximate_mappings.paf.chunk_1.paf
# approximate_mappings.paf.chunk_2.paf
# approximate_mappings.paf.chunk_3.paf
import sys
# The script that takes the approximate mappings, weighs each mapping by computing its length * (1 - estimated identity),
# then creates N new files where the mapping sets have a similar sum of weights.
def split_chunks(l, n):
result = [[] for i in range(n)]
sums = [0] * n
i = 0
for e in l:
result[i].append(e)
sums[i] += e[1]
i = sums.index(min(sums))
return result
if __name__ == '__main__':
path_approx_mappings = sys.argv[1]
num_of_chunks = int(sys.argv[2])
rank_to_mapping_dict = {}
mapping_list = []
with open(path_approx_mappings) as f:
for rank, line in enumerate(f):
# We could avoid keeping everything in memory by reading the file again later
rank_to_mapping_dict[rank] = line
_, _, query_start, query_end, _, _, _, target_start, target_end, _, _, _, estimated_identity = line.strip().split('\t')[:13]
num_mapped_bases = max(int(query_end) - int(query_start), int(target_end) - int(target_start))
estimated_identity = float(estimated_identity.split('id:f:')[1]) / 100.0
# High divergence makes alignment more difficult
weight = num_mapped_bases * (1 - estimated_identity)
mapping_list.append((rank, weight))
# Chunk the tuples by looking at their weigths
chunk_list = split_chunks(mapping_list, num_of_chunks)
# Collect the ranks from the tuples to generate balanced chunks
for num_chunk, element_list in enumerate(chunk_list):
with open(path_approx_mappings + f'.chunk_{num_chunk}.paf', 'w') as fw:
for rank, _ in element_list:
fw.write(rank_to_mapping_dict[rank])