-
Notifications
You must be signed in to change notification settings - Fork 0
/
Known_Motif_Search.py
136 lines (108 loc) · 4.24 KB
/
Known_Motif_Search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import itertools
import sys
peak_file = sys.argv[1]
genome_file = sys.argv[2]
output_file = sys.argv[3]
custom_motif = "no input"
try:
custom_motif = sys.argv[4]
except:
pass
def get_sequence_variables(sequence):
nucleotides = {
'A': ['A'],
'C': ['C'],
'G': ['G'],
'T': ['T'],
'U': ['U'],
'R': ['G', 'A'],
'Y': ['C', 'T'],
'K': ['G', 'T'],
'M': ['A', 'C'],
'S': ['G', 'C'],
'W': ['A', 'T'],
'B': ['G', 'T', 'C'],
'D': ['G', 'A', 'T'],
'H': ['A', 'C', 'T'],
'V': ['G', 'C', 'A'],
'N': ['A', 'G', 'C', 'T']
}
sequences = []
options = [nucleotides.get(base, [base]) for base in sequence]
for combination in itertools.product(*options):
sequences.append(''.join(combination))
return sequences
def extract_peak_fragments(peak_file, genome_file):
fragments = []
with open(peak_file, 'r') as peak, open(genome_file, 'r') as genome:
genome_sequence = genome.read().replace('\n', '')
# Skip header lines in the peak file
for line in peak:
if line.startswith("#"):
continue
else:
break
# Process peak lines
for line in peak:
fields = line.strip().split('\t')
if len(fields) < 4:
continue
peak_id = fields[0]
chrom = fields[1]
start = int(fields[2]) - 1
end = int(fields[3])
p_value_vs_local = float(fields[13])
sequence = genome_sequence[start:end]
fragments.append((peak_id, sequence, p_value_vs_local))
return fragments
#peak_file = './peaks.txt'
#genome_file = './GRCm38.chr17.fa'
fragments = extract_peak_fragments(peak_file, genome_file)
def extract_motifs_from_file(file_path):
motifs = []
with open(file_path, 'r') as file:
lines = file.readlines()
# Iterate over the lines and extract motifs
i = 0
while i < len(lines):
line = lines[i].strip()
if line.startswith('>'):
motif_sequence = line.split()[0][1:]
motif_name = line.split()[1]
motif_probability_matrix = []
i += 1
while i < len(lines) and not lines[i].startswith('>'):
probabilities = list(map(float, lines[i].strip().split()))
motif_probability_matrix.append(probabilities)
i += 1
motifs.append((motif_sequence,motif_name, motif_probability_matrix))
variables = get_sequence_variables(motif_sequence)
for j in variables:
motifs.append((j,motif_name, motif_probability_matrix))
else:
i += 1
return motifs
def search_motifs(fragments, known_motifs, output_file):
motifs_found = {}
with open(output_file, 'w') as file:
for fragment in fragments:
peak_id, sequence, p_value_vs_local = fragment
for motif_sequence,motif_name, motif_probability_matrix in known_motifs:
if motif_sequence in sequence:
if motif_sequence in motifs_found:
motifs_found[motif_sequence] = [min(motifs_found[motif_sequence][0], p_value_vs_local), motif_name]
else:
motifs_found[motif_sequence] = [p_value_vs_local, motif_name]
for motif_sequence, name_p_value in motifs_found.items():
file.write(f">{motif_sequence} {name_p_value[1]} P:{name_p_value[0]}\n")
motif_probability_matrix = next(
(matrix for sequence, name, matrix in known_motifs if sequence == motif_sequence), None)
if motif_probability_matrix:
for probabilities in motif_probability_matrix:
file.write(' '.join(map(str, probabilities)) + '\n')
#output_file = 'motifs_output_Oct4_test.txt'
if custom_motif != "no input":
known_motifs = extract_motifs_from_file(custom_motif)
else:
known_motifs = extract_motifs_from_file("./Homer_motifs.txt")
search_motifs(fragments, known_motifs, output_file)