-
Notifications
You must be signed in to change notification settings - Fork 0
/
lcsm.py
49 lines (34 loc) · 881 Bytes
/
lcsm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from fasta_tools import fasta_read
import numpy as np
file = './datasets/rosalind_lcsm.txt'
def lcsm(file):
# get sequences/IDs
data = fasta_read(file)
# reduce to only sequences
data = [seq for _, seq in data.items()]
subs = list()
first = data[0]
# get every substring of the first entry
for i in range(0, len(first)):
for j in range(2, len(first) + 1):
if len(first[i:j]) < 2:
continue
if first[i:j] not in data[1]:
break
if first[i:j] not in subs:
print(f'{len(subs)}: {first[i:j]}')
subs.append(first[i:j])
# find the longest commons substring
subs_temp = subs.copy()
for sub in subs:
for seq in data[2:]:
if sub not in seq:
subs_temp.remove(sub)
break
subs = subs_temp
if len(subs) == 1:
return subs[0]
subs = sorted(subs, key=len)
return subs[-1]
if __name__ == '__main__':
print(str(lcsm(file)))