-
Notifications
You must be signed in to change notification settings - Fork 0
/
consurf_runner.py
280 lines (227 loc) · 10.1 KB
/
consurf_runner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import Select
import requests
class ConsurfRunner:
r"""
Classname: ConsurfRunner
Description: runs the Consurf server to calculate the conservation score for each residue in a protein.
Variables:
self.pdb_id: PDB ID of the protein
self.email: User's email that receives notification when the job is done
self.job_id: Job ID for the job.
self.chain_id: Chain Identifier in the PDB file
self.q_seq: query sequence of MSA.
Input: the required parameters of the server, including a PDB file and a MSA file (in clustal format)
Output: a text file containing conservation score for each amino acid.
Reference:
Ashkenazy H., Abadi S., Martz E., Chay O., Mayrose I., Pupko T., and Ben-Tal N. 2016
ConSurf 2016: an improved methodology to estimate and visualize evolutionary conservation in macromolecules
Nucl. Acids Res. 2016; DOI: 10.1093/nar/gkw408; PMID: 27166375
Celniker G., Nimrod G., Ashkenazy H., Glaser F., Martz E., Mayrose I., Pupko T., and Ben-Tal N. 2013.
ConSurf: Using Evolutionary Data to Raise Testable Hypotheses about Protein Function
Isr. J. Chem. 2013 March 10, doi: 10.1002/ijch.201200096
Ashkenazy H., Erez E., Martz E., Pupko T. and Ben-Tal N. 2010
ConSurf 2010: calculating evolutionary conservation in sequence and structure of proteins and nucleic acids.
Nucl. Acids Res. 2010; DOI: 10.1093/nar/gkq399; PMID: 20478830
Landau M., Mayrose I., Rosenberg Y., Glaser F., Martz E., Pupko T. and Ben-Tal N. 2005.
ConSurf 2005: the projection of evolutionary conservation scores of residues on protein structures.
Nucl. Acids Res. 33:W299-W302.
Glaser F., Pupko T., Paz I., Bell R.E., Bechor D., Martz E. and Ben-Tal N. 2003.
ConSurf: Identification of Functional Regions in Proteins by Surface-Mapping of Phylogenetic Information.
Bioinformatics 19:163-164.
"""
def __init__(
self,
pdb_id,
email,
job_id
):
r"""
Object constructor.
:param pdb_id: PDB ID of the protein
:param email: User's email that receives notification when the job is done
:param job_id: Job ID for the job
"""
self._pdb_id = pdb_id
self._email = email
self._chain_id = self._get_chain_id()
# self._q_seq = self._get_q_seq()
self._job_id = job_id
def _get_chain_id(self) -> str:
r"""
Gets user's input for chain id and checks if the chain id is present in the PDB file.
:return: A valid chain id
"""
current_path = os.getcwd()
PDB_path = os.path.join(current_path, f"{self._pdb_id}.pdb")
chain_id_list = []
with open(PDB_path, "r") as file:
line = file.readline()
while not line.startswith("SEQRES"):
line = file.readline()
chain_id = line.split()[2]
chain_id_list.append(chain_id)
idx = 0
while line.startswith("SEQRES"):
if line.split()[2] != chain_id_list[idx]:
chain_id_list.append(line.split()[2])
idx += 1
line = file.readline()
chain_user = input(f"Please give your chain id from {chain_id_list}: ")
while True:
if chain_user in chain_id_list:
break
else:
chain_user = input("Chain identifier invalid. Please try again: ")
return chain_user
# def _get_q_seq(self) -> str:
#
# r"""
# Gets user's input for query sequence and checks if the query sequence is present in the MSA file.
# :return: A valid query sequence
# """
#
# current_path = os.getcwd()
# MSA_path = os.path.join(current_path, f"{self._pdb_id}.a3m")
# q_seq_list = []
# with open(MSA_path, "r") as file:
# line = file.readline()
# while line[0] == "-" or line[0] == ">" or line[0].isalpha():
# q_seq_list.append(line.split()[0][1:])
# line = file.readline()
# line = file.readline()
# q_seq_user = input("Please select query sequence: ")
# while True:
# if q_seq_user in q_seq_list:
# break
# else:
# q_seq_user = input("Chain identifier invalid. Please try again: ")
# return q_seq_user
def out_chain_id(self) -> str:
r"""
Access the chain id from the main program.
:return: chain_id
"""
return self._chain_id
def run_job(self):
r"""
The program opens a Chrome window and runs the Consurf server and download result automatically.
THIS MAY TAKE HOURS.
:return: N/A
"""
current_path = os.getcwd()
driver = webdriver.Chrome(ChromeDriverManager().install())
server_url = "https://consurf.tau.ac.il/?redirect=NO"
# Access the Consurf server
print("Starting Consurf server...")
driver.get(server_url)
# Wait for the page to load
WebDriverWait(driver, 90).until(
EC.element_to_be_clickable((By.XPATH, "//*[@id='maincontentcontainer']/div/form/div[2]/label"))
)
# Select Amino Acid
DNA_AA = driver.find_element(By.XPATH, "//*[@id='maincontentcontainer']/div/form/div[2]/label")
DNA_AA.click()
# Wait for the page to load
WebDriverWait(driver, 90).until(
EC.url_changes(server_url)
)
# Choose to upload PDB
PDB_yes_no = driver.find_element(By.XPATH, "//*[@id='yes_no_box']/div[1]/label")
PDB_yes_no.click()
# Wait for the page to load
WebDriverWait(driver, 15).until(
EC.element_to_be_clickable((By.XPATH, "//*[@id='pdb_file_field']"))
)
# Upload PDB file
pdb_FILE = driver.find_element(By.XPATH, "//*[@id='pdb_file_field']")
PDB_path = os.path.join(current_path, f"{self._pdb_id}.pdb")
pdb_FILE.send_keys(PDB_path)
print("Analyzing PDB...")
# Wait for the page to load
WebDriverWait(driver, 15).until(
EC.element_to_be_clickable((By.XPATH, "//*[@id='case_pdb_yes']/input[4]"))
)
# Go to the next page
next_page = driver.find_element(By.XPATH, "//*[@id='case_pdb_yes']/input[4]")
PDB_url = driver.current_url
next_page.click()
# Wait for the page to load
WebDriverWait(driver, 90).until(
EC.url_changes(PDB_url)
)
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.XPATH, "//*[@id='form']/select"))
)
# Get a list of options from the chain identifiers
chainID = driver.find_element(By.XPATH, "//*[@id='form']/select")
chainID_drop = Select(chainID)
# Choose a chain identifier
chainID_drop.select_by_value(self._chain_id)
# Wait for the page to load
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.XPATH, "//*[@id='yes_no_box']/div[1]/label"))
)
# Choose to upload MSA
MSA_yes_no = driver.find_element(By.XPATH, "/html/body/div[3]/div/form/div[3]/div[1]/div[1]/label/input")
MSA_yes_no.click()
# Wait for the page to load
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.XPATH, "//*[@id='fileSelect']"))
)
# Upload MSA
MSA_upload = driver.find_element(By.XPATH, "//*[@id='fileSelect']")
MSA_path = os.path.join(current_path, f"{self._pdb_id}_MSA.fasta")
MSA_upload.send_keys(MSA_path)
print("Fetching query sequences...")
# Wait for the page to load
WebDriverWait(driver, 90).until(
EC.presence_of_element_located((By.XPATH, "//*[@id='queryName']"))
)
# Choose query sequence name
QS = driver.find_element(By.XPATH, "//*[@id='queryName']")
QS_drop = Select(QS)
QS_drop.select_by_visible_text("101")
# Update selection
update_selection = driver.find_element(By.XPATH, "/html/body/div[3]/div/form/div[3]/div[2]/div[1]/div[2]/a[2]")
update_selection.click()
# Wait for the page to load
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.XPATH, "//*[@id='yes_no_box']/div[1]/label/input"))
)
# No Tree file to upload
TREE_yes_no = driver.find_element(By.XPATH, "/html/body/div[3]/div/form/div[5]/div[1]/div[2]/label/input")
TREE_yes_no.click()
# Wait for the page to load
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.XPATH, "//*[@id='qtitle']"))
)
# Provide a job name
job_name = driver.find_element(By.ID, "qtitle")
job_name.send_keys(self._job_id)
# Provide a Email to receive update
email = driver.find_element(By.XPATH, "/html/body/div[3]/div/form/div[6]/div[8]/input")
email.send_keys(self._email)
# Submit Job
print("Submitting job...")
submit_button = driver.find_element(By.XPATH, "/html/body/div[3]/div/form/div[6]/input")
final_url = driver.current_url
submit_button.click()
# Wait for the page to load
WebDriverWait(driver, 90).until(
EC.url_changes(final_url)
)
print(driver.find_element(By.XPATH, "/html/body/div[3]/b").text)
job_id = driver.find_element(By.XPATH, "/html/body/div[3]/b").text.split()[7][:-1]
WebDriverWait(driver, 36000).until(
EC.element_to_be_clickable((By.XPATH, "/html/body/div[3]/ul[9]/li/a"))
)
result = requests.get(f"https://consurf.tau.ac.il/results/{job_id}/consurf.grades", verify=False)
with open(f"{self._pdb_id}_CONS.txt", "w") as out:
out.write(result.text)
driver.quit()