-
Notifications
You must be signed in to change notification settings - Fork 0
/
parsePPD.py
164 lines (125 loc) · 5.71 KB
/
parsePPD.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 09 13:28:30 2016
Compile Plasma Proteome Database into FASTA format
@author: vgor
"""
import re
import requests
import numpy as np
from os import walk, stat
from io import StringIO
from lxml import etree
from pandas import DataFrame
from pyteomics import fasta
xmlPath = "E:\\Fasta\\PlasmaProteomeDB\\Experimental_evidence\\"
fastaPath = "E:\\Fasta\\"
concentrationPath = "E:\\Fasta\\PlasmaProteomeDB\\Protein_concentration.xml"
resDir = "E:\\RawData\\20161120_NP\\res\\"
#ill-formed XML tag
#groups are
#0 - opening brace
#1 - element name with spaces
#2 - attributes
#3 - closing brace
invalidXMLTag = re.compile(r'(<\/*)(\w+(?:\s\w+(?!\s*=\s*".*?"))+)((?:\s\w+\s*=\s*".*?")*)(\/*>)')
def curePPDSyntax(filename):
"""
Remove spaces in XML tags that come from PPD
i.e. <Experimental evidence> become <ExperimentalEvidence>
"""
with open(filename, "r") as fileIn:
text = fileIn.read()
curedText = ""
pos = 0
for match in invalidXMLTag.finditer(text):
parts = match.groups()
#groups are
#0 - opening brace
#1 - element name with spaces
#2 - attributes
#3 - closing brace
curedTag = parts[0] + "".join(map(str.capitalize, parts[1].split())) + parts[2] + parts[3]
curedText += text[pos:match.start()] + curedTag
pos = match.end()
curedText += text[pos:]
return curedText
def queryUniprot(accessionList):
"""
Retrieve list of accessions from Uniprot in FASTA format
"""
params = {
'from':'ACC',
'to':'ACC',
'format':'fasta',
'query':' '.join(accessionList)
}
return requests.post('http://www.uniprot.org/uploadlists/', params).text
data = []
for name in walk(xmlPath, False).next()[2]:
if stat(xmlPath + name).st_size > 0: #ignore empty files
d = etree.fromstring(curePPDSyntax(xmlPath + name))
#collect protein information
protein = {}
protein["filename"] = name
protein["ID"] = d.xpath("//protein")[0].get("ID")
protein["Description"] = d.xpath("//protein/title")[0].text.strip()
protein["GeneID"] = d.xpath("//protein/gene_symbol")[0].text.strip()
protein["UniprotID"] = d.xpath("//protein/swissprot")[0].text.strip()
protein["NrReference"] = len(d.xpath("//Experimental_evidence[@Reference]"))
protein["NrPlasma"] = len(d.xpath("//Experimental_evidence[@Sample=\"Plasma\"]"))
data.append(protein)
data = DataFrame(data)#convert to DataFrame for easier manin=pulation
#keep only entries with more than 1 experimental evidence in total and at least one in plasma
ids = data.loc[np.logical_and(data["NrPlasma"] > 0, data["NrReference"] > 1), "UniprotID"].values
print("Number of entries from PPD:", len(ids))
#remove entries whitout SwissProt identifier
ids = ids[np.logical_and(ids != "-", ids != "None")]
print("Removed empty SwissProt identifiers:", len(ids))
#Has only a single SwissProt id
singleIDs = filter(lambda x: len(x) == 6, ids)
print("Have single ID:", len(singleIDs))
#some ids are duplicates, thus we will use set
singleIDfasta = queryUniprot(set(singleIDs))
#collect data from fasta
proteins = [{"Description": p.description, "Sequence": p.sequence}
for p in fasta.read(StringIO(singleIDfasta))]
multipleIDs = filter(lambda x: len(x) > 6, ids)
print("Have multiple IDs:", len(multipleIDs))
multipleIDfasta = ""
for ID in multipleIDs:
fastaText = queryUniprot(ID.split(","))
multipleIDfasta += fastaText
#collect further data
proteins.extend([{"Description": p.description, "Sequence": p.sequence}
for p in fasta.read(StringIO(multipleIDfasta))])
proteins = DataFrame(proteins)
proteins["ID"] = proteins["Description"].apply(lambda s: s.split("|")[1])#extract UniprotID
proteins["db"] = proteins["Description"].apply(lambda s: s.split("|")[0])#extract database type (SwissProt or TrEMBL)
print("Number of proteins in fasta:", len(proteins))
proteins.drop_duplicates("ID", inplace = True)
print("Remove duplicates:", len(proteins))
print("SwissProt: {}; TrEMBL: {}".format(sum(proteins["db"] == "sp"), sum(proteins["db"] == "tr")))
#writing result
fasta.write(zip(proteins["Description"], proteins["Sequence"]), open(fastaPath + "PPD.fasta", "w"))
fasta.write_decoy_db(fastaPath + "PPD.fasta", fastaPath + "PPD+reverse.fasta", file_mode = "w")
fasta.write_decoy_db(fastaPath + "PPD.fasta", fastaPath + "DECOY_PPD.fasta", file_mode = "w", decoy_only = True)
#add contaminants
ppdProteins = [p for p in fasta.read(open(fastaPath + "PPD.fasta", "r"))]
contProteins = [p for p in fasta.read(open(fastaPath + "MQCont.fasta", "r"))]
fasta.write(ppdProteins + contProteins, open(fastaPath + "PPD_MQCont.fasta", "w"))
fasta.write_decoy_db(fastaPath + "PPD_MQCont.fasta", fastaPath + "PPD_MQCont+Rev.fasta", file_mode = "w")
#%% Plasma protein concentrations
def readConcentrations(concentrationPath):
#cleaning concentration XML
concXML = open(concentrationPath).read()
concXML = re.sub(r"±", "±", concXML)#substitute html to utf-8
concXML = re.sub(r"encoding=\".+?\"(?=.+\?>)", "", concXML)#remove encoding
xmlData = etree.fromstring(concXML)
concData = DataFrame([dict(row.attrib) for row in xmlData.xpath("//Row")])
concData = concData.rename(columns=concData.iloc[0]).drop(0)
#further cleaning
concData["concentration"] = concData["Protein concentration"].str.replace("·", ".")
concData["concentration"] = concData["concentration"].str.replace("[^\\d\\.\\+\\-±]", "")
return concData.drop("Protein concentration", axis = 1)
readConcentrations(concentrationPath).to_csv("{}plasma_protein_concentrations.csv".format(resDir), index=False)