-
Notifications
You must be signed in to change notification settings - Fork 10
/
PathLinker.py
312 lines (227 loc) · 9.27 KB
/
PathLinker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
"""
PathLinker finds high-scoring paths that connect a source node set to a
target node set in a large, optionally weighted directed network. To
reconstruct signaling pathways, PathLinker returns high-scoring paths
from any receptor protein (source node) to any transcriptional regulator
(target node) in a protein-protein interactome.
See the following paper for more details:
Pathways on Demand: Automated Reconstruction of Human Signaling Networks.
Anna Ritz, Christopher L. Poirel, Allison N. Tegge, Nicholas Sharp, Allison Powell, Kelsey Simmons, Shiv D. Kale, and T. M. Murali.
npj Systems Biology and Applications, 2, Article number 16002, 2016
Virginia Tech, Blacksburg, VA
This code is authored by:
Mitch Wagner: [email protected]
Nicholas Sharp: [email protected]
Anna Ritz: [email protected]
Christopher L. Poirel: [email protected]
T. M. Murali: [email protected]
"""
from math import log
import networkx as nx
def modifyGraphForKSP_removeEdgesToSources(net, sources):
"""
Modifies the structure of a graph by removing all edges entering
sources. These edges will never contribute to a path, according
to the PathLinker problem formulation.
:param net: NetworkX graph
:param sources: List of source nodes
"""
# We will never use edges leaving a target or entering a source, since
# we do not allow start/end nodes to be internal to any path.
for u,v in net.edges():
if not net.has_edge(u, v):
continue
# remove edges coming into sources
elif v in sources:
net.remove_edge(u,v)
return
def modifyGraphForKSP_removeEdgesFromTargets(net, targets):
"""
Modifies the structure of a graph by removing all edges existing
targets. These edges will never contribute to a path, according
to the PathLinker problem formulation.
:param net: NetworkX graph
:param sources: List of target nodes
"""
# We will never use edges leaving a target or entering a source, since
# we do not allow start/end nodes to be internal to any path.
for u,v in net.edges():
if not net.has_edge(u, v):
continue
# remove edges leaving targets
elif u in targets:
net.remove_edge(u,v)
return
def modifyGraphForKSP_addSuperSourceSink(net, sources, targets, weightForArtificialEdges=0):
"""
Modifies the structure of a graph by adding a supersource with an
edge to every source, and a supersink with an edge from every
target. These artificial edges are given the weight
'weightForArtificialEdges'. A weight of 0 should correspond to a
"free" edge in the current interpretation of the graph.
:param net: NetworkX graph
:param sources: List of source nodes
:param targets: List of target nodes
:param weightForArtificialEdges: Weight to be assigned to edges
created by this process
"""
# Add a supersource and supersink. Shortest paths from source to sink are the same
# as shortest paths from "sources" to "targets".
for s in sources:
net.add_edge('source', s, weight=1)
net.edge['source'][s]['ksp_weight'] = weightForArtificialEdges
for t in targets:
net.add_edge(t, 'sink', weight=1)
net.edge[t]['sink']['ksp_weight'] = weightForArtificialEdges
return
def applyEdgePenalty(net, weight):
"""
Applies a user-specified edge penalty to each edge. This weight
penalizes the score of every path by a factor equal to
(the number of edges in the path)^(this factor).
This was previously done in the logTransformEdgeWeights method
with a parameter weight=(sum of all edge weights). In the
"standard" PathLinker case, this was necessary to account for the
probability that is lost when edges are removed in
modifyGraphForKSP_removeEdges(), along with the probability lost
to zero degree nodes in the edge flux calculation.
:param net: NetworkX graph
:param weight: user-specified edge penalty
"""
if weight == 1.0:
return
for u,v in net.edges():
w = net.edge[u][v]['ksp_weight']/weight
net.edge[u][v]['ksp_weight'] = w
return
def logTransformEdgeWeights(net):
"""
Apply a negative logarithmic transformation to edge weights,
converting multiplicative values (where higher is better) to
additive costs (where lower is better).
Before the transformation, weights are normalized to sum to one,
supporting an interpretation as probabilities.
If the weights in the input graph correspond to probabilities,
shortest paths in the output graph are maximum-probability paths
in the input graph.
:param net: NetworkX graph
"""
for u,v in net.edges():
w = -log(max([0.000000001, net.edge[u][v]['ksp_weight']]))/log(10)
net.edge[u][v]['ksp_weight'] = w
return
def undoLogTransformPathLengths(paths):
"""
Undoes the logarithmic transform to path lengths, converting the
path lengths back into terms of the original edge weights
:param paths: paths to apply the transform to
"""
new_paths_list = []
# Reconstructs the path list with each edge distance un-log transformed.
# We build a new list because tuples are unmodifiable.
for path in paths:
new_path = [(x[0], 10 ** (-1 * x[1])) for x in path]
new_paths_list.append(new_path)
return new_paths_list
def calculateFluxEdgeWeights(net, nodeWeights):
"""
Given a probability distribution over the nodes, calculate the
probability "flowing" through the outgoing edges of every node.
Used to assign edge weights after PageRank-ing nodes.
:param net: NetworkX graph
:parma nodeWeights: dictionary containing weights of the nodes
"""
# the flux score for and edge (u,v) is f_uv = (w_uv p_u)/d_u where
# w_uv is the weight of the edge (u,v), p_u is the normalized visitation
# probability (or node score) for u, and d_u is the weighted outdegree of node u.
# assign EdgeFlux scores to the edges
for u,v in net.edges():
w = nodeWeights[u] * net[u][v]['weight']/net.out_degree(u, 'weight')
net.edge[u][v]['ksp_weight'] = w
return
def printKSPGraph(f, graph):
"""
Print the edge in the k shortest paths graph computed by
PathLinker. This creates a tab-delimited file with one edge per
line with three columns: tail, head, and KSP index.
Here, 'ksp index' indicates the index of the first shortest path
in which the edge is used.
:param f: Name of the file to write out
:param graph: NetworkX graph
"""
outf = open(f, 'w')
outf.write('#tail\thead\tKSP index\tpath cost\n')
edges = graph.edges(data=True)
# Print in increasing order of KSP identifier.
for e in sorted(edges, key=lambda x: x[2]['ksp_id']):
t, h, attr_dict = e
ksp = attr_dict['ksp_id']
cost = attr_dict['path_cost']
outf.write('%s\t%s\t%d\t%0.5e\n' %(t, h, ksp, cost))
outf.close()
return
def printKSPPaths(f, paths):
"""
Print the k shortest paths in order. This creates a tab-delimited
file with three columns: the rank of the path (which k it was
returned for), the length of the path (sum of weights), and the
sequence of nodes in the path.
:param f: Name of the file to write out
:param paths: The paths to print out
"""
outf = open(f, 'w')
outf.write('#KSP\tpath_length\tpath\n')
for k,path in enumerate(paths, 1):
pathNodes = [n for n,w in path]
length = path[-1][1]
outf.write('%d\t%0.5e\t%s\n' %(k, length, '|'.join(pathNodes[1:-1]) ))
outf.close()
return
def printEdgeFluxes(f, graph):
"""
Print the edges with the flux weight. Sort by decreasing flux
weight.
:param f: Name of the file to write out
:param graph: NetworkX graph
"""
outf = open(f, 'w')
outf.write('#tail\thead\tedge_flux\n')
edges = graph.edges(data=True)
# Print in decreasing flux weight
for e in sorted(edges, key=lambda x: x[2]['ksp_weight'], reverse=True):
t, h, attr_dict = e
w = attr_dict['ksp_weight']
outf.write('%s\t%s\t%0.5e\n' %(t, h, w))
outf.close()
return
def readNetworkFile(network_file, pagerank=False):
net = nx.DiGraph()
# Read the network file
# infile = open(network_file, 'r')
for line in network_file:
items = [x.strip() for x in line.rstrip().split('\t')]
# Skip empty lines or those beginning with '#' comments
if line=='\n':
continue
if line[0]=='#':
continue
id1 = items[0]
id2 = items[1]
# Ignore self-edges
if id1==id2:
continue
# Possibly use an edge weight
eWeight = 1
if (not pagerank):
if(len(items) > 2):
eWeight = float(items[2])
else:
raise Exception(
"ERROR: All edges must have a weight "
"unless --PageRank is used. Edge (%s --> %s) does "
"not have a weight entry." % (id1, id2))
# Assign the weight. Note in the PageRank case, "weight" is
# interpreted as running PageRank and edgeflux on a weighted
# graph.
net.add_edge(id1, id2, ksp_weight=eWeight, weight=eWeight)
return net