-
Notifications
You must be signed in to change notification settings - Fork 2
/
visualize_query_ratios.py
210 lines (174 loc) · 7.79 KB
/
visualize_query_ratios.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#!/usr/bin/python
import numpy as np
import pandas as pd
import math
import argparse
import re
from collections import Counter
from bokeh.io import output_file, show
from bokeh.models import HoverTool
from bokeh.plotting import figure
from bokeh.transform import linear_cmap
from bokeh.transform import log_cmap
from bokeh.util.hex import hexbin
from bokeh.util.hex import cartesian_to_axial
from bokeh.palettes import Blues9
from bokeh.palettes import Reds9
from bokeh.models import LinearColorMapper, BasicTicker, ColorBar
from bokeh.layouts import column, row
def create_data_source(table, size, orientation, ratio):
'''
This function recieves x and y vectors and returns a dataframe with hexiconal coordinates + counts
'''
# Create array with mass and logP values
x, y = create_array(table)
q, r = cartesian_to_axial(x, y, size, orientation=orientation, aspect_scale=ratio)
# create dataframe with hexagonal coordinates as rows (row = hexagon)
df = table.reset_index()
df.loc[:,"q"] = q
df.loc[:,"r"] = r
df = df.drop(['ChEBI', 'Class', 'Names'], axis=1)
df = df.groupby(['q', 'r']).agg({'Count': 'sum', 'TFIDF': 'sum'}).reset_index()
return df
def import_table(file):
'''
This function imports a pickle file (from the 'table' folder) and returns a dataframe
'''
table = pd.read_pickle(file)
return table
def create_array(table):
'''
This function recieves a dataframe with logP and Mass values for every ChEBI identifier.
It returns two numpy arrays: for mass and logP.
'''
# create lists
x = [float(logP) for logP in table.logP]
y = [float(mass) for mass in table.Mass]
return np.asarray(x), np.asarray(y)
def get_ratio(x, y, total_x, total_y, lb, normalize):
'''
This function is used to calculate the ratio between counts of two dataframes in one hexagon.
If the normalize argument is given, then both counts in the hexaon will be normalized for the total amount of counts found in the dataframes.
'''
if normalize:
factor = total_x / total_y
if total_x < total_y:
if x > lb:
x = x / factor
else:
if y > lb:
y = y * factor
ratio = x / y
return ratio
def calculate_ratios(df1, df2, lb, normalize):
'''
This function recieves two dataframes, merges these together and calculates the count ratio's for every hexagon.
The ratio per hexagon will be represented in the plot with red (high ratio), blue (low ratio), and white (equal) colors.
Two dataframes for low and high ratio levels are returned.
'''
# Merge dataframes
df_merged = pd.merge(df1, df2, how='outer', on=['q', 'r'])
# Apply lower bound to every hexagon
df_merged.loc[:,['Count_x', 'Count_y']] = df_merged.loc[:,['Count_x', 'Count_y']].fillna(lb)
df_merged.loc[df_merged.Count_x < lb, "Count_x"] = lb
df_merged.loc[df_merged.Count_y < lb, "Count_y"] = lb
# Get total counts
total_count_x = df_merged.Count_x.sum()
total_count_y = df_merged.Count_y.sum()
# Calculate ratio's per hexagon
df_merged.loc[:,'ratio'] = [get_ratio(x, y, total_count_x, total_count_y, lb, normalize) for x, y in zip(df_merged.Count_x, df_merged.Count_y)]
df_merged.loc[:,'log_ratio'] = np.log(df_merged.ratio)
# Divide dataframe into low and high ratio dataframe (for plot colors)
minimum = min(df_merged.log_ratio)
maximum = max(df_merged.log_ratio)
df_ratio_low = df_merged[df_merged.log_ratio < 0]
df_ratio_high = df_merged[df_merged.log_ratio >= 0]
return df_ratio_low, df_ratio_high, minimum, maximum
def normalize_df(df):
'''
This function applies normalization tot the total counts in a dataframe
'''
total_count = df['counts'].sum()
df['counts'] = df['counts'] / total_count
return df
def find_max_values(table_1, table_2):
'''
This function recieves the input tables, creates x and y arrays and finds the min and max values for the plot.
'''
# Find max values to calculate ratio for the plot
x, y = create_array(table_1)
xs, ys = create_array(table_2)
max_x = max(np.append(x, xs))
min_x = min(np.append(x, xs))
max_y = max(np.append(y, ys))
min_y = min(np.append(y, ys))
ratio = (max_y - min_y) / (max_x - min_x)
return max_x, min_x, max_y, min_y, ratio
def plot_ratio(table_1, table_2, query_1, query_2, normalize):
# Define constants
SIZE = 10
LOWER_BOUND = 2
ORIENTATION = "pointytop"
# Plot variables
title = "Hexbin plot comparing %s with %s" % (query_1, query_2)
max_x, min_x, max_y, min_y, ratio = find_max_values(table_1, table_2)
# Create figure
p = figure(title=title, match_aspect=True, aspect_scale = ratio, x_range = [min_x, max_x],y_range=[min_y, max_y],
tools="wheel_zoom,reset, save", background_fill_color= '#D3D3D3')
p.grid.visible = False
# Create hexagonal dataframes (turn LogP and Mass values into hexagonal coordinates)
df1 = create_data_source(table_1, SIZE, ORIENTATION, ratio)
df2 = create_data_source(table_2, SIZE, ORIENTATION, ratio)
# Calculate ratio's for plot dataframes, and min max values for
df_ratio_low, df_ratio_high, minimum, maximum = calculate_ratios(df1, df2, LOWER_BOUND, normalize)
extreme = max(abs(minimum), maximum)
# Reverse list of colors for color bar
red_reversed = list(Reds9)
red_reversed.reverse()
# Create blue and red hex tiles for low and high ratio's.
p.hex_tile(q="q", r="r", size=SIZE, line_color=None, source=df_ratio_low,aspect_scale=ratio,
fill_color=linear_cmap('log_ratio', 'Blues9', -extreme, 0))
p.hex_tile(q="q", r="r", size=SIZE, line_color=None, source=df_ratio_high,aspect_scale=ratio,
fill_color=linear_cmap('log_ratio', red_reversed, 0, extreme))
hover = HoverTool(tooltips=[("log_ratio", "@log_ratio")])
p.add_tools(hover)
pbr = list(Blues9) + ['#FFFFFF'] + red_reversed
# Color bar
color_mapper = LinearColorMapper(palette=pbr, low=-5, high=5)
color_bar = ColorBar(color_mapper=color_mapper, ticker=BasicTicker(),major_label_overrides={4: 'More '+str(query_1)+' counts',-4:"More "+str(query_2)+" counts"},major_label_text_align='left',
label_standoff=6, border_line_color=None, location=(0,0))
# Color bar is added to a dummy figure, to prevent the shrinking of the original plot
dummy = figure(
toolbar_location=None,
min_border=0,
outline_line_color=None)
dummy.add_layout(color_bar, 'left')
dummy.title.align = 'center'
dummy.title.text_font_size = '10pt'
# Output
output_file("plots/%s_vs_%s.html" % (query_1, query_2))
layout = row(p, dummy)
show(layout)
def parser():
parser = argparse.ArgumentParser(description='This script makes a table of the query IDs, their names and their properties')
parser.add_argument('-i', required=False, metavar='input_file', dest='input_file', help='[i] to select input file from the results folder')
parser.add_argument('-i2', required=False, metavar='input_file_2', dest='input_file_2', help='[i] to select second input file from the results folder')
parser.add_argument('-n', default=False, action='store_true', help='[n] to select normalization')
arguments = parser.parse_args()
return arguments
def main():
# Arguments
args = parser()
file_1 = args.input_file
file_2 = args.input_file_2
normalize = args.n
# Import input files
table_1 = import_table(file_1) # , term ?
table_2 = import_table(file_2)
# Get query names from input (Windows/Linux)
query_1 = re.split(r'[\\/]',file_1)[1].split('_')[0]
query_2 = re.split(r'[\\/]',file_2)[1].split('_')[0]
# Plot!
plot_ratio(table_1, table_2, query_1, query_2, normalize)
if __name__ == '__main__':
main()