forked from Jamesyang2333/SAM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluate.py
230 lines (189 loc) · 7.81 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import numpy as np
import pandas as pd
import random
import time
OPS = {
'>': np.greater,
'<': np.less,
'>=': np.greater_equal,
'<=': np.less_equal,
'=': np.equal
}
def Query(table, columns, operators, vals, return_masks=False, return_crad_and_masks=False):
assert len(columns) == len(operators) == len(vals)
bools = None
# print(table.name)
for c, o, v in zip(columns, operators, vals):
if table.name in ['DMV', 'Census']:
inds = [False] * table.cardinality
inds = np.array(inds)
is_nan = pd.isnull(c.data)
if np.any(is_nan):
v = np.array(v).astype(c.data.dtype)
# print(type(v))
# print(v)
inds[~is_nan] = OPS[o](c.data[~is_nan], v)
else:
v = np.array(v).astype(c.data.dtype)
inds = OPS[o](c.data, v)
else:
v = np.array(v).astype(c.data.dtype)
inds = OPS[o](c.data, v)
# print(v)
# print(v.dtype)
# print(c.name)
# print(c.data.dtype)
# print(c.data)
# print(o)
if bools is None:
bools = inds
else:
bools &= inds
c = bools.sum()
if return_masks:
return bools
elif return_crad_and_masks:
return c, bools
return c
def ComputeCE(gt_table, gen_table, gt_caches, eps=1e-9):
# fill nan value with -1 for both data frames
gt_table = gt_table.fillna(-1)
gen_table = gen_table.fillna(-1)
col_names = gt_table.columns.tolist()
unique_rows = list(gt_table.groupby(col_names).groups)
ce = 0.
if not len(gt_caches):
gt_counts_df = gt_table.groupby(
col_names).size().reset_index(name='counts')
gen_counts_df = gen_table.groupby(
col_names).size().reset_index(name='counts')
for row in unique_rows:
value = row
value_str = ','.join([str(item) for item in row])
if value_str in gt_caches:
gt_prob = gt_caches[value_str]
else:
gt_prob = gt_counts_df[gt_counts_df[col_names[0]] == value[0]]
for i in range(len(col_names) - 1):
gt_prob = gt_prob[gt_prob[col_names[i + 1]] == value[i + 1]]
if len(gt_prob) == 0:
print(row)
# gt_prob = gt_counts_df[gt_counts_df[col_names[0]] == value[0]]
# if len(gt_prob) == 0:
# if value[0] != value[0]:
# print(value[0])
# print(gt_counts_df[gt_counts_df[col_names[0]].isnull()])
# print(gt_prob)
# for i in range(len(col_names) - 1):
# gt_prob = gt_prob[gt_prob[col_names[i + 1]] == value[i + 1]]
# print(gt_prob)
gt_prob = gt_prob.iloc[0]['counts'] / len(gt_table)
gt_caches[value_str] = gt_prob
gen_prob = gen_counts_df[gen_counts_df[col_names[0]] == value[0]]
for i in range(len(col_names) - 1):
gen_prob = gen_prob[gen_prob[col_names[i + 1]] == value[i + 1]]
if len(gen_prob) > 0:
gen_prob = gen_prob.iloc[0]['counts'] / len(gen_table)
else:
gen_prob = eps
ce -= gt_prob * np.log(gen_prob)
return ce
def relation_generation(sample_frame_dict, join_name_set):
title_cols = sample_frame_dict['title'].columns.tolist()
sample_frame_dict['title'] = sample_frame_dict['title'].fillna(-1)
# generate the other relations from views
missing_rows = {}
for col in title_cols:
missing_rows[col] = []
union_row_difference = set()
initial_title_rows = list(
sample_frame_dict['title'].groupby(title_cols).groups)
initial_title_rows_str = [
','.join([str(item) for item in row]) for row in initial_title_rows]
for name in join_name_set:
print(sample_frame_dict[name].dtypes)
for name in join_name_set:
if name == 'title':
continue
# fill nan value with -1 for both data frames
view_frame = sample_frame_dict[name].fillna(-1)
all_col = view_frame.columns.tolist()
print("all columns: {}".format(all_col))
relation_col = list(set(all_col).difference(title_cols))
print("columns of the join relation: {}".format(relation_col))
view_title_rows = list(view_frame.groupby(title_cols).groups)
for row in view_title_rows:
if len(str(row[0])) == 1:
print(name)
print(row)
view_title_rows_str = [
','.join([str(item) for item in row]) for row in view_title_rows]
row_difference = set(view_title_rows_str).difference(
set(initial_title_rows_str))
if row_difference:
union_row_difference = union_row_difference | row_difference
print("view {} contain {} rows not found in title".format(
name, len(row_difference)))
print("all views together contain {} rows not found in title: {}".format(
len(union_row_difference), union_row_difference))
for row_str in union_row_difference:
row_val = row_str.split(',')
for i in range(len(title_cols)):
missing_rows[title_cols[i]].append(row_val[i])
missing_row_frame = pd.DataFrame.from_dict(missing_rows)
missing_row_frame = missing_row_frame.astype({'title.kind_id': sample_frame_dict['title'].dtypes[
'title.kind_id'], 'title.production_year': sample_frame_dict['title'].dtypes['title.production_year']})
sample_frame_dict['title'] = sample_frame_dict['title'].append(
missing_row_frame)
sample_frame_dict['title']['id'] = list(
range(sample_frame_dict['title'].shape[0]))
title_frame = sample_frame_dict['title']
renameDict = {}
for col in title_cols:
renameDict[col] = col.split('.')[1]
print(renameDict)
title_frame.rename(columns=renameDict, inplace=True)
title_val_dict = {}
for row in initial_title_rows_str:
title_val_dict[row] = []
for row in union_row_difference:
title_val_dict[row] = []
for idx, row in title_frame.iterrows():
row_str = ",".join([str(row['kind_id']), str(row['production_year'])])
title_val_dict[row_str].append(idx)
for name in join_name_set:
fk_list = []
if name == 'title':
continue
start = time.time()
# fill nan value with -1 for both data frames
view_frame = sample_frame_dict[name].fillna(-1)
for i in range(view_frame.shape[0]):
row = view_frame.iloc[i]
row_str = ",".join(
[str(row['title.kind_id']), str(row['title.production_year'])])
fk = random.choice(title_val_dict[row_str])
fk_list.append(fk)
end = time.time()
print("Total time takes for assigning foreign keys: {}".format(
end - start))
all_cols = view_frame.columns.tolist()
renameDict = {}
for col in all_cols:
renameDict[col] = col.split('.')[1]
view_frame.rename(columns=renameDict, inplace=True)
fk_table_name = name.split(',')[0]
view_frame['movie_id'] = fk_list
saved_cols = view_frame.columns.tolist()
saved_cols.remove('kind_id')
saved_cols.remove('production_year')
# undo fillna operation
view_frame = view_frame.applymap(lambda x: np.nan if x == -1 else x)
view_frame.to_csv(
'./generated_tables_60/{}.csv'.format(fk_table_name), columns=saved_cols, index=False)
print("Saved {} into csv.".format(fk_table_name))
# undo fillna operation for title frame
title_frame = title_frame.applymap(lambda x: np.nan if x == -1 else x)
print(title_frame.head())
title_frame.to_csv('./generated_tables_60/title.csv', index=False)
print("Saved title into csv.")