forked from Jamesyang2333/SAM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_loader.py
104 lines (81 loc) · 3.09 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import json
import datasets
from data_utils import MakeTable
import torch
import csv
from torch.utils.data import Dataset, DataLoader
def load_dataset(dataset, file):
with open(file, 'r', encoding="utf8") as f:
workload_stats = json.load(f)
card_list_tmp = workload_stats['card_list']
query_list = workload_stats['query_list']
table, _ = MakeTable(dataset)
card_list_tmp = [float(card)/table.cardinality for card in card_list_tmp]
columns_list = []
operators_list = []
vals_list = []
card_list = []
for i, query in enumerate(query_list):
if card_list_tmp[i] > 0:
cols = query[0]
ops = query[1]
vals = query[2]
columns_list.append(cols)
operators_list.append(ops)
vals_list.append(vals)
card_list.append(card_list_tmp[i])
return {"column": columns_list, "operator": operators_list, "val": vals_list, "card": card_list}
class RelationDataset(Dataset):
def __init__(self, data, label_log, label):
self.data = data
self.label_log = label_log
self.label = label
def __len__(self):
return len(self.label)
def __getitem__(self, idx):
return {"data": self.data[idx], "label_log": self.label_log[idx], "label": self.label[idx]}
def collate_fn(batch):
batch_data = {}
batch_data["data"] = torch.FloatTensor([item["data"] for item in batch])
batch_data["label_log"] = torch.FloatTensor([item["label_log"] for item in batch])
batch_data["label"] = torch.FloatTensor([item["label"] for item in batch])
return batch_data
def get_loader(data, label_log, label, batch_size):
dataset = RelationDataset(data, label_log, label)
dataLoader = DataLoader(dataset = dataset, batch_size = batch_size, collate_fn = collate_fn)
return dataLoader
def load_dataset_join(file_name, tables_list, loaded_tables):
data = []
columns_list = []
operators_list = []
vals_list = []
label_list = []
with open(file_name) as csv_file:
csv_reader = csv.reader(csv_file, delimiter='#')
line_count = 0
for row in csv_reader:
current_row = {}
tables = []
tables = row[0].split(',')
# tables = [table.split(' ')[1] for table in tables]
current_row["tables"] = tables
join_con = row[1]
current_row["join_con"] = join_con
predicates = row[2].split(',')
cols = []
operators = []
vals = []
for i in range(len(predicates)//3):
cols.append(predicates[i*3])
operators.append(predicates[i*3+1])
# get rid of quote if exist
vals.append(predicates[i*3+2].strip("'"))
columns_list.append(cols)
operators_list.append(operators)
vals_list.append(vals)
current_row["predicates"] = [cols, operators, vals]
label = float(row[3])
current_row["label"] = label
label_list.append(label)
data.append(current_row)
return data