-
Notifications
You must be signed in to change notification settings - Fork 0
/
spotlight_helpers.py
240 lines (207 loc) · 11.2 KB
/
spotlight_helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
import glob, os
import numpy as np
import torch
from spotlight.interactions import Interactions
from spotlight.factorization.explicit import ExplicitFactorizationModel
from spotlight.cross_validation import random_train_test_split
from spotlight.evaluation import rmse_score
def train_load(filename):
"""
takes filename, reads filename.csv in the same folder as code running
assumes userID, itemID, rating columns are present
"""
# set extension to .csv
extension = '.csv'
# set URL_PREFIX, set to "" to check the same folder
URL_PREFIX = ''
# read data using np.genfromtxt
data = np.genfromtxt(URL_PREFIX + filename + extension, delimiter=',', names=True, dtype=(int, int, float))
# take userID column
users = data['userID']
# take itemID column
items = data['itemID']
# take ratings column
ratings = data['rating']
# return in a tuple
return (users, items, ratings)
def test_load(filename):
"""
takes filename, reads filename.csv in the same folder as code running
assumes userID, itemID columns are present
"""
# set extension to .csv
extension = '.csv'
# set URL_PREFIX, set to "" to check the same folder
URL_PREFIX = ''
# read data using np.genfromtxt
data = np.genfromtxt(URL_PREFIX + filename + extension, delimiter=',', names=True, dtype=(int, int))
# take userID column
users = data['userID']
# take itemID column
items = data['itemID']
# return in a tuple
return (users, items)
def get_train(myfile):
"""
returns training set appropriate for spotlight.interactions.Interactions
"""
# return training set as spotlight.interactions
return Interactions(*train_load(myfile))
def get_test(myfile):
"""
returns testing set appropriate for spotlight.interactions.Interactions
"""
# return testing set as spotlight.interactions
return Interactions(*test_load(myfile))
def train_spotlight_models(train, test, dataset_testing, embedding_dims, n_iters, batch_sizes, l2s, learning_rates, is_save = False):
"""
takes train, test, dataset_testing datasets as spotlight.interactions.
train multiple spotlight models using ExplicitFactorizationModel, with given parameters.
parameters are given in embedding_dims, n_iters, batch_sizes, l2s, learning_rates.
return predictions of train, test, dataset_testing datasets as well as rmse on train and test.
"""
# initialize train_rmses and test_rmses, these store rmse on train and test set
train_rmses = np.array([])
test_rmses = np.array([])
# initialize preds_train_trains, preds_train_tests, preds_tests; these store predictions of models on train, test and dataset_testing datasets
preds_train_trains = []
preds_train_tests = []
preds_tests = []
# traverse all parameter combinations
# embedding_din, n_iter, batch_size, l2 regularization, learning_rate
for embedding_dim in embedding_dims:
for n_iter in n_iters:
for batch_size in batch_sizes:
for l2 in l2s:
for learning_rate in learning_rates:
# initialize model with parameter, ues GPU is torch.cuda.is_available() returns True, otherwise use CPU
model = ExplicitFactorizationModel(loss='regression',
embedding_dim=embedding_dim, # latent dimensionality
n_iter=n_iter, # number of epochs of training
batch_size=batch_size, # minibatch size
l2=l2, # strength of L2 regularization
learning_rate=learning_rate,
use_cuda=torch.cuda.is_available())
# print which model is being trained
print("embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate))
# fit model
model.fit(train, verbose=True)
# find rmse on train
train_rmse = rmse_score(model, train)
# find rmse on test
test_rmse = rmse_score(model, test)
# store rmse on train and test sets
train_rmses = np.append(train_rmses, train_rmse)
test_rmses = np.append(test_rmses, test_rmse)
# print train and test rmses
print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse))
# if is_save given, save the models to disk
if is_save:
torch.save(model, "models/embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate))
# find predictions of train, test and dataset_testing datasets
preds_train_train = model.predict(train.user_ids,train.item_ids)
preds_train_test = model.predict(test.user_ids,test.item_ids)
preds_test = model.predict(dataset_testing.user_ids,dataset_testing.item_ids)
#store those predictions
preds_train_trains.append(preds_train_train)
preds_train_tests.append(preds_train_test)
preds_tests.append(preds_test)
# return stored predictions on train, test, dataset_testing; return rmses on train and test
return preds_train_trains, preds_train_tests, preds_tests, train_rmses, test_rmses
def train_spotlight_models_using_all_data(train, dataset_testing, embedding_dims, n_iters, batch_sizes, l2s, learning_rates, verbose=True):
"""
takes train dataset as spotlight.interactions.
train multiple spotlight models using ExplicitFactorizationModel, with given parameters.
parameters are given in embedding_dims, n_iters, batch_sizes, l2s, learning_rates.
saves trained models into disk
"""
# store predictions on test set
preds_tests = []
# traverse all parameter combinations
# embedding_din, n_iter, batch_size, l2 regularization, learning_rate
for embedding_dim in embedding_dims:
for n_iter in n_iters:
for batch_size in batch_sizes:
for l2 in l2s:
for learning_rate in learning_rates:
# initialize model
model = ExplicitFactorizationModel(loss='regression',
embedding_dim=embedding_dim, # latent dimensionality
n_iter=n_iter, # number of epochs of training
batch_size=batch_size, # minibatch size
l2=l2, # strength of L2 regularization
learning_rate=learning_rate,
use_cuda=torch.cuda.is_available())
# print if given True
if verbose:
print("embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate))
# fit model using train dataset
model.fit(train, verbose=verbose)
preds_test = model.predict(dataset_testing.user_ids,dataset_testing.item_ids)
preds_tests.append(preds_test)
# save model to disk
torch.save(model, "models_all_data/embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate))
# return stored predictions on dataset_testing
return preds_tests
def load_spotlight_models(train, test, dataset_testing, verbose=False):
"""
Loads pretrained spotlight models from the folder in the directory.
Takes train, test datasets and dataset_testing to generate predictions and calculate rmse
"""
# initialize predictions, stores predictions on train, test and dataset_testing datasets
preds_train_trains = []
preds_train_tests = []
preds_tests = []
# initialize rmses, stores rmses on train and test dataset
train_rmses = np.array([])
test_rmses = np.array([])
# for each file in the "models" folder in the directory
for file in glob.glob("models/*"):
# prinr filenames, if given True
if verbose:
print(file)
# load model
model = torch.load(file)
# calculate and store rmses on train and test datasets
train_rmse = rmse_score(model, train)
test_rmse = rmse_score(model, test)
train_rmses = np.append(train_rmses, train_rmse)
test_rmses = np.append(test_rmses, test_rmse)
# make predictions on train, test and dataset_testing datasets
preds_train_train = model.predict(train.user_ids,train.item_ids)
preds_train_test = model.predict(test.user_ids,test.item_ids)
preds_test = model.predict(dataset_testing.user_ids,dataset_testing.item_ids)
# store predictions
preds_train_trains.append(preds_train_train)
preds_train_tests.append(preds_train_test)
preds_tests.append(preds_test)
# return predictions on train, test and dataset_testing datasets; return rmse on train and test datasets
return preds_train_trains, preds_train_tests, preds_tests, train_rmses, test_rmses
def load_spotlight_models_using_all_data(dataset_testing, verbose=False):
"""
Loads pretrained spotlight models from the folder in the directory.
Takes dataset_testing to generate predictions
"""
# initialize predictions, stores predictions on dataset_testing datasets
preds_tests = []
# for each file in the "models" folder in the directory
for file in glob.glob("models_all_data/*"):
# print filenames, if given True
if verbose:
print(file)
# load model
model = torch.load(file)
preds_test = model.predict(dataset_testing.user_ids,dataset_testing.item_ids)
# store predictions
preds_tests.append(preds_test)
# return predictions on train, test and dataset_testing datasets; return rmse on train and test datasets
return preds_tests
def get_scores(train, test):
"""
Takes train and test datasets, returns scores
"""
# take scores
y_train_trains = train.ratings
y_train_tests = test.ratings
# return score
return y_train_trains, y_train_tests