-
Notifications
You must be signed in to change notification settings - Fork 2
/
viki-videos-similarity.py
286 lines (263 loc) · 14.8 KB
/
viki-videos-similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
from __future__ import division
import pandas as pd
import time
import csv
import numpy as np
import datetime
import re
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics.pairwise import cosine_similarity
def read_data():
""" Read and pre-process data
>>> videos_matrix = read_data()
>>> videos_matrix[:3]
video_id_left container_id_left origin_country_left origin_language_left \
0 TV001 Container001 us en
1 TV001 Container001 us en
2 TV001 Container001 us en
adult_left broadcast_from_left broadcast_to_left season_number_left \
0 False None None None
1 False None None None
2 False None None None
content_owner_id_left genres_left \
0 ContentOwner01 None
1 ContentOwner01 None
2 ContentOwner01 None
... origin_language_right \
0 ... en
1 ... en
2 ... zt
adult_right broadcast_from_right broadcast_to_right season_number_right \
0 False None None None
1 False 2013-06 2013-08 3
2 False 2012-07 2012-11 None
content_owner_id_right genres_right \
0 ContentOwner01 None
1 ContentOwner02 Action & Adventure (1g)
2 ContentOwner03 Comedy (6g), Drama (9g), Idol Drama (1038g), R...
episode_count_right person_id_right \
0 5 NaN
1 10 NaN
2 77 Cast0898 Cast0483 Cast1344 Cast1688 Cast0503 C...
user_id_right
0 189500_2 328741_2 579541_2 153183_2 151295_3 3...
1 353674_3
2 759744_1 379687_3 160301_1 159490_1 151124_1 1...
"""
videos = pd.read_csv('./data/20150701094451-Video_attributes.csv')
casts = pd.read_csv('./data/20150701094451-Video_casts.csv')
behaviors = pd.read_csv('./data/20150701094451-Behavior_training.csv',dtype={'user_id':pd.np.string_,'score':pd.np.string_})
# we don't care about these for now
behaviors = behaviors.drop(['date_hour','mv_ratio'], 1)
# flattening casts
casts = casts.drop(['country','gender'], 1).groupby('container_id',as_index=False).agg(lambda x: ' '.join(x.person_id))
videos = pd.merge(videos, casts, on=['container_id'], how='left', suffixes=['_left', '_right'])
# combined ppl who watched this video (and their "scores")
behaviors = behaviors.groupby('video_id',as_index=False).agg(lambda x: ' '.join(x.user_id + '_' + x.score)).drop('score', 1)
videos = pd.merge(videos, behaviors, on=['video_id'], how='left', suffixes=['_left', '_right'])
# Constructing videos_matrix
videos['dummy'] = 1
videos_matrix = pd.merge(videos, videos, on=['dummy'], suffixes=['_left', '_right'])
videos_matrix = videos_matrix.drop('dummy', 1)
return videos_matrix
def feature_similarity(videos_matrix):
""" Calculating feature similarity for each pair of movies.
>>> videos_matrix = feature_similarity(videos_matrix)
>>> videos_matrix[:3]
video_id_left user_id_left \
0 TV001 189500_2 328741_2 579541_2 153183_2 151295_3 3...
1 TV001 189500_2 328741_2 579541_2 153183_2 151295_3 3...
2 TV001 189500_2 328741_2 579541_2 153183_2 151295_3 3...
video_id_right user_id_right \
0 TV001 189500_2 328741_2 579541_2 153183_2 151295_3 3...
1 TV002 353674_3
2 TV003 759744_1 379687_3 160301_1 159490_1 151124_1 1...
sim_country sim_language sim_adult sim_content_owner_id sim_broadcast \
0 1 1 1 1 0
1 1 1 1 0 0
2 0 0 1 0 0
sim_season sim_episode_count sim_genres sim_cast
0 0 1.000000 0 0
1 0 0.500000 0 0
2 0 0.064935 0 0
"""
# Country Similarity
def sim_country(row):
return (1 if row['origin_country_left'] == row['origin_country_right'] else 0)
videos_matrix['sim_country'] = videos_matrix.apply(sim_country, axis=1)
# Language similarity:
def sim_language(row):
return (1 if row['origin_language_left'] == row['origin_language_right'] else 0)
videos_matrix['sim_language'] = videos_matrix.apply(sim_language, axis=1)
# Adult similarity:
def sim_adult(row):
return (1 if row['adult_left'] == row['adult_right'] else 0)
videos_matrix['sim_adult'] = videos_matrix.apply(sim_adult, axis=1)
# Content_owner_id similarity: 0/1
def sim_content_owner_id(row):
return (1 if row['content_owner_id_left'] == row['content_owner_id_right'] else 0)
videos_matrix['sim_content_owner_id'] = videos_matrix.apply(sim_content_owner_id, axis=1)
# Broadcast_from, broadcast_to # date similarity
def sim_broadcast(row):
try:
bfl_date = datetime.datetime.strptime(row['broadcast_from_left'], "%Y-%m").date()
bfr_date = datetime.datetime.strptime(row['broadcast_from_right'], "%Y-%m").date()
btl_date = datetime.datetime.strptime(row['broadcast_to_left'], "%Y-%m").date()
btr_date = datetime.datetime.strptime(row['broadcast_to_right'], "%Y-%m").date()
return 1 / (abs((bfl_date-bfr_date).days) + abs((btl_date-btr_date).days))
except:
return 0
videos_matrix['sim_broadcast'] = videos_matrix.apply(sim_broadcast, axis=1)
# Season_number
def sim_season(row):
try:
left = int(row['season_number_left'] if row['season_number_left'].isdigit() else 0)
right = int(row['season_number_right'] if row['season_number_right'].isdigit() else 0)
return min(left,right)/max(left,right)
except:
return 0
videos_matrix['sim_season'] = videos_matrix.apply(sim_season, axis=1)
# Episode_count
def sim_episode_count(row):
try:
return min(row['episode_count_left'],row['episode_count_right'])/max(row['episode_count_left'],row['episode_count_right'])
except:
return 0
videos_matrix['sim_episode_count'] = videos_matrix.apply(sim_episode_count, axis=1)
# Genres
def sim_genres(row):
try:
left = set(re.findall("\(*.g\)", row['genres_left']))
right = set(re.findall("\(*.g\)", row['genres_right']))
return len(left&right) / len(left|right)
except:
return 0
videos_matrix['sim_genres'] = videos_matrix.apply(sim_genres, axis=1)
# Casts
def sim_cast(row):
try:
left = set(row['person_id_left'].split())
right = set(row['person_id_right'].split())
return len(left&right) / len(left|right)
except:
return 0
videos_matrix['sim_cast'] = videos_matrix.apply(sim_cast, axis=1)
return videos_matrix.drop(['container_id_left', 'origin_country_left', 'origin_language_left', 'adult_left',
'broadcast_from_left', 'broadcast_to_left', 'season_number_left', 'content_owner_id_left', 'genres_left',
'episode_count_left', 'person_id_left', 'container_id_right',
'origin_country_right', 'origin_language_right', 'adult_right', 'broadcast_from_right',
'broadcast_to_right', 'season_number_right', 'content_owner_id_right', 'genres_right',
'episode_count_right', 'person_id_right'],1)
def jaccard_similarity(videos_matrix):
""" Calculating jaccard similarity for each pair of movies.
>>> videos_matrix = jaccard_similarity(videos_matrix)
>>> videos_matrix[:3]
video_id_left user_id_left \
0 TV001 189500_2 328741_2 579541_2 153183_2 151295_3 3...
1 TV001 189500_2 328741_2 579541_2 153183_2 151295_3 3...
2 TV001 189500_2 328741_2 579541_2 153183_2 151295_3 3...
video_id_right user_id_right \
0 TV001 189500_2 328741_2 579541_2 153183_2 151295_3 3...
1 TV002 353674_3
2 TV003 759744_1 379687_3 160301_1 159490_1 151124_1 1...
sim_country sim_language sim_adult sim_content_owner_id sim_broadcast \
0 1 1 1 1 0
1 1 1 1 0 0
2 0 0 1 0 0
sim_season sim_episode_count sim_genres sim_cast jaccard_1_3 \
0 0 1.000000 0 0 0
1 0 0.500000 0 0 0
2 0 0.064935 0 0 0
jaccard_2_3 jaccard_3_3
0 0 1.00000
1 0 0.00000
2 0 0.00159
"""
print "Calculating Jaccard indexes for high scores - " + str(datetime.datetime.now())
def jaccard_high(row): # people who like or kind of like LEFT and like RIGHT
try:
left_23 = set([item for item in row['user_id_left'].split() if not item.endswith('_1')])
right_23 = set([item for item in row['user_id_right'].split() if not item.endswith('_1')])
if len(left_23|right_23) < 1000:
return 0
else:
return len(left_23&right_23) / len(left_23|right_23)
except:
return 0
videos_matrix['jaccard_high'] = videos_matrix.apply(jaccard_high, axis=1)
print "Calculating Jaccard indexes #1-3 - " + str(datetime.datetime.now())
def jaccard_1_3(row): # people who do not like LEFT but like RIGHT
try:
left_1 = set([item for item in row['user_id_left'].split() if item.endswith('_1')])
right_3 = set([item for item in row['user_id_right'].split() if item.endswith('_3')])
if len(left_1|right_3) < 1000:
return 0
else:
return len(left_1&right_3) / len(left_1|right_3)
except:
return 0
videos_matrix['jaccard_1_3'] = videos_matrix.apply(jaccard_1_3, axis=1)
print "Calculating Jaccard indexes #2-3 - " + str(datetime.datetime.now())
def jaccard_2_3(row): # people who kind of like LEFT and like RIGHT
try:
left_2 = set([item for item in row['user_id_left'].split() if item.endswith('_2')])
right_3 = set([item for item in row['user_id_right'].split() if item.endswith('_3')])
if len(left_2|right_3) < 1000:
return 0
else:
return len(left_2&right_3) / len(left_2|right_3)
except:
return 0
videos_matrix['jaccard_2_3'] = videos_matrix.apply(jaccard_2_3, axis=1)
print "Calculating Jaccard indexes #3-3 - " + str(datetime.datetime.now())
def jaccard_3_3(row): # people who like LEFT and like RIGHT
try:
left_3 = set([item for item in row['user_id_left'].split() if item.endswith('_3')])
right_3 = set([item for item in row['user_id_right'].split() if item.endswith('_3')])
if len(left_3|right_3) < 1000:
return 0
else:
return len(left_3&right_3) / len(left_3|right_3)
except:
return 0
videos_matrix['jaccard_3_3'] = videos_matrix.apply(jaccard_3_3, axis=1)
return videos_matrix.drop(['user_id_left','user_id_right'],1)
def cos_similarity(videos_matrix): # based on mv_ratio
behaviors = pd.read_csv('./data/20150701094451-Behavior_training.csv')
# user - video matrix
behaviors_wide = pd.pivot_table(behaviors, values=["mv_ratio"],
index=["video_id", "user_id"],
aggfunc=np.mean).unstack()
# any cells that are missing data (i.e. a user didn't buy a particular product)
# we're going to set to 0
behaviors_wide = behaviors_wide.fillna(0)
# this is the key. we're going to use cosine_similarity from scikit-learn
# to compute the distance between all beers
cosine_video_matrix = cosine_similarity(behaviors_wide)
# stuff the distance matrix into a dataframe so it's easier to operate on
cosine_video_matrix = pd.DataFrame(cosine_video_matrix, columns=behaviors_wide.index)
# give the indicies (equivalent to rownames in R) the name of the product id
cosine_video_matrix.index = cosine_video_matrix.columns
def sim_cosine_mv_ratio(row):
try:
return cosine_video_matrix[row['video_id_left']][row['video_id_right']]
except: # no data for row['video_id_left']
return 0
videos_matrix['sim_cosine_mv_ratio'] = videos_matrix.apply(sim_cosine_mv_ratio, axis=1)
return videos_matrix
def output_videos_matrix_to_csv(videos_matrix):
videos_matrix.to_csv("./data/videos_similarity_matrix.csv", encoding='utf-8', index=False)
def main():
print "=> Processing data - " + str(datetime.datetime.now())
videos_matrix = read_data()
print "=> Calculating feature similarities - " + str(datetime.datetime.now())
videos_matrix = feature_similarity(videos_matrix)
print "=> Calculating cosine similarities - " + str(datetime.datetime.now())
videos_matrix = cos_similarity(videos_matrix)
print "=> Calculating jaccard similarities - " + str(datetime.datetime.now())
videos_matrix = jaccard_similarity(videos_matrix)
print "=> Output to csv - " + str(datetime.datetime.now())
output_videos_matrix_to_csv(videos_matrix)
if __name__ == "__main__":
main()