forked from recommenders-team/recommenders
-
Notifications
You must be signed in to change notification settings - Fork 0
/
python_utils.py
141 lines (100 loc) · 4.46 KB
/
python_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import logging
import numpy as np
from scipy import sparse
logger = logging.getLogger()
def exponential_decay(value, max_val, half_life):
"""Compute decay factor for a given value based on an exponential decay.
Values greater than `max_val` will be set to 1.
Args:
value (numeric): value to calculate decay factor
max_val (numeric): value at which decay factor will be 1
half_life (numeric): value at which decay factor will be 0.5
Returns:
float: decay factor
"""
return np.minimum(1.0, np.power(0.5, (max_val - value) / half_life))
def jaccard(cooccurrence):
"""Helper method to calculate the Jaccard similarity of a matrix of co-occurrences.
Args:
cooccurrence (np.array): the symmetric matrix of co-occurrences of items.
Returns:
np.array: The matrix of Jaccard similarities between any two items.
"""
diag = cooccurrence.diagonal()
diag_rows = np.expand_dims(diag, axis=0)
diag_cols = np.expand_dims(diag, axis=1)
with np.errstate(invalid="ignore", divide="ignore"):
result = cooccurrence / (diag_rows + diag_cols - cooccurrence)
return np.array(result)
def lift(cooccurrence):
"""Helper method to calculate the Lift of a matrix of co-occurrences.
Args:
cooccurrence (np.array): the symmetric matrix of co-occurrences of items.
Returns:
np.array: The matrix of Lifts between any two items.
"""
diag = cooccurrence.diagonal()
diag_rows = np.expand_dims(diag, axis=0)
diag_cols = np.expand_dims(diag, axis=1)
with np.errstate(invalid="ignore", divide="ignore"):
result = cooccurrence / (diag_rows * diag_cols)
return np.array(result)
def get_top_k_scored_items(scores, top_k, sort_top_k=False):
"""Extract top K items from a matrix of scores for each user-item pair, optionally sort results per user.
Args:
scores (np.array): score matrix (users x items).
top_k (int): number of top items to recommend.
sort_top_k (bool): flag to sort top k results.
Returns:
np.array, np.array: indices into score matrix for each users top items, scores corresponding to top items.
"""
# ensure we're working with a dense ndarray
if isinstance(scores, sparse.spmatrix):
scores = scores.todense()
if scores.shape[1] < top_k:
logger.warning(
"Number of items is less than top_k, limiting top_k to number of items"
)
k = min(top_k, scores.shape[1])
test_user_idx = np.arange(scores.shape[0])[:, None]
# get top K items and scores
# this determines the un-ordered top-k item indices for each user
top_items = np.argpartition(scores, -k, axis=1)[:, -k:]
top_scores = scores[test_user_idx, top_items]
if sort_top_k:
sort_ind = np.argsort(-top_scores)
top_items = top_items[test_user_idx, sort_ind]
top_scores = top_scores[test_user_idx, sort_ind]
return np.array(top_items), np.array(top_scores)
def binarize(a, threshold):
"""Binarize the values.
Args:
a (np.ndarray): Input array that needs to be binarized.
threshold (float): Threshold below which all values are set to 0, else 1.
"""
return np.where(
a > threshold,
1.0,
0.0
)
def rescale(data, new_min=0, new_max=1, data_min=None, data_max=None):
"""
Rescale/normalize the data to be within the range [new_min, new_max]
If data_min and data_max are explicitly provided, they will be used
as the old min/max values instead of taken from the data.
Note: this is same as the scipy.MinMaxScaler with the exception that we can override
the min/max of the old scale.
Args:
data (np.array): 1d scores vector or 2d score matrix (users x items).
new_min (int|float): The minimum of the newly scaled data.
new_max (int|float): The maximum of the newly scaled data.
data_min (None|number): The minimum of the passed data [if omitted it will be inferred].
data_max (None|number): The maximum of the passed data [if omitted it will be inferred].
Returns:
np.array: The newly scaled/normalized data.
"""
data_min = data.min() if data_min is None else data_min
data_max = data.max() if data_max is None else data_max
return (data - data_min) / (data_max - data_min) * (new_max - new_min) + new_min