diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 7a0a0a6..c902713 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -19,8 +19,8 @@ Steps to reproduce the behavior: **Operating environment(运行环境):** - python version [e.g. 3.6, 3.7, 3.8] - - tensorflow version [e.g. 1.4.0, 1.14.0, 2.5.0] - - deepmatch version [e.g. 0.2.1,] + - tensorflow version [e.g. 1.9.0, 1.14.0, 2.5.0] + - deepmatch version [e.g. 0.3.0,] **Additional context** Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md index 2cabbcb..b306341 100644 --- a/.github/ISSUE_TEMPLATE/question.md +++ b/.github/ISSUE_TEMPLATE/question.md @@ -16,5 +16,5 @@ Add any other context about the problem here. **Operating environment(运行环境):** - python version [e.g. 3.6, 3.7, 3.8] - - tensorflow version [e.g. 1.4.0, 1.14.0, 2.5.0] - - deepmatch version [e.g. 0.2.1,] + - tensorflow version [e.g. 1.9.0, 1.14.0, 2.5.0] + - deepmatch version [e.g. 0.3.0,] diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ac474c3..1cc83f4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,15 +18,35 @@ jobs: strategy: matrix: python-version: [3.6,3.7,3.8] - tf-version: [1.4.0,1.14.0,2.5.0] + tf-version: [1.9.0,1.14.0,2.5.0] exclude: - python-version: 3.7 tf-version: 1.4.0 + - python-version: 3.7 + tf-version: 1.9.0 + - python-version: 3.7 + tf-version: 1.10.0 + - python-version: 3.7 + tf-version: 1.11.0 + - python-version: 3.7 + tf-version: 1.12.0 + - python-version: 3.7 + tf-version: 1.13.0 - python-version: 3.7 tf-version: 1.15.0 - python-version: 3.8 tf-version: 1.4.0 + - python-version: 3.8 + tf-version: 1.9.0 + - python-version: 3.8 + tf-version: 1.10.0 + - python-version: 3.8 + tf-version: 1.11.0 + - python-version: 3.8 + tf-version: 1.12.0 + - python-version: 3.8 + tf-version: 1.13.0 - python-version: 3.8 tf-version: 1.14.0 - python-version: 3.8 diff --git a/README.md b/README.md index 69dea2a..a02cb8c 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ # DeepMatch [![Python Versions](https://img.shields.io/pypi/pyversions/deepmatch.svg)](https://pypi.org/project/deepmatch) -[![TensorFlow Versions](https://img.shields.io/badge/TensorFlow-1.4+/2.0+-blue.svg)](https://pypi.org/project/deepmatch) +[![TensorFlow Versions](https://img.shields.io/badge/TensorFlow-1.9+/2.0+-blue.svg)](https://pypi.org/project/deepmatch) +[![Downloads](https://pepy.tech/badge/deepmatch)](https://pepy.tech/project/deepmatch) [![PyPI Version](https://img.shields.io/pypi/v/deepmatch.svg)](https://pypi.org/project/deepmatch) [![GitHub Issues](https://img.shields.io/github/issues/shenweichen/deepmatch.svg )](https://github.com/shenweichen/deepmatch/issues) @@ -11,7 +12,8 @@ [![Documentation Status](https://readthedocs.org/projects/deepmatch/badge/?version=latest)](https://deepmatch.readthedocs.io/) ![CI status](https://github.com/shenweichen/deepmatch/workflows/CI/badge.svg) [![codecov](https://codecov.io/gh/shenweichen/DeepMatch/branch/master/graph/badge.svg)](https://codecov.io/gh/shenweichen/DeepMatch) -[![Disscussion](https://img.shields.io/badge/chat-wechat-brightgreen?style=flat)](./README.md#disscussiongroup) +[![Codacy Badge](https://app.codacy.com/project/badge/Grade/c5a2769ec35444d8958f6b58ff85029b)](https://www.codacy.com/gh/shenweichen/DeepMatch/dashboard?utm_source=github.com&utm_medium=referral&utm_content=shenweichen/DeepMatch&utm_campaign=Badge_Grade) +[![Disscussion](https://img.shields.io/badge/chat-wechat-brightgreen?style=flat)](https://github.com/shenweichen/DeepMatch#disscussiongroup) [![License](https://img.shields.io/github/license/shenweichen/deepmatch.svg)](https://github.com/shenweichen/deepmatch/blob/master/LICENSE) DeepMatch is a deep matching model library for recommendations & advertising. It's easy to **train models** and to **export representation vectors** for user and item which can be used for **ANN search**.You can use any complex model with `model.fit()`and `model.predict()` . @@ -72,31 +74,12 @@ Let's [**Get Started!**](https://deepmatch.readthedocs.io/en/latest/Quick-Start. -## DisscussionGroup & Related Projects +## DisscussionGroup + +- [Github Discussions](https://github.com/shenweichen/DeepMatch/discussions) +- Wechat Discussions + +|公众号:浅梦学习笔记|微信:deepctrbot|学习小组 [加入](https://t.zsxq.com/026UJEuzv) [主题集合](https://mp.weixin.qq.com/mp/appmsgalbum?__biz=MjM5MzY4NzE3MA==&action=getalbum&album_id=1361647041096843265&scene=126#wechat_redirect)| +|:--:|:--:|:--:| +| [![公众号](./docs/pics/code.png)](https://github.com/shenweichen/AlgoNotes)| [![微信](./docs/pics/deepctrbot.png)](https://github.com/shenweichen/AlgoNotes)|[![学习小组](./docs/pics/planet_github.png)](https://t.zsxq.com/026UJEuzv)| - - - - - - - -
- 公众号:浅梦的学习笔记

- - - -
- 微信:deepctrbot

- - - -
- -
- diff --git a/deepmatch/__init__.py b/deepmatch/__init__.py index 6ed5791..5852002 100644 --- a/deepmatch/__init__.py +++ b/deepmatch/__init__.py @@ -1,4 +1,4 @@ from .utils import check_version -__version__ = '0.2.1' +__version__ = '0.3.0' check_version(__version__) diff --git a/deepmatch/layers/__init__.py b/deepmatch/layers/__init__.py index a6053ed..08fc23c 100644 --- a/deepmatch/layers/__init__.py +++ b/deepmatch/layers/__init__.py @@ -1,8 +1,8 @@ from deepctr.layers import custom_objects from deepctr.layers.utils import reduce_sum -from .core import PoolingLayer, Similarity, LabelAwareAttention, CapsuleLayer, SampledSoftmaxLayer, EmbeddingIndex, \ - MaskUserEmbedding +from .core import PoolingLayer, LabelAwareAttention, CapsuleLayer, SampledSoftmaxLayer, EmbeddingIndex, \ + MaskUserEmbedding, InBatchSoftmaxLayer from .interaction import DotAttention, ConcatAttention, SoftmaxWeightedSum, AttentionSequencePoolingLayer, \ SelfAttention, \ SelfMultiHeadAttention, UserAttention @@ -10,11 +10,11 @@ from ..utils import sampledsoftmaxloss _custom_objects = {'PoolingLayer': PoolingLayer, - 'Similarity': Similarity, 'LabelAwareAttention': LabelAwareAttention, 'CapsuleLayer': CapsuleLayer, 'reduce_sum': reduce_sum, 'SampledSoftmaxLayer': SampledSoftmaxLayer, + 'InBatchSoftmaxLayer': InBatchSoftmaxLayer, 'sampledsoftmaxloss': sampledsoftmaxloss, 'EmbeddingIndex': EmbeddingIndex, 'DotAttention': DotAttention, diff --git a/deepmatch/layers/core.py b/deepmatch/layers/core.py index 1267653..f921505 100644 --- a/deepmatch/layers/core.py +++ b/deepmatch/layers/core.py @@ -1,9 +1,15 @@ +""" + +Author: + Weichen Shen,weichenswc@163.com + +""" + +import numpy as np import tensorflow as tf -from deepctr.layers.activation import activation_layer from deepctr.layers.utils import reduce_max, reduce_mean, reduce_sum, concat_func, div, softmax -from tensorflow.python.keras.initializers import RandomNormal, Zeros, TruncatedNormal +from tensorflow.python.keras.initializers import Zeros from tensorflow.python.keras.layers import Layer -from tensorflow.python.keras.regularizers import l2 class PoolingLayer(Layer): @@ -45,45 +51,103 @@ def get_config(self, ): class SampledSoftmaxLayer(Layer): - def __init__(self, num_sampled=5, **kwargs): - self.num_sampled = num_sampled + def __init__(self, sampler_config, temperature=1.0, **kwargs): + self.sampler_config = sampler_config + self.temperature = temperature + self.sampler = self.sampler_config['sampler'] + self.item_count = self.sampler_config['item_count'] + super(SampledSoftmaxLayer, self).__init__(**kwargs) def build(self, input_shape): - self.size = input_shape[0][0] - self.zero_bias = self.add_weight(shape=[self.size], + self.vocabulary_size = input_shape[0][0] + self.zero_bias = self.add_weight(shape=[self.vocabulary_size], initializer=Zeros, dtype=tf.float32, trainable=False, name="bias") super(SampledSoftmaxLayer, self).build(input_shape) - def call(self, inputs_with_label_idx, training=None, **kwargs): - """ - The first input should be the model as it were, and the second the - target (i.e., a repeat of the training data) to compute the labels - argument - """ - embeddings, inputs, label_idx = inputs_with_label_idx - - loss = tf.nn.sampled_softmax_loss(weights=embeddings, # self.item_embedding. - biases=self.zero_bias, - labels=label_idx, - inputs=inputs, - num_sampled=self.num_sampled, - num_classes=self.size, # self.target_song_size - ) + def call(self, inputs_with_item_idx, training=None, **kwargs): + item_embeddings, user_vec, item_idx = inputs_with_item_idx + if item_idx.dtype != tf.int64: + item_idx = tf.cast(item_idx, tf.int64) + user_vec /= self.temperature + if self.sampler == "inbatch": + item_vec = tf.gather(item_embeddings, tf.squeeze(item_idx, axis=1)) + logits = tf.matmul(user_vec, item_vec, transpose_b=True) + loss = inbatch_softmax_cross_entropy_with_logits(logits, self.item_count, item_idx) + + else: + num_sampled = self.sampler_config['num_sampled'] + if self.sampler == "frequency": + sampled_values = tf.nn.fixed_unigram_candidate_sampler(item_idx, 1, num_sampled, True, + self.vocabulary_size, + distortion=self.sampler_config['distortion'], + unigrams=np.maximum(self.item_count, 1).tolist(), + seed=None, + name=None) + elif self.sampler == "adaptive": + sampled_values = tf.nn.learned_unigram_candidate_sampler(item_idx, 1, num_sampled, True, + self.vocabulary_size, seed=None, name=None) + elif self.sampler == "uniform": + try: + sampled_values = tf.nn.uniform_candidate_sampler(item_idx, 1, num_sampled, True, + self.vocabulary_size, seed=None, name=None) + except AttributeError: + sampled_values = tf.random.uniform_candidate_sampler(item_idx, 1, num_sampled, True, + self.vocabulary_size, seed=None, name=None) + else: + raise ValueError(' `%s` sampler is not supported ' % self.sampler) + + loss = tf.nn.sampled_softmax_loss(weights=item_embeddings, + biases=self.zero_bias, + labels=item_idx, + inputs=user_vec, + num_sampled=num_sampled, + num_classes=self.vocabulary_size, + sampled_values=sampled_values + ) return tf.expand_dims(loss, axis=1) def compute_output_shape(self, input_shape): return (None, 1) def get_config(self, ): - config = {'num_sampled': self.num_sampled} + config = {'sampler_config': self.sampler_config, 'temperature': self.temperature} base_config = super(SampledSoftmaxLayer, self).get_config() return dict(list(base_config.items()) + list(config.items())) +class InBatchSoftmaxLayer(Layer): + def __init__(self, sampler_config, temperature=1.0, **kwargs): + self.sampler_config = sampler_config + self.temperature = temperature + self.item_count = self.sampler_config['item_count'] + + super(InBatchSoftmaxLayer, self).__init__(**kwargs) + + def build(self, input_shape): + super(InBatchSoftmaxLayer, self).build(input_shape) + + def call(self, inputs_with_item_idx, training=None, **kwargs): + user_vec, item_vec, item_idx = inputs_with_item_idx + if item_idx.dtype != tf.int64: + item_idx = tf.cast(item_idx, tf.int64) + user_vec /= self.temperature + logits = tf.matmul(user_vec, item_vec, transpose_b=True) + loss = inbatch_softmax_cross_entropy_with_logits(logits, self.item_count, item_idx) + return tf.expand_dims(loss, axis=1) + + def compute_output_shape(self, input_shape): + return (None, 1) + + def get_config(self, ): + config = {'sampler_config': self.sampler_config, 'temperature': self.temperature} + base_config = super(InBatchSoftmaxLayer, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + class LabelAwareAttention(Layer): def __init__(self, k_max, pow_p=1, **kwargs): self.k_max = k_max @@ -128,38 +192,6 @@ def get_config(self, ): return dict(list(base_config.items()) + list(config.items())) -class Similarity(Layer): - - def __init__(self, gamma=1, axis=-1, type='cos', **kwargs): - self.gamma = gamma - self.axis = axis - self.type = type - super(Similarity, self).__init__(**kwargs) - - def build(self, input_shape): - # Be sure to call this somewhere! - super(Similarity, self).build(input_shape) - - def call(self, inputs, **kwargs): - query, candidate = inputs - if self.type == "cos": - query_norm = tf.norm(query, axis=self.axis) - candidate_norm = tf.norm(candidate, axis=self.axis) - cosine_score = reduce_sum(tf.multiply(query, candidate), -1) - if self.type == "cos": - cosine_score = div(cosine_score, query_norm * candidate_norm + 1e-8) - cosine_score = tf.clip_by_value(cosine_score, -1, 1.0) * self.gamma - return cosine_score - - def compute_output_shape(self, input_shape): - return (None, 1) - - def get_config(self, ): - config = {'gamma': self.gamma, 'axis': self.axis, 'type': self.type} - base_config = super(Similarity, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - class CapsuleLayer(Layer): def __init__(self, input_units, out_units, max_len, k_max, iteration_times=3, init_std=1.0, **kwargs): @@ -245,6 +277,23 @@ def squash(inputs): return vec_squashed +def inbatch_softmax_cross_entropy_with_logits(logits, item_count, item_idx): + Q = tf.gather(tf.constant(item_count / np.sum(item_count), 'float32'), + tf.squeeze(item_idx, axis=1)) + try: + logQ = tf.reshape(tf.math.log(Q), (1, -1)) + logits -= logQ # subtract_log_q + labels = tf.linalg.diag(tf.ones_like(logits[0])) + except AttributeError: + logQ = tf.reshape(tf.log(Q), (1, -1)) + logits -= logQ # subtract_log_q + labels = tf.diag(tf.ones_like(logits[0])) + + loss = tf.nn.softmax_cross_entropy_with_logits( + labels=labels, logits=logits) + return loss + + class EmbeddingIndex(Layer): def __init__(self, index, **kwargs): diff --git a/deepmatch/layers/interaction.py b/deepmatch/layers/interaction.py index 6c6f5ab..96b4048 100644 --- a/deepmatch/layers/interaction.py +++ b/deepmatch/layers/interaction.py @@ -1,3 +1,10 @@ +""" + +Author: + Weichen Shen,weichenswc@163.com + +""" + import tensorflow as tf from deepctr.layers.normalization import LayerNormalization from deepctr.layers.utils import softmax, reduce_mean @@ -109,7 +116,7 @@ def call(self, inputs, mask=None, training=None, **kwargs): lower_tri = tf.ones([length, length]) try: lower_tri = tf.contrib.linalg.LinearOperatorTriL(lower_tri).to_dense() - except: + except AttributeError: lower_tri = tf.linalg.LinearOperatorLowerTriangular(lower_tri).to_dense() masks = tf.tile(tf.expand_dims(lower_tri, 0), [tf.shape(align)[0], 1, 1]) align = tf.where(tf.equal(masks, 0), paddings, align) @@ -199,8 +206,8 @@ def build(self, input_shape): super(SelfAttention, self).build(input_shape) def call(self, inputs, mask=None, **kwargs): - input, key_masks = inputs - querys, keys, values = input, input, input + _input, key_masks = inputs + querys, keys, values = _input, _input, _input align = self.attention([querys, keys]) output = self.softmax_weight_sum([align, values, key_masks]) if self.use_layer_norm: diff --git a/deepmatch/layers/sequence.py b/deepmatch/layers/sequence.py index 10e12b9..c89ff9a 100644 --- a/deepmatch/layers/sequence.py +++ b/deepmatch/layers/sequence.py @@ -1,3 +1,10 @@ +""" + +Author: + Weichen Shen,weichenswc@163.com + +""" + import tensorflow as tf from tensorflow.python.keras.layers import Layer diff --git a/deepmatch/models/dssm.py b/deepmatch/models/dssm.py index f00eb3f..14505b5 100644 --- a/deepmatch/models/dssm.py +++ b/deepmatch/models/dssm.py @@ -1,6 +1,8 @@ """ Author: - Zhe Wang,734914022@qq.com + Zhe Wang, 734914022@qq.com + Weichen Shen, weichenswc@163.com + Reference: Huang P S , He X , Gao J , et al. Learning deep structured semantic models for web search using clickthrough data[C]// Acm International Conference on Conference on Information & Knowledge Management. ACM, 2013. """ @@ -10,13 +12,16 @@ from tensorflow.python.keras.models import Model from ..inputs import input_from_feature_columns -from ..layers.core import Similarity +from ..layers.core import InBatchSoftmaxLayer +from ..utils import l2_normalize, inner_product def DSSM(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(64, 32), item_dnn_hidden_units=(64, 32), dnn_activation='relu', dnn_use_bn=False, - l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, gamma=10, seed=1024, metric='cos'): + l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, loss_type='softmax', temperature=0.05, + sampler_config=None, + seed=1024, ): """Instantiates the Deep Structured Semantic Model architecture. :param user_feature_columns: An iterable containing user's features used by the model. @@ -28,9 +33,10 @@ def DSSM(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(64, :param l2_reg_dnn: float. L2 regularizer strength applied to DNN :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate. - :param gamma: float. Scaling factor. + :param loss_type: string. Loss type. + :param temperature: float. Scaling factor. + :param sampler_config: negative sample config. :param seed: integer ,to use as random seed. - :param metric: str, ``"cos"`` for cosine or ``"ip"`` for inner product :return: A Keras model instance. """ @@ -57,13 +63,24 @@ def DSSM(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(64, user_dnn_out = DNN(user_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, dnn_use_bn, output_activation='linear', seed=seed)(user_dnn_input) - - item_dnn_out = DNN(item_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, - dnn_use_bn, output_activation='linear', seed=seed)(item_dnn_input) - - score = Similarity(type=metric, gamma=gamma)([user_dnn_out, item_dnn_out]) - - output = PredictionLayer("binary", False)(score) + user_dnn_out = l2_normalize(user_dnn_out) + + if len(item_dnn_hidden_units) > 0: + item_dnn_out = DNN(item_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, + dnn_use_bn, output_activation='linear', seed=seed)(item_dnn_input) + else: + item_dnn_out = item_dnn_input + item_dnn_out = l2_normalize(item_dnn_out) + + if loss_type == "logistic": + score = inner_product(user_dnn_out, item_dnn_out, temperature) + output = PredictionLayer("binary", False)(score) + + elif loss_type == "softmax": + output = InBatchSoftmaxLayer(sampler_config._asdict(), temperature)( + [user_dnn_out, item_dnn_out, item_features[sampler_config.item_name]]) + else: + raise ValueError(' `loss_type` must be `logistic` or `softmax` ') model = Model(inputs=user_inputs_list + item_inputs_list, outputs=output) diff --git a/deepmatch/models/fm.py b/deepmatch/models/fm.py index 5b25fe0..aea5542 100644 --- a/deepmatch/models/fm.py +++ b/deepmatch/models/fm.py @@ -1,3 +1,8 @@ +""" +Author: + Weichen Shen, weichenswc@163.com + +""" from deepctr.feature_column import build_input_features from deepctr.layers.core import PredictionLayer from deepctr.layers.utils import concat_func, reduce_sum @@ -5,18 +10,22 @@ from tensorflow.python.keras.models import Model from ..inputs import create_embedding_matrix, input_from_feature_columns -from ..layers.core import Similarity +from ..layers.core import InBatchSoftmaxLayer +from ..utils import l2_normalize, inner_product -def FM(user_feature_columns, item_feature_columns, l2_reg_embedding=1e-6, gamma=10, seed=1024, metric='cos'): +def FM(user_feature_columns, item_feature_columns, l2_reg_embedding=1e-6, loss_type='softmax', temperature=0.05, + sampler_config=None, seed=1024, + ): """Instantiates the FM architecture. :param user_feature_columns: An iterable containing user's features used by the model. :param item_feature_columns: An iterable containing item's features used by the model. :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector - :param gamma: float. Scaling factor. + :param loss_type: string. Loss type. + :param temperature: float. Scaling factor. + :param sampler_config: negative sample config. :param seed: integer ,to use as random seed. - :param metric: str, ``"cos"`` for cosine or ``"ip"`` for inner product :return: A Keras model instance. """ @@ -27,29 +36,37 @@ def FM(user_feature_columns, item_feature_columns, l2_reg_embedding=1e-6, gamma= user_features = build_input_features(user_feature_columns) user_inputs_list = list(user_features.values()) - user_sparse_embedding_list, user_dense_value_list = input_from_feature_columns(user_features, - user_feature_columns, - l2_reg_embedding, seed=seed, - support_dense=False, - embedding_matrix_dict=embedding_matrix_dict) + user_sparse_embedding_list, _ = input_from_feature_columns(user_features, + user_feature_columns, + l2_reg_embedding, seed=seed, + support_dense=False, + embedding_matrix_dict=embedding_matrix_dict) item_features = build_input_features(item_feature_columns) item_inputs_list = list(item_features.values()) - item_sparse_embedding_list, item_dense_value_list = input_from_feature_columns(item_features, - item_feature_columns, - l2_reg_embedding, seed=seed, - support_dense=False, - embedding_matrix_dict=embedding_matrix_dict) + item_sparse_embedding_list, _ = input_from_feature_columns(item_features, + item_feature_columns, + l2_reg_embedding, seed=seed, + support_dense=False, + embedding_matrix_dict=embedding_matrix_dict) user_dnn_input = concat_func(user_sparse_embedding_list, axis=1) user_vector_sum = Lambda(lambda x: reduce_sum(x, axis=1, keep_dims=False))(user_dnn_input) + user_vector_sum = l2_normalize(user_vector_sum) item_dnn_input = concat_func(item_sparse_embedding_list, axis=1) item_vector_sum = Lambda(lambda x: reduce_sum(x, axis=1, keep_dims=False))(item_dnn_input) + item_vector_sum = l2_normalize(item_vector_sum) - score = Similarity(type=metric, gamma=gamma)([user_vector_sum, item_vector_sum]) + if loss_type == "logistic": + score = inner_product(user_vector_sum, item_vector_sum, temperature) + output = PredictionLayer("binary", False)(score) - output = PredictionLayer("binary", False)(score) + elif loss_type == "softmax": + output = InBatchSoftmaxLayer(sampler_config._asdict(), temperature)( + [user_vector_sum, item_vector_sum, item_features[sampler_config.item_name]]) + else: + raise ValueError(' `loss_type` must be `logistic` or `softmax` ') model = Model(inputs=user_inputs_list + item_inputs_list, outputs=output) diff --git a/deepmatch/models/mind.py b/deepmatch/models/mind.py index 556e318..a6f2831 100755 --- a/deepmatch/models/mind.py +++ b/deepmatch/models/mind.py @@ -1,7 +1,8 @@ """ Author: - Qingliang Cai,leocaicoder@163.com - Weichen Shen,wcshen1994@164.com + Qingliang Cai, leocaicoder@163.com + Weichen Shen, weichenswc@163.com + Reference: Li C, Liu Z, Wu M, et al. Multi-interest network with dynamic routing for recommendation at Tmall[C]//Proceedings of the 28th ACM International Conference on Information and Knowledge Management. 2019: 2615-2623. """ @@ -45,9 +46,9 @@ def adaptive_interest_num(seq_len, k_max): return k_user -def MIND(user_feature_columns, item_feature_columns, num_sampled=5, k_max=2, p=100, dynamic_k=True, +def MIND(user_feature_columns, item_feature_columns, k_max=2, p=100, dynamic_k=False, user_dnn_hidden_units=(64, 32), dnn_activation='relu', dnn_use_bn=False, l2_reg_dnn=0, l2_reg_embedding=1e-6, - dnn_dropout=0, output_activation='linear', seed=1024): + dnn_dropout=0, output_activation='linear', sampler_config=None, seed=1024): """Instantiates the MIND Model architecture. :param user_feature_columns: An iterable containing user's features used by the model. @@ -63,8 +64,9 @@ def MIND(user_feature_columns, item_feature_columns, num_sampled=5, k_max=2, p=1 :param l2_reg_dnn: L2 regularizer strength applied to DNN :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate. - :param seed: integer ,to use as random seed. :param output_activation: Activation function to use in output layer + :param sampler_config: negative sample config. + :param seed: integer ,to use as random seed. :return: A Keras model instance. """ @@ -151,6 +153,7 @@ def MIND(user_feature_columns, item_feature_columns, num_sampled=5, k_max=2, p=1 dnn_dropout, dnn_use_bn, output_activation=output_activation, seed=seed, name="user_dnn")( user_deep_input) + item_inputs_list = list(item_features.values()) item_embedding_matrix = embedding_matrix_dict[item_feature_name] @@ -166,8 +169,8 @@ def MIND(user_feature_columns, item_feature_columns, num_sampled=5, k_max=2, p=1 user_embedding_final = LabelAwareAttention(k_max=k_max, pow_p=p)((user_embeddings, target_emb, interest_num)) else: user_embedding_final = LabelAwareAttention(k_max=k_max, pow_p=p)((user_embeddings, target_emb)) - - output = SampledSoftmaxLayer(num_sampled=num_sampled)( + print("swc") + output = SampledSoftmaxLayer(sampler_config._asdict())( [pooling_item_embedding_weight, user_embedding_final, item_features[item_feature_name]]) model = Model(inputs=inputs_list + item_inputs_list, outputs=output) diff --git a/deepmatch/models/ncf.py b/deepmatch/models/ncf.py index 95be769..ac835f1 100644 --- a/deepmatch/models/ncf.py +++ b/deepmatch/models/ncf.py @@ -48,7 +48,7 @@ def NCF(user_feature_columns, item_feature_columns, user_gmf_embedding_dim=20, i for feat, size in user_feature_columns.items()] user_features = build_input_features(user_gmf_feature_columns) user_inputs_list = list(user_features.values()) - user_gmf_sparse_embedding_list, user_gmf_dense_value_list = input_from_feature_columns(user_features, + user_gmf_sparse_embedding_list, _ = input_from_feature_columns(user_features, user_gmf_feature_columns, l2_reg_embedding, seed=seed, prefix='gmf_') @@ -59,7 +59,7 @@ def NCF(user_feature_columns, item_feature_columns, user_gmf_embedding_dim=20, i for feat, size in item_feature_columns.items()] item_features = build_input_features(item_gmf_feature_columns) item_inputs_list = list(item_features.values()) - item_gmf_sparse_embedding_list, item_gmf_dense_value_list = input_from_feature_columns(item_features, + item_gmf_sparse_embedding_list, _ = input_from_feature_columns(item_features, item_gmf_feature_columns, l2_reg_embedding, seed=seed, prefix='gmf_') @@ -92,11 +92,11 @@ def NCF(user_feature_columns, item_feature_columns, user_gmf_embedding_dim=20, i mlp_input = Concatenate(axis=1)([user_mlp_out, item_mlp_out]) mlp_out = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, - dnn_use_bn, seed = seed, name="mlp_embedding")(mlp_input) + dnn_use_bn, seed=seed, name="mlp_embedding")(mlp_input) # Fusion of GMF and MLP neumf_input = Concatenate(axis=1)([gmf_out, mlp_out]) - neumf_out = DNN(hidden_units=[1], activation='sigmoid',seed=seed)(neumf_input) + neumf_out = DNN(hidden_units=[1], activation='sigmoid', seed=seed)(neumf_input) output = Lambda(lambda x: x, name='neumf_out')(neumf_out) # output = PredictionLayer(task, False)(neumf_out) diff --git a/deepmatch/models/sdm.py b/deepmatch/models/sdm.py index 5fbd5c4..3fb2bdc 100644 --- a/deepmatch/models/sdm.py +++ b/deepmatch/models/sdm.py @@ -1,7 +1,7 @@ # -*- coding:utf-8 -*- """ Author: - Zhe Wang,734914022@qq.com + Zhe Wang, 734914022@qq.com Reference: [1] Lv, Fuyu, Jin, Taiwei, Yu, Changlong etc. SDM: Sequential Deep Matching Model for Online Large-scale Recommender System[J]. @@ -18,13 +18,13 @@ from ..layers.core import PoolingLayer, SampledSoftmaxLayer, EmbeddingIndex from ..layers.interaction import UserAttention, SelfMultiHeadAttention, AttentionSequencePoolingLayer from ..layers.sequence import DynamicMultiRNN -from ..utils import get_item_embedding +from ..utils import get_item_embedding, l2_normalize -def SDM(user_feature_columns, item_feature_columns, history_feature_list, num_sampled=5, units=64, rnn_layers=2, +def SDM(user_feature_columns, item_feature_columns, history_feature_list, units=64, rnn_layers=2, dropout_rate=0.2, rnn_num_res=1, - num_head=4, l2_reg_embedding=1e-6, dnn_activation='tanh', seed=1024): + num_head=4, l2_reg_embedding=1e-6, dnn_activation='tanh', temperature=0.05, sampler_config=None, seed=1024): """Instantiates the Sequential Deep Matching Model architecture. :param user_feature_columns: An iterable containing user's features used by the model. @@ -38,6 +38,8 @@ def SDM(user_feature_columns, item_feature_columns, history_feature_list, num_sa :param num_head: int int, the number of attention head :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector :param dnn_activation: Activation function to use in deep net + :param temperature: float. Scaling factor. + :param sampler_config: negative sample config. :param seed: integer ,to use as random seed. :return: A Keras model instance. @@ -102,7 +104,7 @@ def SDM(user_feature_columns, item_feature_columns, history_feature_list, num_sa prefer_sess_length = features['prefer_sess_length'] prefer_att_outputs = [] - for i, prefer_emb in enumerate(prefer_emb_list): + for prefer_emb in prefer_emb_list: prefer_attention_output = AttentionSequencePoolingLayer(dropout_rate=0)( [user_emb_output, prefer_emb, prefer_sess_length]) prefer_att_outputs.append(prefer_attention_output) @@ -131,14 +133,15 @@ def SDM(user_feature_columns, item_feature_columns, history_feature_list, num_sa gate_output = Lambda(lambda x: tf.multiply(x[0], x[1]) + tf.multiply(1 - x[0], x[2]))( [gate, short_output, prefer_output]) gate_output_reshape = Lambda(lambda x: tf.squeeze(x, 1))(gate_output) + gate_output_reshape = l2_normalize(gate_output_reshape) item_index = EmbeddingIndex(list(range(item_vocabulary_size)))(item_features[item_feature_name]) item_embedding_matrix = embedding_matrix_dict[item_feature_name] item_embedding_weight = NoMask()(item_embedding_matrix(item_index)) pooling_item_embedding_weight = PoolingLayer()([item_embedding_weight]) - - output = SampledSoftmaxLayer(num_sampled=num_sampled)([ + pooling_item_embedding_weight = l2_normalize(pooling_item_embedding_weight) + output = SampledSoftmaxLayer(sampler_config._asdict(), temperature)([ pooling_item_embedding_weight, gate_output_reshape, item_features[item_feature_name]]) model = Model(inputs=user_inputs_list + item_inputs_list, outputs=output) diff --git a/deepmatch/models/youtubednn.py b/deepmatch/models/youtubednn.py index f146e87..5def6f8 100644 --- a/deepmatch/models/youtubednn.py +++ b/deepmatch/models/youtubednn.py @@ -1,6 +1,6 @@ """ Author: - Weichen Shen, wcshen1994@163.com + Weichen Shen, weichenswc@163.com Reference: Covington P, Adams J, Sargin E. Deep neural networks for youtube recommendations[C]//Proceedings of the 10th ACM conference on recommender systems. 2016: 191-198. """ @@ -11,13 +11,14 @@ from ..inputs import input_from_feature_columns, create_embedding_matrix from ..layers.core import SampledSoftmaxLayer, EmbeddingIndex, PoolingLayer -from ..utils import get_item_embedding +from ..utils import get_item_embedding, l2_normalize -def YoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=5, +def YoutubeDNN(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(64, 32), dnn_activation='relu', dnn_use_bn=False, - l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, output_activation='linear', seed=1024, ): + l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, output_activation='linear', temperature=0.05, + sampler_config=None, seed=1024): """Instantiates the YoutubeDNN Model architecture. :param user_feature_columns: An iterable containing user's features used by the model. @@ -29,8 +30,10 @@ def YoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=5, :param l2_reg_dnn: float. L2 regularizer strength applied to DNN :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate. - :param seed: integer ,to use as random seed. :param output_activation: Activation function to use in output layer + :param temperature: float. Scaling factor. + :param sampler_config: negative sample config. + :param seed: integer ,to use as random seed. :return: A Keras model instance. """ @@ -54,6 +57,7 @@ def YoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=5, item_inputs_list = list(item_features.values()) user_dnn_out = DNN(user_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, dnn_use_bn, output_activation=output_activation, seed=seed)(user_dnn_input) + user_dnn_out = l2_normalize(user_dnn_out) item_index = EmbeddingIndex(list(range(item_vocabulary_size)))(item_features[item_feature_name]) @@ -63,7 +67,8 @@ def YoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=5, pooling_item_embedding_weight = PoolingLayer()([item_embedding_weight]) - output = SampledSoftmaxLayer(num_sampled=num_sampled)( + pooling_item_embedding_weight = l2_normalize(pooling_item_embedding_weight) + output = SampledSoftmaxLayer(sampler_config._asdict(), temperature)( [pooling_item_embedding_weight, user_dnn_out, item_features[item_feature_name]]) model = Model(inputs=user_inputs_list + item_inputs_list, outputs=output) diff --git a/deepmatch/utils.py b/deepmatch/utils.py index ad24242..becdc9f 100644 --- a/deepmatch/utils.py +++ b/deepmatch/utils.py @@ -2,28 +2,58 @@ """ Author: - Weichen Shen,wcshen1994@163.com + Weichen Shen, wcshenswc@163.com """ import json import logging -from threading import Thread - import requests +from collections import namedtuple +from threading import Thread try: from packaging.version import parse except ImportError: from pip._vendor.packaging.version import parse - import tensorflow as tf from tensorflow.python.keras import backend as K from tensorflow.python.keras.layers import Lambda +class NegativeSampler( + namedtuple('NegativeSampler', ['sampler', 'num_sampled', 'item_name', 'item_count', 'distortion'])): + """ NegativeSampler + Args: + sampler: sampler name,['inbatch', 'uniform', 'frequency' 'adaptive',] . + num_sampled: negative samples number per one positive sample. + item_name: pkey of item features . + item_count: global frequency of item . + distortion: skew factor of the unigram probability distribution. + """ + __slots__ = () + + def __new__(cls, sampler, num_sampled, item_name, item_count=None, distortion=1.0, ): + if sampler not in ['inbatch', 'uniform', 'frequency', 'adaptive']: + raise ValueError(' `%s` sampler is not supported ' % sampler) + if sampler in ['inbatch', 'frequency'] and item_count is None: + raise ValueError(' `item_count` must not be `None` when using `inbatch` or `frequency` sampler') + return super(NegativeSampler, cls).__new__(cls, sampler, num_sampled, item_name, item_count, distortion) + + # def __hash__(self): + # return self.sampler.__hash__() + + +def l2_normalize(x, axis=-1): + return Lambda(lambda x: tf.nn.l2_normalize(x, axis))(x) + + +def inner_product(x, y, temperature=1.0): + return Lambda(lambda x: tf.reduce_sum(tf.multiply(x[0], x[1])) / temperature)([x, y]) + + def recall_N(y_true, y_pred, N=50): return len(set(y_pred[:N]) & set(y_true)) * 1.0 / len(y_true) @@ -31,12 +61,12 @@ def recall_N(y_true, y_pred, N=50): def sampledsoftmaxloss(y_true, y_pred): return K.mean(y_pred) + def get_item_embedding(item_embedding, item_input_layer): return Lambda(lambda x: tf.squeeze(tf.gather(item_embedding, x), axis=1))( item_input_layer) - def check_version(version): """Return version of package on pypi.python.org using json.""" diff --git a/docs/source/Examples.md b/docs/source/Examples.md index e7efaad..4c52d56 100644 --- a/docs/source/Examples.md +++ b/docs/source/Examples.md @@ -1,9 +1,13 @@ # Examples -## Run YoutubeDNN on MovieLen1M on Google colab +## Run models on MovieLen1M in Google colab -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/shenweichen/DeepMatch/blob/master/examples/colab_MovieLen1M_YoutubeDNN.ipynb) +YoutubeDNN: [![Run YoutubeDNN In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/shenweichen/DeepMatch/blob/master/examples/colab_MovieLen1M_YoutubeDNN.ipynb) + +SDM: [![Run SDM In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/shenweichen/DeepMatch/blob/master/examples/colab_MovieLen1M_SDM.ipynb) + +DSSM InBatchSoftmax: [![Run DSSM InBatchSoftmax In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/shenweichen/DeepMatch/blob/master/examples/colab_MovieLen1M_DSSM_InBatchSoftmax.ipynb) ## YoutubeDNN/MIND with sampled softmax @@ -21,37 +25,30 @@ This example shows how to use ``YoutubeDNN`` to solve a matching task. You can g ```python import pandas as pd from deepctr.feature_column import SparseFeat, VarLenSparseFeat +from deepmatch.models import * +from deepmatch.utils import sampledsoftmaxloss, NegativeSampler from preprocess import gen_data_set, gen_model_input from sklearn.preprocessing import LabelEncoder from tensorflow.python.keras import backend as K from tensorflow.python.keras.models import Model -from deepmatch.models import * -from deepmatch.utils import sampledsoftmaxloss - if __name__ == "__main__": data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt") + data['genres'] = list(map(lambda x: x.split('|')[0], data['genres'].values)) + sparse_features = ["movie_id", "user_id", - "gender", "age", "occupation", "zip", ] + "gender", "age", "occupation", "zip", "genres"] SEQ_LEN = 50 - negsample = 0 # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` - features = ['user_id', 'gender', 'age', 'occupation', 'zip'] feature_max_idx = {} - for feature in features: + for feature in sparse_features: lbe = LabelEncoder() data[feature] = lbe.fit_transform(data[feature]) + 1 feature_max_idx[feature] = data[feature].max() + 1 - id_count = data['movie_id'].value_counts() - mapdict = {t[0]: i for i, t in - enumerate(sorted([(k, v) for k, v in id_count.to_dict().items()], key=lambda x: x[1], reverse=True))} - data['movie_id'] = data['movie_id'].map(mapdict) - feature_max_idx['movie_id'] = data['movie_id'].max() + 1 - user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id') item_profile = data[["movie_id"]].drop_duplicates('movie_id') @@ -60,7 +57,7 @@ if __name__ == "__main__": user_item_list = data.groupby("user_id")['movie_id'].apply(list) - train_set, test_set = gen_data_set(data, negsample) + train_set, test_set = gen_data_set(data, SEQ_LEN, 0) train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) @@ -76,30 +73,40 @@ if __name__ == "__main__": SparseFeat("zip", feature_max_idx['zip'], embedding_dim), VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'), + VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim, + embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len') ] item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)] + from collections import Counter + + train_counter = Counter(train_model_input['movie_id']) + item_count = [train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)] + sampler_config = NegativeSampler('frequency', num_sampled=5, item_name='movie_id', item_count=item_count) + # 3.Define Model and train import tensorflow as tf - + if tf.__version__ >= '2.0.0': tf.compat.v1.disable_eager_execution() else: K.set_learning_phase(True) - model = YoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=5, user_dnn_hidden_units=(64, embedding_dim)) - # model = MIND(user_feature_columns,item_feature_columns,dynamic_k=True,k_max=2,num_sampled=5,user_dnn_hidden_units=(64,embedding_dim),init_std=0.001) + model = YoutubeDNN(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(64, embedding_dim), + sampler_config=sampler_config) + # model = MIND(user_feature_columns, item_feature_columns, dynamic_k=False, k_max=2, + # user_dnn_hidden_units=(64, embedding_dim), sampler_config=sampler_config) - model.compile(optimizer="adam", loss=sampledsoftmaxloss) # "binary_crossentropy") + model.compile(optimizer="adam", loss=sampledsoftmaxloss) history = model.fit(train_model_input, train_label, # train_label, batch_size=256, epochs=1, verbose=1, validation_split=0.0, ) # 4. Generate user features for testing and full item features for retrieval test_user_model_input = test_model_input - all_item_model_input = {"movie_id": item_profile['movie_id'].values, "movie_idx": item_profile['movie_id'].values} + all_item_model_input = {"movie_id": item_profile['movie_id'].values} user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding) item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding) @@ -113,7 +120,7 @@ if __name__ == "__main__": # 5. [Optional] ANN search by faiss and evaluate the result - # test_true_label = {line[0]:[line[2]] for line in test_set} + # test_true_label = {line[0]:[line[1]] for line in test_set} # # import numpy as np # import faiss @@ -157,18 +164,18 @@ This example shows how to use ``SDM`` to solve a matching task. You can get the ```python import pandas as pd -from deepctr.inputs import SparseFeat, VarLenSparseFeat +from deepctr.feature_column import SparseFeat, VarLenSparseFeat +from deepmatch.models import SDM +from deepmatch.utils import sampledsoftmaxloss, NegativeSampler from preprocess import gen_data_set_sdm, gen_model_input_sdm from sklearn.preprocessing import LabelEncoder from tensorflow.python.keras import backend as K -from tensorflow.python.keras import optimizers from tensorflow.python.keras.models import Model -from deepmatch.models import SDM -from deepmatch.utils import sampledsoftmaxloss - if __name__ == "__main__": data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt") + data['genres'] = list(map(lambda x: x.split('|')[0], data['genres'].values)) + sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip", "genres"] SEQ_LEN_short = 5 @@ -176,20 +183,13 @@ if __name__ == "__main__": # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` - features = ['user_id', 'gender', 'age', 'occupation', 'zip'] feature_max_idx = {} - for feature in features: + for feature in sparse_features: lbe = LabelEncoder() data[feature] = lbe.fit_transform(data[feature]) + 1 feature_max_idx[feature] = data[feature].max() + 1 - id_count = data['movie_id'].value_counts() - mapdict = {t[0]: i for i, t in - enumerate(sorted([(k, v) for k, v in id_count.to_dict().items()], key=lambda x: x[1], reverse=True))} - data['movie_id'] = data['movie_id'].map(mapdict) - feature_max_idx['movie_id'] = data['movie_id'].max() + 1 - - user_profile = data[["user_id", "gender", "age", "occupation", "zip", "genres"]].drop_duplicates('user_id') + user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id') item_profile = data[["movie_id"]].drop_duplicates('movie_id') @@ -197,7 +197,7 @@ if __name__ == "__main__": # # user_item_list = data.groupby("user_id")['movie_id'].apply(list) - train_set, test_set = gen_data_set_sdm(data, seq_short_len=SEQ_LEN_short, seq_prefer_len=SEQ_LEN_prefer) + train_set, test_set = gen_data_set_sdm(data, seq_short_max_len=SEQ_LEN_short, seq_prefer_max_len=SEQ_LEN_prefer) train_model_input, train_label = gen_model_input_sdm(train_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer) test_model_input, test_label = gen_model_input_sdm(test_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer) @@ -227,6 +227,14 @@ if __name__ == "__main__": item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)] + from collections import Counter + + train_counter = Counter(train_model_input['movie_id']) + item_count = [train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)] + sampler_config = NegativeSampler('frequency', num_sampled=5, item_name='movie_id', item_count=item_count) + + K.set_learning_phase(True) + import tensorflow as tf if tf.__version__ >= '2.0.0': @@ -236,18 +244,15 @@ if __name__ == "__main__": # units must be equal to item embedding dim! model = SDM(user_feature_columns, item_feature_columns, history_feature_list=['movie_id', 'genres'], - units=embedding_dim, num_sampled=100, ) + units=embedding_dim, sampler_config=sampler_config) - optimizer = optimizers.Adam(lr=0.001, clipnorm=5.0) - - model.compile(optimizer=optimizer, loss=sampledsoftmaxloss) # "binary_crossentropy") + model.compile(optimizer='adam', loss=sampledsoftmaxloss) history = model.fit(train_model_input, train_label, # train_label, batch_size=512, epochs=1, verbose=1, validation_split=0.0, ) - # model.save_weights('SDM_weights.h5') K.set_learning_phase(False) - # 4. Generate user features for testing and full item features for retrieval + # 3.Define Model,train,predict and evaluate test_user_model_input = test_model_input all_item_model_input = {"movie_id": item_profile['movie_id'].values, } @@ -260,10 +265,8 @@ if __name__ == "__main__": print(user_embs.shape) print(item_embs.shape) - - # 5. [Optional] ANN search by faiss and evaluate the result - # test_true_label = {line[0]: [line[3]] for line in test_set} + # test_true_label = {line[0]: [line[1]] for line in test_set} # # import numpy as np # import faiss @@ -291,11 +294,10 @@ if __name__ == "__main__": # print("recall", np.mean(s)) # print("hit rate", hit / len(test_user_model_input['user_id'])) - ``` -## DSSM with negative sampling +## DSSM with in batch softmax The MovieLens data has been used for personalized tag recommendation,which contains 668, 953 tag applications of users on movies. @@ -310,45 +312,44 @@ This example shows how to use ``DSSM`` to solve a matching task. You can get the ```python import pandas as pd from deepctr.feature_column import SparseFeat, VarLenSparseFeat +from deepmatch.models import * +from deepmatch.utils import sampledsoftmaxloss, NegativeSampler from preprocess import gen_data_set, gen_model_input from sklearn.preprocessing import LabelEncoder from tensorflow.python.keras.models import Model -from deepmatch.models import * - if __name__ == "__main__": data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt") sparse_features = ["movie_id", "user_id", - "gender", "age", "occupation", "zip", ] + "gender", "age", "occupation", "zip", "genres"] SEQ_LEN = 50 - negsample = 3 + negsample = 10 # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` - features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip'] feature_max_idx = {} - for feature in features: + for feature in sparse_features: lbe = LabelEncoder() data[feature] = lbe.fit_transform(data[feature]) + 1 feature_max_idx[feature] = data[feature].max() + 1 user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id') - item_profile = data[["movie_id"]].drop_duplicates('movie_id') + item_profile = data[["movie_id", "genres"]].drop_duplicates('movie_id') user_profile.set_index("user_id", inplace=True) user_item_list = data.groupby("user_id")['movie_id'].apply(list) - train_set, test_set = gen_data_set(data, negsample) + train_set, test_set = gen_data_set(data, SEQ_LEN, negsample) train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) # 2.count #unique features for each sparse field and generate feature config for sequence feature - embedding_dim = 16 + embedding_dim = 32 user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim), SparseFeat("gender", feature_max_idx['gender'], embedding_dim), @@ -357,22 +358,40 @@ if __name__ == "__main__": SparseFeat("zip", feature_max_idx['zip'], embedding_dim), VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'), + VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim, + embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len'), ] - item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)] + item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim), + SparseFeat('genres', feature_max_idx['genres'], embedding_dim) + ] + + from collections import Counter + + train_counter = Counter(train_model_input['movie_id']) + item_count = [train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)] + sampler_config = NegativeSampler('inbatch', num_sampled=5, item_name='movie_id', item_count=item_count) # 3.Define Model and train - model = DSSM(user_feature_columns, item_feature_columns) # FM(user_feature_columns,item_feature_columns) + import tensorflow as tf - model.compile(optimizer='adagrad', loss="binary_crossentropy") + if tf.__version__ >= '2.0.0': + tf.compat.v1.disable_eager_execution() + else: + K.set_learning_phase(True) - history = model.fit(train_model_input, train_label, # train_label, + model = DSSM(user_feature_columns, item_feature_columns, loss_type="softmax", sampler_config=sampler_config) + # model = FM(user_feature_columns, item_feature_columns, loss_type="softmax", sampler_config=sampler_config) + + model.compile(optimizer='adagrad', loss=sampledsoftmaxloss) + + history = model.fit(train_model_input, train_label, batch_size=256, epochs=1, verbose=1, validation_split=0.0, ) # 4. Generate user features for testing and full item features for retrieval test_user_model_input = test_model_input - all_item_model_input = {"movie_id": item_profile['movie_id'].values,} + all_item_model_input = {"movie_id": item_profile['movie_id'].values, "genres": item_profile['genres'].values} user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding) item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding) @@ -383,9 +402,9 @@ if __name__ == "__main__": print(user_embs.shape) print(item_embs.shape) - # 5. [Optional] ANN search by faiss and evaluate the result + # 5. [Optional] ANN search by faiss and evaluate the result - # test_true_label = {line[0]:[line[2]] for line in test_set} + # test_true_label = {line[0]:[line[1]] for line in test_set} # # import numpy as np # import faiss @@ -412,5 +431,135 @@ if __name__ == "__main__": # print("recall", np.mean(s)) # print("hr", hit / len(test_user_model_input['user_id'])) +``` + + + +## DSSM with negative sampling + +The MovieLens data has been used for personalized tag recommendation,which +contains 668, 953 tag applications of users on movies. +Here is a small fraction of data include only sparse field. + +![](../pics/movielens_sample.png) + + +This example shows how to use ``DSSM`` to solve a matching task. You can get the demo data +[movielens_sample.txt](https://github.com/shenweichen/DeepMatch/tree/master/examples/movielens_sample.txt) and run the following codes. + +```python +import pandas as pd +from deepctr.feature_column import SparseFeat, VarLenSparseFeat +from deepmatch.models import * +from preprocess import gen_data_set, gen_model_input +from sklearn.preprocessing import LabelEncoder +from tensorflow.python.keras.models import Model + +if __name__ == "__main__": + + data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt") + sparse_features = ["movie_id", "user_id", + "gender", "age", "occupation", "zip", "genres"] + SEQ_LEN = 50 + negsample = 10 + + # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` + + feature_max_idx = {} + for feature in sparse_features: + lbe = LabelEncoder() + data[feature] = lbe.fit_transform(data[feature]) + 1 + feature_max_idx[feature] = data[feature].max() + 1 + + user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id') + + item_profile = data[["movie_id", "genres"]].drop_duplicates('movie_id') + + user_profile.set_index("user_id", inplace=True) + + user_item_list = data.groupby("user_id")['movie_id'].apply(list) + + train_set, test_set = gen_data_set(data, SEQ_LEN, negsample) + + train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) + test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) + + # 2.count #unique features for each sparse field and generate feature config for sequence feature + + embedding_dim = 32 + + user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16), + SparseFeat("gender", feature_max_idx['gender'], 16), + SparseFeat("age", feature_max_idx['age'], 16), + SparseFeat("occupation", feature_max_idx['occupation'], 16), + SparseFeat("zip", feature_max_idx['zip'], 16), + VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim, + embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'), + VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim, + embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len'), + ] + + item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim), + SparseFeat('genres', feature_max_idx['genres'], embedding_dim) + ] + + # 3.Define Model and train + + import tensorflow as tf + + if tf.__version__ >= '2.0.0': + tf.compat.v1.disable_eager_execution() + else: + K.set_learning_phase(True) + + model = DSSM(user_feature_columns, item_feature_columns, loss_type="logistic") + # model = FM(user_feature_columns,item_feature_columns) + + model.compile(optimizer='adagrad', loss="binary_crossentropy") + + history = model.fit(train_model_input, train_label, + batch_size=256, epochs=1, verbose=1, validation_split=0.0, ) + + # 4. Generate user features for testing and full item features for retrieval + test_user_model_input = test_model_input + all_item_model_input = {"movie_id": item_profile['movie_id'].values, "genres": item_profile['genres'].values} + + user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding) + item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding) + + user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12) + item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12) + + print(user_embs.shape) + print(item_embs.shape) + + # 5. [Optional] ANN search by faiss and evaluate the result + + # test_true_label = {line[0]:[line[1]] for line in test_set} + # + # import numpy as np + # import faiss + # from tqdm import tqdm + # from deepmatch.utils import recall_N + # + # index = faiss.IndexFlatIP(embedding_dim) + # # faiss.normalize_L2(item_embs) + # index.add(item_embs) + # # faiss.normalize_L2(user_embs) + # D, I = index.search(user_embs, 50) + # s = [] + # hit = 0 + # for i, uid in tqdm(enumerate(test_user_model_input['user_id'])): + # try: + # pred = [item_profile['movie_id'].values[x] for x in I[i]] + # filter_item = None + # recall_score = recall_N(test_true_label[uid], pred, N=50) + # s.append(recall_score) + # if test_true_label[uid] in pred: + # hit += 1 + # except: + # print(i) + # print("recall", np.mean(s)) + # print("hr", hit / len(test_user_model_input['user_id'])) ``` diff --git a/docs/source/History.md b/docs/source/History.md index 3fbc964..cb5e222 100644 --- a/docs/source/History.md +++ b/docs/source/History.md @@ -1,4 +1,5 @@ # History +- 07/04/2022 : [v0.3.0](https://github.com/shenweichen/DeepMatch/releases/tag/v0.3.0) released.Support different negative sampling strategies, including `inbatch`, `uniform`, `frequency`, `adaptive`. - 06/17/2022 : [v0.2.1](https://github.com/shenweichen/DeepMatch/releases/tag/v0.2.1) released.Fix some bugs. - 10/12/2020 : [v0.2.0](https://github.com/shenweichen/DeepMatch/releases/tag/v0.2.0) released.Support different initializers for different embedding weights and loading pretrained embeddings. - 05/17/2020 : [v0.1.3](https://github.com/shenweichen/DeepMatch/releases/tag/v0.1.3) released.Add `SDM` model . diff --git a/docs/source/Quick-Start.md b/docs/source/Quick-Start.md index 271ee44..877b6d7 100644 --- a/docs/source/Quick-Start.md +++ b/docs/source/Quick-Start.md @@ -1,7 +1,7 @@ # Quick-Start ## Installation Guide -Now `deepmatch` is available for python `2.7 `and `3.5, 3.6, 3.7, 3.8`. +Now `deepmatch` is available for python `2.7 `and `3.6, 3.7, 3.8`. `deepmatch` depends on tensorflow, you can specify to install the cpu version or gpu version through `pip`. ### CPU version @@ -16,7 +16,9 @@ $ pip install deepmatch[gpu] ``` ## Run examples !! -- [Run YoutubeDNN on MovieLen1M on Google colab](https://colab.research.google.com/github/shenweichen/DeepMatch/blob/dev_shenweichen/examples/colab_MovieLen1M_YoutubeDNN.ipynb) +- [Run models on MovieLen1M in Google colab](./Examples.html#run-models-on-movielen1m-in-google-colab) + - [YoutubeDNN/MIND with sampled softmax](./Examples.html#youtubednn-mind-with-sampled-softmax) - [SDM with sampled softmax](./Examples.html#sdm-with-sampled-softmax) +- [DSSM with in batch softmax](./Examples.html#dssm-with-in-batch-softmax) - [DSSM with negative sampling](./Examples.html#dssm-with-negative-sampling) diff --git a/docs/source/conf.py b/docs/source/conf.py index fdddcde..6d4ebaf 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -20,13 +20,13 @@ # -- Project information ----------------------------------------------------- project = 'DeepMatch' -copyright = '2020, Weichen Shen' +copyright = '2020-present, Weichen Shen' author = 'Weichen Shen' # The short X.Y version version = '' # The full version, including alpha/beta/rc tags -release = '0.2.1' +release = '0.3.0' # -- General configuration --------------------------------------------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index 7ebbb81..db54bb5 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -6,6 +6,25 @@ Welcome to DeepMatch's documentation! ===================================== +|Downloads|_ |Stars|_ |Forks|_ |PyPii|_ |Issues|_ |Chat|_ + +.. |Downloads| image:: https://pepy.tech/badge/deepmatch +.. _Downloads: https://pepy.tech/project/deepmatch + +.. |Stars| image:: https://img.shields.io/github/stars/shenweichen/deepmatch.svg +.. _Stars: https://github.com/shenweichen/DeepMatch + +.. |Forks| image:: https://img.shields.io/github/forks/shenweichen/deepmatch.svg +.. _Forks: https://github.com/shenweichen/DeepMatch/fork + +.. |PyPii| image:: https://img.shields.io/pypi/v/deepmatch.svg +.. _PyPii: https://pypi.org/project/deepmatch + +.. |Issues| image:: https://img.shields.io/github/issues/shenweichen/deepmatch.svg +.. _Issues: https://github.com/shenweichen/deepmatch/issues + +.. |Chat| image:: https://img.shields.io/badge/chat-wechat-brightgreen?style=flat +.. _Chat: ./#disscussiongroup DeepMatch is a deep matching model library for recommendations, advertising, and search. It's easy to **train models** and to **export representation vectors** for user and item which can be used for **ANN search**.You can use any complex model with ``model.fit()`` and ``model.predict()`` . @@ -18,18 +37,22 @@ You can read the latest code at https://github.com/shenweichen/DeepMatch News ----- +07/04/2022 : Support different negative sampling strategies, including `inbatch` , `uniform` , `frequency` , `adaptive` . `Changelog `_ + 06/17/2022 : Fix some bugs. `Changelog `_ 10/12/2020 : Support different initializers for different embedding weights and loading pretrained embeddings. `Changelog `_ -05/17/2020 : Add ``SDM`` model. `Changelog `_ - DisscussionGroup ----------------------- -公众号:**浅梦的学习笔记** wechat ID: **deepctrbot** -.. image:: ../pics/code.png + 公众号:**浅梦学习笔记** wechat ID: **deepctrbot** + + `Discussions `_ `学习小组主题集合 `_ + +.. image:: ../pics/code2.jpg + .. toctree:: :maxdepth: 2 diff --git a/examples/colab_MovieLen1M_DSSM_InBatchSoftmax.ipynb b/examples/colab_MovieLen1M_DSSM_InBatchSoftmax.ipynb new file mode 100644 index 0000000..0133553 --- /dev/null +++ b/examples/colab_MovieLen1M_DSSM_InBatchSoftmax.ipynb @@ -0,0 +1,517 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "rtox72csOQUN" + }, + "source": [ + "# DeepMatch 样例代码\n", + "- https://github.com/shenweichen/DeepMatch\n", + "- https://deepmatch.readthedocs.io/en/latest/" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bTWHz-heMkyw" + }, + "source": [ + "# 下载movielens-1M数据 安装依赖包" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 13146, + "status": "ok", + "timestamp": 1656786953325, + "user": { + "displayName": "沈伟臣", + "userId": "00399522274399293678" + }, + "user_tz": -480 + }, + "id": "yTl6d6jO1oqf", + "outputId": "16a19888-344e-4dbe-c723-effe370222f7" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2022-07-02 18:35:40-- http://files.grouplens.org/datasets/movielens/ml-1m.zip\n", + "Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152\n", + "Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 5917549 (5.6M) [application/zip]\n", + "Saving to: ‘./ml-1m.zip’\n", + "\n", + "./ml-1m.zip 100%[===================>] 5.64M 3.43MB/s in 1.6s \n", + "\n", + "2022-07-02 18:35:42 (3.43 MB/s) - ‘./ml-1m.zip’ saved [5917549/5917549]\n", + "\n", + "--2022-07-02 18:35:42-- https://raw.githubusercontent.com/shenweichen/DeepMatch/master/examples/preprocess.py\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 5642 (5.5K) [text/plain]\n", + "Saving to: ‘preprocess.py’\n", + "\n", + "preprocess.py 100%[===================>] 5.51K --.-KB/s in 0s \n", + "\n", + "2022-07-02 18:35:42 (75.7 MB/s) - ‘preprocess.py’ saved [5642/5642]\n", + "\n", + "Archive: ml-1m.zip\n", + " inflating: ml-1m/movies.dat \n", + " inflating: ml-1m/ratings.dat \n", + " inflating: ml-1m/README \n", + " inflating: ml-1m/users.dat \n", + "\u001b[33mWARNING: Skipping tensorflow as it is not installed.\u001b[0m\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "deepctr 0.9.1 requires h5py==2.10.0, but you have h5py 3.1.0 which is incompatible.\u001b[0m\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow-gpu 2.5.0 requires h5py~=3.1.0, but you have h5py 2.10.0 which is incompatible.\u001b[0m\n" + ] + } + ], + "source": [ + "! wget http://files.grouplens.org/datasets/movielens/ml-1m.zip -O ./ml-1m.zip \n", + "! wget https://raw.githubusercontent.com/shenweichen/DeepMatch/master/examples/preprocess.py -O preprocess.py\n", + "! unzip -o ml-1m.zip \n", + "! pip uninstall -y -q tensorflow\n", + "! pip install -q tensorflow-gpu==2.5.0\n", + "! pip install -q deepmatch" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "p9UxNHuPMuW2" + }, + "source": [ + "# 导入需要的库" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "executionInfo": { + "elapsed": 3234, + "status": "ok", + "timestamp": 1656786956552, + "user": { + "displayName": "沈伟臣", + "userId": "00399522274399293678" + }, + "user_tz": -480 + }, + "id": "C_ZR6gzp1E2N" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from deepctr.feature_column import SparseFeat, VarLenSparseFeat\n", + "from preprocess import gen_data_set, gen_model_input\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from tensorflow.python.keras import backend as K\n", + "from tensorflow.python.keras.models import Model\n", + "\n", + "from deepmatch.models import *\n", + "from deepmatch.utils import sampledsoftmaxloss, NegativeSampler" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fQq6O9XAMzPF" + }, + "source": [ + "# 读取数据" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 5508, + "status": "ok", + "timestamp": 1656786962055, + "user": { + "displayName": "沈伟臣", + "userId": "00399522274399293678" + }, + "user_tz": -480 + }, + "id": "lcO29zFb21Od", + "outputId": "9c60dffa-5829-40bc-c54d-cde4ae2bea72" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/swc/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", + " after removing the cwd from sys.path.\n", + "/Users/swc/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", + " \n", + "/Users/swc/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:8: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", + " \n" + ] + } + ], + "source": [ + "data_path = \"./\"\n", + "\n", + "unames = ['user_id','gender','age','occupation','zip']\n", + "user = pd.read_csv(data_path+'ml-1m/users.dat',sep='::',header=None,names=unames)\n", + "rnames = ['user_id','movie_id','rating','timestamp']\n", + "ratings = pd.read_csv(data_path+'ml-1m/ratings.dat',sep='::',header=None,names=rnames)\n", + "mnames = ['movie_id','title','genres']\n", + "movies = pd.read_csv(data_path+'ml-1m/movies.dat',sep='::',header=None,names=mnames,encoding=\"unicode_escape\")\n", + "movies['genres'] = list(map(lambda x: x.split('|')[0], movies['genres'].values))\n", + "\n", + "data = pd.merge(pd.merge(ratings,movies),user)#.iloc[:10000]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L0yCWxQxM3se" + }, + "source": [ + "# 构建特征列,训练模型,导出embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 400136, + "status": "ok", + "timestamp": 1656787362187, + "user": { + "displayName": "沈伟臣", + "userId": "00399522274399293678" + }, + "user_tz": -480 + }, + "id": "BMOvk_de2ML3", + "outputId": "2c5ac944-b866-4a63-be70-fedc9c257812" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 6040/6040 [00:12<00:00, 482.49it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8 8\n", + "Train on 988129 samples\n", + "Epoch 1/20\n", + "988129/988129 [==============================] - 44s 44us/sample - loss: 4.7422\n", + "Epoch 2/20\n", + "988129/988129 [==============================] - 47s 47us/sample - loss: 4.3175\n", + "Epoch 3/20\n", + "988129/988129 [==============================] - 52s 53us/sample - loss: 4.1604\n", + "Epoch 4/20\n", + "988129/988129 [==============================] - 56s 56us/sample - loss: 4.0652\n", + "Epoch 5/20\n", + "988129/988129 [==============================] - 53s 54us/sample - loss: 3.9986\n", + "Epoch 6/20\n", + "988129/988129 [==============================] - 51s 52us/sample - loss: 3.9487\n", + "Epoch 7/20\n", + "988129/988129 [==============================] - 45s 45us/sample - loss: 3.9083\n", + "Epoch 8/20\n", + "988129/988129 [==============================] - 44s 45us/sample - loss: 3.8760\n", + "Epoch 9/20\n", + "988129/988129 [==============================] - 51s 51us/sample - loss: 3.8502\n", + "Epoch 10/20\n", + "988129/988129 [==============================] - 51s 52us/sample - loss: 3.8282\n", + "Epoch 11/20\n", + "988129/988129 [==============================] - 54s 55us/sample - loss: 3.8102\n", + "Epoch 12/20\n", + "988129/988129 [==============================] - 51s 51us/sample - loss: 3.7938\n", + "Epoch 13/20\n", + "988129/988129 [==============================] - 41s 41us/sample - loss: 3.7804\n", + "Epoch 14/20\n", + "988129/988129 [==============================] - 48s 48us/sample - loss: 3.7676\n", + "Epoch 15/20\n", + "988129/988129 [==============================] - 65s 65us/sample - loss: 3.7577\n", + "Epoch 16/20\n", + "988129/988129 [==============================] - 53s 53us/sample - loss: 3.7482\n", + "Epoch 17/20\n", + "988129/988129 [==============================] - 50s 50us/sample - loss: 3.7394\n", + "Epoch 18/20\n", + "988129/988129 [==============================] - 56s 57us/sample - loss: 3.7311\n", + "Epoch 19/20\n", + "988129/988129 [==============================] - 52s 53us/sample - loss: 3.7238\n", + "Epoch 20/20\n", + "988129/988129 [==============================] - 56s 57us/sample - loss: 3.7180\n", + "WARNING:tensorflow:From /Users/swc/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", + "(6040, 32)\n", + "(3706, 32)\n" + ] + } + ], + "source": [ + "#data = pd.read_csvdata = pd.read_csv(\"./movielens_sample.txt\")\n", + "sparse_features = [\"movie_id\", \"user_id\",\n", + " \"gender\", \"age\", \"occupation\", \"zip\", \"genres\"]\n", + "SEQ_LEN = 50\n", + "negsample = 0\n", + "\n", + "# 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input`\n", + "\n", + "#features = ['user_id','movie_id','gender', 'age', 'occupation', 'zip']\n", + "feature_max_idx = {}\n", + "for feature in sparse_features:\n", + " lbe = LabelEncoder()\n", + " data[feature] = lbe.fit_transform(data[feature]) + 1\n", + " feature_max_idx[feature] = data[feature].max() + 1\n", + "\n", + "user_profile = data[[\"user_id\", \"gender\", \"age\", \"occupation\", \"zip\"]].drop_duplicates('user_id')\n", + "\n", + "item_profile = data[[\"movie_id\",\"genres\"]].drop_duplicates('movie_id')\n", + "\n", + "user_profile.set_index(\"user_id\", inplace=True)\n", + "\n", + "user_item_list = data.groupby(\"user_id\")['movie_id'].apply(list)\n", + "\n", + "train_set, test_set = gen_data_set(data, SEQ_LEN, negsample)\n", + "\n", + "train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)\n", + "test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)\n", + "\n", + "# 2.count #unique features for each sparse field and generate feature config for sequence feature\n", + "\n", + "embedding_dim = 32\n", + "\n", + "user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16),\n", + " SparseFeat(\"gender\", feature_max_idx['gender'], 16),\n", + " SparseFeat(\"age\", feature_max_idx['age'], 16),\n", + " SparseFeat(\"occupation\", feature_max_idx['occupation'], 16),\n", + " SparseFeat(\"zip\", feature_max_idx['zip'], 16),\n", + " VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim,\n", + " embedding_name=\"movie_id\"), SEQ_LEN, 'mean', 'hist_len'),\n", + " VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim,\n", + " embedding_name=\"genres\"), SEQ_LEN, 'mean', 'hist_len'),\n", + " ]\n", + "\n", + "item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim),\n", + " SparseFeat('genres', feature_max_idx['genres'], embedding_dim)\n", + " ]\n", + "\n", + "from collections import Counter\n", + "train_counter = Counter(train_model_input['movie_id'])\n", + "item_count = [train_counter.get(i,0) for i in range(item_feature_columns[0].vocabulary_size)]\n", + "sampler_config = NegativeSampler('inbatch',num_sampled=255,item_name=\"movie_id\",item_count=item_count)\n", + "\n", + "# 3.Define Model and train\n", + "\n", + "import tensorflow as tf\n", + "if tf.__version__ >= '2.0.0':\n", + " tf.compat.v1.disable_eager_execution()\n", + "else:\n", + " K.set_learning_phase(True)\n", + " \n", + "model = DSSM(user_feature_columns, item_feature_columns,user_dnn_hidden_units=(128,64, embedding_dim),\n", + " item_dnn_hidden_units=(64, embedding_dim,),loss_type='softmax',sampler_config=sampler_config)\n", + "\n", + "model.compile(optimizer=\"adam\", loss=sampledsoftmaxloss)\n", + "\n", + "history = model.fit(train_model_input, train_label, # train_label,\n", + " batch_size=256, epochs=20, verbose=1, validation_split=0.0, )\n", + "\n", + "# 4. Generate user features for testing and full item features for retrieval\n", + "test_user_model_input = test_model_input\n", + "all_item_model_input = {\"movie_id\": item_profile['movie_id'].values,\"genres\": item_profile['genres'].values}\n", + "\n", + "user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)\n", + "item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)\n", + "\n", + "user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)\n", + "# user_embs = user_embs[:, i, :] # i in [0,k_max) if MIND\n", + "item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)\n", + "\n", + "print(user_embs.shape)\n", + "print(item_embs.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w_G3KWslKmJo" + }, + "source": [ + "# 使用faiss进行ANN查找并评估结果" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5SvyQLNVKkcs" + }, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 2988, + "status": "ok", + "timestamp": 1656787365168, + "user": { + "displayName": "沈伟臣", + "userId": "00399522274399293678" + }, + "user_tz": -480 + }, + "id": "j2ZNYNBOOqrN", + "outputId": "7c66659a-582f-4747-e025-5f6c6253c15f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Requirement already satisfied: faiss-cpu in /usr/local/lib/python3.7/dist-packages (1.7.2)\n" + ] + } + ], + "source": [ + "! pip install faiss-cpu" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 3764, + "status": "ok", + "timestamp": 1656787368915, + "user": { + "displayName": "沈伟臣", + "userId": "00399522274399293678" + }, + "user_tz": -480 + }, + "id": "6TY1l27iJU8U", + "outputId": "6e842add-350e-41d8-8e8b-fd4652e74e17" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "6040it [00:02, 2487.51it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "recall 0.3554635761589404\n", + "hit rate 0.3554635761589404\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "test_true_label = {line[0]:[line[1]] for line in test_set}\n", + "\n", + "import numpy as np\n", + "import faiss\n", + "from tqdm import tqdm\n", + "from deepmatch.utils import recall_N\n", + "\n", + "index = faiss.IndexFlatIP(embedding_dim)\n", + "# faiss.normalize_L2(item_embs)\n", + "index.add(item_embs)\n", + "# faiss.normalize_L2(user_embs)\n", + "D, I = index.search(np.ascontiguousarray(user_embs), 50)\n", + "s = []\n", + "hit = 0\n", + "for i, uid in tqdm(enumerate(test_user_model_input['user_id'])):\n", + " try:\n", + " pred = [item_profile['movie_id'].values[x] for x in I[i]]\n", + " filter_item = None\n", + " recall_score = recall_N(test_true_label[uid], pred, N=50)\n", + " s.append(recall_score)\n", + " if test_true_label[uid] in pred:\n", + " hit += 1\n", + " except:\n", + " print(i)\n", + "print(\"\")\n", + "print(\"recall\", np.mean(s))\n", + "print(\"hit rate\", hit / len(test_user_model_input['user_id']))" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "colab_MovieLen1M_YoutubeDNN.ipynb", + "provenance": [], + "toc_visible": true + }, + "gpuClass": "standard", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/examples/colab_MovieLen1M_SDM.ipynb b/examples/colab_MovieLen1M_SDM.ipynb new file mode 100644 index 0000000..95de07d --- /dev/null +++ b/examples/colab_MovieLen1M_SDM.ipynb @@ -0,0 +1,482 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "rtox72csOQUN" + }, + "source": [ + "# DeepMatch 样例代码\n", + "- https://github.com/shenweichen/DeepMatch\n", + "- https://deepmatch.readthedocs.io/en/latest/" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bTWHz-heMkyw" + }, + "source": [ + "# 下载movielens-1M数据 安装依赖包" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yTl6d6jO1oqf", + "outputId": "ca32c49d-102b-46e8-d613-1c33885326ff" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2022-07-03 12:34:44-- http://files.grouplens.org/datasets/movielens/ml-1m.zip\n", + "Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152\n", + "Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 5917549 (5.6M) [application/zip]\n", + "Saving to: ‘./ml-1m.zip’\n", + "\n", + "./ml-1m.zip 100%[===================>] 5.64M 19.1MB/s in 0.3s \n", + "\n", + "2022-07-03 12:34:44 (19.1 MB/s) - ‘./ml-1m.zip’ saved [5917549/5917549]\n", + "\n", + "--2022-07-03 12:34:44-- https://raw.githubusercontent.com/shenweichen/DeepMatch/dev/sample/examples/preprocess.py\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 6705 (6.5K) [text/plain]\n", + "Saving to: ‘preprocess.py’\n", + "\n", + "preprocess.py 100%[===================>] 6.55K --.-KB/s in 0s \n", + "\n", + "2022-07-03 12:34:45 (57.6 MB/s) - ‘preprocess.py’ saved [6705/6705]\n", + "\n", + "Archive: ml-1m.zip\n", + " creating: ml-1m/\n", + " inflating: ml-1m/movies.dat \n", + " inflating: ml-1m/ratings.dat \n", + " inflating: ml-1m/README \n", + " inflating: ml-1m/users.dat \n", + "\u001b[K |████████████████████████████████| 454.3 MB 16 kB/s \n", + "\u001b[K |████████████████████████████████| 14.8 MB 43.1 MB/s \n", + "\u001b[K |████████████████████████████████| 4.0 MB 52.7 MB/s \n", + "\u001b[K |████████████████████████████████| 132 kB 71.2 MB/s \n", + "\u001b[K |████████████████████████████████| 1.2 MB 61.6 MB/s \n", + "\u001b[K |████████████████████████████████| 4.0 MB 63.3 MB/s \n", + "\u001b[K |████████████████████████████████| 462 kB 54.4 MB/s \n", + "\u001b[?25h Building wheel for wrapt (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "kapre 0.3.7 requires tensorflow>=2.0.0, which is not installed.\n", + "xarray-einstats 0.2.2 requires numpy>=1.21, but you have numpy 1.19.5 which is incompatible.\n", + "deepctr 0.9.1 requires h5py==2.10.0, but you have h5py 3.1.0 which is incompatible.\n", + "datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.\n", + "albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.\u001b[0m\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow-gpu 2.5.0 requires h5py~=3.1.0, but you have h5py 2.10.0 which is incompatible.\u001b[0m\n" + ] + } + ], + "source": [ + "! wget http://files.grouplens.org/datasets/movielens/ml-1m.zip -O ./ml-1m.zip \n", + "! wget https://raw.githubusercontent.com/shenweichen/DeepMatch/master/examples/preprocess.py -O preprocess.py\n", + "! unzip -o ml-1m.zip \n", + "! pip uninstall -y -q tensorflow\n", + "! pip install -q tensorflow-gpu==2.5.0\n", + "! pip install -q deepmatch" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "p9UxNHuPMuW2" + }, + "source": [ + "# 导入需要的库" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "C_ZR6gzp1E2N" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from deepctr.feature_column import SparseFeat, VarLenSparseFeat\n", + "from preprocess import gen_data_set, gen_model_input,gen_data_set_sdm,gen_model_input_sdm\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from tensorflow.python.keras import backend as K\n", + "from tensorflow.python.keras.models import Model\n", + "\n", + "from deepmatch.models import *\n", + "from deepmatch.utils import sampledsoftmaxloss, NegativeSampler" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fQq6O9XAMzPF" + }, + "source": [ + "# 读取数据" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lcO29zFb21Od", + "outputId": "cda19a71-6a6e-4113-f42d-ab80f06273b8" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/swc/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", + " after removing the cwd from sys.path.\n", + "/Users/swc/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", + " \n", + "/Users/swc/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:8: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", + " \n" + ] + } + ], + "source": [ + "data_path = \"./\"\n", + "\n", + "unames = ['user_id','gender','age','occupation','zip']\n", + "user = pd.read_csv(data_path+'ml-1m/users.dat',sep='::',header=None,names=unames)\n", + "rnames = ['user_id','movie_id','rating','timestamp']\n", + "ratings = pd.read_csv(data_path+'ml-1m/ratings.dat',sep='::',header=None,names=rnames)\n", + "mnames = ['movie_id','title','genres']\n", + "movies = pd.read_csv(data_path+'ml-1m/movies.dat',sep='::',header=None,names=mnames,encoding=\"unicode_escape\")\n", + "movies['genres'] = list(map(lambda x: x.split('|')[0], movies['genres'].values))\n", + "\n", + "data = pd.merge(pd.merge(ratings,movies),user)#.iloc[:10000]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L0yCWxQxM3se" + }, + "source": [ + "# 构建特征列,训练模型,导出embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 913 + }, + "id": "BMOvk_de2ML3", + "outputId": "eba1ad5c-7a45-4b30-84f6-0d19f556834c" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 6040/6040 [00:14<00:00, 402.76it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10 10\n", + "WARNING:tensorflow:From /Users/swc/study/DeepMatch/deepmatch/layers/sequence.py:35: BasicLSTMCell.__init__ (from tensorflow.python.keras.layers.legacy_rnn.rnn_cell_impl) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.\n", + "WARNING:tensorflow:From /Users/swc/study/DeepMatch/deepmatch/layers/sequence.py:65: MultiRNNCell.__init__ (from tensorflow.python.keras.layers.legacy_rnn.rnn_cell_impl) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.\n", + "WARNING:tensorflow:From /Users/swc/study/DeepMatch/deepmatch/layers/sequence.py:78: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Please use `keras.layers.RNN(cell)`, which is equivalent to this API\n", + "WARNING:tensorflow:From /Users/swc/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_impl.py:740: Layer.add_variable (from tensorflow.python.keras.engine.base_layer_v1) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Please use `layer.add_weight` method instead.\n", + "WARNING:tensorflow:From /Users/swc/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_impl.py:744: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Call initializer instance with the dtype argument instead of passing it to the constructor\n", + "WARNING:tensorflow:From /Users/swc/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/initializers/initializers_v1.py:68: calling TruncatedNormal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Call initializer instance with the dtype argument instead of passing it to the constructor\n", + "WARNING:tensorflow:From /Users/swc/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py:158: calling LinearOperator.__init__ (from tensorflow.python.ops.linalg.linear_operator) with graph_parents is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Do not pass `graph_parents`. They will no longer be used.\n", + "Train on 988129 samples\n", + "Epoch 1/20\n", + "988129/988129 [==============================] - 114s 116us/sample - loss: 5.1889\n", + "Epoch 2/20\n", + "988129/988129 [==============================] - 123s 125us/sample - loss: 4.2234\n", + "Epoch 3/20\n", + "988129/988129 [==============================] - 121s 123us/sample - loss: 4.0599\n", + "Epoch 4/20\n", + "988129/988129 [==============================] - 130s 131us/sample - loss: 3.9574\n", + "Epoch 5/20\n", + "988129/988129 [==============================] - 124s 125us/sample - loss: 3.8822\n", + "Epoch 6/20\n", + "988129/988129 [==============================] - 121s 123us/sample - loss: 3.8313\n", + "Epoch 7/20\n", + "988129/988129 [==============================] - 112s 114us/sample - loss: 3.7889\n", + "Epoch 8/20\n", + "988129/988129 [==============================] - 102s 103us/sample - loss: 3.7559\n", + "Epoch 9/20\n", + "988129/988129 [==============================] - 102s 103us/sample - loss: 3.7315\n", + "Epoch 10/20\n", + "988129/988129 [==============================] - 101s 102us/sample - loss: 3.7082\n", + "Epoch 11/20\n", + "988129/988129 [==============================] - 114s 115us/sample - loss: 3.6901\n", + "Epoch 12/20\n", + "988129/988129 [==============================] - 109s 111us/sample - loss: 3.6750\n", + "Epoch 13/20\n", + "988129/988129 [==============================] - 108s 109us/sample - loss: 3.6606\n", + "Epoch 14/20\n", + "988129/988129 [==============================] - 105s 106us/sample - loss: 3.6482\n", + "Epoch 15/20\n", + "988129/988129 [==============================] - 119s 120us/sample - loss: 3.6363\n", + "Epoch 16/20\n", + "988129/988129 [==============================] - 119s 120us/sample - loss: 3.6288\n", + "Epoch 17/20\n", + "988129/988129 [==============================] - 120s 121us/sample - loss: 3.6193\n", + "Epoch 18/20\n", + "988129/988129 [==============================] - 122s 123us/sample - loss: 3.6123\n", + "Epoch 19/20\n", + "988129/988129 [==============================] - 122s 124us/sample - loss: 3.6049\n", + "Epoch 20/20\n", + "988129/988129 [==============================] - 117s 119us/sample - loss: 3.5990\n", + "WARNING:tensorflow:From /Users/swc/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", + "(6040, 32)\n", + "(3706, 32)\n" + ] + } + ], + "source": [ + "#data = pd.read_csvdata = pd.read_csv(\"./movielens_sample.txt\")\n", + "sparse_features = ['user_id','movie_id','gender', 'age', 'occupation', 'zip', 'genres']\n", + "SEQ_LEN = 50\n", + "SEQ_LEN_short = 5\n", + "SEQ_LEN_prefer = 50\n", + "negsample = 0\n", + "\n", + "# 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input`\n", + "\n", + "feature_max_idx = {}\n", + "for feature in sparse_features:\n", + " lbe = LabelEncoder()\n", + " data[feature] = lbe.fit_transform(data[feature]) + 1\n", + " feature_max_idx[feature] = data[feature].max() + 1\n", + "features = sparse_features\n", + "user_profile = data[features].drop_duplicates('user_id')\n", + "\n", + "item_profile = data[[\"movie_id\"]].drop_duplicates('movie_id')\n", + "\n", + "user_profile.set_index(\"user_id\", inplace=True)\n", + "\n", + "user_item_list = data.groupby(\"user_id\")['movie_id'].apply(list)\n", + "\n", + "train_set, test_set = gen_data_set_sdm(data, seq_short_max_len=SEQ_LEN_short, seq_prefer_max_len=SEQ_LEN_prefer)\n", + "\n", + "train_model_input, train_label = gen_model_input_sdm(train_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer)\n", + "test_model_input, test_label = gen_model_input_sdm(test_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer)\n", + "\n", + "# 2.count #unique features for each sparse field and generate feature config for sequence feature\n", + "\n", + "embedding_dim = 32\n", + "\n", + "\n", + "user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16),\n", + " SparseFeat(\"gender\", feature_max_idx['gender'], 16),\n", + " SparseFeat(\"age\", feature_max_idx['age'], 16),\n", + " SparseFeat(\"occupation\", feature_max_idx['occupation'], 16),\n", + " SparseFeat(\"zip\", feature_max_idx['zip'], 16),\n", + " VarLenSparseFeat(SparseFeat('short_movie_id', feature_max_idx['movie_id'], embedding_dim,\n", + " embedding_name=\"movie_id\"), SEQ_LEN_short, 'mean',\n", + " 'short_sess_length'),\n", + " VarLenSparseFeat(SparseFeat('prefer_movie_id', feature_max_idx['movie_id'], embedding_dim,\n", + " embedding_name=\"movie_id\"), SEQ_LEN_prefer, 'mean',\n", + " 'prefer_sess_length'),\n", + " VarLenSparseFeat(SparseFeat('short_genres', feature_max_idx['genres'], embedding_dim,\n", + " embedding_name=\"genres\"), SEQ_LEN_short, 'mean',\n", + " 'short_sess_length'),\n", + " VarLenSparseFeat(SparseFeat('prefer_genres', feature_max_idx['genres'], embedding_dim,\n", + " embedding_name=\"genres\"), SEQ_LEN_prefer, 'mean',\n", + " 'prefer_sess_length'),\n", + " ]\n", + "\n", + "\n", + "item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)]\n", + "\n", + "from collections import Counter\n", + "train_counter = Counter(train_model_input['movie_id'])\n", + "item_count = [train_counter.get(i,0) for i in range(item_feature_columns[0].vocabulary_size)]\n", + "sampler_config = NegativeSampler('frequency',num_sampled=255,item_name=\"movie_id\",item_count=item_count)\n", + "\n", + "# 3.Define Model and train\n", + "\n", + "import tensorflow as tf\n", + "if tf.__version__ >= '2.0.0':\n", + " tf.compat.v1.disable_eager_execution()\n", + "else:\n", + " K.set_learning_phase(True)\n", + "\n", + "model = SDM(user_feature_columns, item_feature_columns, history_feature_list=['movie_id','genres'],\n", + " units=embedding_dim, sampler_config=sampler_config )\n", + "\n", + "model.compile(optimizer=\"adam\", loss=sampledsoftmaxloss)\n", + "\n", + "history = model.fit(train_model_input, train_label, # train_label,\n", + " batch_size=512, epochs=20, verbose=1, validation_split=0.0, )\n", + "\n", + "# 4. Generate user features for testing and full item features for retrieval\n", + "test_user_model_input = test_model_input\n", + "all_item_model_input = {\"movie_id\": item_profile['movie_id'].values,}\n", + "\n", + "user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)\n", + "item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)\n", + "\n", + "user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)\n", + "# user_embs = user_embs[:, i, :] # i in [0,k_max) if MIND\n", + "item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)\n", + "\n", + "print(user_embs.shape)\n", + "print(item_embs.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w_G3KWslKmJo" + }, + "source": [ + "# 使用faiss进行ANN查找并评估结果" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5SvyQLNVKkcs" + }, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "j2ZNYNBOOqrN" + }, + "outputs": [], + "source": [ + "! pip install faiss-cpu" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "6TY1l27iJU8U" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "6040it [00:01, 3381.75it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "recall 0.47516556291390727\n", + "hit rate 0.47516556291390727\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "test_true_label = {line[0]:[line[1]] for line in test_set}\n", + "\n", + "import numpy as np\n", + "import faiss\n", + "from tqdm import tqdm\n", + "from deepmatch.utils import recall_N\n", + "\n", + "index = faiss.IndexFlatIP(embedding_dim)\n", + "# faiss.normalize_L2(item_embs)\n", + "index.add(item_embs)\n", + "# faiss.normalize_L2(user_embs)\n", + "D, I = index.search(np.ascontiguousarray(user_embs), 50)\n", + "s = []\n", + "hit = 0\n", + "for i, uid in tqdm(enumerate(test_user_model_input['user_id'])):\n", + " try:\n", + " pred = [item_profile['movie_id'].values[x] for x in I[i]]\n", + " filter_item = None\n", + " recall_score = recall_N(test_true_label[uid], pred, N=50)\n", + " s.append(recall_score)\n", + " if test_true_label[uid] in pred:\n", + " hit += 1\n", + " except:\n", + " print(i)\n", + "print(\"\")\n", + "print(\"recall\", np.mean(s))\n", + "print(\"hit rate\", hit / len(test_user_model_input['user_id']))" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "colab_MovieLen1M_SDM.ipynb", + "provenance": [], + "toc_visible": true + }, + "gpuClass": "standard", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/examples/colab_MovieLen1M_YoutubeDNN.ipynb b/examples/colab_MovieLen1M_YoutubeDNN.ipynb index 8587653..57decf2 100644 --- a/examples/colab_MovieLen1M_YoutubeDNN.ipynb +++ b/examples/colab_MovieLen1M_YoutubeDNN.ipynb @@ -22,50 +22,15 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yTl6d6jO1oqf", - "outputId": "6f4516af-0f03-4f35-8f0d-80e803021095" + "outputId": "ee7303f1-8970-4726-a9f1-368798077228" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2022-06-17 04:30:51-- http://files.grouplens.org/datasets/movielens/ml-1m.zip\n", - "Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152\n", - "Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 5917549 (5.6M) [application/zip]\n", - "Saving to: ‘./ml-1m.zip’\n", - "\n", - "./ml-1m.zip 100%[===================>] 5.64M 3.47MB/s in 1.6s \n", - "\n", - "2022-06-17 04:30:54 (3.47 MB/s) - ‘./ml-1m.zip’ saved [5917549/5917549]\n", - "\n", - "--2022-06-17 04:30:54-- https://raw.githubusercontent.com/shenweichen/DeepMatch/master/examples/preprocess.py\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 5642 (5.5K) [text/plain]\n", - "Saving to: ‘preprocess.py’\n", - "\n", - "preprocess.py 100%[===================>] 5.51K --.-KB/s in 0s \n", - "\n", - "2022-06-17 04:30:54 (71.1 MB/s) - ‘preprocess.py’ saved [5642/5642]\n", - "\n", - "Archive: ml-1m.zip\n", - " inflating: ml-1m/movies.dat \n", - " inflating: ml-1m/ratings.dat \n", - " inflating: ml-1m/README \n", - " inflating: ml-1m/users.dat \n", - "\u001b[33mWARNING: Skipping tensorflow as it is not installed.\u001b[0m\n" - ] - } - ], + "outputs": [], "source": [ "! wget http://files.grouplens.org/datasets/movielens/ml-1m.zip -O ./ml-1m.zip \n", "! wget https://raw.githubusercontent.com/shenweichen/DeepMatch/master/examples/preprocess.py -O preprocess.py\n", @@ -86,25 +51,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "C_ZR6gzp1E2N", - "outputId": "903724ad-114b-4ea8-d0ce-151f9f6d4cdc" + "id": "C_ZR6gzp1E2N" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:root:\n", - "DeepCTR version 0.9.1 detected. Your version is 0.8.2.\n", - "Use `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v0.9.1\n" - ] - } - ], + "outputs": [], "source": [ "import pandas as pd\n", "from deepctr.feature_column import SparseFeat, VarLenSparseFeat\n", @@ -114,7 +65,7 @@ "from tensorflow.python.keras.models import Model\n", "\n", "from deepmatch.models import *\n", - "from deepmatch.utils import sampledsoftmaxloss\n" + "from deepmatch.utils import sampledsoftmaxloss, NegativeSampler" ] }, { @@ -128,21 +79,25 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "lcO29zFb21Od", - "outputId": "ea095585-f5ec-4d1c-9ffa-117531a5ed3b" + "outputId": "bfeed1ac-99f2-425f-dda6-10b83be721fe" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/lib/python3.7/dist-packages/pandas/util/_decorators.py:311: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", - " return func(*args, **kwargs)\n" + "/Users/swc/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", + " after removing the cwd from sys.path.\n", + "/Users/swc/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", + " \n", + "/Users/swc/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:8: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", + " \n" ] } ], @@ -155,8 +110,9 @@ "ratings = pd.read_csv(data_path+'ml-1m/ratings.dat',sep='::',header=None,names=rnames)\n", "mnames = ['movie_id','title','genres']\n", "movies = pd.read_csv(data_path+'ml-1m/movies.dat',sep='::',header=None,names=mnames,encoding=\"unicode_escape\")\n", + "movies['genres'] = list(map(lambda x: x.split('|')[0], movies['genres'].values))\n", "\n", - "data = pd.merge(pd.merge(ratings,movies),user)#.iloc[:10000]\n" + "data = pd.merge(pd.merge(ratings,movies),user)#.iloc[:10000]" ] }, { @@ -170,120 +126,91 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "BMOvk_de2ML3", - "outputId": "24448edc-9100-4a01-c13f-c48d0a6632e0" + "outputId": "962afe1c-d387-4345-861f-e9b974a0b495" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 6040/6040 [00:17<00:00, 336.80it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "6 6\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/backend.py:435: UserWarning: `tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.\n", - " warnings.warn('`tf.keras.backend.set_learning_phase` is deprecated and '\n", - "WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/array_ops.py:5049: calling gather (from tensorflow.python.ops.array_ops) with validate_indices is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.\n" + "100%|██████████| 6040/6040 [00:12<00:00, 488.35it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ + "8 8\n", "Train on 988129 samples\n", "Epoch 1/20\n", - "988129/988129 [==============================] - 24s 25us/sample - loss: 4.4995\n", + "988129/988129 [==============================] - 38s 39us/sample - loss: 5.6344\n", "Epoch 2/20\n", - "988129/988129 [==============================] - 23s 23us/sample - loss: 4.2307\n", + "988129/988129 [==============================] - 41s 41us/sample - loss: 4.6947\n", "Epoch 3/20\n", - "988129/988129 [==============================] - 25s 25us/sample - loss: 3.8902\n", + "988129/988129 [==============================] - 39s 39us/sample - loss: 4.4681\n", "Epoch 4/20\n", - "988129/988129 [==============================] - 24s 24us/sample - loss: 3.6825\n", + "988129/988129 [==============================] - 38s 38us/sample - loss: 4.3227\n", "Epoch 5/20\n", - "988129/988129 [==============================] - 23s 23us/sample - loss: 3.5604\n", + "988129/988129 [==============================] - 38s 38us/sample - loss: 4.2224\n", "Epoch 6/20\n", - "988129/988129 [==============================] - 23s 23us/sample - loss: 3.4642\n", + "988129/988129 [==============================] - 37s 37us/sample - loss: 4.1463\n", "Epoch 7/20\n", - "988129/988129 [==============================] - 23s 23us/sample - loss: 3.3803\n", + "988129/988129 [==============================] - 37s 37us/sample - loss: 4.0843\n", "Epoch 8/20\n", - "988129/988129 [==============================] - 23s 23us/sample - loss: 3.3126\n", + "988129/988129 [==============================] - 37s 38us/sample - loss: 4.0339\n", "Epoch 9/20\n", - "988129/988129 [==============================] - 23s 23us/sample - loss: 3.2583\n", + "988129/988129 [==============================] - 44s 44us/sample - loss: 3.9941\n", "Epoch 10/20\n", - "988129/988129 [==============================] - 23s 23us/sample - loss: 3.2177\n", + "988129/988129 [==============================] - 38s 38us/sample - loss: 3.9619\n", "Epoch 11/20\n", - "988129/988129 [==============================] - 23s 23us/sample - loss: 3.1791\n", + "988129/988129 [==============================] - 43s 43us/sample - loss: 3.9349\n", "Epoch 12/20\n", - "988129/988129 [==============================] - 23s 23us/sample - loss: 3.1472\n", + "988129/988129 [==============================] - 39s 39us/sample - loss: 3.9112\n", "Epoch 13/20\n", - "988129/988129 [==============================] - 23s 23us/sample - loss: 3.1246\n", + "988129/988129 [==============================] - 39s 39us/sample - loss: 3.8902\n", "Epoch 14/20\n", - "988129/988129 [==============================] - 23s 23us/sample - loss: 3.0992\n", + "988129/988129 [==============================] - 39s 39us/sample - loss: 3.8712\n", "Epoch 15/20\n", - "988129/988129 [==============================] - 24s 24us/sample - loss: 3.0796\n", + "988129/988129 [==============================] - 38s 38us/sample - loss: 3.8560\n", "Epoch 16/20\n", - "988129/988129 [==============================] - 23s 23us/sample - loss: 3.0601\n", + "988129/988129 [==============================] - 39s 40us/sample - loss: 3.8413\n", "Epoch 17/20\n", - "988129/988129 [==============================] - 23s 23us/sample - loss: 3.0418\n", + "988129/988129 [==============================] - 39s 39us/sample - loss: 3.8285\n", "Epoch 18/20\n", - "988129/988129 [==============================] - 23s 23us/sample - loss: 3.0265\n", + "988129/988129 [==============================] - 38s 38us/sample - loss: 3.8185\n", "Epoch 19/20\n", - "988129/988129 [==============================] - 23s 23us/sample - loss: 3.0119\n", + "988129/988129 [==============================] - 40s 40us/sample - loss: 3.8069\n", "Epoch 20/20\n", - "988129/988129 [==============================] - 23s 23us/sample - loss: 2.9994\n", + "988129/988129 [==============================] - 40s 41us/sample - loss: 3.7964\n", + "WARNING:tensorflow:From /Users/swc/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", "(6040, 32)\n", "(3706, 32)\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/training.py:2426: UserWarning: `Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.\n", - " warnings.warn('`Model.state_updates` will be removed in a future version. '\n" - ] } ], "source": [ "#data = pd.read_csvdata = pd.read_csv(\"./movielens_sample.txt\")\n", "sparse_features = [\"movie_id\", \"user_id\",\n", - " \"gender\", \"age\", \"occupation\", \"zip\", ]\n", + " \"gender\", \"age\", \"occupation\", \"zip\", \"genres\"]\n", "SEQ_LEN = 50\n", "negsample = 0\n", "\n", "# 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input`\n", "\n", - "features = ['user_id', 'gender', 'age', 'occupation', 'zip']\n", "feature_max_idx = {}\n", - "for feature in features:\n", + "for feature in sparse_features:\n", " lbe = LabelEncoder()\n", " data[feature] = lbe.fit_transform(data[feature]) + 1\n", " feature_max_idx[feature] = data[feature].max() + 1\n", "\n", - "id_count = data['movie_id'].value_counts() \n", - "mapdict = {t[0]: i for i, t in\n", - " enumerate(sorted([(k, v) for k, v in id_count.to_dict().items()], key=lambda x: x[1], reverse=True))}\n", - "data['movie_id'] = data['movie_id'].map(mapdict)\n", - "feature_max_idx['movie_id'] = data['movie_id'].max() + 1\n", - "\n", "user_profile = data[[\"user_id\", \"gender\", \"age\", \"occupation\", \"zip\"]].drop_duplicates('user_id')\n", "\n", "item_profile = data[[\"movie_id\"]].drop_duplicates('movie_id')\n", @@ -292,7 +219,7 @@ "\n", "user_item_list = data.groupby(\"user_id\")['movie_id'].apply(list)\n", "\n", - "train_set, test_set = gen_data_set(data, negsample)\n", + "train_set, test_set = gen_data_set(data, SEQ_LEN, negsample)\n", "\n", "train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)\n", "test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)\n", @@ -308,10 +235,17 @@ " SparseFeat(\"zip\", feature_max_idx['zip'], 16),\n", " VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim,\n", " embedding_name=\"movie_id\"), SEQ_LEN, 'mean', 'hist_len'),\n", + " VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim,\n", + " embedding_name=\"genres\"), SEQ_LEN, 'mean', 'hist_len'),\n", " ]\n", "\n", "item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)]\n", "\n", + "from collections import Counter\n", + "train_counter = Counter(train_model_input['movie_id'])\n", + "item_count = [train_counter.get(i,0) for i in range(item_feature_columns[0].vocabulary_size)]\n", + "sampler_config = NegativeSampler('frequency',num_sampled=255,item_name=\"movie_id\",item_count=item_count)\n", + "\n", "# 3.Define Model and train\n", "\n", "import tensorflow as tf\n", @@ -320,10 +254,10 @@ "else:\n", " K.set_learning_phase(True)\n", " \n", - "model = YoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=100, user_dnn_hidden_units=(128,64, embedding_dim))\n", - "# model = MIND(user_feature_columns,item_feature_columns,dynamic_k=True,k_max=2,num_sampled=100,user_dnn_hidden_units=(128,64, embedding_dim))\n", + "model = YoutubeDNN(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(128,64, embedding_dim), sampler_config=sampler_config)\n", + "#model = MIND(user_feature_columns,item_feature_columns,dynamic_k=False,k_max=2, user_dnn_hidden_units=(128,64, embedding_dim), sampler_config=sampler_config)\n", "\n", - "model.compile(optimizer=\"adam\", loss=sampledsoftmaxloss) # \"binary_crossentropy\")\n", + "model.compile(optimizer=\"adam\", loss=sampledsoftmaxloss)\n", "\n", "history = model.fit(train_model_input, train_label, # train_label,\n", " batch_size=512, epochs=20, verbose=1, validation_split=0.0, )\n", @@ -340,8 +274,7 @@ "item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)\n", "\n", "print(user_embs.shape)\n", - "print(item_embs.shape)\n", - "\n" + "print(item_embs.shape)" ] }, { @@ -362,13 +295,13 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "j2ZNYNBOOqrN", - "outputId": "442609e8-f94d-42c3-d945-582374a5fa77" + "outputId": "2eec5e82-2d2b-4fe0-9b83-2a74a4dc52ba" }, "outputs": [ { @@ -386,20 +319,20 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6TY1l27iJU8U", - "outputId": "3070ad94-9f84-4b51-d095-18053b84f5ce" + "outputId": "5a8ccdd3-af70-4c48-b859-84c4befddfdd" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "6040it [00:02, 3004.17it/s]" + "6040it [00:02, 2769.01it/s]" ] }, { @@ -407,8 +340,8 @@ "output_type": "stream", "text": [ "\n", - "recall 0.3033112582781457\n", - "hit rate 0.3033112582781457\n" + "recall 0.33708609271523177\n", + "hit rate 0.33708609271523177\n" ] }, { @@ -420,9 +353,7 @@ } ], "source": [ - "\n", - "\n", - "test_true_label = {line[0]:[line[2]] for line in test_set}\n", + "test_true_label = {line[0]:[line[1]] for line in test_set}\n", "\n", "import numpy as np\n", "import faiss\n", @@ -450,24 +381,14 @@ "print(\"recall\", np.mean(s))\n", "print(\"hit rate\", hit / len(test_user_model_input['user_id']))" ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "a97TB0obOrRe" - }, - "outputs": [], - "source": [] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], - "name": "colab_MovieLen1M_YoutubeDNN", - "provenance": [], - "toc_visible": true + "name": "colab_MovieLen1M_YoutubeDNN.ipynb", + "provenance": [] }, "gpuClass": "standard", "kernelspec": { diff --git a/examples/preprocess.py b/examples/preprocess.py index 4803749..7639d81 100644 --- a/examples/preprocess.py +++ b/examples/preprocess.py @@ -1,40 +1,49 @@ -import random import numpy as np -from tqdm import tqdm +import random from tensorflow.python.keras.preprocessing.sequence import pad_sequences +from tqdm import tqdm -def gen_data_set(data, negsample=0): +def gen_data_set(data, seq_max_len=50, negsample=0): data.sort_values("timestamp", inplace=True) item_ids = data['movie_id'].unique() - + item_id_genres_map = dict(zip(data['movie_id'].values, data['genres'].values)) train_set = [] test_set = [] for reviewerID, hist in tqdm(data.groupby('user_id')): pos_list = hist['movie_id'].tolist() + genres_list = hist['genres'].tolist() rating_list = hist['rating'].tolist() if negsample > 0: candidate_set = list(set(item_ids) - set(pos_list)) - neg_list = np.random.choice(candidate_set,size=len(pos_list)*negsample,replace=True) + neg_list = np.random.choice(candidate_set, size=len(pos_list) * negsample, replace=True) for i in range(1, len(pos_list)): hist = pos_list[:i] + genres_hist = genres_list[:i] + seq_len = min(i, seq_max_len) if i != len(pos_list) - 1: - train_set.append((reviewerID, hist[::-1], pos_list[i], 1, len(hist[::-1]),rating_list[i])) + train_set.append(( + reviewerID, pos_list[i], 1, hist[::-1][:seq_len], seq_len, genres_hist[::-1][:seq_len], + genres_list[i], + rating_list[i])) for negi in range(negsample): - train_set.append((reviewerID, hist[::-1], neg_list[i*negsample+negi], 0,len(hist[::-1]))) + train_set.append((reviewerID, neg_list[i * negsample + negi], 0, hist[::-1][:seq_len], seq_len, + genres_hist[::-1][:seq_len], item_id_genres_map[neg_list[i * negsample + negi]])) else: - test_set.append((reviewerID, hist[::-1], pos_list[i],1,len(hist[::-1]),rating_list[i])) + test_set.append((reviewerID, pos_list[i], 1, hist[::-1][:seq_len], seq_len, genres_hist[::-1][:seq_len], + genres_list[i], + rating_list[i])) random.shuffle(train_set) random.shuffle(test_set) - print(len(train_set[0]),len(test_set[0])) + print(len(train_set[0]), len(test_set[0])) - return train_set,test_set + return train_set, test_set -def gen_data_set_sdm(data, seq_short_len=5, seq_prefer_len=50): +def gen_data_set_sdm(data, seq_short_max_len=5, seq_prefer_max_len=50): data.sort_values("timestamp", inplace=True) train_set = [] test_set = [] @@ -45,18 +54,20 @@ def gen_data_set_sdm(data, seq_short_len=5, seq_prefer_len=50): for i in range(1, len(pos_list)): hist = pos_list[:i] genres_hist = genres_list[:i] - if i <= seq_short_len and i != len(pos_list) - 1: - train_set.append((reviewerID, hist[::-1], [0]*seq_prefer_len, pos_list[i], 1, len(hist[::-1]), 0, - rating_list[i], genres_hist[::-1], [0]*seq_prefer_len)) - elif i != len(pos_list) - 1: - train_set.append((reviewerID, hist[::-1][:seq_short_len], hist[::-1][seq_short_len:], pos_list[i], 1, seq_short_len, - len(hist[::-1])-seq_short_len, rating_list[i], genres_hist[::-1][:seq_short_len], genres_hist[::-1][seq_short_len:])) - elif i <= seq_short_len and i == len(pos_list) - 1: - test_set.append((reviewerID, hist[::-1], [0] * seq_prefer_len, pos_list[i], 1, len(hist[::-1]), 0, - rating_list[i], genres_hist[::-1], [0]*seq_prefer_len)) + seq_short_len = min(i, seq_short_max_len) + seq_prefer_len = min(max(i - seq_short_len, 0), seq_prefer_max_len) + if i != len(pos_list) - 1: + train_set.append( + (reviewerID, pos_list[i], 1, hist[::-1][:seq_short_len][::-1], + hist[::-1][seq_short_len:seq_short_len + seq_prefer_len], seq_short_len, + seq_prefer_len, genres_hist[::-1][:seq_short_len][::-1], + genres_hist[::-1][seq_short_len:seq_short_len + seq_prefer_len], rating_list[i])) else: - test_set.append((reviewerID, hist[::-1][:seq_short_len], hist[::-1][seq_short_len:], pos_list[i], 1, seq_short_len, - len(hist[::-1])-seq_short_len, rating_list[i], genres_hist[::-1][:seq_short_len], genres_hist[::-1][seq_short_len:])) + test_set.append( + (reviewerID, pos_list[i], 1, hist[::-1][:seq_short_len][::-1], + hist[::-1][seq_short_len:seq_short_len + seq_prefer_len], seq_short_len, + seq_prefer_len, genres_hist[::-1][:seq_short_len][::-1], + genres_hist[::-1][seq_short_len:seq_short_len + seq_prefer_len], rating_list[i])) random.shuffle(train_set) random.shuffle(test_set) @@ -65,47 +76,56 @@ def gen_data_set_sdm(data, seq_short_len=5, seq_prefer_len=50): return train_set, test_set -def gen_model_input(train_set,user_profile,seq_max_len): +def gen_model_input(train_set, user_profile, seq_max_len): train_uid = np.array([line[0] for line in train_set]) - train_seq = [line[1] for line in train_set] - train_iid = np.array([line[2] for line in train_set]) - train_label = np.array([line[3] for line in train_set]) + train_iid = np.array([line[1] for line in train_set]) + train_label = np.array([line[2] for line in train_set]) + train_seq = [line[3] for line in train_set] train_hist_len = np.array([line[4] for line in train_set]) - + train_seq_genres = np.array([line[5] for line in train_set]) + train_genres = np.array([line[6] for line in train_set]) train_seq_pad = pad_sequences(train_seq, maxlen=seq_max_len, padding='post', truncating='post', value=0) + train_seq_genres_pad = pad_sequences(train_seq_genres, maxlen=seq_max_len, padding='post', truncating='post', + value=0) train_model_input = {"user_id": train_uid, "movie_id": train_iid, "hist_movie_id": train_seq_pad, - "hist_len": train_hist_len} + "hist_genres": train_seq_genres_pad, + "hist_len": train_hist_len, "genres": train_genres} for key in ["gender", "age", "occupation", "zip"]: train_model_input[key] = user_profile.loc[train_model_input['user_id']][key].values return train_model_input, train_label -def gen_model_input_sdm(train_set, user_profile, seq_short_len, seq_prefer_len): +def gen_model_input_sdm(train_set, user_profile, seq_short_max_len, seq_prefer_max_len): train_uid = np.array([line[0] for line in train_set]) - short_train_seq = [line[1] for line in train_set] - prefer_train_seq = [line[2] for line in train_set] - train_iid = np.array([line[3] for line in train_set]) - train_label = np.array([line[4] for line in train_set]) + train_iid = np.array([line[1] for line in train_set]) + train_label = np.array([line[2] for line in train_set]) + short_train_seq = [line[3] for line in train_set] + prefer_train_seq = [line[4] for line in train_set] train_short_len = np.array([line[5] for line in train_set]) train_prefer_len = np.array([line[6] for line in train_set]) - short_train_seq_genres = np.array([line[8] for line in train_set]) - prefer_train_seq_genres = np.array([line[9] for line in train_set]) + short_train_seq_genres = np.array([line[7] for line in train_set]) + prefer_train_seq_genres = np.array([line[8] for line in train_set]) - train_short_item_pad = pad_sequences(short_train_seq, maxlen=seq_short_len, padding='post', truncating='post', - value=0) - train_prefer_item_pad = pad_sequences(prefer_train_seq, maxlen=seq_prefer_len, padding='post', truncating='post', + train_short_item_pad = pad_sequences(short_train_seq, maxlen=seq_short_max_len, padding='post', truncating='post', value=0) - train_short_genres_pad = pad_sequences(short_train_seq_genres, maxlen=seq_short_len, padding='post', truncating='post', - value=0) - train_prefer_genres_pad = pad_sequences(prefer_train_seq_genres, maxlen=seq_prefer_len, padding='post', truncating='post', - value=0) + train_prefer_item_pad = pad_sequences(prefer_train_seq, maxlen=seq_prefer_max_len, padding='post', + truncating='post', + value=0) + train_short_genres_pad = pad_sequences(short_train_seq_genres, maxlen=seq_short_max_len, padding='post', + truncating='post', + value=0) + train_prefer_genres_pad = pad_sequences(prefer_train_seq_genres, maxlen=seq_prefer_max_len, padding='post', + truncating='post', + value=0) train_model_input = {"user_id": train_uid, "movie_id": train_iid, "short_movie_id": train_short_item_pad, - "prefer_movie_id": train_prefer_item_pad, "prefer_sess_length": train_prefer_len, "short_sess_length": - train_short_len, 'short_genres': train_short_genres_pad, 'prefer_genres': train_prefer_genres_pad} + "prefer_movie_id": train_prefer_item_pad, + "prefer_sess_length": train_prefer_len, + "short_sess_length": train_short_len, 'short_genres': train_short_genres_pad, + 'prefer_genres': train_prefer_genres_pad} for key in ["gender", "age", "occupation", "zip"]: train_model_input[key] = user_profile.loc[train_model_input['user_id']][key].values diff --git a/examples/run_dssm_inbatchsoftmax.py b/examples/run_dssm_inbatchsoftmax.py new file mode 100644 index 0000000..2b15335 --- /dev/null +++ b/examples/run_dssm_inbatchsoftmax.py @@ -0,0 +1,120 @@ +import pandas as pd +from deepctr.feature_column import SparseFeat, VarLenSparseFeat +from deepmatch.models import * +from deepmatch.utils import sampledsoftmaxloss, NegativeSampler +from preprocess import gen_data_set, gen_model_input +from sklearn.preprocessing import LabelEncoder +from tensorflow.python.keras.models import Model + +if __name__ == "__main__": + + data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt") + sparse_features = ["movie_id", "user_id", + "gender", "age", "occupation", "zip", "genres"] + SEQ_LEN = 50 + negsample = 10 + + # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` + + feature_max_idx = {} + for feature in sparse_features: + lbe = LabelEncoder() + data[feature] = lbe.fit_transform(data[feature]) + 1 + feature_max_idx[feature] = data[feature].max() + 1 + + user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id') + + item_profile = data[["movie_id", "genres"]].drop_duplicates('movie_id') + + user_profile.set_index("user_id", inplace=True) + + user_item_list = data.groupby("user_id")['movie_id'].apply(list) + + train_set, test_set = gen_data_set(data, SEQ_LEN, negsample) + + train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) + test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) + + # 2.count #unique features for each sparse field and generate feature config for sequence feature + + embedding_dim = 32 + + user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim), + SparseFeat("gender", feature_max_idx['gender'], embedding_dim), + SparseFeat("age", feature_max_idx['age'], embedding_dim), + SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim), + SparseFeat("zip", feature_max_idx['zip'], embedding_dim), + VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim, + embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'), + VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim, + embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len'), + ] + + item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim), + SparseFeat('genres', feature_max_idx['genres'], embedding_dim) + ] + + from collections import Counter + + train_counter = Counter(train_model_input['movie_id']) + item_count = [train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)] + sampler_config = NegativeSampler('inbatch', num_sampled=5, item_name='movie_id', item_count=item_count) + + # 3.Define Model and train + + import tensorflow as tf + + if tf.__version__ >= '2.0.0': + tf.compat.v1.disable_eager_execution() + else: + K.set_learning_phase(True) + + model = DSSM(user_feature_columns, item_feature_columns, loss_type="softmax", sampler_config=sampler_config) + # model = FM(user_feature_columns, item_feature_columns, loss_type="softmax", sampler_config=sampler_config) + + model.compile(optimizer='adagrad', loss=sampledsoftmaxloss) + + history = model.fit(train_model_input, train_label, + batch_size=256, epochs=1, verbose=1, validation_split=0.0, ) + + # 4. Generate user features for testing and full item features for retrieval + test_user_model_input = test_model_input + all_item_model_input = {"movie_id": item_profile['movie_id'].values, "genres": item_profile['genres'].values} + + user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding) + item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding) + + user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12) + item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12) + + print(user_embs.shape) + print(item_embs.shape) + + # 5. [Optional] ANN search by faiss and evaluate the result + + # test_true_label = {line[0]:[line[1]] for line in test_set} + # + # import numpy as np + # import faiss + # from tqdm import tqdm + # from deepmatch.utils import recall_N + # + # index = faiss.IndexFlatIP(embedding_dim) + # # faiss.normalize_L2(item_embs) + # index.add(item_embs) + # # faiss.normalize_L2(user_embs) + # D, I = index.search(user_embs, 50) + # s = [] + # hit = 0 + # for i, uid in tqdm(enumerate(test_user_model_input['user_id'])): + # try: + # pred = [item_profile['movie_id'].values[x] for x in I[i]] + # filter_item = None + # recall_score = recall_N(test_true_label[uid], pred, N=50) + # s.append(recall_score) + # if test_true_label[uid] in pred: + # hit += 1 + # except: + # print(i) + # print("recall", np.mean(s)) + # print("hr", hit / len(test_user_model_input['user_id'])) diff --git a/examples/run_dssm_negsampling.py b/examples/run_dssm_negsampling.py index 2ed6d83..e82efa0 100644 --- a/examples/run_dssm_negsampling.py +++ b/examples/run_dssm_negsampling.py @@ -1,68 +1,78 @@ import pandas as pd from deepctr.feature_column import SparseFeat, VarLenSparseFeat +from deepmatch.models import * from preprocess import gen_data_set, gen_model_input from sklearn.preprocessing import LabelEncoder from tensorflow.python.keras.models import Model -from deepmatch.models import * - if __name__ == "__main__": data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt") sparse_features = ["movie_id", "user_id", - "gender", "age", "occupation", "zip", ] + "gender", "age", "occupation", "zip", "genres"] SEQ_LEN = 50 - negsample = 3 + negsample = 10 # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` - features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip'] feature_max_idx = {} - for feature in features: + for feature in sparse_features: lbe = LabelEncoder() data[feature] = lbe.fit_transform(data[feature]) + 1 feature_max_idx[feature] = data[feature].max() + 1 user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id') - item_profile = data[["movie_id"]].drop_duplicates('movie_id') + item_profile = data[["movie_id", "genres"]].drop_duplicates('movie_id') user_profile.set_index("user_id", inplace=True) user_item_list = data.groupby("user_id")['movie_id'].apply(list) - train_set, test_set = gen_data_set(data, negsample) + train_set, test_set = gen_data_set(data, SEQ_LEN, negsample) train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) # 2.count #unique features for each sparse field and generate feature config for sequence feature - embedding_dim = 8 + embedding_dim = 32 - user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim), - SparseFeat("gender", feature_max_idx['gender'], embedding_dim), - SparseFeat("age", feature_max_idx['age'], embedding_dim), - SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim), - SparseFeat("zip", feature_max_idx['zip'], embedding_dim), + user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16), + SparseFeat("gender", feature_max_idx['gender'], 16), + SparseFeat("age", feature_max_idx['age'], 16), + SparseFeat("occupation", feature_max_idx['occupation'], 16), + SparseFeat("zip", feature_max_idx['zip'], 16), VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'), + VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim, + embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len'), ] - item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)] + item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim), + SparseFeat('genres', feature_max_idx['genres'], embedding_dim) + ] # 3.Define Model and train - model = DSSM(user_feature_columns, item_feature_columns) # FM(user_feature_columns,item_feature_columns) + import tensorflow as tf + + if tf.__version__ >= '2.0.0': + tf.compat.v1.disable_eager_execution() + else: + K.set_learning_phase(True) + + model = DSSM(user_feature_columns, item_feature_columns, loss_type="logistic") + # model = FM(user_feature_columns,item_feature_columns) model.compile(optimizer='adagrad', loss="binary_crossentropy") - history = model.fit(train_model_input, train_label, # train_label, + history = model.fit(train_model_input, train_label, batch_size=256, epochs=1, verbose=1, validation_split=0.0, ) # 4. Generate user features for testing and full item features for retrieval test_user_model_input = test_model_input - all_item_model_input = {"movie_id": item_profile['movie_id'].values} + all_item_model_input = {"movie_id": item_profile['movie_id'].values, "genres": item_profile['genres'].values} user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding) item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding) @@ -73,9 +83,9 @@ print(user_embs.shape) print(item_embs.shape) - # 5. [Optional] ANN search by faiss and evaluate the result + # 5. [Optional] ANN search by faiss and evaluate the result - # test_true_label = {line[0]:[line[2]] for line in test_set} + # test_true_label = {line[0]:[line[1]] for line in test_set} # # import numpy as np # import faiss diff --git a/examples/run_ncf.py b/examples/run_ncf.py index 29d0358..beb801f 100644 --- a/examples/run_ncf.py +++ b/examples/run_ncf.py @@ -1,9 +1,8 @@ import pandas as pd +from deepmatch.models import NCF from preprocess import gen_data_set, gen_model_input from sklearn.preprocessing import LabelEncoder -from deepmatch.models import NCF - if __name__ == "__main__": data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt") sparse_features = ["movie_id", "user_id", @@ -28,7 +27,7 @@ user_item_list = data.groupby("user_id")['movie_id'].apply(list) - train_set, test_set = gen_data_set(data, negsample) + train_set, test_set = gen_data_set(data, SEQ_LEN, negsample) train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) diff --git a/examples/run_sdm.py b/examples/run_sdm.py index 902ea17..acf5ccc 100644 --- a/examples/run_sdm.py +++ b/examples/run_sdm.py @@ -1,16 +1,16 @@ import pandas as pd from deepctr.feature_column import SparseFeat, VarLenSparseFeat +from deepmatch.models import SDM +from deepmatch.utils import sampledsoftmaxloss, NegativeSampler from preprocess import gen_data_set_sdm, gen_model_input_sdm from sklearn.preprocessing import LabelEncoder from tensorflow.python.keras import backend as K -from tensorflow.python.keras import optimizers from tensorflow.python.keras.models import Model -from deepmatch.models import SDM -from deepmatch.utils import sampledsoftmaxloss - if __name__ == "__main__": data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt") + data['genres'] = list(map(lambda x: x.split('|')[0], data['genres'].values)) + sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip", "genres"] SEQ_LEN_short = 5 @@ -18,20 +18,13 @@ # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` - features = ['user_id', 'gender', 'age', 'occupation', 'zip'] feature_max_idx = {} - for feature in features: + for feature in sparse_features: lbe = LabelEncoder() data[feature] = lbe.fit_transform(data[feature]) + 1 feature_max_idx[feature] = data[feature].max() + 1 - id_count = data['movie_id'].value_counts() - mapdict = {t[0]: i for i, t in - enumerate(sorted([(k, v) for k, v in id_count.to_dict().items()], key=lambda x: x[1], reverse=True))} - data['movie_id'] = data['movie_id'].map(mapdict) - feature_max_idx['movie_id'] = data['movie_id'].max() + 1 - - user_profile = data[["user_id", "gender", "age", "occupation", "zip", "genres"]].drop_duplicates('user_id') + user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id') item_profile = data[["movie_id"]].drop_duplicates('movie_id') @@ -39,7 +32,7 @@ # # user_item_list = data.groupby("user_id")['movie_id'].apply(list) - train_set, test_set = gen_data_set_sdm(data, seq_short_len=SEQ_LEN_short, seq_prefer_len=SEQ_LEN_prefer) + train_set, test_set = gen_data_set_sdm(data, seq_short_max_len=SEQ_LEN_short, seq_prefer_max_len=SEQ_LEN_prefer) train_model_input, train_label = gen_model_input_sdm(train_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer) test_model_input, test_label = gen_model_input_sdm(test_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer) @@ -69,18 +62,26 @@ item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)] + from collections import Counter + + train_counter = Counter(train_model_input['movie_id']) + item_count = [train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)] + sampler_config = NegativeSampler('frequency', num_sampled=5, item_name='movie_id', item_count=item_count) + K.set_learning_phase(True) import tensorflow as tf if tf.__version__ >= '2.0.0': tf.compat.v1.disable_eager_execution() + else: + K.set_learning_phase(True) # units must be equal to item embedding dim! model = SDM(user_feature_columns, item_feature_columns, history_feature_list=['movie_id', 'genres'], - units=embedding_dim, num_sampled=100, ) + units=embedding_dim, sampler_config=sampler_config) - model.compile(optimizer='adam', loss=sampledsoftmaxloss) # "binary_crossentropy") + model.compile(optimizer='adam', loss=sampledsoftmaxloss) history = model.fit(train_model_input, train_label, # train_label, batch_size=512, epochs=1, verbose=1, validation_split=0.0, ) @@ -100,7 +101,7 @@ print(user_embs.shape) print(item_embs.shape) - # test_true_label = {line[0]: [line[3]] for line in test_set} + # test_true_label = {line[0]: [line[1]] for line in test_set} # # import numpy as np # import faiss diff --git a/examples/run_youtubednn.py b/examples/run_youtubednn.py index b9881ed..5e85385 100644 --- a/examples/run_youtubednn.py +++ b/examples/run_youtubednn.py @@ -1,7 +1,7 @@ import pandas as pd from deepctr.feature_column import SparseFeat, VarLenSparseFeat from deepmatch.models import * -from deepmatch.utils import sampledsoftmaxloss +from deepmatch.utils import sampledsoftmaxloss, NegativeSampler from preprocess import gen_data_set, gen_model_input from sklearn.preprocessing import LabelEncoder from tensorflow.python.keras import backend as K @@ -10,25 +10,20 @@ if __name__ == "__main__": data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt") + data['genres'] = list(map(lambda x: x.split('|')[0], data['genres'].values)) + sparse_features = ["movie_id", "user_id", - "gender", "age", "occupation", "zip", ] + "gender", "age", "occupation", "zip", "genres"] SEQ_LEN = 50 # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` - features = ['user_id', 'gender', 'age', 'occupation', 'zip'] feature_max_idx = {} - for feature in features: + for feature in sparse_features: lbe = LabelEncoder() data[feature] = lbe.fit_transform(data[feature]) + 1 feature_max_idx[feature] = data[feature].max() + 1 - id_count = data['movie_id'].value_counts() - mapdict = {t[0]: i for i, t in - enumerate(sorted([(k, v) for k, v in id_count.to_dict().items()], key=lambda x: x[1], reverse=True))} - data['movie_id'] = data['movie_id'].map(mapdict) - feature_max_idx['movie_id'] = data['movie_id'].max() + 1 - user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id') item_profile = data[["movie_id"]].drop_duplicates('movie_id') @@ -37,7 +32,7 @@ user_item_list = data.groupby("user_id")['movie_id'].apply(list) - train_set, test_set = gen_data_set(data, 0) + train_set, test_set = gen_data_set(data, SEQ_LEN, 0) train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) @@ -53,10 +48,18 @@ SparseFeat("zip", feature_max_idx['zip'], embedding_dim), VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'), + VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim, + embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len') ] item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)] + from collections import Counter + + train_counter = Counter(train_model_input['movie_id']) + item_count = [train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)] + sampler_config = NegativeSampler('frequency', num_sampled=5, item_name='movie_id', item_count=item_count) + # 3.Define Model and train import tensorflow as tf @@ -66,10 +69,12 @@ else: K.set_learning_phase(True) - model = YoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=5, user_dnn_hidden_units=(64, embedding_dim)) - # model = MIND(user_feature_columns,item_feature_columns,dynamic_k=True,k_max=2,num_sampled=5,user_dnn_hidden_units=(64, embedding_dim)) + model = YoutubeDNN(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(64, embedding_dim), + sampler_config=sampler_config) + # model = MIND(user_feature_columns, item_feature_columns, dynamic_k=False, k_max=2, + # user_dnn_hidden_units=(64, embedding_dim), sampler_config=sampler_config) - model.compile(optimizer="adam", loss=sampledsoftmaxloss) # "binary_crossentropy") + model.compile(optimizer="adam", loss=sampledsoftmaxloss) history = model.fit(train_model_input, train_label, # train_label, batch_size=256, epochs=1, verbose=1, validation_split=0.0, ) @@ -90,7 +95,7 @@ # 5. [Optional] ANN search by faiss and evaluate the result - # test_true_label = {line[0]:[line[2]] for line in test_set} + # test_true_label = {line[0]:[line[1]] for line in test_set} # # import numpy as np # import faiss diff --git a/setup.py b/setup.py index d5fe36e..ad3b8fd 100644 --- a/setup.py +++ b/setup.py @@ -4,12 +4,12 @@ long_description = fh.read() REQUIRED_PACKAGES = [ - 'requests', "deepctr==0.9.1" + 'requests', "deepctr~=0.9.1" ] setuptools.setup( name="deepmatch", - version="0.2.1", + version="0.3.0", author="Weichen Shen", author_email="weichenswc@163.com", description="Deep matching model library for recommendations, advertising. It's easy to train models and to **export representation vectors** for user and item which can be used for **ANN search**.", @@ -22,8 +22,8 @@ python_requires=">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*", # '>=3.4', # 3.4.6 install_requires=REQUIRED_PACKAGES, extras_require={ - "cpu": ["tensorflow>=1.4.0,!=1.7.*,!=1.8.*"], - "gpu": ["tensorflow-gpu>=1.4.0,!=1.7.*,!=1.8.*"], + "cpu": ["tensorflow>=1.9.0"], + "gpu": ["tensorflow-gpu>=1.9.0"], }, entry_points={ }, @@ -35,7 +35,6 @@ 'Intended Audience :: Science/Research', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', diff --git a/tests/models/DSSM_test.py b/tests/models/DSSM_test.py index c4210e8..3de1a1e 100644 --- a/tests/models/DSSM_test.py +++ b/tests/models/DSSM_test.py @@ -1,15 +1,40 @@ +import pytest +import tensorflow as tf from deepmatch.models import DSSM -from ..utils import check_model, get_xy_fd +from deepmatch.utils import sampledsoftmaxloss, NegativeSampler +from tensorflow.python.keras import backend as K +from ..utils import check_model, get_xy_fd -def test_DSSM(): +@pytest.mark.parametrize( + 'loss_type,user_dnn_hidden_units,item_dnn_hidden_units', + [('logistic', [32, 4], []), ('softmax', [64, 32], [32]) + ] +) +def test_DSSM(loss_type, user_dnn_hidden_units, item_dnn_hidden_units): model_name = "DSSM" x, y, user_feature_columns, item_feature_columns = get_xy_fd(False) - model = DSSM(user_feature_columns, item_feature_columns, ) - - model.compile('adam', "binary_crossentropy") + if tf.__version__ >= '2.0.0': + tf.compat.v1.disable_eager_execution() + else: + K.set_learning_phase(True) + if loss_type == "logistic": + model = DSSM(user_feature_columns, item_feature_columns, user_dnn_hidden_units=user_dnn_hidden_units, + item_dnn_hidden_units=item_dnn_hidden_units, + loss_type=loss_type) + model.compile('adam', "binary_crossentropy") + else: + from collections import Counter + item_name = 'item' + train_counter = Counter(x[item_name]) + item_count = [train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)] + sampler_config = NegativeSampler(sampler='inbatch', num_sampled=2, item_name=item_name, item_count=item_count) + model = DSSM(user_feature_columns, item_feature_columns, user_dnn_hidden_units=user_dnn_hidden_units, + item_dnn_hidden_units=item_dnn_hidden_units, + loss_type=loss_type, sampler_config=sampler_config) + model.compile('adam', sampledsoftmaxloss) check_model(model, model_name, x, y) diff --git a/tests/models/FM_test.py b/tests/models/FM_test.py index baa224e..e4597a5 100644 --- a/tests/models/FM_test.py +++ b/tests/models/FM_test.py @@ -1,15 +1,37 @@ +import pytest +import tensorflow as tf from deepmatch.models import FM +from deepmatch.utils import sampledsoftmaxloss, NegativeSampler +from tensorflow.python.keras import backend as K + from ..utils import check_model, get_xy_fd -def test_FM(): +@pytest.mark.parametrize( + 'loss_type', + ['logistic', 'softmax' + ] +) +def test_FM(loss_type): model_name = "FM" x, y, user_feature_columns, item_feature_columns = get_xy_fd(False) - model = FM(user_feature_columns, item_feature_columns, ) - - model.compile('adam', "binary_crossentropy") - check_model(model, model_name, x, y,) + if tf.__version__ >= '2.0.0': + tf.compat.v1.disable_eager_execution() + else: + K.set_learning_phase(True) + if loss_type == "logistic": + model = FM(user_feature_columns, item_feature_columns, loss_type=loss_type) + model.compile('adam', "binary_crossentropy") + else: + from collections import Counter + item_name = 'item' + train_counter = Counter(x[item_name]) + item_count = [train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)] + sampler_config = NegativeSampler(sampler='inbatch', num_sampled=2, item_name=item_name, item_count=item_count) + model = FM(user_feature_columns, item_feature_columns, loss_type=loss_type, sampler_config=sampler_config) + model.compile('adam', sampledsoftmaxloss) + check_model(model, model_name, x, y) if __name__ == "__main__": diff --git a/tests/models/MIND_test.py b/tests/models/MIND_test.py index ea9c9a1..537c083 100644 --- a/tests/models/MIND_test.py +++ b/tests/models/MIND_test.py @@ -1,7 +1,7 @@ import pytest import tensorflow as tf from deepmatch.models import MIND -from deepmatch.utils import sampledsoftmaxloss +from deepmatch.utils import sampledsoftmaxloss, NegativeSampler from tensorflow.python.keras import backend as K from ..utils import check_model, get_xy_fd @@ -21,8 +21,9 @@ def test_MIND(dynamic_k, p): tf.compat.v1.disable_eager_execution() else: K.set_learning_phase(True) - model = MIND(user_feature_columns, item_feature_columns, num_sampled=2, p=p, dynamic_k=dynamic_k, - user_dnn_hidden_units=(16, 4)) + sampler_config = NegativeSampler(sampler='uniform', num_sampled=2, item_name='item') + model = MIND(user_feature_columns, item_feature_columns, p=p, dynamic_k=dynamic_k, + user_dnn_hidden_units=(16, 4), sampler_config=sampler_config) model.compile('adam', sampledsoftmaxloss) check_model(model, model_name, x, y) diff --git a/tests/models/SDM_test.py b/tests/models/SDM_test.py index 072caf8..e213a8d 100644 --- a/tests/models/SDM_test.py +++ b/tests/models/SDM_test.py @@ -1,12 +1,11 @@ import tensorflow as tf +from deepmatch.models import SDM +from deepmatch.utils import sampledsoftmaxloss, NegativeSampler from tensorflow.python.keras import backend as K -from deepmatch.models import SDM -from deepmatch.utils import sampledsoftmaxloss from ..utils import check_model, get_xy_fd_sdm - def test_SDM(): model_name = "SDM" x, y, user_feature_columns, item_feature_columns, history_feature_list = get_xy_fd_sdm(False) @@ -15,7 +14,10 @@ def test_SDM(): tf.compat.v1.disable_eager_execution() else: K.set_learning_phase(True) - model = SDM(user_feature_columns, item_feature_columns, history_feature_list, units=8) + + sampler_config = NegativeSampler(sampler='uniform', num_sampled=2, item_name='item') + model = SDM(user_feature_columns, item_feature_columns, history_feature_list, units=8, + sampler_config=sampler_config) # model.summary() model.compile('adam', sampledsoftmaxloss) diff --git a/tests/models/YoutubeDNN_test.py b/tests/models/YoutubeDNN_test.py index a014c16..eebe33c 100644 --- a/tests/models/YoutubeDNN_test.py +++ b/tests/models/YoutubeDNN_test.py @@ -1,12 +1,17 @@ +import pytest import tensorflow as tf -from tensorflow.python.keras import backend as K - from deepmatch.models import YoutubeDNN -from deepmatch.utils import sampledsoftmaxloss -from ..utils import check_model, get_xy_fd +from deepmatch.utils import sampledsoftmaxloss, NegativeSampler +from tensorflow.python.keras import backend as K +from tests.utils import check_model, get_xy_fd -def test_YoutubeDNN(): +@pytest.mark.parametrize( + 'sampler', + ['inbatch', 'uniform', 'frequency', 'adaptive', + ] +) +def test_YoutubeDNN(sampler): model_name = "YoutubeDNN" x, y, user_feature_columns, item_feature_columns = get_xy_fd(False) @@ -15,8 +20,12 @@ def test_YoutubeDNN(): tf.compat.v1.disable_eager_execution() else: K.set_learning_phase(True) - - model = YoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=2, user_dnn_hidden_units=(16, 4)) + from collections import Counter + train_counter = Counter(x['item']) + item_count = [train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)] + sampler_config = NegativeSampler(sampler, num_sampled=2, item_name='item', item_count=item_count, distortion=1.0) + model = YoutubeDNN(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(16, 4), + sampler_config=sampler_config) model.compile('adam', sampledsoftmaxloss) check_model(model, model_name, x, y, check_model_io=True) diff --git a/tests/utils.py b/tests/utils.py index b58e9db..d2d067a 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,19 +1,17 @@ from __future__ import absolute_import, division, print_function import inspect +import numpy as np import os import sys - -import numpy as np import tensorflow as tf from deepctr.feature_column import SparseFeat, DenseFeat, VarLenSparseFeat, DEFAULT_GROUP_NAME +from deepmatch.layers import custom_objects from numpy.testing import assert_allclose from tensorflow.python.keras import backend as K from tensorflow.python.keras.layers import Input, Masking from tensorflow.python.keras.models import Model, load_model, save_model -from deepmatch.layers import custom_objects - SAMPLE_SIZE = 8 VOCABULARY_SIZE = 4 @@ -346,10 +344,10 @@ def check_model(model, model_name, x, y, check_model_io=True): user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding) item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding) - user_embs = user_embedding_model.predict(x, batch_size=2 ** 12) + _ = user_embedding_model.predict(x, batch_size=2 ** 12) # user_embs = user_embs[:, i, :] i in [0,k_max) if MIND print(model_name + " user_emb pass!") - item_embs = item_embedding_model.predict(x, batch_size=2 ** 12) + _ = item_embedding_model.predict(x, batch_size=2 ** 12) print(model_name + " item_emb pass!")