-
Notifications
You must be signed in to change notification settings - Fork 0
/
document composition vectors.py
135 lines (113 loc) · 4.71 KB
/
document composition vectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import pickle
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.corpora import Dictionary
from gensim.models import fasttext
from sklearn.mixture import GaussianMixture
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy
from scipy.spatial.distance import cosine
#%%
class SCDV(object):
def __init__(self, n_components=100, min_count=5, epochs=5):
self.n_components = n_components
self.min_count = min_count
self.iter = epochs
def __get_sentences_tokens(self, documents):
_data = []
for doc in documents:
_doc = doc.lower()
_sentences = sent_tokenize(_doc)
for sent in _sentences:
_data.append(word_tokenize(sent))
return _data
def fit(self, documents=None):
self.documents = documents
_resumes_words_list = self.__get_sentences_tokens(self.documents)
self.model_tfidf = TfidfVectorizer()
self.model_tfidf.fit(self.documents)
self.model_word2vec = fasttext.FastText(_resumes_words_list,
negative=5,
workers=4,
iter=self.iter,
min_count=self.min_count)
self.word_vectors = self.model_word2vec.wv.syn0
self.model_cluster = GaussianMixture(n_components=self.n_components)
self.model_cluster.fit(self.word_vectors)
def get_document_vector(self, document):
if type(document) == type("str"):
document = word_tokenize(document.lower())
doc1 = [document]
# compute word vectors for words in the document
_doc_topic_wv = []
for i in doc1[0]:
try:
val = self.model_word2vec.wv[i]
except KeyError:
val = numpy.zeros_like(self.model_word2vec.wv.syn0[0])
_doc_topic_wv.append(val)
# compute topic probabilities for each word vector
_doc_topic_probs = self.model_cluster.predict_proba(_doc_topic_wv)
# multiply each word vector repeated n-times with topic probabilities.
# n: number of topics
_doc_topic_wv = numpy.array(
[numpy.repeat(
[_doc_topic_wv[i]],
len(_doc_topic_probs[i]),
axis=0
)
for i in range(len(_doc_topic_wv))
]
)
_doc_topic_probs = numpy.array(_doc_topic_probs)
_doc_topic_probs = _doc_topic_probs.reshape(
[
_doc_topic_probs.shape[0],
_doc_topic_probs.shape[1], 1
]
)
_prob_mul_matrix = numpy.multiply(_doc_topic_wv, _doc_topic_probs)
# get the tfidf weights of each word in te document
_indexes_weights_tfidf = self.__get_tfidf__([" ".join(doc1[0])])
_indexes_weights_tfidf = _indexes_weights_tfidf.reshape(
[_indexes_weights_tfidf.shape[0], 1])
_indexes_weights_tfidf = numpy.repeat(
_indexes_weights_tfidf, repeats=self.n_components, axis=1)
_indexes_weights_tfidf = _indexes_weights_tfidf.reshape(
[_indexes_weights_tfidf.shape[0],
_indexes_weights_tfidf.shape[1],
1
]
)
# mutiply the tfidf of each word with the above _prob_mul_matrix
# to get the weighted word vector probabilit matrix
_prob_mul_matrix = numpy.multiply(
_prob_mul_matrix, _indexes_weights_tfidf)
# sum element along rows i.e. all words to get the final vectors and reshape
_prob_mul_matrix = numpy.sum(_prob_mul_matrix, axis=0)
_prob_mul_matrix = _prob_mul_matrix.reshape([-1, 1])
# increase the sparcity by reducing the less than 95% values to 0
# _prob_mul_matrix[_prob_mul_matrix < numpy.percentile(_prob_mul_matrix,5)] = 0
return _prob_mul_matrix
def __get_tfidf__(self,document):
_transformed = self.model_tfidf.transform(document)
_to_return = []
for word in word_tokenize(document[0]):
_index = self.model_tfidf.vocabulary_.get(word)
if _index is None:
_to_return.append(0)
else:
_val = _transformed[0,_index]
_to_return.append(_val)
return numpy.array(_to_return)
def get_ranking(self, documents, query):
docs = []
for i in documents:
docs.append(self.get_document_vector(i))
q = []
for i in query:
q.append(self.get_document_vector(i))
sims = []
for i in q:
for j in docs:
sims.append(cosine(j, i))
return sims