Skip to content

Commit

Permalink
Retrieving the 5 most similar docs
Browse files Browse the repository at this point in the history
  • Loading branch information
pedrojlazevedo committed Mar 20, 2020
1 parent fa62f7d commit cc578b7
Showing 1 changed file with 16 additions and 10 deletions.
26 changes: 16 additions & 10 deletions doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@
import sys

import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

if len(sys.argv)-1 == 1:
if len(sys.argv) - 1 == 1:
max_counter = 1000000 # 1 000 000
else:
max_counter = 10000 # 10 000
max_counter = 10000 # 10 000
print("Max Counter not defined!")
print("Set Default Value: " + str(max_counter))

Expand All @@ -21,7 +22,6 @@

counter = 0


train_text = []
tokens = []
for file in files:
Expand All @@ -34,20 +34,26 @@
tokens = gensim.utils.simple_preprocess(text)
train_text.append(gensim.models.doc2vec.TaggedDocument(tokens, [file]))
counter += 1
print(counter)
if counter % 1000 == 0:
print(counter)

model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=20, epochs=2)
model.build_vocab(train_text)

model.train(train_text, total_examples=model.corpus_count, epochs=model.epochs)

sentence = "Obama was president of United States of America similar to a Portuguese kind called D. Afonso Henriques"
sentence = "Obama was president of United States of America similar to a Portuguese person called D. Afonso Henriques"
test_sentence = gensim.utils.simple_preprocess(sentence)
inferred_vector = model.infer_vector(test_sentence)
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
print(sims)

file_content = jsonlines.open(wiki_folder + "/" + sims[0][0])
file_content = file_content.read()
text = file_content['text']
print(text)
STOP = 5
for doc, sim in sims:
file_content = jsonlines.open(wiki_folder + "/" + doc)
file_content = file_content.read()
text = file_content['text']
print("\n" + doc + " -- " + str(sim) + ": \n" + text)
if STOP == 0:
break
else:
STOP -= 1

0 comments on commit cc578b7

Please sign in to comment.