From cc578b76b5b6d58be9ba116629ea8158cf906fc1 Mon Sep 17 00:00:00 2001 From: pedrojlazevedo Date: Fri, 20 Mar 2020 00:21:39 +0000 Subject: [PATCH] Retrieving the 5 most similar docs --- doc2vec.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/doc2vec.py b/doc2vec.py index 7359f1bd..39a77e8b 100644 --- a/doc2vec.py +++ b/doc2vec.py @@ -5,12 +5,13 @@ import sys import logging + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) -if len(sys.argv)-1 == 1: +if len(sys.argv) - 1 == 1: max_counter = 1000000 # 1 000 000 else: - max_counter = 10000 # 10 000 + max_counter = 10000 # 10 000 print("Max Counter not defined!") print("Set Default Value: " + str(max_counter)) @@ -21,7 +22,6 @@ counter = 0 - train_text = [] tokens = [] for file in files: @@ -34,20 +34,26 @@ tokens = gensim.utils.simple_preprocess(text) train_text.append(gensim.models.doc2vec.TaggedDocument(tokens, [file])) counter += 1 - print(counter) + if counter % 1000 == 0: + print(counter) model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=20, epochs=2) model.build_vocab(train_text) model.train(train_text, total_examples=model.corpus_count, epochs=model.epochs) -sentence = "Obama was president of United States of America similar to a Portuguese kind called D. Afonso Henriques" +sentence = "Obama was president of United States of America similar to a Portuguese person called D. Afonso Henriques" test_sentence = gensim.utils.simple_preprocess(sentence) inferred_vector = model.infer_vector(test_sentence) sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs)) -print(sims) -file_content = jsonlines.open(wiki_folder + "/" + sims[0][0]) -file_content = file_content.read() -text = file_content['text'] -print(text) +STOP = 5 +for doc, sim in sims: + file_content = jsonlines.open(wiki_folder + "/" + doc) + file_content = file_content.read() + text = file_content['text'] + print("\n" + doc + " -- " + str(sim) + ": \n" + text) + if STOP == 0: + break + else: + STOP -= 1