From 8f9565cba2b8e87f35a6e0c209a5953b77ef292c Mon Sep 17 00:00:00 2001 From: pedrojlazevedo Date: Fri, 20 Mar 2020 00:07:33 +0000 Subject: [PATCH] Starting Doc2vec generation for every Document --- doc2vec.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 doc2vec.py diff --git a/doc2vec.py b/doc2vec.py new file mode 100644 index 00000000..c9fbb630 --- /dev/null +++ b/doc2vec.py @@ -0,0 +1,44 @@ +import os +import jsonlines +from random import shuffle +import gensim + +import logging +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + +# full text and processed in ['text'] tag +wiki_folder = "../wiki-pages-split" +files = os.listdir(wiki_folder) +shuffle(files) + +counter = 0 +max_counter = 1000000 #1 000 000 +train_text = [] +tokens = [] +for file in files: + file_content = jsonlines.open(wiki_folder + "/" + file) + file_content = file_content.read() + text = file_content['text'] + if counter > max_counter: + break + else: + tokens = gensim.utils.simple_preprocess(text) + train_text.append(gensim.models.doc2vec.TaggedDocument(tokens, [file])) + counter += 1 + print(counter) + +model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=20, epochs=2) +model.build_vocab(train_text) + +model.train(train_text, total_examples=model.corpus_count, epochs=model.epochs) + +sentence = "Obama was president of United States of America similar to a Portuguese kind called D. Afonso Henriques" +test_sentence = gensim.utils.simple_preprocess(sentence) +inferred_vector = model.infer_vector(test_sentence) +sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs)) +print(sims) + +file_content = jsonlines.open(wiki_folder + "/" + sims[0][0]) +file_content = file_content.read() +text = file_content['text'] +print(text)