Skip to content

Commit

Permalink
Starting Doc2vec generation for every Document
Browse files Browse the repository at this point in the history
  • Loading branch information
pedrojlazevedo committed Mar 20, 2020
1 parent 09a1fd8 commit 8f9565c
Showing 1 changed file with 44 additions and 0 deletions.
44 changes: 44 additions & 0 deletions doc2vec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os
import jsonlines
from random import shuffle
import gensim

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# full text and processed in ['text'] tag
wiki_folder = "../wiki-pages-split"
files = os.listdir(wiki_folder)
shuffle(files)

counter = 0
max_counter = 1000000 #1 000 000
train_text = []
tokens = []
for file in files:
file_content = jsonlines.open(wiki_folder + "/" + file)
file_content = file_content.read()
text = file_content['text']
if counter > max_counter:
break
else:
tokens = gensim.utils.simple_preprocess(text)
train_text.append(gensim.models.doc2vec.TaggedDocument(tokens, [file]))
counter += 1
print(counter)

model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=20, epochs=2)
model.build_vocab(train_text)

model.train(train_text, total_examples=model.corpus_count, epochs=model.epochs)

sentence = "Obama was president of United States of America similar to a Portuguese kind called D. Afonso Henriques"
test_sentence = gensim.utils.simple_preprocess(sentence)
inferred_vector = model.infer_vector(test_sentence)
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
print(sims)

file_content = jsonlines.open(wiki_folder + "/" + sims[0][0])
file_content = file_content.read()
text = file_content['text']
print(text)

0 comments on commit 8f9565c

Please sign in to comment.