Skip to content

Commit

Permalink
Creating Vectors Representations
Browse files Browse the repository at this point in the history
  • Loading branch information
pedrojlazevedo committed Mar 24, 2020
1 parent cc578b7 commit 3434d4b
Show file tree
Hide file tree
Showing 3 changed files with 275 additions and 254 deletions.
100 changes: 89 additions & 11 deletions doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,51 @@
from random import shuffle
import gensim
import sys
import spacy
from gensim.parsing.preprocessing import remove_stopwords
from gensim.models.doc2vec import Doc2Vec

fname = "doc2vec.model"

import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

spacy_nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

customize_stop_words = [
"-LRB-", "-RRB-", "-LSB-", "-LRB-"
]

for w in customize_stop_words:
spacy_nlp.vocab[w].is_stop = True

if len(sys.argv) - 1 == 1:
max_counter = 1000000 # 1 000 000
max_counter = int(sys.argv[1])
else:
max_counter = 10000 # 10 000
print("Max Counter not defined!")
print("Set Default Value: " + str(max_counter))


def pre_process(doc):
# doc = spacy_nlp(doc)

# lemma_tokens = [token.lemma_ for token in doc]
# doc = ' '.join(map(str, lemma_tokens))
# doc = spacy_nlp(doc)

# tokens = [token.text for token in doc if not token.is_stop]

# text = ' '.join(map(str, tokens))
text = remove_stopwords(doc)
return text

# TODO:Remove all STOP-WORDS and Lemmatize every token!!!!!

# full text and processed in ['text'] tag
wiki_folder = "../wiki-pages-split"
wiki_folder = "data/wiki-pages-split"
files = os.listdir(wiki_folder)
shuffle(files)

Expand All @@ -26,9 +57,24 @@
tokens = []
for file in files:
file_content = jsonlines.open(wiki_folder + "/" + file)
file_content = file_content.read()
text = file_content['text']
doc = file_content.read()['text']
text = pre_process(doc)

if counter > max_counter:
# adding required docs by fever with the claim given
file_content = jsonlines.open(wiki_folder + "/" + "Telemundo.json")
doc = file_content.read()['text']
text = pre_process(doc)
tokens = gensim.utils.simple_preprocess(text)
print(tokens)
train_text.append(gensim.models.doc2vec.TaggedDocument(tokens, ["Telemundo.json"]))

file_content = jsonlines.open(wiki_folder + "/" + "Hispanic_and_Latino_Americans.json")
doc = file_content.read()['text']
text = pre_process(doc)
tokens = gensim.utils.simple_preprocess(text)
train_text.append(gensim.models.doc2vec.TaggedDocument(tokens, ["Hispanic_and_Latino_Americans.json"]))

break
else:
tokens = gensim.utils.simple_preprocess(text)
Expand All @@ -37,23 +83,55 @@
if counter % 1000 == 0:
print(counter)

model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=20, epochs=2)
model.build_vocab(train_text)
model = Doc2Vec(vector_size=50, min_count=2, epochs=40)
#model = Doc2Vec.load(fname)
model.build_vocab(train_text)#,keep_raw_vocab=True)#, update=True)

model.train(train_text, total_examples=model.corpus_count, epochs=model.epochs)

sentence = "Obama was president of United States of America similar to a Portuguese person called D. Afonso Henriques"
test_sentence = gensim.utils.simple_preprocess(sentence)
inferred_vector = model.infer_vector(test_sentence)
sentence = "Telemundo is a English-language television network."
text = pre_process(sentence)
tokens = gensim.utils.simple_preprocess(text)
print(tokens)
for token in tokens:
print(token)
inferred_vector = model.infer_vector([token])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

STOP = 3
for doc, sim in sims:
file_content = jsonlines.open(wiki_folder + "/" + doc)
file_content = file_content.read()
text = file_content['text']
print("\n" + doc + " -- " + str(sim) + ": \n") # + text)
if STOP == 0:
break
else:
STOP -= 1

for doc, sim in sims:
if doc != "Hispanic_and_Latino_Americans.json" and doc != "Telemundo.json":
continue
print(doc + " -- " + str(sim))
print("\n")

model.save(fname)

inferred_vector = model.infer_vector(tokens)
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

STOP = 5
STOP = 3
for doc, sim in sims:
file_content = jsonlines.open(wiki_folder + "/" + doc)
file_content = file_content.read()
text = file_content['text']
print("\n" + doc + " -- " + str(sim) + ": \n" + text)
print("\n" + doc + " -- " + str(sim) + ": \n") # + text)
if STOP == 0:
break
else:
STOP -= 1

for doc, sim in sims:
if doc != "Hispanic_and_Latino_Americans.json" and doc != "Telemundo.json":
continue
print(doc + " -- " + str(sim))
243 changes: 0 additions & 243 deletions metrics_old.py

This file was deleted.

Loading

0 comments on commit 3434d4b

Please sign in to comment.