Creating Vectors Representations

DeFacto · Mar 24, 2020 · 3434d4b · 3434d4b
1 parent cc578b7
commit 3434d4b
Show file tree

Hide file tree

Showing 3 changed files with 275 additions and 254 deletions.
diff --git a/doc2vec.py b/doc2vec.py
@@ -3,20 +3,51 @@
 from random import shuffle
 import gensim
 import sys
+import spacy
+from gensim.parsing.preprocessing import remove_stopwords
+from gensim.models.doc2vec import Doc2Vec
+
+fname = "doc2vec.model"
 
 import logging
 
 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
+spacy_nlp = spacy.load('en_core_web_sm')
+spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
+
+customize_stop_words = [
+    "-LRB-", "-RRB-", "-LSB-", "-LRB-"
+]
+
+for w in customize_stop_words:
+    spacy_nlp.vocab[w].is_stop = True
+
 if len(sys.argv) - 1 == 1:
-    max_counter = 1000000  # 1 000 000
+    max_counter = int(sys.argv[1])
 else:
     max_counter = 10000  # 10 000
     print("Max Counter not defined!")
     print("Set Default Value: " + str(max_counter))
 
+
+def pre_process(doc):
+    # doc = spacy_nlp(doc)
+
+    # lemma_tokens = [token.lemma_ for token in doc] 
+    # doc = ' '.join(map(str, lemma_tokens))
+    # doc = spacy_nlp(doc)
+
+    # tokens = [token.text for token in doc if not token.is_stop]
+
+    # text = ' '.join(map(str, tokens))
+    text = remove_stopwords(doc)
+    return text
+
+# TODO:Remove all STOP-WORDS and Lemmatize every token!!!!!
+
 # full text and processed in ['text'] tag
-wiki_folder = "../wiki-pages-split"
+wiki_folder = "data/wiki-pages-split"
 files = os.listdir(wiki_folder)
 shuffle(files)
 
@@ -26,9 +57,24 @@
 tokens = []
 for file in files:
     file_content = jsonlines.open(wiki_folder + "/" + file)
-    file_content = file_content.read()
-    text = file_content['text']
+    doc = file_content.read()['text']
+    text = pre_process(doc)
+
     if counter > max_counter:
+        # adding required docs by fever with the claim given
+        file_content = jsonlines.open(wiki_folder + "/" + "Telemundo.json")
+        doc = file_content.read()['text']
+        text = pre_process(doc)
+        tokens = gensim.utils.simple_preprocess(text)
+        print(tokens)
+        train_text.append(gensim.models.doc2vec.TaggedDocument(tokens, ["Telemundo.json"]))
+
+        file_content = jsonlines.open(wiki_folder + "/" + "Hispanic_and_Latino_Americans.json")
+        doc = file_content.read()['text']
+        text = pre_process(doc)
+        tokens = gensim.utils.simple_preprocess(text)
+        train_text.append(gensim.models.doc2vec.TaggedDocument(tokens, ["Hispanic_and_Latino_Americans.json"]))
+
         break
     else:
         tokens = gensim.utils.simple_preprocess(text)
@@ -37,23 +83,55 @@
         if counter % 1000 == 0:
             print(counter)
 
-model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=20, epochs=2)
-model.build_vocab(train_text)
+model = Doc2Vec(vector_size=50, min_count=2, epochs=40)
+#model = Doc2Vec.load(fname)
+model.build_vocab(train_text)#,keep_raw_vocab=True)#, update=True)
 
 model.train(train_text, total_examples=model.corpus_count, epochs=model.epochs)
 
-sentence = "Obama was president of United States of America similar to a Portuguese person called D. Afonso Henriques"
-test_sentence = gensim.utils.simple_preprocess(sentence)
-inferred_vector = model.infer_vector(test_sentence)
+sentence = "Telemundo is a English-language television network."
+text = pre_process(sentence)
+tokens = gensim.utils.simple_preprocess(text)
+print(tokens)
+for token in tokens:
+    print(token)
+    inferred_vector = model.infer_vector([token])
+    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
+
+    STOP = 3
+    for doc, sim in sims:
+        file_content = jsonlines.open(wiki_folder + "/" + doc)
+        file_content = file_content.read()
+        text = file_content['text']
+        print("\n" + doc + " -- " + str(sim) + ": \n")  # + text)
+        if STOP == 0:
+            break
+        else:
+            STOP -= 1
+
+    for doc, sim in sims:
+        if doc != "Hispanic_and_Latino_Americans.json" and doc != "Telemundo.json":
+            continue
+        print(doc + " -- " + str(sim))
+    print("\n")
+
+model.save(fname)
+
+inferred_vector = model.infer_vector(tokens)
 sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
 
-STOP = 5
+STOP = 3
 for doc, sim in sims:
     file_content = jsonlines.open(wiki_folder + "/" + doc)
     file_content = file_content.read()
     text = file_content['text']
-    print("\n" + doc + " -- " + str(sim) + ": \n" + text)
+    print("\n" + doc + " -- " + str(sim) + ": \n")  # + text)
     if STOP == 0:
         break
     else:
         STOP -= 1
+
+for doc, sim in sims:
+    if doc != "Hispanic_and_Latino_Americans.json" and doc != "Telemundo.json":
+        continue
+    print(doc + " -- " + str(sim))
diff --git a/metrics_old.py b/metrics_old.py