From 6b3b31d0ec658d6f10300f1ef2e96bd5d267138b Mon Sep 17 00:00:00 2001
From: altanner <8190834+altanner@users.noreply.github.com>
Date: Thu, 20 Aug 2020 16:08:03 +0100
Subject: [PATCH] remove redundant files

---
 src/categories_removed_from_LIWC.txt |  22 --
 src/modules/senti_ops.py             | 315 ---------------------------
 2 files changed, 337 deletions(-)
 delete mode 100644 src/categories_removed_from_LIWC.txt
 delete mode 100644 src/modules/senti_ops.py

diff --git a/src/categories_removed_from_LIWC.txt b/src/categories_removed_from_LIWC.txt
deleted file mode 100644
index e727b8c..0000000
--- a/src/categories_removed_from_LIWC.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-kind of    50    54
-to like 20      30      31      91
-(i) like*       20      30      31
-(you) like*     20      30      31
-(we) like*      20      30      31
-(they) like*    20      30      31
-(do) like       30      31
-(don't) like    30      31
-(did) like      30      31
-(didn't) like   30      31
-(will) like     30      31
-(won't) like    30      31
-(does) like     30      31
-(doesn't) like  30      31
-(did not) like  30      31
-(will not) like 30      31
-(do not) like   30      31
-(does not) like 30      31
-(would not) like        30      31
-(should not) like       30      31
-(could not) like        30      31
-(53) like*      30      31
diff --git a/src/modules/senti_ops.py b/src/modules/senti_ops.py
deleted file mode 100644
index fe67f61..0000000
--- a/src/modules/senti_ops.py
+++ /dev/null
@@ -1,315 +0,0 @@
-import pymongo
-import csv
-from collections import namedtuple, Counter
-from tqdm import tqdm
-import sys
-from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
-from labMTsimple.storyLab import *
-import liwc
-
-
-# Link up the local DB
-db_name = "twitter_db"
-collection_name = "tweets"
-mongodb_port = 27017
-
-try:  # check if mongod is running here
-
-    client = pymongo.MongoClient('localhost', mongodb_port, serverSelectionTimeoutMS = 3000)
-    print("Connected to MongoDB version", client.server_info()["version"], "on port", mongodb_port)
-
-except pymongo.errors.ServerSelectionTimeoutError as e:
-
-    print("MongoDB cannot be reached - is it running?   ", e)
-    exit(1)
-
-db = client[db_name]
-total_records = db[collection_name].estimated_document_count()
-
-
-def main():
-
-    mongo_vader(db)
-    mongo_labMT(db)
-    mongo_liwc(db)
-    # in development :)
-    #mongo_nlp_example(db)
-    #mongo_insert_groundtruth(db)
-    #mongo_groundtruth_delta(db, senti_method)
-
-
-def tweet_or_retweet(db_document_dict):
-
-    """
-    tweet jsons are kind of moronic - if the tweet is a retweet, the full_text field is
-    truncated (ending with single character ellipsis (...)),
-    and the field underneath called 'truncated' says 'false'.
-    I do not know when the 'truncated' field does not say false.
-
-    Anyway, we have to get the *true* full text from the field retweeted_status.full_text
-    in the case that a tweet is a retweet -.-
-
-    tweet_text is a dict created by the pymongo query like
-    db[collection_name].find({}, {"id_str": 1, "full_text": 1, "retweeted_status.full_text": 1})
-    """
-
-    full_text_field = "db_document_dict[\"full_text\"]"
-
-    if "retweeted_status" in db_document_dict:  # if this key exists, it is a retweet
-        full_text_field = "db_document_dict[\"retweeted_status\"][\"full_text\"]"
-
-    return full_text_field
-
-
-def mongo_insert_groundtruth(db):
-
-    """
-    Open up the local MongoDB, and for each record
-    insert values representing groundtruth.
-    These go in new fields, or if the fields already exist
-    they are updated.
-
-    The groundtruth.csv must be in csv format, with two fields
-    user_id and a float. user_id is the twitter id number,
-    and the float for the ground truth is a random number -1 < x < 1
-    """
-
-    print(f"Inserting groundtruth values...")
-
-    # Turn csv into named tuple, for dot notation in pymongo ops
-    with open("groundtruth.csv") as incoming_csv:
-
-        reader = csv.reader(incoming_csv)
-        Data = namedtuple("Data", next(reader))
-        groundtruth_in = [Data(*r) for r in reader]
-
-    # Count users in db and groundtruth for crosschecking
-    total_users_in_db = db[collection_name].distinct("user.id_str")
-    users_with_groundtruth_provided = []
-
-    # Create or update field (epicosm.groundtruth.gt_stat_1) with values
-    for index, user in enumerate(groundtruth_in):
-
-        db[collection_name].update_many({"user.id_str": user.user},
-                              {"$set":
-                              {"epicosm.groundtruth.gt_stat_1": float(user.gt_stat_1)}})
-
-        users_with_groundtruth_provided.append(user.user)
-
-    print(f"OK - Groundtruth appended to {index + 1} users' records.")
-
-    # Cross-checking of groundtruth against users in DB.
-    existing_users_but_no_groundtruth = list(set(total_users_in_db) - set(users_with_groundtruth_provided))
-    existing_groundtruths_but_no_user = list(set(users_with_groundtruth_provided) - set(total_users_in_db))
-
-    # make some log files if there are discrepancies
-    if len(existing_groundtruths_but_no_user) > 0:
-
-        print("Groundtruth was provided for", len(existing_groundtruths_but_no_user), "users not appearing in the DB.",
-              "See groundtruth_but_no_user.log")
-
-        with open("groundtruth_but_no_user.log", "w") as save_file:
-            for user in existing_groundtruths_but_no_user:
-                save_file.write("%s\n" % user)
-
-    if len(existing_users_but_no_groundtruth) > 0:
-
-        print("Groundtruth was not provided for", len(existing_users_but_no_groundtruth), "users appearing in the DB.",
-              "See user_but_no_groundtruth.log")
-
-        with open("user_but_no_groundtruth.log", "w") as save_file:
-            for user in existing_users_but_no_groundtruth:
-                save_file.write("%s\n" % user)
-
-
-def mongo_vader(db):
-
-    """
-    Do Vader (Hutto & Gilbert 2014) analysis on the contents of the DB,
-    appending four fields: epicosm.vader.negative epicosm.vader.neutral
-    epicosm.vader.positive epicosm.vader.compound
-    """
-
-    print(f"Vader sentiment, analysing...")
-
-    # initialise analyser
-    analyser = SentimentIntensityAnalyzer()
-
-    # analyse and insert each vader score for each tweet text
-    with tqdm(total=total_records, file=sys.stdout) as pbar:
-
-        for index, db_document_dict in enumerate(db[collection_name].find({})):
-
-            # decide if it is a tweet or retweet and assign relevant field
-            full_text_field = eval(tweet_or_retweet(db_document_dict))
-
-            vader_negative = analyser.polarity_scores(full_text_field)["neg"]
-            vader_neutral = analyser.polarity_scores(full_text_field)["neu"]
-            vader_positive = analyser.polarity_scores(full_text_field)["pos"]
-            vader_compound = analyser.polarity_scores(full_text_field)["compound"]
-
-            db[collection_name].update_one({"id_str": db_document_dict["id_str"]}, {"$set": {
-                                  "epicosm.vader.negative": vader_negative,
-                                  "epicosm.vader.neutral": vader_neutral,
-                                  "epicosm.vader.positive": vader_positive,
-                                  "epicosm.vader.compound": vader_compound}})
-
-            pbar.update(1)
-
-    print(f"OK - Vader sentiment analysis applied to {index + 1} records.")
-
-
-def mongo_labMT(db):
-
-    """
-    Do labMT (Dodds & Danforth 2011) to contents of DB,
-    appending one field called epicosm.labMT.emotion_valence
-    """
-
-    print(f"labMT sentiment, analysing...")
-
-    lang = 'english'
-    labMT, labMTvector, labMTwordList = emotionFileReader(stopval=0.0, lang=lang, returnVector=True)
-
-    with tqdm(total=total_records, file=sys.stdout) as pbar:
-
-        for index, db_document_dict in enumerate(db[collection_name].find({})):
-
-            # decide if it is a tweet or retweet and assign relevant field
-            full_text_field = eval(tweet_or_retweet(db_document_dict))
-
-            # compute valence score and return frequency vector for generating wordshift
-            valence, frequency_vector = emotion(full_text_field, labMT, shift=True, happsList=labMTvector)
-
-            # assign a stop vector
-            stop_vector = stopper(frequency_vector, labMTvector, labMTwordList, stopVal=1.0)
-
-            # get the emotional valence
-            output_valence = emotionV(stop_vector, labMTvector)
-
-            # insert score into DB
-            db[collection_name].update_one({"id_str": db_document_dict["id_str"]}, {"$set": {
-                                  "epicosm.labMT.emotion_valence": float(format(output_valence, '.4f'))}})
-
-            pbar.update(1)
-
-    print(f"OK - labMT sentiment analysis applied to {index + 1} records.")
-
-
-def mongo_liwc(db):
-
-    """
-    Do LIWC (Pennebaker et al 2015) to contents of DB,
-    appending 78 (?) metric fields to DB.
-
-    Requires an LIWC dictionary, named LIWC.dic, in the run folder.
-
-    Appends fields epicosm.liwc.[category]
-    """
-
-    def tokenize(text):
-
-        """Split each text entry into words (tokens)"""
-
-        for match in re.finditer(r'\w+', text, re.UNICODE):
-            yield match.group(0)
-
-    # Look for an LIWC dictionary
-    if os.path.isfile('./LIWC.dic'):
-        dictionary = "LIWC.dic"
-    else:
-        print(f"Please have your dictionary here, named LIWC.dic")
-        return  # abort LIWC if not dictionary
-
-    print(f"LIWC sentiment, analysing...")
-
-    parse, category_names = liwc.load_token_parser(dictionary)
-
-    with tqdm(total=total_records, file=sys.stdout) as pbar:
-
-        for index, db_document_dict in enumerate(db[collection_name].find({})):
-
-            # decide if it is a tweet or retweet and assign relevant field
-            full_text_field = eval(tweet_or_retweet(db_document_dict))
-
-            word_count = len(re.findall(r'\w+', full_text_field))
-            text_tokens = tokenize(full_text_field)
-            text_counts = Counter(category for token in text_tokens for category in parse(token))
-
-            for count_category in text_counts:  # insert the LIWC values as proportion of word_count
-
-                db[collection_name].update_one({"id_str": db_document_dict["id_str"]},
-                                               {"$set":
-                                               {"epicosm.liwc." + count_category:
-                                               float(format((text_counts[count_category] / word_count),
-                                               '.4f'))}})
-
-            pbar.update(1)
-
-    print(f"OK - LIWC sentiment analysis applied to {index + 1} records.")
-
-
-def mongo_time_of_day(db):
-
-    """Apply a fuzzy time of day field eg early morning/midday/evening etc"""
-
-    pass
-
-
-def mongo_extract_emojis(db):
-
-    """Find emojis used in the post and copy them to an epicosm subfield."""
-
-    pass
-
-
-def mongo_nlp_example(db):
-
-    """
-    This is a trivial placeholder for custom analyses.
-    Outputs the ratio of the letter 'e' to total characters
-    in field epicosm.trivial_nlp.e_ratio
-    """
-
-    print(f"e_ratio, analysing...")
-
-    with tqdm(total=total_records, file=sys.stdout) as pbar:
-
-        for index, db_document_dict in enumerate(db[collection_name].find({})):
-
-            # decide if it is a tweet or retweet and use correct field
-            full_text_field = eval(tweet_or_retweet(db_document_dict))
-
-            count = Counter(full_text_field)
-            db[collection_name].update_one({"id_str": db_document_dict["id_str"]},
-                                           {"$set":
-                                           { "epicosm.trivial_nlp.e_ratio":
-                                           float(format(int(count['e']) / int(len(full_text_field)), '.4f'))}})
-
-            pbar.update(1)
-
-    print(f"OK - e_ratio analysis applied to {index + 1} records.")
-
-
-def mongo_groundtruth_delta(db, candidate_inference):
-
-     """
-     This is a placeholder for ascertaining how well a candidate analysis
-     algorithm is correlating with groundtruth
-     """
-
-     with tqdm(total=total_records, file=sys.stdout) as pbar:
-
-         for index, tweet_text in enumerate(db[collection_name].find({}, {"id_str": 1, interest_field: 1})):
-
-             groundtruth_delta = groundtruthfield - candidate_inference_output_field
-
-             db[collection_name].update_one({"id_str": tweet_text["id_str"]}, {"$set": {
-                 "epicosm." + candidate_inference + ".groundtruth_delta": format(groundtruth_delta, '.4f')
-
-         pbar.update(1)
-
-
-if __name__ == "__main__":
-    main()
-