-
Notifications
You must be signed in to change notification settings - Fork 0
/
textprocessor.py
135 lines (122 loc) · 5.16 KB
/
textprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
import pandas as pd
from string import punctuation
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.stem.snowball import DutchStemmer
from math import log
class Textprocessor:
# Handles all the lexical analysis and stemming.
# contains the term weight matrix for chosen texts and settings.
def __init__(self):
# Sets the standard settings for the Textprocessor.
self.documents_folder = os.path.join(os.getcwd(), "documents")
self.language = "english"
self.unwanted_chars = punctuation + "1234567890"
self.enable_stopwords = False
self.enable_stemmer = False
self.enable_lemmatizer = True
self.create_term_weight_matrix()
def create_term_weight_matrix(self):
# Call this function to create
# a term weight matrix with the provided settings.
wordcounts = {}
for file in os.listdir(self.documents_folder):
if not file.endswith(".txt"): # skip file if it is not *.txt.
continue
wordlist = self.open_file(os.path.join(self.documents_folder,
file))
wordlist = self.clean_words(wordlist)
wordcounts[file] = self.count_words(wordlist)
term_weight_matrix = self.calculate_term_weights(wordcounts)
self.term_weight_matrix = term_weight_matrix
def open_file(self, path):
# opens file and puts all words in a wordlist.
with open(path, "r", encoding="utf8") as f:
wordlist = f.read().split()
return wordlist
def clean_words(self, wordlist):
words = self.remove_unwanted_characters(wordlist)
words = self.remove_stopwords(words)
words = self.lemmatize_words(words)
words = self.stem_words(words)
return words
def remove_unwanted_characters(self, wordlist):
# Takes a wordlist and removes all unwanted chars from the words.
cleanlist = []
for word in wordlist:
newword = ""
for char in word:
if char not in self.unwanted_chars:
newword += char
if newword != "": # no empty strings in wordlist
cleanlist.append(newword.lower())
return cleanlist
def remove_stopwords(self, wordlist):
# Takes a wordlist and removes all stopwords from that list.
if self.enable_stopwords is True:
return wordlist
stopwords = self.open_file(os.path.join("config",
"stopwords",
self.language))
for word in list(wordlist):
if word in stopwords:
wordlist.remove(word)
return wordlist
def lemmatize_words(self, wordlist):
# Morphological analysis of words.
# Works for English only, checks if enabled.
if self.language != "english" or self.enable_lemmatizer is not True:
return wordlist
lemmatizer = WordNetLemmatizer()
lemmatized_words = []
for word in wordlist:
lemmatized_words.append(lemmatizer.lemmatize(word))
return lemmatized_words
def stem_words(self, wordlist):
# Checks if stemming is enabled and stems words in wordlist.
if self.enable_stemmer is not True:
return wordlist
if self.language == "english":
stemmer = PorterStemmer()
elif self.language == "dutch":
stemmer = DutchStemmer()
stemmed_words = []
for word in wordlist:
stemmed_words.append(stemmer.stem(word))
return stemmed_words
def count_words(self, wordlist):
# Takes a wordlist and returns the wordcount.
wordcount = {}
for word in wordlist:
if word in wordcount:
wordcount[word] += 1
else:
wordcount[word] = 1
return wordcount
def calculate_term_weights(self, wordcounts):
# Takes a wordcounts dict and returns a term weight matrix.
freq_matrix = pd.DataFrame(wordcounts)
freq_matrix.fillna(0, inplace=True) # replaces all NaN values with 0.
N = len(freq_matrix.columns)
idf_list = []
for word in freq_matrix.index:
df = 0
for text in freq_matrix.columns:
if freq_matrix.loc[word, text] != 0.0:
df += 1
idf = log((N/df), 2)
idf_list.append(idf)
# use pandas .mul(tiply) function to quickly multiply
# the idf list with all terms in the matrix.
term_weight_matrix = freq_matrix.mul(idf_list, axis=0)
return term_weight_matrix
def reset_default_settings(self):
# Can be called to reset the program to default settings.
# Also creates a new term weight matrix (if anything was changed.)
self.documents_folder = os.path.join(os.getcwd(), "documents")
self.language = "english"
self.unwanted_chars = punctuation + "1234567890"
self.enable_stopwords = False
self.enable_stemmer = False
self.enable_lemmatizer = True
self.create_term_weight_matrix()