-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_processing.py
50 lines (35 loc) · 1.55 KB
/
text_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import re
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import PorterStemmer
def get_words(data, stopwords = []):
'''
preprocesses and tokenise any data and returns a list of words
'''
# Lower case
#data = str.lower(data);
# Strip all html tags and attributes and html character codes, like and <
data = re.sub('<(.|\n)*?>', ' ', data)
data = re.sub('&\w+;', ' ', data)
# Handle URLS (Look for strings starting with http:// or https://)
#data = re.sub('(http|https)://[^\s]*', ' httpaddr ', data)
# Handle Email Addresses (Look for strings with @ in the middle)
#data = re.sub('[^\s]+@[^\s]+', ' emailaddr ', data);
# Handle $ sign
data = re.sub('[$]+', ' dollar ', data)
# Strip out weird '3D' artefacts.
data = re.sub('3D', ' ', data)
# Replace multiple underscores with a single one
data = re.sub('_+', '_', data)
# Remove '=' symbols before tokenizing, since these are sometimes occur within words to indicate, e.g., line-wrapping
data = data.replace('=\n', '')
# Tokenize Edata
data_words = wordpunct_tokenize(data)
# Get rid of stopwords
#data_words = [w for w in data_words if w not in stopwords]
# Remove any non alphanumeric characters
data_words = map(lambda word:re.sub('[^a-zA-Z0-9]', '', word), data_words)
# Get rid of punctuation tokens, numbers and single letters
data_words = [w for w in data_words if re.search('[a-zA-Z]', w) and len(w) > 1]
# Stemming of words
#data_words = map(lambda word:PorterStemmer().stem_word(unicode(word, errors='replace')), data_words)
return data_words