-
Notifications
You must be signed in to change notification settings - Fork 33
/
twokenize.py
299 lines (247 loc) · 12.7 KB
/
twokenize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
# -*- coding: utf-8 -*-
"""
Twokenize -- a tokenizer designed for Twitter text in English and some other European languages.
This tokenizer code has gone through a long history:
(1) Brendan O'Connor wrote original version in Python, http://github.com/brendano/tweetmotif
TweetMotif: Exploratory Search and Topic Summarization for Twitter.
Brendan O'Connor, Michel Krieger, and David Ahn.
ICWSM-2010 (demo track), http://brenocon.com/oconnor_krieger_ahn.icwsm2010.tweetmotif.pdf
(2a) Kevin Gimpel and Daniel Mills modified it for POS tagging for the CMU ARK Twitter POS Tagger
(2b) Jason Baldridge and David Snyder ported it to Scala
(3) Brendan bugfixed the Scala port and merged with POS-specific changes
for the CMU ARK Twitter POS Tagger
(4) Tobi Owoputi ported it back to Java and added many improvements (2012-06)
Current home is http://github.com/brendano/ark-tweet-nlp and http://www.ark.cs.cmu.edu/TweetNLP
There have been at least 2 other Java ports, but they are not in the lineage for the code here.
Ported to Python by Myle Ott <[email protected]>.
"""
from __future__ import print_function
import operator
import re
import HTMLParser
def regex_or(*items):
return '(?:' + '|'.join(items) + ')'
Contractions = re.compile(u"(?i)(\w+)(n['’′]t|['’′]ve|['’′]ll|['’′]d|['’′]re|['’′]s|['’′]m)$", re.UNICODE)
Whitespace = re.compile(u"[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE)
punctChars = r"['\"“”‘’.?!…,:;]"
#punctSeq = punctChars+"+" #'anthem'. => ' anthem '.
punctSeq = r"['\"“”‘’]+|[.?!,…]+|[:;]+" #'anthem'. => ' anthem ' .
entity = r"&(?:amp|lt|gt|quot);"
# URLs
# BTO 2012-06: everyone thinks the daringfireball regex should be better, but they're wrong.
# If you actually empirically test it the results are bad.
# Please see https://github.com/brendano/ark-tweet-nlp/pull/9
urlStart1 = r"(?:https?://|\bwww\.)"
commonTLDs = r"(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)"
ccTLDs = r"(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" + \
r"bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" + \
r"er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" + \
r"hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" + \
r"lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" + \
r"nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" + \
r"sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" + \
r"va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)" #TODO: remove obscure country domains?
urlStart2 = r"\b(?:[A-Za-z\d-])+(?:\.[A-Za-z0-9]+){0,3}\." + regex_or(commonTLDs, ccTLDs) + r"(?:\."+ccTLDs+r")?(?=\W|$)"
urlBody = r"(?:[^\.\s<>][^\s<>]*?)?"
urlExtraCrapBeforeEnd = regex_or(punctChars, entity) + "+?"
urlEnd = r"(?:\.\.+|[<>]|\s|$)"
url = regex_or(urlStart1, urlStart2) + urlBody + "(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")"
# Numeric
timeLike = r"\d+(?::\d+){1,2}"
#numNum = r"\d+\.\d+"
numberWithCommas = r"(?:(?<!\d)\d{1,3},)+?\d{3}" + r"(?=(?:[^,\d]|$))"
numComb = u"[\u0024\u058f\u060b\u09f2\u09f3\u09fb\u0af1\u0bf9\u0e3f\u17db\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6\u00a2-\u00a5\u20a0-\u20b9]?\\d+(?:\\.\\d+)+%?".encode('utf-8')
# Abbreviations
boundaryNotDot = regex_or("$", r"\s", r"[“\"?!,:;]", entity)
aa1 = r"(?:[A-Za-z]\.){2,}(?=" + boundaryNotDot + ")"
aa2 = r"[^A-Za-z](?:[A-Za-z]\.){1,}[A-Za-z](?=" + boundaryNotDot + ")"
standardAbbreviations = r"\b(?:[Mm]r|[Mm]rs|[Mm]s|[Dd]r|[Ss]r|[Jj]r|[Rr]ep|[Ss]en|[Ss]t)\."
arbitraryAbbrev = regex_or(aa1, aa2, standardAbbreviations)
separators = "(?:--+|―|—|~|–|=)"
decorations = u"(?:[♫♪]+|[★☆]+|[♥❤♡]+|[\u2639-\u263b]+|[\ue001-\uebbb]+)".encode('utf-8')
thingsThatSplitWords = r"[^\s\.,?\"]"
embeddedApostrophe = thingsThatSplitWords+r"+['’′]" + thingsThatSplitWords + "*"
# Emoticons
# myleott: in Python the (?iu) flags affect the whole expression
#normalEyes = "(?iu)[:=]" # 8 and x are eyes but cause problems
normalEyes = "[:=]" # 8 and x are eyes but cause problems
wink = "[;]"
noseArea = "(?:|-|[^a-zA-Z0-9 ])" # doesn't get :'-(
happyMouths = r"[D\)\]\}]+"
sadMouths = r"[\(\[\{]+"
tongue = "[pPd3]+"
otherMouths = r"(?:[oO]+|[/\\]+|[vV]+|[Ss]+|[|]+)" # remove forward slash if http://'s aren't cleaned
# mouth repetition examples:
# @aliciakeys Put it in a love song :-))
# @hellocalyclops =))=))=)) Oh well
# myleott: try to be as case insensitive as possible, but still not perfect, e.g., o.O fails
#bfLeft = u"(♥|0|o|°|v|\\$|t|x|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)".encode('utf-8')
bfLeft = u"(♥|0|[oO]|°|[vV]|\\$|[tT]|[xX]|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)".encode('utf-8')
bfCenter = r"(?:[\.]|[_-]+)"
bfRight = r"\2"
s3 = r"(?:--['\"])"
s4 = r"(?:<|<|>|>)[\._-]+(?:<|<|>|>)"
s5 = "(?:[.][_]+[.])"
# myleott: in Python the (?i) flag affects the whole expression
#basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
basicface = "(?:" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
eeLeft = r"[\\\ƪԄ\((<>;ヽ\-=~\*]+"
eeRight= u"[\\-=\\);'\u0022<>ʃ)//ノノ丿╯σっµ~\\*]+".encode('utf-8')
eeSymbol = r"[^A-Za-z0-9\s\(\)\*:=-]"
eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight
oOEmote = r"(?:[oO]" + bfCenter + r"[oO])"
emoticon = regex_or(
# Standard version :) :( :] :D :P
"(?:>|>)?" + regex_or(normalEyes, wink) + regex_or(noseArea,"[Oo]") + regex_or(tongue+r"(?=\W|$|RT|rt|Rt)", otherMouths+r"(?=\W|$|RT|rt|Rt)", sadMouths, happyMouths),
# reversed version (: D: use positive lookbehind to remove "(word):"
# because eyes on the right side is more ambiguous with the standard usage of : ;
regex_or("(?<=(?: ))", "(?<=(?:^))") + regex_or(sadMouths,happyMouths,otherMouths) + noseArea + regex_or(normalEyes, wink) + "(?:<|<)?",
#inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
eastEmote.replace("2", "1", 1), basicface,
# iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]
# TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this
# myleott: o.O and O.o are two of the biggest sources of differences
# between this and the Java version. One little hack won't hurt...
oOEmote
)
Hearts = "(?:<+/?3+)+" #the other hearts are in decorations
Arrows = regex_or(r"(?:<*[-―—=]*>+|<+[-―—=]*>*)", u"[\u2190-\u21ff]+".encode('utf-8'))
# BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes
# "hello (#hashtag)" ==> "hello (#hashtag )" WRONG
# "hello (#hashtag)" ==> "hello ( #hashtag )" RIGHT
# "hello (@person)" ==> "hello (@person )" WRONG
# "hello (@person)" ==> "hello ( @person )" RIGHT
# ... Some sort of weird interaction with edgepunct I guess, because edgepunct
# has poor content-symbol detection.
# This also gets #1 #40 which probably aren't hashtags .. but good as tokens.
# If you want good hashtag identification, use a different regex.
Hashtag = "#[a-zA-Z0-9_]+" #optional: lookbehind for \b
#optional: lookbehind for \b, max length 15
AtMention = "[@@][a-zA-Z0-9_]+"
# I was worried this would conflict with at-mentions
# but seems ok in sample of 5800: 7 changes all email fixes
# http://www.regular-expressions.info/email.html
Bound = r"(?:\W|^|$)"
Email = regex_or("(?<=(?:\W))", "(?<=(?:^))") + r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}(?=" +Bound+")"
# We will be tokenizing using these regexps as delimiters
# Additionally, these things are "protected", meaning they shouldn't be further split themselves.
Protected = re.compile(
unicode(regex_or(
Hearts,
url,
Email,
timeLike,
#numNum,
numberWithCommas,
numComb,
emoticon,
Arrows,
entity,
punctSeq,
arbitraryAbbrev,
separators,
decorations,
embeddedApostrophe,
Hashtag,
AtMention
).decode('utf-8')), re.UNICODE)
# Edge punctuation
# Want: 'foo' => ' foo '
# While also: don't => don't
# the first is considered "edge punctuation".
# the second is word-internal punctuation -- don't want to mess with it.
# BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days.
# I remember it causing lots of trouble in the past as well. Would be good to revisit or eliminate.
# Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
#edgePunctChars = r"'\"“”‘’«»{}\(\)\[\]\*&" #add \\p{So}? (symbols)
edgePunctChars = u"'\"“”‘’«»{}\\(\\)\\[\\]\\*&" #add \\p{So}? (symbols)
edgePunct = "[" + edgePunctChars + "]"
notEdgePunct = "[a-zA-Z0-9]" # content characters
offEdge = r"(^|$|:|;|\s|\.|,)" # colon here gets "(hello):" ==> "( hello ):"
EdgePunctLeft = re.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")", re.UNICODE)
EdgePunctRight = re.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge, re.UNICODE)
def splitEdgePunct(input):
input = EdgePunctLeft.sub(r"\1\2 \3", input)
input = EdgePunctRight.sub(r"\1 \2\3", input)
return input
# The main work of tokenizing a tweet.
def simpleTokenize(text):
# Do the no-brainers first
splitPunctText = splitEdgePunct(text)
textLength = len(splitPunctText)
# BTO: the logic here got quite convoluted via the Scala porting detour
# It would be good to switch back to a nice simple procedural style like in the Python version
# ... Scala is such a pain. Never again.
# Find the matches for subsequences that should be protected,
# e.g. URLs, 1.0, U.N.K.L.E., 12:53
bads = []
badSpans = []
for match in Protected.finditer(splitPunctText):
# The spans of the "bads" should not be split.
if (match.start() != match.end()): #unnecessary?
bads.append( [splitPunctText[match.start():match.end()]] )
badSpans.append( (match.start(), match.end()) )
# Create a list of indices to create the "goods", which can be
# split. We are taking "bad" spans like
# List((2,5), (8,10))
# to create
# List(0, 2, 5, 8, 10, 12)
# where, e.g., "12" here would be the textLength
# has an even length and no indices are the same
indices = [0]
for (first, second) in badSpans:
indices.append(first)
indices.append(second)
indices.append(textLength)
# Group the indices and map them to their respective portion of the string
splitGoods = []
for i in range(0, len(indices), 2):
goodstr = splitPunctText[indices[i]:indices[i+1]]
splitstr = goodstr.strip().split(" ")
splitGoods.append(splitstr)
# Reinterpolate the 'good' and 'bad' Lists, ensuring that
# additonal tokens from last good item get included
zippedStr = []
for i in range(len(bads)):
zippedStr = addAllnonempty(zippedStr, splitGoods[i])
zippedStr = addAllnonempty(zippedStr, bads[i])
zippedStr = addAllnonempty(zippedStr, splitGoods[len(bads)])
# BTO: our POS tagger wants "ur" and "you're" to both be one token.
# Uncomment to get "you 're"
#splitStr = []
#for tok in zippedStr:
# splitStr.extend(splitToken(tok))
#zippedStr = splitStr
return zippedStr
def addAllnonempty(master, smaller):
for s in smaller:
strim = s.strip()
if (len(strim) > 0):
master.append(strim)
return master
# "foo bar " => "foo bar"
def squeezeWhitespace(input):
return Whitespace.sub(" ", input).strip()
# Final pass tokenization based on special patterns
def splitToken(token):
m = Contractions.search(token)
if m:
return [m.group(1), m.group(2)]
return [token]
# Assume 'text' has no HTML escaping.
def tokenize(text):
return simpleTokenize(squeezeWhitespace(text))
# Twitter text comes HTML-escaped, so unescape it.
# We also first unescape &'s, in case the text has been buggily double-escaped.
def normalizeTextForTagger(text):
text = text.replace("&", "&")
text = HTMLParser.HTMLParser().unescape(text)
return text
# This is intended for raw tweet text -- we do some HTML entity unescaping before running the tagger.
#
# This function normalizes the input text BEFORE calling the tokenizer.
# So the tokens you get back may not exactly correspond to
# substrings of the original text.
def tokenizeRawTweetText(text):
tokens = tokenize(normalizeTextForTagger(text))
return tokens