forked from EPINetz/EPINetz-Policy-Parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils_text_processing.R
362 lines (282 loc) · 14.6 KB
/
utils_text_processing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
drop_quantile <- function(data, # data
tokens, # column with tokens
quantile = 0.1, # quantile to be dropped
ignore_case = TRUE, # ignore casing?
group, # grouping column, e.g. type of word, for report statistics
verbose = TRUE # report staistics
)
{
require(dplyr)
require(stringr)
require(scales)
require(stats)
if (ignore_case == TRUE) {
data <- data %>%
dplyr::mutate(
orig_tokens = !!as.name(tokens), # preserve original tokens
{{tokens}} := stringr::str_to_lower(!!as.name(tokens))) # convert to lower case to ignore case
}
# add count for tokens
res <- data %>%
dplyr::group_by(!!as.name(tokens)) %>%
dplyr::mutate(entity_count = dplyr::n()) %>%
dplyr::ungroup()
# save min and max counts for verbose output
if (verbose == TRUE) {
range <- dplyr::tibble(min = min(res$entity_count),
max = max(res$entity_count))
}
# calculate quantile for filtering
threshold <- res %>%
dplyr::distinct(!!as.name(tokens), .keep_all = T) %>%
dplyr::pull(entity_count) %>%
stats::quantile(probs = quantile) %>%
.[[1]]
# drop quantile
res <- res %>%
dplyr::filter(entity_count > threshold)
if (verbose == TRUE) {
cat(paste(scales::percent(quantile), "Quantile:", threshold, "\n"))
cat(paste("Count Range:", range$min, "-", range$max, "\n"))
original_count <- data %>% # save unprocessed count statistics
dplyr::distinct(!!as.name(tokens), !!as.name(group)) %>%
dplyr::summarise(distinct_unprocessed = n(), .by = {{group}})
res %>%
dplyr::distinct(!!as.name(tokens),!!as.name(group)) %>%
dplyr::group_by(!!as.name(group)) %>%
dplyr::summarise(distinct_processed = n()) %>%
dplyr::left_join(original_count, by = group) %>%
dplyr::arrange(tag) %>%
dplyr::add_row(
{{group}} := "total",
distinct_unprocessed = sum(.$distinct_unprocessed),
distinct_processed = sum(.$distinct_processed)
) %>%
dplyr::mutate(
dropped = distinct_unprocessed - distinct_processed,
reduction = scales::percent(1 - distinct_processed / distinct_unprocessed)
) %>%
print()
}
if (ignore_case == TRUE) {
res <- res %>% dplyr::mutate({{tokens}} := orig_tokens) %>% # restore original tokens
dplyr::select(!orig_tokens)
}
return(res)
}
filter_tokens <- function(tokens,
tokens_col = "lemma", # which column should be used for token filtering. Usually "token" or "lemma"
tags = c("NN", "NE"), # tags of a tokens object to keep. Expects a "tag" column. NULL to skip
min_str_length = 2, # minimum string length to keep
tolower = TRUE, # should tokens be set to lower? Ignores URLs and all-caps acronyms. Note that this is executed before stopword filtering, which is case-sensitive
stopword_languages = c("en", "de", "fr"), # stopword dictionaries to be removed. NULL to skip
stopword_dictionary = "snowball", # stopword dictionary to be used. See stopwords::stopwords()
additional_stopwords = c("amp", "&", "RT", "rt", "--", "---"), # provide additional stopwords to drop here
replies = NULL, # should replies be considered for the random walks? Expects a column named is_reply. NULL to skip
keep_mentions = TRUE, # should @-mentions be filtered out? NULL to skip
keep_urls = TRUE # should URLs be filtered out? NULL to skip
) {
require(dplyr)
require(stopwords)
require(stringr)
require(purrr)
# some checks
if (!is.null(tags) & !("tag" %in% colnames(tokens))){
stop("Missing 'tag' column for tag filtering")
}
if (!is.null(replies) & !("is_reply" %in% colnames(tokens))){
stop("Missing 'is_reply' column for reply filtering")
}
if (any(!(stopword_languages %in% stopwords::stopwords_getlanguages(stopword_dictionary)))) {
stop(
cat("Language '",
stopword_languages[!stopword_languages %in%
stopwords::stopwords_getlanguages(stopword_dictionary)],
"' not provided in stopword dictionary '",
stopword_dictionary, "'.\n", sep = "")
)
}
# Filtering Sequences
if (!is.null(tags)) {
tokens <- tokens %>%
dplyr::filter(tag %in% tags)
}
tokens <- tokens %>%
dplyr::filter(stringr::str_length(!!as.name(tokens_col)) >= min_str_length)
if (tolower) {
tokens <- tokens %>%
dplyr::mutate(lemma = # lower case -
dplyr::case_when(
str_detect(!!as.name(tokens_col), "http") # except for URLs (so they don't break) ...
~ !!as.name(tokens_col),
!!as.name(tokens_col) == toupper(!!as.name(tokens_col)) # ... and acronyms (all caps), e.g. "UN", "IT" etc
~ !!as.name(tokens_col),
.default = tolower(!!as.name(tokens_col))))
}
if (!is.null(stopword_languages)) {
stopwords <- stopword_languages %>% # combine stopwords dicts
purrr::map(\(lang) stopwords::stopwords(language = lang,
source = stopword_dictionary)) %>%
unlist()
tokens <- tokens %>%
dplyr::filter(!(!!as.name(tokens_col) %in% stopwords))
}
if (!is.null(additional_stopwords)) {
tokens <- tokens %>%
dplyr::filter(!(!!as.name(tokens_col) %in% additional_stopwords))
}
if (!is.null(replies)) {
tokens <- tokens %>%
dplyr::filter(is_reply == replies | is_reply == FALSE) # filter for reply condition (TRUE includes replies, FALSE does not)
}
if (!is.null(keep_mentions) && keep_mentions == FALSE) {
tokens <- tokens %>%
dplyr::filter(!stringr::str_detect(!!as.name(tokens_col), "@")) # drop all lemmas containing "@" - that is, all mentions
}
if (!is.null(keep_urls) && keep_urls == FALSE) {
tokens <- tokens %>%
dplyr::filter(!stringr::str_detect(!!as.name(tokens_col), "http")) # drop all URLs
}
return(tokens)
}
get_latest_tokens_file <- function(path,
pattern ="tokens.csv.tar.gz", # regular updates of the tokens file will always be named "tokens.csv.tar.gz"
fallback = "tokens_init") { # fallback pattern to look for, with only the latest returned
# if there are multiple matches according to the pattern "tokens_init_[date]"
require(stringr)
require(dplyr)
files <- list.files(path)
if (pattern %in% files) { # check for latest tokens file and load
latest_tokens <- pattern
} else { # else, fall back to initial tokenization
latest_tokens <- dplyr::tibble(files = files) %>%
dplyr::filter(stringr::str_detect(files, fallback)) %>%
dplyr::arrange(dplyr::desc(files)) %>% dplyr::slice_head(n = 1)
}
return(file.path(path, latest_tokens))
}
read_timelimited_data <- function(file, # this can handle multiple files, and will pick only the relevant files (if applicable)
..., # arguments to pass to vroom()
filter_var = "_source.created_at", # filtering var as datetime
starting_point, # the starting point, as "YYYY-MM-DD"
timeframe = lubridate::weeks(1), # the timeframe before or after the starting point, as a lubridate period
before_after = c("before", "after"),
check_timeframe = TRUE,
add_file_time = lubridate::years(1), # time to add to the filename when checking for fit of data with specified timeframe.
# E.g., when adding years(1), it is assumed the file "data_news_2017-2018" contains data from 2017-01-01 to 2019-01-01
verbose = TRUE # verbosity of warnings
) {
require(rlang)
require(dplyr)
require(lubridate)
require(vroom)
require(purrr)
require(data.table)
require(stringr)
rlang::arg_match(before_after)
starting_point <- lubridate::ymd(starting_point) # make sure it's a date format
if (length(file) > 1 & check_timeframe) {
# if more than one file is provided, check for each file if the data is within the specified timeframe
# this is helpful for automatically loading data distributed over several files without specifiying the file each time and without loading unnecessarily large datasets
data <- file %>%
purrr::map(\(dat)
{
years <- dat %>% stringr::str_extract_all("\\d{4}") %>% .[[1]] # get years in file name
if (length(years) == 0) { # if no years in filename, load regardless and give warning
if (verbose) {
warning(paste0("No year indicators found in ", dat,
". Loading the dataset regardless of necessity."))
}
loaded_data <- vroom::vroom(dat, ...)
}
if (length(years) == 1) { # for one year in a filename, check if the interval matches the specified timefram
if (before_after == "before") {
if (lubridate::parse_date_time(years[1], # data in the interval of the timeframe?
orders = "y") %within% lubridate::interval(
starting_point - timeframe,
starting_point)
) {
loaded_data <- vroom::vroom(dat, ...)
}
}
if (before_after == "after") {
if (lubridate::parse_date_time(years[1], # data in the interval of the timeframe?
orders = "y") %within% lubridate::interval(
starting_point,
starting_point + timeframe)
) {
loaded_data <- vroom::vroom(dat, ...)
}
}
}
if (length(years) == 2) { # for two years in a filename, check if the interval matches the specified timeframe
interval <-
lubridate::interval(
lubridate::parse_date_time(years[1],
orders = "y"),
lubridate::parse_date_time(years[2],
orders = "y") +
add_file_time
)
if (before_after == "before") {
if (lubridate::interval(starting_point - timeframe,
starting_point) %within% interval) { # period in the interval of the data?
loaded_data <- vroom::vroom(dat, ...)
}
}
if (before_after == "after") {
if (lubridate::interval(starting_point,
starting_point + timeframe) %within% interval) { # period in the interval of the data?
loaded_data <- vroom::vroom(dat, ...)
}
}
}
if (length(years) > 2) { # for more than two years in filename, load regardless and give warning
if (verbose) {
warning(paste0("More than two year indicators found in ", dat,
". Loading the dataset regardless of necessity."))
}
loaded_data <- vroom::vroom(dat, ...)
}
if (exists("loaded_data")){ # return the loaded data only if it was loaded
return(loaded_data)
}
}) %>% purrr::compact() %>% # drops empty list elements for cases where no data was loaded
data.table::rbindlist(fill = TRUE)
if (length(data) == 0) {
stop("No fitting data found for specified timeframe in the data. Either specify different files or timeframe or skip this check with check_timeframe == FALSE\n")
}
} else { # if only one file is provided, it is loaded directly
data <- vroom::vroom(file, ...)
}
# filter the timeframe within the loaded data
if (before_after == "before") {
result <- data %>%
dplyr::filter(!!as.name(filter_var) <= starting_point &
!!as.name(filter_var) >= (starting_point - timeframe))}
if (before_after == "after") {
result <- data %>%
dplyr::filter(!!as.name(filter_var) >= starting_point &
!!as.name(filter_var) <= (starting_point + timeframe))}
gc(verbose = FALSE)
return(result)
}
split_timeframes <- function(data, # the dataframe to read in
datetime_var, # the name of the datetime var to use for splitting
timeframe = "weeks", # the timeframe. can be any lubridate period. See lubridate::ceiling_date() for details
before_after = c("before", "after") # split before or after the timeframe's endpoint
){
rlang::arg_match(before_after)
if (before_after == "before") {
out <- data %>%
dplyr::mutate(period = lubridate::ceiling_date(!!as.name(datetime_var), unit = timeframe)) # end of the week
}
if (before_after == "after") {
out <- data %>%
dplyr::mutate(period = lubridate::floor_date(!!as.name(datetime_var), unit = timeframe)) # beginning of the week
}
out <- out %>%
data.table::as.data.table() %>%
split(by = "period")
return(out)
}