utils_text_processing.R


drop_quantile <- function(data,   # data
                          tokens, # column with tokens 
                          quantile = 0.1, # quantile to be dropped
                          ignore_case = TRUE, # ignore casing?
                          group,  # grouping column, e.g. type of word, for report statistics
                          verbose = TRUE # report staistics
)
{
  
  require(dplyr)
  require(stringr)
  require(scales)
  require(stats)
  
  if (ignore_case == TRUE) {
    data <- data %>% 
      dplyr::mutate(
        orig_tokens = !!as.name(tokens), # preserve original tokens
        {{tokens}} := stringr::str_to_lower(!!as.name(tokens)))  # convert to lower case to ignore case
  }
  
  # add count for tokens
  res <- data %>% 
    dplyr::group_by(!!as.name(tokens)) %>% 
    dplyr::mutate(entity_count = dplyr::n()) %>% 
    dplyr::ungroup() 
  
  # save min and max counts for verbose output
  if (verbose == TRUE) {
    range <- dplyr::tibble(min = min(res$entity_count),
                           max = max(res$entity_count))
  }
  
  # calculate quantile for filtering
  threshold <- res %>% 
    dplyr::distinct(!!as.name(tokens), .keep_all = T) %>% 
    dplyr::pull(entity_count) %>% 
    stats::quantile(probs = quantile) %>% 
    .[[1]]
  
  # drop quantile
  res <- res %>%  
    dplyr::filter(entity_count > threshold) 
  
  if (verbose == TRUE) {
    
    cat(paste(scales::percent(quantile), "Quantile:", threshold, "\n"))
    
    cat(paste("Count Range:", range$min, "-", range$max, "\n"))
    
    original_count <- data %>% # save unprocessed count statistics
      dplyr::distinct(!!as.name(tokens), !!as.name(group)) %>% 
      dplyr::summarise(distinct_unprocessed = n(), .by = {{group}}) 
    
    res %>% 
      dplyr::distinct(!!as.name(tokens),!!as.name(group)) %>%
      dplyr::group_by(!!as.name(group)) %>% 
      dplyr::summarise(distinct_processed = n()) %>%
      dplyr::left_join(original_count, by = group) %>% 
      dplyr::arrange(tag) %>% 
      dplyr::add_row(
        {{group}} := "total",
        distinct_unprocessed = sum(.$distinct_unprocessed),
        distinct_processed = sum(.$distinct_processed)
      ) %>%
      dplyr::mutate(
        dropped = distinct_unprocessed - distinct_processed,
        reduction =  scales::percent(1 - distinct_processed / distinct_unprocessed)
      ) %>% 
      print()
  }
  
  if (ignore_case == TRUE) {
    res <- res %>% dplyr::mutate({{tokens}} := orig_tokens) %>% # restore original tokens
      dplyr::select(!orig_tokens) 
  }
  
  return(res)
  
}


filter_tokens <- function(tokens, 
                          tokens_col = "lemma", # which column should be used for token filtering. Usually "token" or "lemma"
                          tags = c("NN", "NE"), # tags of a tokens object to keep. Expects a "tag" column. NULL to skip
                          min_str_length = 2, # minimum string length to keep
                          tolower = TRUE, # should tokens be set to lower? Ignores URLs and all-caps acronyms. Note that this is executed before stopword filtering, which is case-sensitive
                          stopword_languages = c("en", "de", "fr"), # stopword dictionaries to be removed. NULL to skip
                          stopword_dictionary = "snowball", # stopword dictionary to be used. See stopwords::stopwords()
                          additional_stopwords = c("amp", "&amp", "RT", "rt", "--", "---"), # provide additional stopwords to drop here
                          replies = NULL, # should replies be considered for the random walks? Expects a column named is_reply. NULL to skip  
                          keep_mentions = TRUE, # should @-mentions be filtered out? NULL to skip
                          keep_urls = TRUE # should URLs be filtered out? NULL to skip
                          ) {
  
  require(dplyr)
  require(stopwords)
  require(stringr)
  require(purrr)
  
  # some checks
  
  if (!is.null(tags) & !("tag" %in% colnames(tokens))){
    stop("Missing 'tag' column for tag filtering")
  }
  
  if (!is.null(replies) & !("is_reply" %in% colnames(tokens))){
    stop("Missing 'is_reply' column for reply filtering")
  }
  
  if (any(!(stopword_languages %in% stopwords::stopwords_getlanguages(stopword_dictionary)))) {
    stop(
      cat("Language '",
            stopword_languages[!stopword_languages %in%
                                 stopwords::stopwords_getlanguages(stopword_dictionary)],
      "' not provided in stopword dictionary '",
      stopword_dictionary, "'.\n", sep = "")
    )
  }
  
  # Filtering Sequences
  
  if (!is.null(tags)) {
    tokens <- tokens %>% 
      dplyr::filter(tag %in% tags)
  }
  
  tokens <- tokens %>% 
    dplyr::filter(stringr::str_length(!!as.name(tokens_col)) >= min_str_length) 
    
  if (tolower) {
    tokens <- tokens %>%
      dplyr::mutate(lemma = # lower case - 
                      dplyr::case_when(
                        str_detect(!!as.name(tokens_col), "http") # except for URLs (so they don't break) ...
                        ~ !!as.name(tokens_col), 
                        !!as.name(tokens_col) == toupper(!!as.name(tokens_col)) # ... and acronyms (all caps), e.g. "UN", "IT" etc
                        ~ !!as.name(tokens_col),
                        .default = tolower(!!as.name(tokens_col))))
  }
  
  if (!is.null(stopword_languages)) {
    
    stopwords <- stopword_languages %>% # combine stopwords dicts
      purrr::map(\(lang) stopwords::stopwords(language = lang, 
                                       source = stopword_dictionary)) %>% 
      unlist()
    
    tokens <- tokens %>% 
      dplyr::filter(!(!!as.name(tokens_col) %in% stopwords))
  }
  
  if (!is.null(additional_stopwords)) {
    tokens <- tokens %>% 
      dplyr::filter(!(!!as.name(tokens_col) %in% additional_stopwords))
  }
  

  if (!is.null(replies)) {
    tokens <- tokens %>%
    dplyr::filter(is_reply == replies | is_reply == FALSE) # filter for reply condition (TRUE includes replies, FALSE does not)
  }
  
  if (!is.null(keep_mentions) && keep_mentions == FALSE) {
    tokens <- tokens %>%
      dplyr::filter(!stringr::str_detect(!!as.name(tokens_col), "@")) # drop all lemmas containing "@" - that is, all mentions
  }

  if (!is.null(keep_urls) && keep_urls == FALSE) {
    tokens <- tokens %>%
      dplyr::filter(!stringr::str_detect(!!as.name(tokens_col), "http"))  # drop all URLs
  }
  
  return(tokens)
}


get_latest_tokens_file <- function(path, 
                                   pattern ="tokens.csv.tar.gz", # regular updates of the tokens file will always be named "tokens.csv.tar.gz"
                                   fallback = "tokens_init") {  # fallback pattern to look for, with only the latest returned 
                                                                #  if there are multiple matches according to the pattern "tokens_init_[date]"
  require(stringr)
  require(dplyr)
  
  files <- list.files(path)
  
  if (pattern %in% files) { # check for latest tokens file and load
    
    latest_tokens <- pattern 
    
  } else { # else, fall back to initial tokenization
    
    latest_tokens <- dplyr::tibble(files = files) %>% 
      dplyr::filter(stringr::str_detect(files, fallback)) %>% 
      dplyr::arrange(dplyr::desc(files)) %>% dplyr::slice_head(n = 1)
                         
  }
  
  return(file.path(path, latest_tokens))
  
}


read_timelimited_data <- function(file, # this can handle multiple files, and will pick only the relevant files (if applicable)
                                  ..., # arguments to pass to vroom()
                                  filter_var = "_source.created_at", # filtering var as datetime 
                                  starting_point, # the starting point, as "YYYY-MM-DD"
                                  timeframe = lubridate::weeks(1), # the timeframe before or after the starting point, as a lubridate period
                                  before_after = c("before", "after"),
                                  check_timeframe = TRUE,
                                  add_file_time = lubridate::years(1), # time to add to the filename when checking for fit of data with specified timeframe. 
                                                                       #  E.g., when adding years(1), it is assumed the file "data_news_2017-2018" contains data from 2017-01-01 to 2019-01-01
                                  verbose = TRUE # verbosity of warnings
) {
  require(rlang)
  require(dplyr)
  require(lubridate)
  require(vroom)
  require(purrr)
  require(data.table)
  require(stringr)
  
  rlang::arg_match(before_after)
  
  starting_point <- lubridate::ymd(starting_point) # make sure it's a date format
  
  if (length(file) > 1 & check_timeframe) { 
    # if more than one file is provided, check for each file if the data is within the specified timeframe
    #   this is helpful for automatically loading data distributed over several files without specifiying the file each time and without loading unnecessarily large datasets
    
    data <- file %>% 
      purrr::map(\(dat)
                 { 
                   years <- dat %>% stringr::str_extract_all("\\d{4}") %>% .[[1]] # get years in file name
                   
                   if (length(years) == 0) { # if no years in filename, load regardless and give warning
                     if (verbose) {
                       warning(paste0("No year indicators found in ", dat,
                                      ". Loading the dataset regardless of necessity."))
                     }
                     loaded_data <- vroom::vroom(dat, ...)
                   }
                   
                   if (length(years) == 1) { # for one year in a filename, check if the interval matches the specified timefram
                     if (before_after == "before") {
                       if (lubridate::parse_date_time(years[1], # data in the interval of the timeframe?
                                                      orders = "y") %within% lubridate::interval(
                                                        starting_point - timeframe,
                                                        starting_point) 
                       ) { 
                         loaded_data <- vroom::vroom(dat, ...) 
                       }
                     }
                     
                     if (before_after == "after") {
                       if (lubridate::parse_date_time(years[1], # data in the interval of the timeframe?
                                                      orders = "y") %within% lubridate::interval(
                                                        starting_point, 
                                                        starting_point + timeframe)
                       ) { 
                         loaded_data <- vroom::vroom(dat, ...) 
                       }
                     }
                   }
                   
                   
                   if (length(years) == 2) { # for two years in a filename, check if the interval matches the specified timeframe
                     interval <-
                       lubridate::interval(
                         lubridate::parse_date_time(years[1],
                                                    orders = "y"),
                         lubridate::parse_date_time(years[2],
                                                    orders = "y") +
                           add_file_time
                       )
                     
                     if (before_after == "before") {
                       if (lubridate::interval(starting_point - timeframe, 
                                               starting_point) %within% interval) { # period in the interval of the data?
                         loaded_data <- vroom::vroom(dat, ...) 
                       }
                     }
                     
                     if (before_after == "after") {
                       if (lubridate::interval(starting_point, 
                                               starting_point + timeframe) %within% interval) { # period in the interval of the data?
                         loaded_data <- vroom::vroom(dat, ...) 
                       }
                     }
                   } 
                   
                   
                   if (length(years) > 2) { # for more than two years in filename, load regardless and give warning
                     if (verbose) {
                       warning(paste0("More than two year indicators found in ", dat,
                                      ". Loading the dataset regardless of necessity."))
                     }
                     loaded_data <- vroom::vroom(dat, ...)
                   }
                  
                   
                   if (exists("loaded_data")){ # return the loaded data only if it was loaded
                     return(loaded_data)
                   }
                   
      }) %>% purrr::compact() %>% # drops empty list elements for cases where no data was loaded
      data.table::rbindlist(fill = TRUE) 
    
    if (length(data) == 0) {
      stop("No fitting data found for specified timeframe in the data. Either specify different files or timeframe or skip this check with check_timeframe == FALSE\n")
    }
    
  }  else { # if only one file is provided, it is loaded directly
    data <- vroom::vroom(file, ...) 
  }
  
  # filter the timeframe within the loaded data
  if (before_after == "before") {
    result <- data %>%
      dplyr::filter(!!as.name(filter_var) <= starting_point &
                      !!as.name(filter_var) >= (starting_point - timeframe))}
  
  if (before_after == "after") {
    result <- data %>% 
      dplyr::filter(!!as.name(filter_var) >= starting_point &
                      !!as.name(filter_var) <= (starting_point + timeframe))}
  
  gc(verbose = FALSE)
  
  return(result)
}


split_timeframes <- function(data, # the dataframe to read in
                             datetime_var, # the name of the datetime var to use for splitting
                             timeframe = "weeks", # the timeframe. can be any lubridate period. See lubridate::ceiling_date() for details
                             before_after = c("before", "after") # split before or after the timeframe's endpoint
                             ){
  
  rlang::arg_match(before_after)
  
    if (before_after == "before") {
      out <- data %>% 
        dplyr::mutate(period = lubridate::ceiling_date(!!as.name(datetime_var), unit = timeframe)) # end of the week
    }
    
    if (before_after == "after") {
      out <- data %>% 
        dplyr::mutate(period = lubridate::floor_date(!!as.name(datetime_var), unit = timeframe)) # beginning of the week
    }
    
    out <- out %>% 
      data.table::as.data.table() %>% 
      split(by = "period")
    
    return(out)
}