policy_parser_coding_sample.R

# Pulling Samples for the Policy Parser Evaluation with manually-coded data

{
  library(tidyverse)
  library(vroom)
  library(data.table)
  library(openxlsx)
  library(scales)
}

seed_sample <- function(data, # data
                        seed, # seed
                        ... # arguments to pass to slice_sample()
                        ) { # slice_sample needs the seed specified before each run (?!), so we build a small wrapper function
  set.seed(seed)
  res <- data %>% slice_sample(...)
  return(res)
}

seed_twitter <- 20231218
seed_news <- 20231218
#set.seed(seed)

threshold_twitter <- 0.7
threshold_news <- 0.7

## Read data

### Twitter
tweet_data <- vroom(file = "Tokenizer/data_init_tweets_2023-06-22.csv.tar.gz",
                    # Important! specify coltypes to preserve correct IDs
                    col_types = list(
                      `_id` = "c",
                      `_source.author_id` = "c",
                      `_source.conversation_id` = "c",
                      `_source.in_reply_to_user_id`= "c",
                      `_source.attachments.poll_ids` = "c",
                      `_source.withheld.scope` = "c",
                      `_source.withheld.country_codes` = "c",
                      `_source.entities.cashtags` = "c"
                    ), guess_max = 10000)

tweet_data <- tweet_data %>% mutate(is_retweet = case_when( # add RT indicator
  str_detect(`_source.text`, "^RT") ~ TRUE,
  .default = FALSE)) %>% 
  mutate(week = ceiling_date(`_source.created_at`, unit = "weeks")) # add week indicator

tweet_classification <- readRDS("init_classification/init_classified_tweets.RDS")

# tweets_highest <- tweet_classification %>%  # reduce to highest score per document to determine its policy field
#   imap(\(week, date) 
#        {week %>% .[["classified_documents"]]} %>% 
#          slice_max(score_norm, by = doc_id) %>% 
#          mutate(week = as_date(date))) %>% 
#   rbindlist() %>% 
#   rename(`_id` = doc_id) %>% # add variables
#   left_join(tweet_data_full %>% 
#               select(`_id`, `_source.created_at`, 
#                      `_source.real_name`, `_source.text`),
#             by = join_by(`_id`)) 

classified_tweets <-  tweet_classification %>% # unnest 
  map(\(dat) dat$classified_documents) %>%
  rbindlist() %>% 
  filter(score_norm >= threshold_twitter) %>%  # select classifications above threshold (reduced nr of policy fields per tweet, drops tweets below)
  left_join(tweet_data %>% # add additional vars like text
              select(`_id`, `_source.created_at`,
                     `_source.real_name`, `_source.text`, 
                     week, is_retweet, is_reply),
            by = join_by(doc_id == `_id`)) %>% 
  filter(!is_retweet & !is_reply) # drop replies and retweets for the samples (replies were not classified, retweets need not be classified)


### News

news_data <- list.files("news_classification/data", full.names = T) %>% 
  map(\(file) vroom(file)) %>% rbindlist(fill = TRUE)

news_data <- news_data %>% # add outlet indicator
  mutate(outlet = str_remove_all(`_source.host`,
                                 paste0(c("^www\\.", "\\.de$", "\\.net$", 
                                          "\\.co.uk$", "\\.com$"), 
                                        collapse = "|"))) %>% 
  mutate(week = ceiling_date(`_source.estimated_date`, unit = "weeks")) # add week indicator

news_data <- news_data %>% # remove a small number of non-german outlets accidentially in the data / only keep select outlets (drops 20 docs)
  filter(outlet %in% c("faz", "welt", "bild", "spiegel", 
                       "zeit", "sueddeutsche", "stuttgarter-zeitung"))

news_classification <- readRDS("news_classification/init_classified_news.RDS")

# news_highest <- news_classification %>%  # reduce to highest score per document to determine its policy field
#   imap(\(week, date) 
#        {week %>% .[["classified_documents"]]} %>% 
#          slice_max(score_norm, by = doc_id) %>% 
#          mutate(week = as_date(date))) %>% 
#   rbindlist() %>% 
#   rename(`_id` = doc_id) %>% # add variables
#   left_join(news_data %>% 
#               select(`_id`, `_source.estimated_date`, 
#                      outlet, `_source.title`, `_source.body`),
#             by = join_by(`_id`)) 

classified_news <-  news_classification %>% # unnest 
  map(\(dat) dat$classified_documents) %>%
  rbindlist() %>% 
  filter(score_norm >= threshold_news) %>%  # select classifications above threshold (reduced nr of policy fields per tweet, drops tweets below)
  left_join(news_data %>% # add additional vars like text
              select(`_id`, `_source.estimated_date`,
                     outlet, `_source.title`, `_source.body`,
                     week),
            by = join_by(doc_id == `_id`))

## Pull Samples

### Twitter 
#### 1000 tweets per coder, 300 tweets intercoder sample = 2.400 total tweets classified (~0.01%) with 12.5% of tweets coded by all coders
#### 60% of sampled data are pulled over the timeframes (weeks), 40% are pulled over tweets associated with policy fields (highest association) to not underrepresent small fields

tweet_sample_1 <- 
  classified_tweets %>% # temporal sample
  seed_sample(seed_twitter, n = 2, by = week)  %>% 
  mutate(sample = "temporal") %>% 
  bind_rows(
    classified_tweets %>% # policy field sample
      filter(!(doc_id %in% (classified_tweets %>% # not in temporal sample
                 seed_sample(seed_twitter, n = 2, by = week) %>% pull(doc_id)))) %>% 
      seed_sample(seed_twitter, n = 24, by = policy_field) %>% 
      mutate(sample = "policy")
  ) %>% 
  select(doc_id, `_source.created_at`, `_source.text`, 
         policy_field, sample)

tweet_sample_1 %>% summarise(n = n(), .by = sample) # a roughly 60/40 ratio


twitter_intercoder_sample <- tweet_sample_1 %>% # make intercoder sample
  seed_sample(seed_twitter, n = 150, by = sample) # .. with even amounts of temporal and policy sourced tweets


tweet_sample_2 <-  classified_tweets %>% # temporal sample
  filter(!(doc_id %in% tweet_sample_1$doc_id)) %>% # not in previous samples
  seed_sample(seed_twitter, n = 2, by = week)  %>% 
  mutate(sample = "temporal") %>% 
  bind_rows(
    classified_tweets %>% # policy field sample
      filter(!(doc_id %in% tweet_sample_1$doc_id), # not in previous samples
             !(doc_id %in% (classified_tweets %>% # not in temporal sample
                             seed_sample(seed_twitter, n = 2, by = week) %>% pull(doc_id)))) %>% 
      seed_sample(seed_twitter, n = 24, by = policy_field) %>% 
      mutate(sample = "policy")
  ) %>% 
  seed_sample(seed_twitter, n = 700, weight_by = ifelse(sample == "temporal", 0.6, 0.4)) %>% # pull 700 of the 1000 documents, maintaining the 60/40 ratio
  bind_rows(twitter_intercoder_sample) %>% 
  select(doc_id, `_source.created_at`, `_source.text`, 
         policy_field, sample)

tweet_sample_2 %>% summarise(n = n(), .by = sample)


tweet_sample_3 <- classified_tweets %>% # temporal sample
  filter(!(doc_id %in% tweet_sample_1$doc_id), # not in previous samples
         !(doc_id %in% tweet_sample_2$doc_id)) %>% 
  seed_sample(seed_twitter, n = 2, by = week)  %>% 
  mutate(sample = "temporal") %>% 
  bind_rows(
    classified_tweets %>% # policy field sample
      filter(!(doc_id %in% tweet_sample_1$doc_id), # not in previous samples
             !(doc_id %in% tweet_sample_2$doc_id),
             !(doc_id %in% (classified_tweets %>% # not in temporal sample
                             seed_sample(seed_twitter, n = 2, by = week) %>% pull(doc_id)))) %>% 
      seed_sample(seed_twitter, n = 24, by = policy_field) %>% 
      mutate(sample = "policy")
  ) %>% 
  seed_sample(seed_twitter, n = 700, weight_by = ifelse(sample == "temporal", 0.6, 0.4)) %>% # pull 700 of the 1000 documents, maintaining the 60/40 ratio
  bind_rows(twitter_intercoder_sample) %>% 
  select(doc_id, `_source.created_at`, `_source.text`, 
         policy_field, sample)

tweet_sample_3 %>% summarise(n = n(), .by = sample)

#### export samples

export_sample <- function(data, sort_col, filename) {
  data <- as.data.frame(data)
  data["correct"] <- NA # add empty column for T/F indicators
  data %>% 
    arrange(!!as.name(sort_col)) %>% # order by date
    mutate(intercoder_sample = case_when((doc_id %in% twitter_intercoder_sample$doc_id) ~ TRUE, # indicate intercoder sample
                              .default = FALSE)) %>% 
    arrange(desc(intercoder_sample)) %>% # intercoder sample first
    select(!c(any_of("sample"))) %>% # drop sample indicator
    mutate(across(.cols = where(is.character),  ~ utf8::as_utf8(.x))) %>% # utf8 conversion
    write.xlsx(file = filename)
}
export_sample(twitter_intercoder_sample, "_source.created_at", "evaluation_samples/twitter_sample_intercoder.xlsx")

export_sample(tweet_sample_1, "_source.created_at", "evaluation_samples/twitter_sample_1.xlsx")

export_sample(tweet_sample_2, "_source.created_at", "evaluation_samples/twitter_sample_2.xlsx")

export_sample(tweet_sample_3, "_source.created_at", "evaluation_samples/twitter_sample_3.xlsx")


# #### in some cases, the text body of the samples was missing (due to RTs being present in the classified tweets, but not in the join data used)
# 
# tweet_sample_1_missing <- tweet_sample_1 %>% filter(is.na(`_source.text`)) %>% 
#   select(`_id`) %>% 
#   left_join(tweet_data_full %>% select(`_id`, `_source.created_at`, 
#                                        `_source.real_name`, `_source.text`), 
#             by = "_id")
# 
# export_sample(tweet_sample_1_missing, "_source.created_at", 
#               "evaluation_samples/twitter_sample_1_missing.xlsx")
# 
# 
# tweet_sample_2_missing <- tweet_sample_2 %>% filter(is.na(`_source.text`)) %>% 
#   select(`_id`) %>% 
#   left_join(tweet_data_full %>% select(`_id`, `_source.created_at`, 
#                                        `_source.real_name`, `_source.text`), 
#             by = "_id")
# 
# export_sample(tweet_sample_2_missing, "_source.created_at", 
#               "evaluation_samples/twitter_sample_2_missing.xlsx")
# 
# 
# tweet_sample_3_missing <- tweet_sample_3 %>% filter(is.na(`_source.text`)) %>% 
#   select(`_id`) %>% 
#   left_join(tweet_data_full %>% select(`_id`, `_source.created_at`, 
#                                        `_source.real_name`, `_source.text`), 
#             by = "_id")
# 
# export_sample(tweet_sample_3_missing, "_source.created_at", 
#               "evaluation_samples/twitter_sample_3_missing.xlsx")


### News
#### 437 articles per coder, 129 docs intercoder sample = 1.049 total docs classified (~0.001%) with 12.5% of docs coded by all coders
#### 66% of sampled data are pulled over the timeframes (weeks), 
####  20% are pulled over docs associated with policy fields (highest association) to not underrepresent small fields,
####  14% are pulled over outlet, to represent all outlets

news_sample_1 <- 
  classified_news %>% # temporal sample
  seed_sample(seed_news, n = 1, by = week)  %>% 
  mutate(sample = "temporal") %>% 
  bind_rows( # policy field sample
    classified_news %>% 
      filter(!(doc_id %in% (classified_news %>% # not in temporal sample
                             seed_sample(seed_news, n = 1, by = week) %>% pull(doc_id)))) %>% 
      seed_sample(seed_news, n = 5, by = policy_field) %>% 
      mutate(sample = "policy")
  ) %>% 
  bind_rows( # outlet sample
    classified_news %>% 
      filter(!(doc_id %in% (classified_news %>% # not in temporal sample
                             seed_sample(seed_news, n = 1, by = week) %>% 
                             pull(doc_id)))) %>% 
      filter(!(doc_id %in% (classified_news %>% # not in policy sample
                             filter(!(doc_id %in% (classified_news %>% 
                                                    seed_sample(seed_news, n = 1, 
                                                                by = week) %>% 
                                                    pull(doc_id)))) %>% 
                             seed_sample(seed_news, n = 5, by = policy_field)))) %>% 
      seed_sample(seed_news, n = 9, by = outlet) %>% 
      mutate(sample = "outlet")
  ) %>% 
  select(doc_id, `_source.estimated_date`, outlet, `_source.title`, 
         `_source.body`, policy_field, sample)

news_sample_1 %>% summarise(n = n(), .by = sample) %>% 
  mutate(percent = percent(n/sum(n), accuracy = 0.01)) # a roughly 66/19/15 ratio


news_intercoder_sample <- news_sample_1 %>% # make intercoder sample
  seed_sample(seed_news, n = 43, by = sample) # .. with even amounts of temporal and policy sourced tweets


news_sample_2 <-  classified_news %>% # temporal sample
  filter(!(doc_id %in% news_sample_1$doc_id)) %>% # not in previous samples
  seed_sample(seed_news, n = 1, by = week)  %>% 
  mutate(sample = "temporal") %>% 
  bind_rows(
    classified_news %>% # policy field sample
      filter(!(doc_id %in% news_sample_1$doc_id)) %>%  # not in previous samples
      filter(!(doc_id %in% (classified_news %>% # not in temporal sample
                             seed_sample(seed_news, n = 1, by = week) %>% 
                             pull(doc_id)))) %>% 
      seed_sample(seed_news, n = 5, by = policy_field) %>% 
      mutate(sample = "policy")
  ) %>% 
  bind_rows( # outlet sample
    classified_news %>% 
      filter(!(doc_id %in% news_sample_1$doc_id)) %>%  # not in previous samples
      filter(!(doc_id %in% (classified_news %>% # not in temporal sample
                             seed_sample(seed_news, n = 1, by = week) %>% 
                             pull(doc_id)))) %>% 
      filter(!(doc_id %in% (classified_news %>% # not in policy sample
                             filter(!(doc_id %in% (classified_news %>% 
                                                    seed_sample(seed_news, n = 1, 
                                                                by = week) %>% 
                                                    pull(doc_id)))) %>% 
                             seed_sample(seed_news, n = 5, by = policy_field)))) %>% 
      seed_sample(seed_news, n = 9, by = outlet) %>% 
      mutate(sample = "outlet")
  ) %>% 
  seed_sample(seed_news, n = 308, weight_by = ifelse(sample == "temporal", 0.66, 
                                                ifelse(sample == "policy", 0.20, 0.14))) %>% # pull 700 of the 1000 documents, maintaining the 60/40 ratio
  bind_rows(news_intercoder_sample) %>% 
  select(doc_id, `_source.estimated_date`, outlet, `_source.title`, 
         `_source.body`, policy_field, sample)

news_sample_2 %>% summarise(n = n(), .by = sample) %>% 
  mutate(percent = percent(n/sum(n), accuracy = 0.01)) # a roughly 66/19/15 ratio


news_sample_3 <-  classified_news %>% # temporal sample
  filter(!(doc_id %in% news_sample_1$doc_id), # not in previous samples
         !(doc_id %in% news_sample_2$doc_id)) %>% 
  seed_sample(seed_news, n = 1, by = week)  %>% 
  mutate(sample = "temporal") %>% 
  bind_rows(
    classified_news %>% # policy field sample
      filter(!(doc_id %in% news_sample_1$doc_id), # not in previous samples
             !(doc_id %in% news_sample_2$doc_id)) %>% 
      filter(!(doc_id %in% (classified_news %>% # not in temporal sample
                             seed_sample(seed_news, n = 1, by = week) %>% 
                             pull(doc_id)))) %>% 
      seed_sample(seed_news, n = 5, by = policy_field) %>% 
      mutate(sample = "policy")
  ) %>% 
  bind_rows( # outlet sample
    classified_news %>% 
      filter(!(doc_id %in% news_sample_1$doc_id), # not in previous samples
             !(doc_id %in% news_sample_2$doc_id)) %>% 
      filter(!(doc_id %in% (classified_news %>% # not in temporal sample
                             seed_sample(seed_news, n = 1, by = week) %>% 
                             pull(doc_id)))) %>% 
      filter(!(doc_id %in% (classified_news %>% # not in policy sample
                             filter(!(doc_id %in% (classified_news %>% 
                                                    seed_sample(seed_news, n = 1, 
                                                                by = week) %>% 
                                                    pull(doc_id)))) %>% 
                             seed_sample(seed_news, n = 5, by = policy_field)))) %>% 
      seed_sample(seed_news, n = 9, by = outlet) %>% 
      mutate(sample = "outlet")
  ) %>% 
  seed_sample(seed_news, n = 308, weight_by = ifelse(sample == "temporal", 0.66, 
                                                ifelse(sample == "policy", 0.20, 0.14))) %>% # pull 700 of the 1000 documents, maintaining the 60/40 ratio
  bind_rows(news_intercoder_sample) %>% 
  select(doc_id, `_source.estimated_date`, outlet, `_source.title`, 
         `_source.body`, policy_field, sample)

news_sample_3 %>% summarise(n = n(), .by = sample) %>% 
  mutate(percent = percent(n/sum(n), accuracy = 0.01)) # a roughly 66/19/15 ratio


#### export samples

export_sample <- function(data, sort_col, filename) {
  data <- as.data.frame(data)
  data["correct"] <- NA # add empty column for T/F indicators
  data <- data %>% mutate(`_source.body` = case_when( # truncate long documents
    str_length(`_source.body`) > 32767 ~ str_trunc(`_source.body`, 
                                                   width = 32767, 
                                                   side = "right"), 
    .default = `_source.body`))
  data %>% 
    arrange(!!as.name(sort_col)) %>% # order by date
    mutate(intercoder_sample = case_when((doc_id %in% news_intercoder_sample$doc_id) ~ TRUE, # indicate intercoder sample
                                         .default = FALSE)) %>% 
    arrange(desc(intercoder_sample)) %>% # intercoder sample first
    select(!c(any_of("sample"))) %>% # drop sample indicator
    mutate(across(.cols = where(is.character),  ~ utf8::as_utf8(.x))) %>% # utf8 conversion
    write.xlsx(file = filename)
}

export_sample(news_intercoder_sample, "_source.estimated_date", "evaluation_samples/news_sample_intercoder.xlsx")

export_sample(news_sample_1, "_source.estimated_date", "evaluation_samples/news_sample_1.xlsx")

export_sample(news_sample_2, "_source.estimated_date", "evaluation_samples/news_sample_2.xlsx")

export_sample(news_sample_3, "_source.estimated_date", "evaluation_samples/news_sample_3.xlsx")