-
Notifications
You must be signed in to change notification settings - Fork 0
/
prep.R
29 lines (23 loc) · 1.66 KB
/
prep.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
library(tidyverse)
library(here)
library(tidymodels)
d <- read_rds(here("data", "ngsschat-data.rds"))
codes <- read_csv(here("data", "ngsschat-qualitative-codes.csv"))
codes <- codes %>%
select(id_string = ID, code = Code) # this changes the variables names in the codes data frame to be the esame as in the tweets
dd <- d %>%
left_join(codes) %>% # join the two files together
filter(!is.na(code)) %>%
filter(code != "OT" & code != "RT" & code != "TF") # here, because TF (transformational) codes are so rare, we exclude them, as well as OT (off-topic) and RT (retweet) tweets and those missing a code
ddd <- dd %>%
select(favorite_count, retweet_count, followers_count, friends_count, statuses_count, display_text_width,
code, id_string) %>% # select the variables we'll use for our supervised machine learning model
filter(!is.na(favorite_count)) %>%
group_by(id_string) %>% # group the tweets by thread and calculate summary variables
summarize(mean_favorite_count = mean(favorite_count), # this and the next are means; this could, though, be a sum; it represents the average number of favorites each tweet in the thread received
mean_retweet_count = mean(retweet_count), # how many retweets each tweet received
sum_display_text_width = sum(display_text_width), # this is a variable for the length of the tweet; sum seems more sensible than mean
n = n()) %>% # the number of tweets in the thread
left_join(distinct(dd, id_string, code)) %>% # here, we join back the codes, as we lost them in the summary step
select(-id_string)
write_csv(ddd, here("data", "ngsschat-processed-data.csv"))