forked from pablobarbera/social-media-workshop
-
Notifications
You must be signed in to change notification settings - Fork 0
/
04-facebook-data-analysis.r
249 lines (195 loc) · 7.66 KB
/
04-facebook-data-analysis.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
################################################################
## Workshop: Collecting and Analyzing Social Media Data with R
## February 2nd, 2015
## Script 4: Analyzing Facebook data
## Author: Pablo Barbera, NYU, @p_barbera
################################################################
setwd("~/Dropbox/git/social-media-workshop")
###############################################
### SCRAPING LIST OF MEMBERS OF CONGRESS ###
###############################################
# reading additional functions
source("functions.r")
# get your API key here: http://developer.nytimes.com
api_key = 'XXXXXXXX'
# scraping NYT API
house <- scrape_nytimes_congress_api(api_key, chamber="house")
senate <- scrape_nytimes_congress_api(api_key, chamber="senate")
congress <- rbind(house, senate)
table(congress$facebook_account=="")
# keeping only members of congress with FB accounts
congress <- congress[congress$facebook_account!="",]
write.csv(congress, file="backup/congress.csv", row.names=FALSE)
###########################################################
### COLLECTING FB PAGE DATA FOR MEMBERS OF CONGRESS ###
###########################################################
library(Rfacebook)
# list of accounts
accounts <- congress$facebook_account
# removing those already downloaded
accounts.done <- list.files("data/facebook")
accounts.left <- accounts[tolower(accounts) %in% tolower(gsub(".csv", "", accounts.done)) == FALSE]
# loop over accounts
for (account in accounts.left){
cat(account, "\n")
# download page data (with error handling)
error <- tryCatch(df <- getPage(account, token=fb_oauth, n=5000, since='2013/01/01'),
error=function(e) e)
if (inherits(error, 'error')){ next }
## cleaning text from \n
df$message <- gsub("\n", " ", df$message)
## save page data in R and csv format
save(df, file=paste0("data/raw_fb/", account, ".Rdata"))
write.csv(df, file=paste0("data/facebook/", account, ".csv"),
row.names=F)
}
###########################################################
### MERGING AND CLEANING TEXT FROM FACEBOOK PAGES ###
###########################################################
# list of files (one per MC)
accounts <- list.files("data/facebook", full.names=TRUE)
# function to read the file and clean the text
clean_fb_text <- function(f){
require(tm); require(stringr)
# read file
d <- read.csv(f, stringsAsFactors=F)
# remove posts with no message
d <- d[!is.na(d$message) & d$message!="",]
# avoid encoding issues by dropping non-unicode characters
utf8text <- iconv(d$message, to='UTF-8-MAC', sub = "byte")
# cleaning text
text <- removePunctuation(removeNumbers(tolower(utf8text)))
removeURL <- function(x) gsub('"(http.*) |(http.*)$|\n', "", x)
text <- removeURL(text)
# putting it all together
text <- paste0(text, collapse=" ")
return(text)
}
# and now we apply it to all files
# (note that this can take a few minutes)
txt <- lapply(accounts, clean_fb_text)
# and convert it to a vector
txt <- unlist(txt)
save(txt, file="backup/congress-fb-data.rdata")
###########################################################
### RUNNING TOPIC MODEL ###
###########################################################
load("backup/congress-fb-data.rdata")
# loading library to convert text into matrix
library(tm); library(stringr)
# reading text into Corpus object
txt <- Corpus(VectorSource(txt))
txt
# and now converting to matrix (sparse representation)
dtm <- DocumentTermMatrix(txt,
control=list(minWordLength=3,
stopwords = c(stopwords("en"), "can", "will", "get",
"make", "must", "new", "one", "said",
"like", "think", "great", "american",
"american", "americans", "congress",
"today", "sen", "house", "senate",
unlist(str_split(tolower(state.name), " ")))))
dtm
# remove sparse terms (infrequent words)
dtm <- removeSparseTerms(dtm, 0.90)
dtm
# keep words that appear in more than 10% of pages
# loading library to run the topic model
library(topicmodels)
# running LDA
lda <- LDA(dtm, k=50, method="Gibbs",
control=list(verbose=1L, iter=50))
save(lda, file="backup/lda-output.Rdata")
###########################################################
### INTERPRETING RESULTS OF TOPIC MODEL ###
###########################################################
load("backup/lda-output.Rdata")
# top word associated to each topic
terms(lda)
# top 10 words associated to each topic
trms <- t(terms(lda, k=10))
# some topics are easy to identify
trms[4,] # TV appearances
trms[7,] # economic prosperity
trms[8,] # bipartisan issues
trms[9,] # natural resources
trms[15,] # civil rights
trms[17,] # spending cuts
trms[31,] # foreign policy
trms[40,] # community events
trms[42,] # "spread the word"
trms[43,] # small business
trms[45,] # military
trms[49,] # Health care (ACA)
# some of these are pretty interesting
trms[35,] # immigration reform
trms[22,] # "illegal" immigration
# merging with Congress data:
# 1) reading congress data
congress <- read.csv("backup/congress.csv", stringsAsFactors=F)
# 2) reading list of files
accounts <- tolower(gsub(".csv", "", list.files("data/facebook")))
# 3) keeping members of congress with data
congress <- congress[tolower(congress$facebook_account) %in% tolower(accounts),]
# 4) putting in same order as our dataset
congress <- congress[order(congress$facebook_account),]
# matrix with topic distributions
mat <- lda@gamma
dim(mat)
mat[1:5, 1:5]
# i.e.: Member of Congress 1 is talking 3.4% of the time about Topic 1
# who is the Member of Congress that talks more about...
# immigration?
congress[which.max(mat[,35]),] # immigration reform
congress[which.max(mat[,22]),] # "illegal" immigration
# agriculture?
congress[which.max(mat[,23]),]
# spending cuts?
congress[which.max(mat[,17]),]
# looking at differences across parties
republicans <- which(congress$party=="R")
democrats <- which(congress$party=="D")
rep.props <- apply(lda@gamma[republicans,], 2, mean)
dem.props <- apply(lda@gamma[democrats,], 2, mean)
ratio <- rep.props / (dem.props + rep.props)
# % of topic usage by party
ratio[35] # immigration reform
ratio[22] # "illegal" immigration
ratio[8] # bipartisan
ratio[15] # civil rights
ratio[45] # military
ratio[31] # foreign policy
# bar plots for a selection of topics
sbs <- c(4, 8, 9, 15, 49, 7, 31, 40, 43, 45, 35, 22)
df <- data.frame(
prop = c(ratio[sbs], 1-ratio[sbs]),
party = rep(c("Republicans", "Democrats"), each=12),
label = rep(c(
"TV appearances", "Bipartisan issues", "Natural resources",
"Civil rights", "Health care", "Economic prosperity",
"Foreign policy", "Community events", "Small business",
"Military", "Immigration reform", "Illegal immigration"), 2),
stringsAsFactors=F)
# sorting by % use by republicans
df <- df[c(order(ratio[sbs]), order(ratio[sbs])+12),]
df$label <- factor(df$label, levels=rev(df$label[1:12]))
library(ggplot2)
library(scales)
# plot code, modified from code in:
# https://solomonmessing.wordpress.com/2014/10/11/when-to-use-stacked-barcharts/
pq <- ggplot(df, aes(x=label, y=prop, fill=party)) +
scale_y_continuous(labels = percent_format(), expand=c(0,0)) +
geom_bar(stat='identity') +
geom_hline(yintercept=.5, linetype = 'dashed') +
coord_flip() +
theme_bw() +
ylab('Democrat/Republican share of page posts') +
xlab('') +
scale_fill_manual(values=c('blue', 'red')) +
theme(legend.position='none',
panel.border = element_blank(),
axis.ticks.y=element_blank(),
panel.grid = element_blank()) +
ggtitle('Political Issues Discussed by Party\n')
pq
ggsave(pq, file="img/congress-lda.png", width=6, height=4)