-
Notifications
You must be signed in to change notification settings - Fork 3
/
bigClusterAnalysis.R
121 lines (87 loc) · 4.17 KB
/
bigClusterAnalysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#massive combined file
allParts = read_csv("~/Documents/Northeastern/Viral Texts Project/Viral-Texts-R/data/combined_short.csv")
allPartsDates = read_csv("~/Documents/Northeastern/Viral Texts Project/Viral-Texts-R/data/combined_short2.csv")
#select longest witness of each cluster
allParts = allParts %>%
group_by(cluster) %>%
mutate(count=nchar(text)) %>%
arrange(-count) %>%
slice(1)
allPartsDates = allPartsDates %>%
group_by(cluster) %>%
mutate(count=nchar(text)) %>%
arrange(-count) %>%
slice(1)
allParts <- allParts[,c("cluster","text")] %>% mutate(genre = "unknown")
allParts$cluster <- as.character(allParts$cluster)
allParts_short <- allParts[sample(nrow(allParts), 5000),] %>% as_data_frame()
#read in classified clusters from big data
bigClustersClassRaw = read_csv("~/Documents/Northeastern/Viral Texts Project/Viral-Texts-R/output/genreClass-2-19-17.csv")
bigClustersClassRaw <- bigClustersClassRaw[,c("cluster","text","classified_genre", "probability")]
#merge in classified clusters from PCA
allDataBig <- rbind(allParts_short,newGenres_slim)
#merge dates from allPartsDates and visualize
bigClustersClass = bigClustersClassRaw %>% inner_join(allPartsDates,by = "cluster")
bigClustersClass <- bigClustersClass[,c("cluster","text.x","classified_genre", "date")]
colnames(bigClustersClass)[colnames(bigClustersClass) == 'text.x'] <- 'text'
colnames(bigClustersClass)[colnames(bigClustersClass) == 'classified_genre'] <- 'genre'
gTime <- bigClustersClass %>%
mutate(year=gsub(".*(\\d{4}).*","\\1",date)) %>%
mutate("year" = as.numeric(year)) %>%
group_by(year,genre) %>%
mutate(count=n()) %>%
ggplot() +
geom_line(aes(x = year, y = count, color = genre))
ggplotly(gTime)
chart_link = plotly_POST(gTime, filename="vt-genres-over-time")
chart_link
gTime <- bigClustersClass %>%
mutate(year=gsub(".*(\\d{4}).*","\\1",date)) %>%
mutate("year" = as.numeric(year)) %>%
group_by(year,classified_genre) %>%
mutate(count=n()) %>%
ggplot() +
geom_smooth(mapping = aes(x = year, y = count, color = classified_genre), method = loess)
#read in pre-classified clusters
allDataGenres = read_csv("output/genreClass-10-30-16.csv")
#This should be Priamry.Genre in most cases
names(allDataGenres)[names(allDataGenres)=="classified_genre"] <- "genre"
names(allDataGenres)[names(allDataGenres)=="Text"] <- "text"
allDataGenres = allDataGenres[,c("cluster","text","genre")]
allDataGenres$cluster = as.character(allDataGenres$cluster)
# get equal amount of each
allDataGenres <- ddply(allDataGenres, "genre" , function(allDataGenres) allDataGenres[sample(nrow(allDataGenres), 750),])
#Merge hand-tagged clusters into alldata
allData = allParts_short %>% full_join(allDataGenres)
allData = allData[,c("cluster","text.x","genre.y")]
names(allData)[names(allData)=="genre.y"] <- "genre"
names(allData)[names(allData)=="text.x"] <- "text"
#allData <- allData[sample(1:nrow(allData), 2000,replace=FALSE),]
allData = replace(allData, is.na(allData), "unknown")
# just prose from classication
poetry <- genreClass %>% filter(classified_genre=="poetry")
prose <- genreClass %>% filter(classified_genre=="prose")
ads <- genreClass %>% filter(classified_genre=="advertisement")
news <- genreClass %>% filter(classified_genre=="news")
#create SQL backend (not working)
db <- dbConnect(SQLite(), dbname="vt.sqlite")
dbListTables(db)
dbWriteTable(db, "poetry", poetry)
#for David, getting the dates of the first hand tagged genres
handTagged = read_csv("~/Documents/Northeastern/Viral Texts Project/Viral-Texts-R/data/newdata-5-11-handtagged-genres.csv")
handTaggedJoin = handTagged %>% inner_join(beginData, by = "cluster")
write.csv(handTaggedJoin, file = paste('output/200handTaggedwDates-3-29-17.csv',sep=""))
handTaggedJoin %>%
mutate(year=gsub(".*(\\d{4}).*","\\1",date)) %>%
mutate("year" = as.numeric(year)) %>%
mutate("genre" = `Top Level`) %>%
group_by(year,genre) %>%
mutate(count=n()) %>%
ggplot() +
geom_line(aes(x = year, y = count, color = genre))
# Work with new genres from PCA
newGenres = read_csv("output/newGenres-6-23-17.csv")
newGenres$X1 <- NULL
newGenres$cluster <- as.character(newGenres$cluster)
allData <- rbind(newGenres,allParts_short)
# Go topic model from here