-
Notifications
You must be signed in to change notification settings - Fork 3
/
Wright Data.R
122 lines (91 loc) · 4.77 KB
/
Wright Data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#Working with Wright Data
#make a list of all files
WRIGHT = list.files("data/wright-txt",full.names = TRUE)
#function to read them in
readWRIGHT = function(file) {
message(file)
text = paste(scan(file, sep="\n",what="raw",strip.white = TRUE),collapse = "/n")
WRIGHT = data.frame(filename=file,text=text,stringsAsFactors = FALSE)
return(WRIGHT)
}
#run the function
allWRIGHT = data.frame(filename=WRIGHT,stringsAsFactors=FALSE) %>%
group_by(filename) %>%
do(readWRIGHT(.$filename))
#remove everything above Full Text
allWRIGHT$text = gsub("/n", "", allWRIGHT$text)
#this removes line breaks
allWRIGHT$text = gsub("\r?\n|\r", "\\s", allWRIGHT$text)
allWRIGHT$text = gsub(".*?----FULL TEXT----", " ", allWRIGHT$text)
#sample 8 in an effort to get a smaller corpus
#allWRIGHT <- allWRIGHT[sample(1:nrow(allWRIGHT), 8,replace=FALSE),]
#convert text to string and chunk by character count. Convert to df
allWRIGHT_string = paste(unlist(allWRIGHT$text), collapse =" ")
allWRIGHT_string = substring(allWRIGHT_string,seq(1,nchar(allWRIGHT_string),2000),seq(2000,nchar(allWRIGHT_string),2000))
allWRIGHT2 = allWRIGHT_string %>% as.data.frame()
# Use Lincoln's update of tokenizers to chunk text
allWRIGHT2 = chunk_text(allWRIGHT$text, chunk_size = 300, doc_id = allWRIGHT$filename) %>% as_data_frame()
allWRIGHT2 = allWRIGHT2 %>% gather(id, text)
allWRIGHT2 = allWRIGHT2 %>% as.data.frame()
allWRIGHT2 = allWRIGHT2 %>% mutate(genre="fiction")
allWRIGHT2 = allWRIGHT2 %>% mutate(cluster=paste("wright",1:nrow(allWRIGHT2), sep = "_"))
names(allWRIGHT2)[names(allWRIGHT2)=="."] <- "text"
allWRIGHT2 = allWRIGHT2[,c("cluster","text","genre")]
allWRIGHT2 <- allWRIGHT2[sample(1:nrow(allWRIGHT2), 200,replace=FALSE),]
allWRIGHT2$text = allWRIGHT2$text %>% as.character()
#add genre column and convert filename to cluster
allWRIGHT = allWRIGHT %>% mutate(genre="fiction")
names(allWRIGHT)[names(allWRIGHT)=="filename"] <- "cluster"
allWRIGHT = allWRIGHT %>% as.data.frame()
#allWRIGHT <- allWRIGHT[sample(1:nrow(allWRIGHT), 8,replace=FALSE),]
#read in fiction found with "by"
allFiction = read.csv("data/fiction-with-by.csv", header=TRUE, fill = TRUE, sep = ",", row.names = NULL, stringsAsFactors = FALSE)
allFiction = allFiction %>% mutate(genre = "fiction")
allFiction = allFiction[,c("cluster","text","genre")]
#read in news
allNews = read.csv("output/genres/news.csv", header=TRUE, fill = TRUE, sep = ",", row.names = NULL, stringsAsFactors = FALSE)
names(allNews)[names(allNews)=="classified_genre"] <- "genre"
names(allNews)[names(allNews)=="Text"] <- "text"
allNews = allNews[,c("cluster","text","genre")]
allNews <- allNews[sample(1:nrow(allNews), 200,replace=FALSE),]
allNews$text = gsub("\r?\n|\r", "\\s", allNews$text)
#read in vignettes
allVignettes = read.csv("data/200vignettes.csv", header=TRUE, fill = TRUE, sep = ",", row.names = NULL, stringsAsFactors = FALSE)
allVignettes = allVignettes[,c("cluster","text","genre")]
allVignettes = allVignettes %>% mutate(genre="unknown")
#allVignettes <- allVignettes[sample(1:nrow(allNews), 75,replace=FALSE),]
allVignettes$text = gsub("\r?\n|\r", "\\s", allVignettes$text)
#combine above
allData = rbind(allWRIGHT2,allNews,allVignettes)
# What about vignettes classified against the general population of genres
allData <- rbind(allVignettes, newGenres_slim)
allData <- allData %>% mutate(cluster = as.character(cluster))
# LET'S COMPARE THE RESULTS WITH OTHER GENRES (ads, poetry)
#poetry
allPoetry = read.csv("data/genreClass-4-13-16.csv", header=TRUE, fill = TRUE, sep = ",", row.names = NULL, stringsAsFactors = FALSE)
allPoetry = allPoetry %>% filter(classified_genre == "poetry")
names(allPoetry)[names(allPoetry)=="classified_genre"] <- "genre"
names(allPoetry)[names(allPoetry)=="Text"] <- "text"
allPoetry = allPoetry[,c("cluster","text","genre")]
allPoetry = allPoetry %>% mutate(genre="unknown")
allPoetry <- allPoetry[sample(1:nrow(allNews), 200,replace=FALSE),]
#combine above
allData = rbind(allWRIGHT2,allNews,allPoetry)
#ads
allAds = read.csv("output/justAds-5-23-16.csv", header=TRUE, fill = TRUE, sep = ",", row.names = NULL, stringsAsFactors = FALSE)
allAds = allAds[,c("cluster","text","genre")]
allAds = allAds %>% mutate(genre="unknown")
allAds <- allAds[sample(1:nrow(allNews), 200,replace=FALSE),]
#combine above
allData = rbind(allWRIGHT2,allNews,allAds)
#Trying to find other fiction. Instead of reading in vignettes, read in everything
newData = beginData #from ClusterAnalysis.R
newData$cluster <- as.character(newData$cluster)
newData = newData %>% group_by(cluster) %>% slice(1)
newData = newData %>% mutate(genre="unknown")
newData = newData[,c("cluster","text","genre")]
newData = newData %>% as.data.frame()
#Sample data down to 5000
#newData <- newData[sample(1:nrow(newData), 5000,replace=FALSE),]
#combine above
allData = rbind(allWRIGHT2,allNews,allVignettes)