-
Notifications
You must be signed in to change notification settings - Fork 3
/
PCA.Rmd
129 lines (82 loc) · 4.27 KB
/
PCA.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
---
title: "PCA"
output: html_document
---
This document includes the process for performing PCA on clusters. It requires some tweaking as all clusters seem to group at present. Among the things I need to figure out is how to eliminate stop words from the bag of words.
```{r}
#Streamlined PCA Code for WORDS
stopWords = stopwords("en")
wordCutoff = 100
allDataWords <- allData %>%
unnest_tokens(word,text)
counts = allDataWords %>% filter(word!="") %>% mutate(word=tolower(word)) %>% group_by(cluster,word,genre) %>% summarize(count=n()) %>% ungroup
td = allDataWordsIntoTD(allDataWords,normalized = T)
justaMatrix = td %>% select(-cluster, -genre)
justaMatrix = log(justaMatrix + .5)
model = prcomp(justaMatrix)
prediction = predict(model)
prediction = prediction %>%
as.data.frame %>% mutate(cluster=td$cluster %>% as.character, genre=td$genre %>% as.character) %>%
select(cluster,genre,PC1,PC2,PC3)
#PCA Code for TOPICS (This assumes topic modelling has been completed in Topic Modelling Generic.r)
#Create a dataframe with all topics for clusters that have known genres
topicsDF = doc.topics %>%
as.data.frame() %>%
mutate(cluster = input$Cluster, primary_genre = input$Genre) %>%
filter(primary_genre != "unknown")
#topicsDF <- topicsDF[sample(nrow(topicsDF), 1000),]
#Convert to a matrix
modeling_matrix = topicsDF %>% select(-primary_genre, -cluster)
modeling_matrix = log(modeling_matrix + .5)
#PCA
model = prcomp(modeling_matrix)
prediction = predict(model)
prediction = prediction %>%
as.data.frame %>%
mutate(cluster=topicsDF$cluster %>%
as.character, genre=topicsDF$primary_genre %>%
as.character) %>%
select(cluster,genre,PC1,PC2,PC3)
#vtData$cluster <- vtData$cluster %>% as.character()
prediction <- prediction %>% inner_join(newGenres, by = "cluster" )
prediction <- prediction[,c("cluster","text","genre.x","PC1","PC2","PC3")]
prediction$text <- substr(prediction$text, 0, 300)
class(prediction$text)
write_csv(prediction, path = "VT-Data/data/pca4plotly-6-23-17.csv")
```
See the Principal Components
```{r}
#We can see what the weights are by
model$rotation %>% as.data.frame %>% mutate(word=rownames(model$rotation)) %>% select(word,PC1) %>% arrange(-PC1) %>% head
model$rotation %>% as.data.frame %>% mutate(word=rownames(model$rotation)) %>% select(word,PC1) %>% arrange(PC1) %>% head
# The second "Principal Component"
model$rotation %>% as.data.frame %>% mutate(word=rownames(model$rotation)) %>% select(word,PC2) %>% arrange(-PC2) %>% head
model$rotation %>% as.data.frame %>% mutate(word=rownames(model$rotation)) %>% select(word,PC2) %>% arrange(PC2) %>% head
```
```{r}
#Visualize using a variety of methods
#Visualize (Need to get V1 Cluster number into the mix)
prediction %>% filter(genre!="unknown") %>% ggplot() + geom_point(aes(x=PC1,y=PC2,color=genre, size=5))
ggplot(prediction %>% group_by(genre) %>% filter(n()>1)) + geom_point(aes(x=PC1,y=PC2,color=genre, shape=genre))
ggplot(prediction %>% group_by(genre) %>% filter(n()>1)) + geom_point(aes(x=PC2,y=PC3,color=genre, shape=genre))
ggplot(prediction) + geom_text(aes(x=PC1,y=PC2,label=cluster, shape=genre, color=genre))
ggplot(prediction) + geom_text(aes(x=PC2,y=PC3,label=cluster, shape=genre, color=genre))
ggplot(prediction) + geom_point(aes(x=PC2,y=PC3,color=genre))
ggplot(prediction %>% mutate(url=url) %>% inner_join(df)) + geom_point(aes(x=PC2,y=PC3,shape=username,color=username,size=5))
ggplot(prediction %>% mutate(url=url) %>% inner_join(df) %>% group_by(username) %>% filter(n()>5)) + geom_text(aes(y=PC1,x=as.Date(Date),color=username,size=5,label=username))
ggplot(prediction %>% mutate(url=url) %>% inner_join(df) %>% group_by(username) %>% filter(n()>5)) + geom_text(aes(y=PC1,x=PC2,color=username,size=5,label=username))
#Print df header
head(prediction)
#Select a row to print
df[10,]
```
This requires more work, but here is the alternative to ggplot that Abby suggested. It seems promising, but I'm not yet able to get it to work.
```{r}
#Visualize with ggbiplot (???)
data(halfCluster)
clusters.pca <- prcomp(halfCluster, scale. = TRUE)
ggbiplot(clusters.pca, obs.scale = 1, var.scale = 1,
groups = halfCluster.V1, ellipse = TRUE, circle = TRUE) +
scale_color_discrete(name = '') +
theme(legend.direction = 'horizontal', legend.position = 'top')
```