PCA.Rmd

---
title: "PCA"
output: html_document
---

This document includes the process for performing PCA on clusters. It requires some tweaking as all clusters seem to group at present. Among the things I need to figure out is how to eliminate stop words from the bag of words.

```{r}

#Streamlined PCA Code for WORDS
stopWords = stopwords("en")
wordCutoff = 100
allDataWords <- allData %>% 
  unnest_tokens(word,text)
counts = allDataWords %>% filter(word!="") %>% mutate(word=tolower(word)) %>% group_by(cluster,word,genre) %>% summarize(count=n()) %>% ungroup
td = allDataWordsIntoTD(allDataWords,normalized = T) 
justaMatrix = td %>% select(-cluster, -genre)
justaMatrix = log(justaMatrix + .5)
model = prcomp(justaMatrix)
prediction = predict(model)
prediction = prediction %>% 
  as.data.frame %>% mutate(cluster=td$cluster %>% as.character, genre=td$genre %>% as.character) %>% 
  select(cluster,genre,PC1,PC2,PC3)

#PCA Code for TOPICS (This assumes topic modelling has been completed in Topic Modelling Generic.r)


#Create a dataframe with all topics for clusters that have known genres
topicsDF = doc.topics %>% 
  as.data.frame() %>% 
  mutate(cluster = input$Cluster, primary_genre = input$Genre) %>% 
  filter(primary_genre != "unknown")


#topicsDF <- topicsDF[sample(nrow(topicsDF), 1000),]


#Convert to a matrix
modeling_matrix = topicsDF %>% select(-primary_genre, -cluster)
modeling_matrix = log(modeling_matrix + .5)

#PCA
model = prcomp(modeling_matrix)
prediction = predict(model)
prediction = prediction %>% 
  as.data.frame %>% 
  mutate(cluster=topicsDF$cluster %>% 
           as.character, genre=topicsDF$primary_genre %>% 
           as.character) %>% 
  select(cluster,genre,PC1,PC2,PC3)


#vtData$cluster <- vtData$cluster %>% as.character()
prediction <- prediction %>% inner_join(newGenres, by = "cluster" )
prediction <- prediction[,c("cluster","text","genre.x","PC1","PC2","PC3")]


prediction$text <- substr(prediction$text, 0, 300)

class(prediction$text)

write_csv(prediction, path = "VT-Data/data/pca4plotly-6-23-17.csv")
```

See the Principal Components

```{r}

#We can see what the weights are by 
model$rotation %>% as.data.frame %>% mutate(word=rownames(model$rotation)) %>% select(word,PC1) %>% arrange(-PC1) %>% head
model$rotation %>% as.data.frame %>% mutate(word=rownames(model$rotation)) %>% select(word,PC1) %>% arrange(PC1) %>% head
# The second "Principal Component"
model$rotation %>% as.data.frame %>% mutate(word=rownames(model$rotation)) %>% select(word,PC2) %>% arrange(-PC2) %>% head
model$rotation %>% as.data.frame %>% mutate(word=rownames(model$rotation)) %>% select(word,PC2) %>% arrange(PC2) %>% head

```


```{r}
#Visualize using a variety of methods
#Visualize (Need to get V1 Cluster number into the mix)

prediction %>% filter(genre!="unknown") %>% ggplot() + geom_point(aes(x=PC1,y=PC2,color=genre, size=5))

ggplot(prediction %>% group_by(genre) %>% filter(n()>1)) + geom_point(aes(x=PC1,y=PC2,color=genre, shape=genre))
ggplot(prediction %>% group_by(genre) %>% filter(n()>1)) + geom_point(aes(x=PC2,y=PC3,color=genre, shape=genre))


ggplot(prediction) + geom_text(aes(x=PC1,y=PC2,label=cluster, shape=genre, color=genre))
ggplot(prediction) + geom_text(aes(x=PC2,y=PC3,label=cluster, shape=genre, color=genre))


ggplot(prediction) + geom_point(aes(x=PC2,y=PC3,color=genre))

ggplot(prediction %>% mutate(url=url) %>% inner_join(df)) + geom_point(aes(x=PC2,y=PC3,shape=username,color=username,size=5))

ggplot(prediction %>% mutate(url=url) %>% inner_join(df) %>% group_by(username) %>% filter(n()>5)) + geom_text(aes(y=PC1,x=as.Date(Date),color=username,size=5,label=username))

ggplot(prediction %>% mutate(url=url) %>% inner_join(df) %>% group_by(username) %>% filter(n()>5)) + geom_text(aes(y=PC1,x=PC2,color=username,size=5,label=username))

#Print df header
head(prediction)

#Select a row to print
df[10,]


```

This requires more work, but here is the alternative to ggplot that Abby suggested. It seems promising, but I'm not yet able to get it to work.

```{r}

#Visualize with ggbiplot (???)
data(halfCluster)
clusters.pca <- prcomp(halfCluster, scale. = TRUE)
ggbiplot(clusters.pca, obs.scale = 1, var.scale = 1,
         groups = halfCluster.V1, ellipse = TRUE, circle = TRUE) +
  scale_color_discrete(name = '') +
  theme(legend.direction = 'horizontal', legend.position = 'top')


```