.Rhistory

colRm_test <- c("user_name","raw_timestamp_part_1","raw_timestamp_part_2","cvtd_timestamp","num_window","problem_id")
newtraining <- training_filter_na[,!(names(training_filter_na) %in% colRm_train)]
newtesting <- testing_filter_na[,!(names(testing_filter_na) %in% colRm_test)]
dim(newtraining)
dim(newtesting)
str(newtesting)
str(testing)
training <- read.csv("./data/pml-training.csv",header=T,sep=",",na.strings=c(""," ","NA"))
testing <- read.csv("./data/pml-testing.csv",header=T,sep=",",na.strings=c(""," ","NA"))
colnames(testing)
Url1 <-"https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
Url2 <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
trainFile <- "./data/pml-training.csv"
testFile  <- "./data/pml-testing.csv"
if (!file.exists("./data")) {
dir.create("./data")
}
if (!file.exists(trainFile)) {
download.file(Url1, destfile=trainFile)
}
if (!file.exists(testFile)) {
download.file(Url2, destfile=testFile)
}
#load the data
training <- read.csv("./data/pml-training.csv",header=T,sep=",",na.strings=c(""," ","NA"))
testing <- read.csv("./data/pml-testing.csv",header=T,sep=",",na.strings=c(""," ","NA"))
training$classe
testing$classe
dim(testing)
testing_pred <- predict(rfFit, newdata=newtesting)
testing(table)
table(testing)
table(testing_pred)
---
title: "Predicting the Quality of Weigth Lifting Exerice Using Random Forest"
author: "Nilrey Jim Cornites"
date: "November 19, 2017"
output:
html_document: default
pdf_document: default
---
```{r setup, include=FALSE, cache=TRUE}
knitr::opts_chunk$set(echo = TRUE)
setwd("D:/Stat and Data Science References/datasciencecoursera/08_Practical Machine Learning/practicalMLfinalproject")
```
## Executive Summary
This the a final project to Practical Machine Learning Class in coursera.com, 1 of the 9 courses under the Data Science Specialization.
The common problem in research is to quantify how much of a particular activity they do, but rarely to quantify how well they do it. In this project, the goal will be to use data from accelerometers on the belt, forearm, arm, and dumbell of 6 participants to predict how well they do weight lifting exercise.
Using **random forest** technique, the model predicted the quality of the weighlifting by `96%`, a `4%` out of sample only. Top important predictors are _________________ .
## Introduction
Using devices such as Jawbone Up, Nike FuelBand, and Fitbit it is now possible to collect a large amount of data about personal activity relatively inexpensively. These type of devices are part of the quantified self movement - a group of enthusiasts who take measurements about themselves regularly to improve their health, to find patterns in their behavior, or because they are tech geeks. One thing that people regularly do is quantify how much of a particular activity they do, but they rarely quantify how well they do it. See http://web.archive.org/web/20161224072740/http:/groupware.les.inf.puc-rio.br/har for more details about the data.
## The Data
This project utilized the **Weight Lifting Exercises Dataset** from http://web.archive.org/web/20161224072740/http:/groupware.les.inf.puc-rio.br/har. The goal is to use data from accelerometers on the belt, forearm, arm, and dumbell of 6 participants. They were asked to perform barbell lifts correctly and incorrectly in 5 different ways.
The 6 young health participants were asked to perform one set of 10 repetitions of the Unilateral Dumbbell Biceps Curl in five different fashions: exactly according to the specification (Class A), throwing the elbows to the front (Class B), lifting the dumbbell only halfway (Class C), lowering the dumbbell only halfway (Class D) and throwing the hips to the front (Class E).
The training data for this project are available here:
https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv
The test data are available here:
https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv
## Loading Data and Some Exploratory Analyses
Following code will call Caret package which will be used to build the model, ggplot2 for visualization, and download the training and test set.
``` {r}
library(caret)
library(ggplot2)
#download the data
Url1 <-"https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
Url2 <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
trainFile <- "./data/pml-training.csv"
testFile  <- "./data/pml-testing.csv"
if (!file.exists("./data")) {
dir.create("./data")
}
if (!file.exists(trainFile)) {
download.file(Url1, destfile=trainFile)
}
if (!file.exists(testFile)) {
download.file(Url2, destfile=testFile)
}
#load the data
training <- read.csv("./data/pml-training.csv",header=T,sep=",",na.strings=c(""," ","NA"))
testing <- read.csv("./data/pml-testing.csv",header=T,sep=",",na.strings=c(""," ","NA"))
```
### Exploratory Analyses
``` {r}
library(Amelia)
dim(training) ; dim(testing) ## succedding exploration will use only training set to avoid bias on tuning the selected model
```
We see that there are 19,622 rows and 160 columns in the training set while only 20 rows with the same number of fields in testing set. The “classe” variable in the both sets is the outcome to predict.
``` {r echo=FALSE}
str(training)
```
We can see that of 160 columns, there are many variables with lots of missing values (`NA`). We might also want to check for variables(except the outcome variable) if there's any that has variance near zero
### Data Preprocessing
We will check those those variable that has near zero variance and remove those predictors. We will also remove those predictor with missing value. We will put back those removed variables with imputed values if the prediction model looks bad.
```{r}
nzvariance<-nearZeroVar(training, saveMetrics = T)
toremove<-nzvariance[nzvariance[,"nzv"],]
print(toremove)
nrow(toremove) ## number of removed predictor
training <- training[,!nzvariance$nzv]
testing <- testing[,!nzvariance$nzv]
```
There are now `159-toremove` remaining predictor variables. The graph shows below the remainind predictor with missing value which will be removed.
```{r}
library(Amelia)
missmap(training)
# Remove variables with missing values
nomissing<-colSums(is.na(training)) == 0
training_filter_na <- training[,nomissing]
testing_filter_na <- testing[,nomissing]
# Remove unnecessary columns
colRm_train <- c("user_name","raw_timestamp_part_1","raw_timestamp_part_2","cvtd_timestamp","num_window")
colRm_test <- c("user_name","raw_timestamp_part_1","raw_timestamp_part_2","cvtd_timestamp","num_window","problem_id")
newtraining <- training_filter_na[,!(names(training_filter_na) %in% colRm_train)]
newtesting <- testing_filter_na[,!(names(testing_filter_na) %in% colRm_test)]
dim(newtraining)
dim(newtesting)
```
Code below will split the training set to two subgroup, training set and validation set.
``` {r}
set.seed(526)
inTrain <- createDataPartition(y=training$classe, p=0.7, list=FALSE)
training_clean <- newtraining[inTrain,]
validation_clean <- newtraining[-inTrain,]
```
## Model Building
Since outcome variable is categorical, We will consider Linear Discriminant Analysis (LDA) and random forest (RF) and see if later model accuracy is more supreme.
Caveat: LDA has strict assumptions to follow that are the same to linear regression model e.g. no outliers and normality of data. One may use logistic regression (or multinomial logistics in this case since outcome variable has more than 2 category) but we will just use LDA just to have comparison with RF, as in most cases RF tend to have better performance than linear modeling approach.
``` {r}
set.seed(123)
# Fit LDA model
ldaFit <- train(classe ~ ., method = "lda", data = training_clean, importance = T, trControl = trainControl(method = "cv", number = 4))
lda_validation_pred <- predict(ldaFit, newdata=validation_clean)
```
Below will model data based on random forest algorithm.
``` {r}
set.seed(123)
# Fit rf model
rfFit <- train(classe ~ ., method = "rf", data = training_clean, importance = T, trControl = trainControl(method = "cv", number = 4))
knitr::opts_chunk$set(echo = TRUE)
setwd("D:/Stat and Data Science References/datasciencecoursera/08_Practical Machine Learning/practicalMLfinalproject")
library(caret)
library(ggplot2)
#download the data
Url1 <-"https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
Url2 <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
trainFile <- "./data/pml-training.csv"
testFile  <- "./data/pml-testing.csv"
if (!file.exists("./data")) {
dir.create("./data")
}
if (!file.exists(trainFile)) {
download.file(Url1, destfile=trainFile)
}
if (!file.exists(testFile)) {
download.file(Url2, destfile=testFile)
}
#load the data
training <- read.csv("./data/pml-training.csv",header=T,sep=",",na.strings=c(""," ","NA"))
testing <- read.csv("./data/pml-testing.csv",header=T,sep=",",na.strings=c(""," ","NA"))
library(Amelia)
dim(training) ; dim(testing) ## succedding exploration will use only training set to avoid bias on tuning the selected model
str(training)
nzvariance<-nearZeroVar(training, saveMetrics = T)
toremove<-nzvariance[nzvariance[,"nzv"],]
print(toremove)
nrow(toremove) ## number of removed predictor
training <- training[,!nzvariance$nzv]
testing <- testing[,!nzvariance$nzv]
library(Amelia)
missmap(training)
# Remove variables with missing values
nomissing<-colSums(is.na(training)) == 0
training_filter_na <- training[,nomissing]
testing_filter_na <- testing[,nomissing]
# Remove unnecessary columns
colRm_train <- c("user_name","raw_timestamp_part_1","raw_timestamp_part_2","cvtd_timestamp","num_window")
colRm_test <- c("user_name","raw_timestamp_part_1","raw_timestamp_part_2","cvtd_timestamp","num_window","problem_id")
newtraining <- training_filter_na[,!(names(training_filter_na) %in% colRm_train)]
newtesting <- testing_filter_na[,!(names(testing_filter_na) %in% colRm_test)]
dim(newtraining)
dim(newtesting)
set.seed(526)
inTrain <- createDataPartition(y=training$classe, p=0.7, list=FALSE)
training_clean <- newtraining[inTrain,]
validation_clean <- newtraining[-inTrain,]
set.seed(123)
# Fit LDA model
ldaFit <- train(classe ~ ., method = "lda", data = training_clean, importance = T, trControl = trainControl(method = "cv", number = 4))
lda_validation_pred <- predict(ldaFit, newdata=validation_clean)
set.seed(123)
# Fit rf model
rfFit <- train(classe ~ ., method = "rf", data = training_clean, importance = T, trControl = trainControl(method = "cv", number = 4))
rf_validation_pred <- predict(rfFit, newdata=validation_clean)
# Check model performance
confusionMatrix(lda_validation_pred,validation_clean$classe)
confusionMatrix(rf_validation_pred,validation_clean$classe)
imp <- varImp(rfFit)$importance
varImpPlot(rfFit$finalModel, sort = TRUE, type = 1, pch = 19, col = 1, cex = 1, main = "Importance of the Predictors")
testing_pred <- predict(rfFit, newdata=newtesting)
testing_pred
dim(validation_clean)
```{r setup, include=FALSE, cache=TRUE}
knitr::opts_chunk$set(echo = TRUE)
setwd("D:/Stat and Data Science References/datasciencecoursera/08_Practical Machine Learning/practicalMLfinalproject")
```
## Executive Summary
This the a final project to Practical Machine Learning Class in coursera.com, 1 of the 9 courses under the Data Science Specialization.
The common problem in research is to quantify how much of a particular activity they do, but rarely to quantify how well they do it. In this project, the goal will be to use data from accelerometers on the belt, forearm, arm, and dumbell of 6 participants to predict how well they do weight lifting exercise.
Using **random forest** technique, the model predicted the quality of the weighlifting by `96%`, a `4%` out of sample only. Top important predictors are _________________ .
## Introduction
Using devices such as Jawbone Up, Nike FuelBand, and Fitbit it is now possible to collect a large amount of data about personal activity relatively inexpensively. These type of devices are part of the quantified self movement - a group of enthusiasts who take measurements about themselves regularly to improve their health, to find patterns in their behavior, or because they are tech geeks. One thing that people regularly do is quantify how much of a particular activity they do, but they rarely quantify how well they do it. See http://web.archive.org/web/20161224072740/http:/groupware.les.inf.puc-rio.br/har for more details about the data.
## The Data
This project utilized the **Weight Lifting Exercises Dataset** from http://web.archive.org/web/20161224072740/http:/groupware.les.inf.puc-rio.br/har. The goal is to use data from accelerometers on the belt, forearm, arm, and dumbell of 6 participants. They were asked to perform barbell lifts correctly and incorrectly in 5 different ways.
The 6 young health participants were asked to perform one set of 10 repetitions of the Unilateral Dumbbell Biceps Curl in five different fashions: exactly according to the specification (Class A), throwing the elbows to the front (Class B), lifting the dumbbell only halfway (Class C), lowering the dumbbell only halfway (Class D) and throwing the hips to the front (Class E).
The training data for this project are available here:
https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv
The test data are available here:
https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv
## Loading Data and Some Exploratory Analyses
Following code will call Caret package which will be used to build the model, ggplot2 for visualization, and download the training and test set.
``` {r}
library(caret)
library(ggplot2)
#download the data
Url1 <-"https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
Url2 <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
trainFile <- "./data/pml-training.csv"
testFile  <- "./data/pml-testing.csv"
if (!file.exists("./data")) {
dir.create("./data")
}
if (!file.exists(trainFile)) {
download.file(Url1, destfile=trainFile)
}
if (!file.exists(testFile)) {
download.file(Url2, destfile=testFile)
}
#load the data
training <- read.csv("./data/pml-training.csv",header=T,sep=",",na.strings=c(""," ","NA"))
testing <- read.csv("./data/pml-testing.csv",header=T,sep=",",na.strings=c(""," ","NA"))
```
### Exploratory Analyses
``` {r}
library(Amelia)
dim(training) ; dim(testing) ## succedding exploration will use only training set to avoid bias on tuning the selected model
```
We see that there are 19,622 rows and 160 columns in the training set while only 20 rows with the same number of fields in testing set. The “classe” variable in the both sets is the outcome to predict.
``` {r echo=FALSE}
str(training)
```
We can see that of 160 columns, there are many variables with lots of missing values (`NA`). We might also want to check for variables(except the outcome variable) if there's any that has variance near zero
### Data Preprocessing
We will check those those variable that has near zero variance and remove those predictors. We will also remove those predictor with missing value. We will put back those removed variables with imputed values if the prediction model looks bad.
```{r}
nzvariance<-nearZeroVar(training, saveMetrics = T)
toremove<-nzvariance[nzvariance[,"nzv"],]
print(toremove)
nrow(toremove) ## number of removed predictor
training <- training[,!nzvariance$nzv]
testing <- testing[,!nzvariance$nzv]
```
There are now `159-toremove` remaining predictor variables. The graph shows below the remainind predictor with missing value which will be removed.
```{r}
library(Amelia)
missmap(training)
# Remove variables with missing values
nomissing<-colSums(is.na(training)) == 0
training_filter_na <- training[,nomissing]
testing_filter_na <- testing[,nomissing]
# Remove unnecessary columns
colRm_train <- c("user_name","raw_timestamp_part_1","raw_timestamp_part_2","cvtd_timestamp","num_window")
colRm_test <- c("user_name","raw_timestamp_part_1","raw_timestamp_part_2","cvtd_timestamp","num_window","problem_id")
newtraining <- training_filter_na[,!(names(training_filter_na) %in% colRm_train)]
newtesting <- testing_filter_na[,!(names(testing_filter_na) %in% colRm_test)]
dim(newtraining)
dim(newtesting)
```
Code below will further split the training set to two subset, training set and validation set.
``` {r}
set.seed(526)
inTrain <- createDataPartition(y=newtraining$classe, p=0.7, list=FALSE)
training_clean <- newtraining[inTrain,]
validation_clean <- newtraining[-inTrain,]
```
## Model Building
Since outcome variable is categorical, We will consider Linear Discriminant Analysis (LDA) and random forest (RF) and see if later model accuracy is more supreme.
Caveat: LDA has strict assumptions to follow that are the same to linear regression model e.g. no outliers and normality of data. One may use logistic regression (or multinomial logistics in this case since outcome variable has more than 2 category) but we will just use LDA just to have comparison with RF, as in most cases RF tend to have better performance than linear modeling approach.
``` {r}
set.seed(123)
# Fit LDA model
ldaFit <- train(classe ~ ., method = "lda", data = training_clean, importance = T, trControl = trainControl(method = "cv", number = 4))
lda_validation_pred <- predict(ldaFit, newdata=validation_clean)
```
dim(training)
dim(testing)
training_filter_na <- training[,(colSums(is.na(training)) == 0)]
testing_filter_na <- testing[,(colSums(is.na(testing)) == 0)]
dim(training_filter_na)
dim(testing_filter_na)
dim(newtraining)
dim(newtesting)
colnames(newtesting)
colnames(newtraining)
colnames(newtesting)==colnames(newtraining)
dim(training_clean)
dim(validation_clean)
dim(training_clean)+dim(validation_clean)
dim(newtraining)
# Check model performance
confusionMatrix(lda_validation_pred,validation_clean$classe)
dim(training_clean)
knitr::opts_chunk$set(echo = TRUE)
setwd("D:/Stat and Data Science References/datasciencecoursera/08_Practical Machine Learning/practicalMLfinalproject")
library(caret)
library(ggplot2)
#download the data
Url1 <-"https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
Url2 <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
trainFile <- "./data/pml-training.csv"
testFile  <- "./data/pml-testing.csv"
if (!file.exists("./data")) {
dir.create("./data")
}
if (!file.exists(trainFile)) {
download.file(Url1, destfile=trainFile)
}
if (!file.exists(testFile)) {
download.file(Url2, destfile=testFile)
}
#load the data
training <- read.csv("./data/pml-training.csv",header=T,sep=",",na.strings=c(""," ","NA"))
testing <- read.csv("./data/pml-testing.csv",header=T,sep=",",na.strings=c(""," ","NA"))
library(Amelia)
dim(training) ; dim(testing) ## succedding exploration will use only training set to avoid bias on tuning the selected model
str(training)
nzvariance<-nearZeroVar(training, saveMetrics = T)
toremove<-nzvariance[nzvariance[,"nzv"],]
print(toremove)
nrow(toremove) ## number of removed predictor
training <- training[,!nzvariance$nzv]
testing <- testing[,!nzvariance$nzv]
library(Amelia)
missmap(training)
# Remove variables with missing values
nomissing<-colSums(is.na(training)) == 0
training_filter_na <- training[,nomissing]
testing_filter_na <- testing[,nomissing]
# Remove unnecessary columns
colRm_train <- c("user_name","raw_timestamp_part_1","raw_timestamp_part_2","cvtd_timestamp","num_window")
colRm_test <- c("user_name","raw_timestamp_part_1","raw_timestamp_part_2","cvtd_timestamp","num_window","problem_id")
newtraining <- training_filter_na[,!(names(training_filter_na) %in% colRm_train)]
newtesting <- testing_filter_na[,!(names(testing_filter_na) %in% colRm_test)]
dim(newtraining)
dim(newtesting)
set.seed(526)
inTrain <- createDataPartition(y=newtraining$classe, p=0.7, list=FALSE)
training_clean <- newtraining[inTrain,]
validation_clean <- newtraining[-inTrain,]
set.seed(123)
# Fit LDA model
ldaFit <- train(classe ~ ., method = "lda", data = training_clean, importance = T, trControl = trainControl(method = "cv", number = 4))
lda_validation_pred <- predict(ldaFit, newdata=validation_clean)
set.seed(123)
# Fit rf model
rfFit <- train(classe ~ ., method = "rf", data = training_clean, importance = T, trControl = trainControl(method = "cv", number = 4))
colnames(training_clean)
knitr::opts_chunk$set(echo = TRUE)
setwd("D:/Stat and Data Science References/datasciencecoursera/08_Practical Machine Learning/practicalMLfinalproject")
library(caret)
library(ggplot2)
#download the data
Url1 <-"https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
Url2 <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
trainFile <- "./data/pml-training.csv"
testFile  <- "./data/pml-testing.csv"
if (!file.exists("./data")) {
dir.create("./data")
}
if (!file.exists(trainFile)) {
download.file(Url1, destfile=trainFile)
}
if (!file.exists(testFile)) {
download.file(Url2, destfile=testFile)
}
#load the data
training <- read.csv("./data/pml-training.csv",header=T,sep=",",na.strings=c(""," ","NA"))
testing <- read.csv("./data/pml-testing.csv",header=T,sep=",",na.strings=c(""," ","NA"))
training<-training[,-1]
testing<-training[,-1]
library(Amelia)
dim(training) ; dim(testing) ## succedding exploration will use only training set to avoid bias on tuning the selected model
str(training)
nzvariance<-nearZeroVar(training, saveMetrics = T)
toremove<-nzvariance[nzvariance[,"nzv"],]
print(toremove)
nrow(toremove) ## number of removed predictor
training <- training[,!nzvariance$nzv]
testing <- testing[,!nzvariance$nzv]
knitr::opts_chunk$set(echo = TRUE)
setwd("D:/Stat and Data Science References/datasciencecoursera/08_Practical Machine Learning/practicalMLfinalproject")
library(caret)
library(ggplot2)
#download the data
Url1 <-"https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
Url2 <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
trainFile <- "./data/pml-training.csv"
testFile  <- "./data/pml-testing.csv"
if (!file.exists("./data")) {
dir.create("./data")
}
if (!file.exists(trainFile)) {
download.file(Url1, destfile=trainFile)
}
if (!file.exists(testFile)) {
download.file(Url2, destfile=testFile)
}
#load the data
training <- read.csv("./data/pml-training.csv",header=T,sep=",",na.strings=c(""," ","NA"))
testing <- read.csv("./data/pml-testing.csv",header=T,sep=",",na.strings=c(""," ","NA"))
library(Amelia)
dim(training) ; dim(testing) ## succedding exploration will use only training set to avoid bias on tuning the selected model
str(training)
nzvariance<-nearZeroVar(training, saveMetrics = T)
toremove<-nzvariance[nzvariance[,"nzv"],]
print(toremove)
nrow(toremove) ## number of removed predictor
training <- training[,!nzvariance$nzv]
testing <- testing[,!nzvariance$nzv]
library(Amelia)
missmap(training)
# Remove variables with missing values
nomissing<-colSums(is.na(training)) == 0
training_filter_na <- training[,nomissing]
testing_filter_na <- testing[,nomissing]
# Remove unnecessary columns
colRm_train <- c("X","user_name","raw_timestamp_part_1","raw_timestamp_part_2","cvtd_timestamp","num_window")
colRm_test <- c("X","user_name","raw_timestamp_part_1","raw_timestamp_part_2","cvtd_timestamp","num_window","problem_id")
newtraining <- training_filter_na[,!(names(training_filter_na) %in% colRm_train)]
newtesting <- testing_filter_na[,!(names(testing_filter_na) %in% colRm_test)]
dim(newtraining)
dim(newtesting)
set.seed(526)
inTrain <- createDataPartition(y=newtraining$classe, p=0.7, list=FALSE)
training_clean <- newtraining[inTrain,]
validation_clean <- newtraining[-inTrain,]
set.seed(123)
# Fit LDA model
ldaFit <- train(classe ~ ., method = "lda", data = training_clean, importance = T, trControl = trainControl(method = "cv", number = 4))
lda_validation_pred <- predict(ldaFit, newdata=validation_clean)
set.seed(123)
# Fit rf model
rfFit <- train(classe ~ ., method = "rf", data = training_clean, importance = T, trControl = trainControl(method = "cv", number = 4))
dim(training)
View(training)
tmp<-training[, c(-1,-117)]
colnames(tmp)<-1:115
missmap(tmp)
missmap(tmp)
View(tmp)
percentmissing<-colSums(is.na(training))/nrow(training)
qplot(percentmissing, 1:ncol(training))
qplot
percentmissing
qplot(y=percentmissing, x=1:ncol(training))
library(ggplot2)
install.packages("labeling")
qplot(y=percentmissing, x=1:ncol(training))
?qplot
plot(y=percentmissing, x=1:ncol(training))
qplot(y=percentmissing, x=index, data=data.frame(percentmissing=percentmissing, index=1:ncol(training)))
data=data.frame(percentmissing=percentmissing, index=1:ncol(training))
qplot(y=percentmissing, x=index, data=data.frame(percentmissing=percentmissing, index=1:ncol(training)), col="red")
qplot(y=percentmissing, x=index, data=data.frame(percentmissing=percentmissing, index=1:ncol(training)), color="red")
qplot(y=percentmissing, x=index, data=data.frame(percentmissing=percentmissing, index=1:ncol(training)), color=as.factor(colnames(training)))
qplot(y=percentmissing, x=index, data=data.frame(percentmissing=percentmissing, index=1:ncol(training)))
qplot(y=percentmissing, x=index, data=data.frame(percentmissing=percentmissing, index=1:ncol(training)))
dim(training) ; dim(testing)
training_clean
View(training_clean)
nzvariance
qplot(training_clean)
install.packages("GGally")
ggpairs(training_clean, aes(color=classe, alpha =0.4))
library(GGally)
ggpairs(training_clean, aes(color=classe, alpha =0.4))
ggpairs(training_clean[,-(1:30)])
ggpairs(training_clean[,-(1:50)])
class(traing_clean)
class(trainging_clean)
class(training_clean)
sapply(class(training_clean))
sapply(training_clean, FUN=class)
length(sapply(training_clean, FUN=class))
sapply(training_clean, FUN=class) == "factor"
x<-sapply(training_clean, FUN=class) == "factor"
sum(x)
ggpairs(training_clean[,-c(1:26,53)], aes(color=training_clean[,53], alpha=.4))
ggpairs(training_clean[,-c(1:26,53)], aes(color=training_clean[,53], alpha=.4))
training_clean[,53]
x<-training_clean[,53]
names(x)
colnames(x)
ggpairs(training_clean[,-c(1:26,53)], aes(color=as.factpr(classe=training_clean[,53]), alpha=.4))
ggpairs(training_clean[,-c(1:26,53)], aes(color=as.factor(classe=training_clean[,53]), alpha=.4))
ggscatmat(training_clean[,-53])
ggscatmat(training_clean[,-(26:53)])
ggscatmat(training_clean, column=columns = c(1,2)
ggscatmat(training_clean, column=columns = c(1,2))
ggscatmat(training_clean, column=c(1,2))
plot(training_clean[,1:10])
ggpairs(training_clean, columns = c('roll_belt', 'yaw_belt', 'pitch_forearm', 'pitch_belt' , 'magnet_dumbbell_z'), aes(color=classe, alpha = 0.5))
ggpairs(training_clean, columns = c('roll_belt', 'yaw_belt', 'pitch_forearm', 'pitch_belt' , 'magnet_dumbbell_z'), aes(color=classe, alpha = 0.5))