-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_analysis.R
111 lines (89 loc) · 4.88 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
## Getting and Cleaning Data Course Project
## Filename: run_analysis.R
## Author: [email protected]
## Date: April 2015
## Comment: Script to prepare tidy data that can be used for later analysis:
## 1. Merges the training and the test sets to create one data set.
## 2. Extracts only the measurements on the mean and standard deviation for
## each measurement.
## 3. Uses descriptive activity names to name the activities in the data set
## 4. Appropriately labels the data set with descriptive variable names.
## 5. From the data set in step 4, creates a second, independent tidy data
## set with the average of each variable for each activity and each subject.
## Load libraries
## dplyr to use select() function (Step 3)
library(dplyr)
## plyr to use ddply() function (Step 5)
library(plyr)
## Define variables
DestDirectory <- "UCI HAR Dataset"
## Get and load names of "Features"
feature_names <- read.table("UCI HAR Dataset/features.txt")
####################################################################
## 1. Merges the training and the test sets to create one data set.
####################################################################
## Read training data files
train_activities <- read.table("UCI HAR Dataset/train/y_train.txt", header = FALSE)
train_subject <- read.table("UCI HAR Dataset/train/subject_train.txt", header = FALSE)
train_set <- read.table("UCI HAR Dataset/train/X_train.txt", header = FALSE)
## Read test data files
test_activities <- read.table("UCI HAR Dataset/test/y_test.txt", header = FALSE)
test_subject <- read.table("UCI HAR Dataset/test/subject_test.txt", header = FALSE)
test_set <- read.table("UCI HAR Dataset/test/X_test.txt", header = FALSE)
## Merge the training and the test sets of activities, subject and measurements
merged_activities <- rbind(train_activities, test_activities)
merged_subject <- rbind(train_subject, test_subject)
merged_set <- rbind(train_set, test_set)
## Set features names to columns of merged data set
colnames(merged_activities) <- "ActivityID"
colnames(merged_subject) <- "Subject"
colnames(merged_set) <- feature_names$V2
# Create one data set for training and test sets
HAR_data <- cbind(merged_activities, merged_subject, merged_set)
####################################################################
## 2. Extracts only the data on the mean and standard deviation for
## each measurement.
####################################################################
## Subset index using only measurements on mean and std
subset_index_features <- grep("-mean\\(\\)|-std\\(\\)", colnames(HAR_data))
## Extracted data with subject, activities and measurements on mean and std
HAR_meanstd <- HAR_data[,c(1,2,subset_index_features)]
####################################################################
## 3. Uses descriptive activity names to name them in the data set
####################################################################
## Get and load names of "Activity Labels"
activity_labels <- read.table("./UCI HAR Dataset/activity_labels.txt",header=FALSE)
colnames(activity_labels) <- c("ActivityID","Activity")
## Merged measurements data and activity names
HAR_meanstd <- merge(activity_labels, HAR_meanstd, by="ActivityID")
## Delete activity_id column
HAR_meanstd <- select(HAR_meanstd, -(ActivityID))
####################################################################
## 4. Appropriately labels data set variables with descriptive names
####################################################################
## Replaced initial "f" for "Frequency"
names(HAR_meanstd) <- gsub("^f","Frequency",names(HAR_meanstd))
## Replaced inital "t" for "Time"
names(HAR_meanstd) <- gsub("^t","Time",names(HAR_meanstd))
## Replace "-mean()" for "Mean"
names(HAR_meanstd)<-gsub("-mean\\(\\)", "Mean", names(HAR_meanstd))
## Replace "-std()" for "StandardDeviation"
names(HAR_meanstd)<-gsub("-std\\(\\)", "StandardDeviation", names(HAR_meanstd))
## Replace "Acc" for "LinearAcceleration"
names(HAR_meanstd) <- gsub('Acc',"LinearAcceleration",names(HAR_meanstd))
## Replace "Gyro" for "AngularVelocity"
names(HAR_meanstd) <- gsub('Gyro',"AngularVelocity",names(HAR_meanstd))
## Replace "Mag" for "Magnitude"
names(HAR_meanstd)<-gsub("Mag", "Magnitude", names(HAR_meanstd))
## Replace "BodyBody" for "Body"
names(HAR_meanstd)<-gsub("BodyBody", "Body", names(HAR_meanstd))
## Remove "-"
names(HAR_meanstd)<-gsub("-","",names(HAR_meanstd))
####################################################################
## 5. Creates a second, independent tidy data set with the average
## of each variable for each activity and each subject.
####################################################################
## Calculate average of each variable for Activity-Subject combinations
HAR_activity_subject = ddply(HAR_meanstd, c("Activity","Subject"), numcolwise(mean))
## Write tidy data set to file
write.table(HAR_activity_subject, file = "HAR_average_by_act_sub.txt", row.names = FALSE)