-
Notifications
You must be signed in to change notification settings - Fork 0
/
BIke_Rcode.R
342 lines (199 loc) · 8.4 KB
/
BIke_Rcode.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
rm(list=ls())
setwd("C:/Users/ARON/Desktop/edwisor projects/bike rental")
getwd()
Load Libraries
# x = c("ggplot2", "corrgram", "DMwR", "caret", "randomForest", "unbalanced", "C50", "dummies", "e1071", "Information",
# "MASS", "rpart", "gbm", "ROSE", 'sampling', 'DataCombine', 'inTrees')
#
# install.packages(x)
# lapply(x, require, character.only = TRUE)
# rm(x)
library(xlsx)
library(rlang)
library(ggplot2)
data_original=read.csv("day.csv")
data=data_original
########analysing dataset################################################
#let us sse the structure
View(data)
str(data)
colnames(data)
dim(data)
class(data$dteday)
#no of unique values in each variables
apply(data, 2,function(x) length(table(x)))
unique(data$weekday)
unique(data$mnth)
# we can see that season,year,month,holiday,weekday,workingday,weathersit are to converted into factor for proper analysis .
###########################EDA###########################################
#let us convert the required dtype into factor
fact_names=c('season','yr','mnth','holiday','weekday','workingday','weathersit')
for (i in fact_names){
print(i)
data[,i]=as.factor(data[,i])
}
str(data)
#let us convert the integer into numeric dtype
data$instant=as.numeric(data$instant)
data$casual=as.numeric(data$casual)
data$registered=as.numeric(data$registered)
data$cnt=as.numeric(data$cnt)
#let us convert date into date dtype
#data$dteday=as.Date(data$dteday)
str(data)
#getting all numeric varaibles together
num_index = sapply(data, is.numeric)
num_data = data[,num_index]
num_col = colnames(num_data) #storing all the column name
#getting all categorical variables together
cat_ind=sapply(data, is.factor)
cat_data=data[,cat_ind]
cat_col= colnames(cat_data)
str(data)
num_col
cat_col
# #####analysing data (univariate)##########################
#let us see the distribution of target variable
hist(data$cnt)
#we see that, the target variable is normally distributed
hist(data$temp)
hist(data$atemp)
hist(data$windspeed)
#most of the data is normally distributed
###############analysing data (bivariate)##########################
names(data)
#let us see if the count in the rental bikes depends on the season
boxplot(data$cnt~data$season,xlab="seasons",ylab="count",mail="")
#we see that the bikes for rental are high in season3 (fall), followed by season2(summer),and season4 (winter), bike rentals are very less in season 1(spring)
#let us see which year had more bikes rented
boxplot(data$cnt~data$yr,xlab="year",ylab="count",mail="")
#compared to 2011, most of the bikes were rented in 2012
#let us see which month has the more bikes rented
boxplot(data$cnt~data$mnth,xlab="year",ylab="count",mail="")
#september had more number of bikes rented, which relates to our hypothesis by analysing season
#let us see whethere the bikes are rented more in working days or holidays
boxplot(data$cnt~data$workingday,xlab="workingday",ylab="count",mail="")
#there is almost similar bike renting at both the days
#let us see on which day in a week the bikes are rented the most
boxplot(data$cnt~data$weekday,xlab="weekday",ylab="count",mail="")
#the data distribution is almost similar on all days, let us check the casual and registered users
boxplot(data$casual~data$weekday,xlab="weekday",ylab="casual users",mail="")
#we see that the most of the bikes are rented by the casual users during the weekend (i.e. saturday, sunday followed by friday)
boxplot(data$registered~data$weekday,xlab="weekday",ylab="registered users",mail="")
#registered users data is almost uniform
#let us see how does the number of bike rental users depends on the weather condition
boxplot(data$cnt~data$weathersit,xlab="weather",ylab=" count",mail="")
#we see that the most of the users rent bikes during clean whether and less users during rainy weather and when the weather is bad there is 0 instance of users
#let us see if the temperature affects the count of users renting bike
plot(data$cnt,data$temp)
#the scatter plot shows that,as the temperature increases, the count increase
plot(data$cnt,data$atemp)
#the scatter plot shows that,as the feeling temperature increases, the count increase
#let us see if humidity affects the count of bikes
plot(data$cnt,data$hum)
#humidity doesnt affect the count
####### missing value analysis and outlier analysis##############
#checking missing value
apply(data,2,function(x){sum(is.na(x))})
#From missing value analysis we see that there are no missing values in the data
library(DMwR)
library(lattice)
library(grid)
#let us first check outliers
library(ggplot2)
for (i in 1:length(num_col))
{
assign(paste0("gn",i),
ggplot(aes_string(y = (num_col[i]), x = 'cnt'),data = data) +
stat_boxplot(geom = "errorbar", width = 0.5) +
geom_boxplot(outlier.colour="blue", fill = "skyblue",
outlier.shape=18,outlier.size=1, notch=FALSE) +
labs(y=num_col[i],x="cnt")+
ggtitle(paste("box plot for count",num_col[i])))
}
## Plotting plots together
gridExtra::grid.arrange(gn1,gn2,gn3,ncol=3)
gridExtra::grid.arrange(gn4,gn5,gn6,ncol=3)
gridExtra::grid.arrange(gn7,gn8,ncol=2)
#we see that the humidity,windspeed casual variable has outliers, out of which we are removing casual in future as the casual and registered variables sums up to explain the target variable.
#there are two variables in humidity, let us see the two variables
boxplot.stats(data$hum,coef=1.5)
#outliers are 0.187917 and 0, so let us remove them
data$hum[data$hum %in% boxplot.stats(data$hum)$out] = NA
boxplot.stats(data$windspeed,coef=1.5)
#there are 13 variables in windspeed, so let us remove them
data$windspeed[data$windspeed %in% boxplot.stats(data$windspeed)$out]=NA
# #checking all the missing values
library(DMwR)
sum(is.na(data))
data = knnImputation(data, k=3)
# let us check missing values left
apply(data,2,function(x){sum(is.na(x))})
dim(data)
#let us check if the outliers have been removed or not
boxplot(data$hum)
boxplot(data$windspeed)
#so, the outlier has been removed
#######feature selection###########################
library(corrgram)
corrgram(data[,num_index],
order = F, #we don't want to reorder
upper.panel=panel.pie,
lower.panel=panel.shade,
text.panel=panel.txt,
main = 'CORRELATION PLOT')
#We can see var the highly corr related var in plot marked dark blue.
#Dark blue color means highly positive correlated
#we see that that the temp is highly correlated to atemp, so let us remove temp
##---------anova ----------------------------------
names(data)
colnames(cat_data)
#Anova test
library("lsr")
anova_test=aov(cnt~season+yr+mnth+holiday+weekday+
workingday+weathersit,data = data)
summary(anova_test)
####################Removing Highly Corelated and Independent var##################
#we will remove instant and date variable as they are unique and dont exolain much about the target variable
#Also, we will remove casual and registered variable as they sum up to give the target variable total count of bikes
#as temp is highly correlated to atemp we will remove atemp
data= subset(data,select= -c(instant,dteday,atemp,casual,registered))
colnames(data)
str(data)
da
#####feature scaling######################
View(data)
#we see that the data is already normalized, so the data is ready to be fitted in the model for development
############model development##########################
####decision tree########
library(MASS)
library(rpart)
train_index= sample(1:nrow(data),0.8*nrow(data))
train= data[train_index,]
test= data[-train_index,]
DT_regression=rpart(cnt ~.,data=train,method="anova")
summary(DT_regression)
DT_predict=predict(DT_regression,test[,-11])
#evaluate
View(test[,11])
#install.packages("DMwR")
library(DMwR)
regr.eval(test[,11],DT_predict,stats = c("mae","mape","rmse"))
#mae mape rmse
#670.8904240 0.2206328 965.9852495
####random forest######
library(randomForest)
rf_model= randomForest(cnt~.,train,importance=TRUE,ntree=100)
summary(rf_model)
rf_predict=predict(rf_model,test[,-11])
regr.eval(test[,11],rf_predict,stats = c("mae","mape","rmse"))
#mae mape rmse
#496.1101195 0.1649831 694.3410299
####linear regression#######
library(usdm)
lm_model= lm(cnt~.,data=train)
summary(lm_model)
lm_predict=predict(lm_model,test[,-11])
regr.eval(test[,11],lm_predict,stats = c("mae","mape","rmse"))
#mae mape rmse
#598.3067621 0.1812669 807.3537860