-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_exploration.R
144 lines (109 loc) · 5.69 KB
/
data_exploration.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# Set Working Directory (where data and R script are located)
wd <- "~/Documents/Artificial Intelligence/Master/2122 Sem. 1/Bayesian Networks/Assignment1/Assignment_1"
setwd(wd)
# Set Seed
set.seed(123)
# Import Packages
library('car')
# Import Data
d <- read.csv("forestfires.csv", colClasses=c("integer","integer","factor","factor","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric"))
# Show head of the Data
head(d)
# Our data has a few categorical variables.
# These are: month and day.
# First let's fix the month variable, and change it into a numeric variable
d$month <- ordered( d$month, levels = c('jan', 'feb', 'mar', 'apr', 'may', 'jun',
'jul', 'aug', 'sep', 'oct', 'nov', 'dec'))
d_new <- unclass(d$month)
d = subset(d, select = -c(month))
d$month <- as.numeric(d_new)
# Check if day is informative.
days <- c('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
d$day <- ordered(d$day, levels = days)
png(file="~/Documents/Artificial Intelligence/Master/2122 Sem. 1/Bayesian Networks/Assignment1/Assignment_1/plts/days.png",
width=600, height=350)
par(las=2)
barplot(table(d$day), main = "Amount of Datapoints per Day", xlab = "Day", ylab = "Count")
dev.off()
# We don't want to use day, so drop it with X and Y.
d <- subset(d, select = -c(X, Y, day))
# And we now have a dataset with the variables we want to use.
head(d)
## Exploring the variables in more detail
# Density of the Area variable:
plot(density(d$area), main = 'Density of Area')
head(sort(d$area, decreasing = TRUE))
qqPlot(d$area)
# We can tell this is heavily skewed to the right, but that this is not necessarily caused by outliers.
# This skewness can be fixed by doing a log transform of this variable
d$area <- log(d$area + 1)
plot(density(d$area), main = 'Density of log-transformed Area')
outliers = qqPlot(d$area, main = "QQ-plot of Area", ylab = "Area (in ha)")
d <- d[-outliers,]
plot(density(d$area), main = 'Density of log-transformed Area after outlier removal')
qqPlot(d$area)
# Density of the rain variable:
plot(density(d$rain), main = 'Density of Rain')
head(sort(d$rain, decreasing = TRUE))
outliers = qqPlot(d$rain, main = "QQ-plot of Rain", ylab = "Rain (in mm/m2)")
# We can tell this is heavily skewed to the right again, but after inspecting the values, this definitaly is an outlier:
# So let's remove this datapoint from the dataset.
d <- d[-outliers[1],]
plot(density(d$rain), main = 'Density of Rain after Outlier Removal')
qqPlot(d$rain, main = "QQ-plot of Rain", ylab = "Rain (in mm/m2)")
# Density of the wind variable:
plot(density(d$wind), main = 'Density of Wind')
outliers = qqPlot(d$wind, main = "QQ-plot of Wind", ylab = "Wind (in km/h)")
# This looks reasonable well to fit a Gaussian to.
# Density of the RH variable:
plot(density(d$RH), main = 'Density of Relative Humidity')
# This also looks reasonable well to fit a Gaussian to.
qqPlot(d$RH)
# Density of the temperature variable:
plot(density(d$temp), main = 'Density of Temperature')
qqPlot(d$temp, main = "QQ-plot of Temperature")
# This also looks reasonable well to fit a Gaussian to.
# Density of the Initial Spread Index variable:
plot(density(d$ISI), main = 'Density of Initial Spread Index')
head(sort(d$ISI, decreasing = TRUE))
outliers = qqPlot(d$ISI, main = 'QQ-plot of Initial Spread Index')
# We see again a heavily right-skewed distribution, but after inspecting the ISI values, there is one that is extremely large compared to others.
# So let's remove than one as well.
d <- d[-outliers[1],]
plot(density(d$ISI), main = 'Density of Initial Spread Index after Outlier Removal')
outliers = qqPlot(d$ISI, main = 'QQ-plot of Initial Spread Index after Outlier Removal')
# Density of the Drought Code variable:
plot(density(d$DC), main = 'Density of Drought Code')
outliers = qqPlot(d$DC, main = 'QQ-plot of Drought Code')
# This is a problematic distribution, as it looks like it's bimodal.
# However, there is nothing we can do about this.
# Density of the Duff Moisture Code variable:
plot(density(d$DMC), main = 'Density of Duff Moisture Code')
outliers = qqPlot(d$DMC, main = 'QQ-plot of Duff Moisture Code')
# Again, looks bimodal.
# Density of the Fine Fuel Moisture Code variable:
png(file="~/Documents/Artificial Intelligence/Master/2122 Sem. 1/Bayesian Networks/Assignment1/Assignment_1/plts/density_ffmc.png", width=600, height=350)
plot(density(d$FFMC), main = 'Density of FFMC')
dev.off()
head(sort(d$FFMC, decreasing = FALSE))
outliers = qqPlot(d$FFMC, main = 'QQ-plot of Fine Fuel Moisture Code')
# Now we have a left-skewed distribution, with an outlier (18.7).
# Let's remove this as well.
d <- d[-outliers,]
plot(density(d$FFMC), main = 'Density of Fine Fuel Moisture Code after Outlier Removal')
outliers = qqPlot(d$FFMC)
d <- d[-outliers[1],]
plot(density(d$FFMC), main = 'Density of Fine Fuel Moisture Code after Outlier Removal')
qqPlot(d$FFMC)
d$FFMC <- log(max(d$FFMC)-d$FFMC+1)
png(file="~/Documents/Artificial Intelligence/Master/2122 Sem. 1/Bayesian Networks/Assignment1/Assignment_1/plts/density_processed_ffmc.png", width=600, height=350)
plot(density(d$FFMC), main = 'Density of FFMC after Preprocessing')
dev.off()
qqPlot(d$FFMC)
## Now we have removed the outliers using QQ-plots, for the following variables:
# Area - 2 outliers removed, those with the highest area were substantially larger than other values.
# Rain - 1 outlier removed, the one with the highest value (6.4) which was again substantially larger than other values
# ISI - 1 outlier removed, the one with the highest value (around 60), idem as before
# FFMC - 3 outliers removed, the three lowest values.
# let's save this dataset to a new .csv file
write.csv(d, paste(wd,"/explored_forestfires.csv", sep=""), row.names = FALSE)