forked from nameyeh/project-three
-
Notifications
You must be signed in to change notification settings - Fork 0
/
CitiBike_nn_app.py
128 lines (104 loc) · 4.35 KB
/
CitiBike_nn_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# IMPORT DEPENDENCIES
import warnings
warnings.simplefilter('ignore')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import *
import time
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
def predict_20s(filename):
df = pd.read_csv(filename)
# remove data from customers - their data doesn't include their age, so they will not help our regression
df = df[df['usertype'] == "Subscriber"]
df = df[df['gender']!=0]
df = df[(df['start station latitude'] < 41.5) & (df['end station latitude'] < 41.5)]
df = df.dropna()
# parse datetime of ride start to create variables for hour of the day, and whether or not the ride happened on a weekend
hour = []
weekend = []
for start_time in df['starttime']:
t1 = datetime.strptime(start_time,'%Y-%m-%d %H:%M:%S')
hour.append(t1.hour)
if t1.weekday() in [5,6]:
weekend.append(1)
else:
weekend.append(0)
# aggregate results into a new dataframe
clean_df = pd.DataFrame({
"duration":df['tripduration'],\
"weekend":weekend,\
"hour":hour,\
"start_lat":df['start station latitude'],\
"start_long":df['start station longitude'],\
"end_lat":df['end station latitude'],\
"end_long":df['end station longitude'],\
"gender":(df['gender']-1),\
"age":(2018 - df['birth year'])
})
# create a new column based on the age of riders
# since a neural network predicts a binary outcome, we will predict whether or not a rider is in their 20s (aged 20-29)
twenties = []
for val in clean_df['age']:
if val < 30 and val > 19:
twenties.append(1)
else:
twenties.append(0)
clean_df['twenties'] = twenties
print(f'{filename} successfully cleaned.')
# pull beginning and ending coordinates to build clusters around
km_test = clean_df[['start_lat','start_long','end_lat','end_long']]
# be sure not to include coordinates in the modelling data, since they will have collinearity with the trip clusters we make
k_data = clean_df.drop(['start_lat','start_long','end_lat','end_long','age','twenties'], axis=1)
# cluster rides
kmeans = KMeans(n_clusters=10)
kmeans.fit(km_test)
predicted_clusters = kmeans.predict(km_test)
k_data['trip_cluster'] = predicted_clusters
print("Trip clustering complete")
print("Beginning neural network modelling")
# create dummy columns for trip cluster and hour of the day
k_data_encoded = pd.get_dummies(k_data, columns=['trip_cluster','hour'])
y = clean_df['twenties'].values.reshape(-1, 1)
# split data into training and testing samples
# create a scaler according to training input, apply to testing input
X_train, X_test, y_train, y_test = train_test_split(
k_data_encoded, y, random_state=1, stratify=y)
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)
# build sequential model
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=37))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=2, activation='softmax'))
# compile and fit to training data
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
model.fit(
X_train_scaled,
y_train_categorical,
epochs=5,
shuffle=True,
verbose=2
)
# test model on testing data
model_loss, model_accuracy = model.evaluate(
X_test_scaled, y_test_categorical, verbose=2)
print(
f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")
# save cleaned data and nn model
clean_df['predicted_cluster'] = predicted_clusters
clean_df.to_csv(f'cleaned_{filename}')
model.save(f'{filename}_nn.h5')
return;
# predict_20s('201802_citibikenyc_tripdata.csv')