-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean_data.py
191 lines (146 loc) · 6.55 KB
/
clean_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/usr/bin/env python
"""Cleans up a directory of speed data into a single CSV file ready for analysis
Author: Andrew Clarry
Date: 15/06/16
"""
import os
import datetime as dt
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.neighbors import KernelDensity
DATA_DIR = "Data"
EMME_VOLUME_DATA = os.path.join(DATA_DIR, "EMME 2011 VOLUME SUMMARY.OCT2015.v2.csv")
EMME_LINK_DATA = os.path.join(DATA_DIR, "EMME_link_data.csv")
RAW_DATA_DIR = os.path.join(DATA_DIR, "Geoprocessed Data", "Processed CSVs")
CLEANED_DATA_DIR = os.path.join(DATA_DIR, "Cleaned Data")
def clean_trips(directory_name, cache_file=None, clean_users=True):
"""Returns a Pandas dataframe of cleaned, aggregated trips from a directory.
"""
csv_list = [os.path.join(directory_name, f) for f in os.listdir(directory_name)
if os.path.splitext(f)[1] == ".csv"]
print("Reading in data")
df_list = [pd.read_csv(csv, parse_dates=["STARTED_AT"])
for csv in csv_list]
print("Cleaning trips")
data = pd.concat([clean_trip(d) for d in df_list])
data = clean_data(data, clean_users=clean_users)
print("Writing cleaned data to %s" % cache_file)
data.to_csv(cache_file, encoding="utf8")
print("Successfully wrote data to csv")
def clean_trip(df):
"""Cleans a pandas dataframe corresponding to a single trip"""
df = df[df["CUMUL_METE"] != 0]
trip_len = df['CUMUL_METE'].max()
df['trip_length'] = trip_len
df.loc[df["SIG_DIST"] < 0, "SIG_DIST"] = trip_len
df.loc[df["STOP_DIST"] < 0, "STOP_DIST"] = trip_len
return df
def add_bike_code(df):
"""Adds bike facility information to the data frame"""
df['bike_lanes'] = df['BIKE_CODE'] == 11
df['sharrows'] = df['BIKE_CODE'].isin((3, 4))
df['bike_path'] = df['BIKE_CODE'].isin((6, 7))
return df
def add_route_stats(df):
"""Adds information about the individual link"""
df['SLOPE_TF'] = df['SLOPE_TF'] * df['LINK_DIR'] * (-1)
return df
def add_emme_volume_stats(df, emme_volume_csv):
volume_df = pd.read_csv(emme_volume_csv)
same_dir = df["LINK_DIR"] == 1
df.loc[same_dir, "EMME_ID"] = df.loc[same_dir, "EMME_MATCH"]
df.loc[~same_dir, "EMME_ID"] = df.loc[~same_dir, "EMME_CONTR"]
# Add volume info for each point
df = pd.merge(df, volume_df, left_on="EMME_ID", right_on="LINK_ID", how="left")
time_categories = (("AM_VOL", 6, 10),
("MID_VOL", 10, 16),
("PM_VOL", 16, 19),
("EVE_VOL", 19, 6))
dt_index = pd.DatetimeIndex(df["RECORDED_A"])
for period, start, stop in time_categories:
df[period] = df[period].fillna(0)
if start < stop:
index = (start <= dt_index.hour) & (dt_index.hour < stop)
else:
index = (start <= dt_index.hour) | (dt_index.hour < stop)
df.loc[index, "volume"] = df.loc[index, period]
return df
def add_emme_stats(df, emme_volume_csv=EMME_VOLUME_DATA, emme_link_csv=EMME_LINK_DATA):
"""Adds information from the corresponding EMME link"""
print("Reading emme volume csv")
df = add_emme_volume_stats(df, emme_volume_csv)
print("Reading emme link csv")
link_df = pd.read_csv(emme_link_csv)
LINK_INFO_COLS = ["ID", "DATA2", "LANES", "VDF"]
# Add link info for each point
df = pd.merge(df, link_df.loc[:, LINK_INFO_COLS], left_on="EMME_ID", right_on="ID", how="left")
df["speed_limit"] = df["DATA2"]
df.drop("DATA2", axis=1, inplace=True)
df.loc[df.volume == 0, "volume"] = 0
df.loc[df.VDF == 0, "VDF"] = 90
df.loc[df.LANES == 0, "LANES"] = 1
df.loc[df.speed_limit == 0, "speed_limit"] = 40
df.loc[df["EMME_ID"].isin(("", " ")), ("volume", "speed_limit", "LANES", "VDF")] = (0, 40, 2, 90)
return df
def estimate_user_age_dist(df, bandwidth=0.2):
"""Converts the users age to a continuous variable
Uses a gaussian kernel to create a kernel density estimate of the age
distribution, so that ages can be sampled from the age ranges more correctly.
Returns a numpy array representing the distribution.
"""
ranges = ((1, 0, 18), (2, 18, 25), (3, 25, 35),
(4, 35, 50), (5, 50, 65), (6, 65, 85))
age_samples = np.concatenate(
[np.random.uniform(lower, upper, (df['AGE'] == ind).sum())
for ind, lower, upper in ranges])
age_samples = np.random.choice(age_samples, size=2000)[:, np.newaxis]
kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(age_samples)
for ind, lower, upper in ranges:
sample = kde.sample(len(df))
range_sample = sample[(sample >= lower) & (sample <= upper)]
df.loc[df['AGE'] == ind, 'age_sample'] = np.random.choice(range_sample,
size=(df['AGE'] == ind).sum())
return df
def add_user_stat(df):
"""Adds traits for the user"""
df['is_male'] = df['GENDER'] == 1
df.loc[df["GENDER"] == " ", "GENDER"] = 0
df.loc[df["AGE"] == " ", "AGE"] = 0
df.loc[df["CYCLING_LE"] == " ", "CYCLING_LE"] = 0
df['road_comfortable'] = df['CYCLING_LE'] == 2
df['traffic_comfortable'] = df['CYCLING_LE'] == 3
return df
def filter_missing_survey_vals(df):
"""Filters out rows of data without user survey responses
Filters out rows from the input dataframe where one of the user survey
responses in the model has no data (has a zero value)
Many of the users using the app did not fill out the whole user survey.
As a first approximation of a regression on the full dataset, we can just
filter out any rows that don't have full data.
Note that this leaves us with about half the original dataset.
"""
df = df[(df.AGE != 0) & (df.CYCLING_LE != 0) & (df.GENDER != 0)]
return df
def clean_data(data, clean_users=True):
"""Cleans the input point speed dataset
"""
print("Ading bike codes")
data = add_bike_code(data)
print("Adding route stats")
data = add_route_stats(data)
print("Adding EMME data")
data = add_emme_stats(data)
print("Cleaning speed data")
data.loc[data["SPEED"] < 0, "SPEED"] = 0
if clean_users:
print("Adding user stats")
data = add_user_stat(data)
print("Performing user age estimate")
data = estimate_user_age_dist(data, bandwidth=5.0)
print("Filtering out missing values from user survey")
#data = filter_missing_survey_vals(data)
return data
if __name__ == "__main__":
cached_cleaned = os.path.join(CLEANED_DATA_DIR, "cleaned_data.csv")
clean_trips(RAW_DATA_DIR, cache_file=cached_cleaned)