-
Notifications
You must be signed in to change notification settings - Fork 0
/
load_and_preprocess.py
209 lines (162 loc) · 7.39 KB
/
load_and_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 29 15:15:59 2019
@author: jmatt
"""
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pickle as pkl
#JEM functions
from PerPersVar_condense import PerPersVar_condense
from drop_illogical import drop_illogical
from make_hist_fig import make_hist_fig
from load_data import load_data
from data_exploration import data_exploration
from extract_areas import extract_areas
from condense_deprivation import condense_deprivation
def load_and_preprocess(data_fn,variables_fn,VariableSet,URL,Drop_extra):
"""
Loads the data, performs pre-processing actions to prepare the data for
analysis, then saves the relevant data in a pickle file
INPUTS
data_fn - The location of the data file
variables_fn - The location of the variables metadata file
VariableSet - the (user selected) subset of variables to use in the analyss
URL - the web address of the data (used if not saved locally)
Drop_extra - flag to: Drop records with missing response variable
Drop columns not in the selected set of variables
OUTPUTS
Pickle file saved in the same directory as data_fn
"""
#Load the data and begin processing
DataSubset,variables,responseVar=load_data(data_fn,variables_fn,URL,VariableSet,Drop_extra)
#Variables of 'discretionary' consumption (cigarettes, alcohol, meals out, etc)
DiscVars = variables['Variable'][variables['Disc']=='y']
#Build a 'discretionary' consumption variable add to the dataset
DataSubset['total_disc'] = DataSubset[DiscVars].sum(axis=1)
#List of clothing spending variables
ClothVars = variables['Variable'][variables['Clothing']=='y']
#Build clothign spending variable and add to the dataset
DataSubset['total_clothing'] = DataSubset[ClothVars].sum(axis=1)
#Find flags (values less than 0) and set all flags to -1.
DataSubset[DataSubset<0] = -1
DoExplore = True
if DoExplore:
data_exploration(DataSubset,responseVar,DiscVars)
plt.figure()
sns.pairplot(DataSubset[['total_income','disposable_income','total_disc','total_clothing',responseVar]])
#
#t = DataSubset[DataSubset.keys()][DataSubset['total_income']==1457066]
#todrop = t.index
#
#DataSubset.drop(todrop,inplace=True)
NumRecords = DataSubset.shape[0]
#Drop the records with untrustworth disposable income data
t = DataSubset[DataSubset.keys()][DataSubset['disposable_income']==837797]
todrop = t.index
DataSubset.drop(todrop,inplace=True)
print('\n{} records dropped due to unexpected disposable income of exactly $837,797 over multiple households'.format(NumRecords-DataSubset.shape[0]))
plt.figure()
sns.pairplot(DataSubset[['total_income','disposable_income','total_disc','total_clothing',responseVar]])
"""
Drop source variables used to create other variables
"""
#Drop the discretionary spending source variables
DataSubset.drop(DiscVars,axis='columns',inplace=True)
#Drop the clothing spending source variables
DataSubset.drop(ClothVars,axis='columns',inplace=True)
#Drop total income and discretionary income source columns
DataSubset.drop(['nhifeftp','nhifeftn','nhifditp','nhifditn'],axis='columns',inplace=True)
#Drop records where economic data is questionable
DataSubset = drop_illogical(DataSubset,'disposable_income','total_income')
DataSubset = drop_illogical(DataSubset,'total_clothing','total_income')
DataSubset = drop_illogical(DataSubset,'total_disc','total_income')
#Set negative values to NAN (these entries may be flags)
DataSubset['total_disc'][DataSubset['total_disc']<0] = np.nan #negative spending on alc/cig/meal
DataSubset['total_clothing'][DataSubset['total_clothing']<0] = np.nan #neg spend on clothing
DataSubset['njbhruw'][DataSubset['njbhruw']>7*24] = np.nan # number of hours worked per week greater than number of hours in a week
#name of the location variable
LocVar = 'nhhsgcc'
#dict of Names of areas to numberic designation
LocVals = {'Sydney':11,
'Melbourne':21,
'Brisbane':31,
'Adelaide':41,
'Perth':51}
LocList = list(LocVals.values())
LocName = list(LocVals.keys())
#dict of numeric area designations to area names
LocNames = dict(zip(LocList,LocName))
#Keep only the data from urban areas
AreaData = extract_areas(DataSubset,LocNames,LocVar)
plt.figure()
sns.pairplot(AreaData[['total_income','disposable_income','total_disc','total_clothing',responseVar]])
#Extract the per-person variables in the current data set
PerPersVars = variables['Variable'][(variables[VariableSet]=='y')&(variables['per_person']=='y')]
#Find the roots of the per-person variables
VarRoots = set([val[:re.search('\d',val).start(0)] for val in PerPersVars])
per_num_var = 'nhhpno'
for root in VarRoots:
AreaData = PerPersVar_condense(AreaData,root,per_num_var,DropSource=True)
#make histograms of each variable
makeplots = False
if makeplots:
save_dir = 'Figures/simple_hist/'
var_descriptions = dict(zip(list(variables['Variable']),list(variables['Label'])))
for var_name in AreaData.keys()[4:]:
data = list(AreaData[var_name])
num_bins = np.sqrt(len(data)).round().astype(int)
make_hist_fig(data,num_bins,save_dir,var_name,var_descriptions)
#Variables that have a very high proportion of missingness and/or are duplicated
#by another variable that appears to have less missingness
NoInfoToDrop = [
'ncnsc_gu',
'ncnsc_ge',
'ncnsc_au',
'ncnsc_ae',
'ncnsc_fo',
'ncnsc_ft',
'ncnsc_ps',
'ncnsc_fd',
'ncnsc_pd',
'ncnsc_fc',
'nmdhagt',
'nmdhamt',
'nmdhafu',
'nmdhash',
'nmdhapm',
'nmdhawc',
'nmdhatv',
'nmdhasm',
'nmdhawh',
'nmdharg',
'nmdhaph',
'nmdhahci',
'nmdhawm',
'nmdhaai',
'nmdhamv',
'nmdhamvi',
'nmdhasa',
'nmdhasl',
'nmdhadt',
'nmdhabp',
'nmdhahaw']
Questionable_todrop = [
'nchctc']
vars_to_drop = [val for val in list(AreaData.keys()) if val in NoInfoToDrop]
if len(vars_to_drop)>0:
#drop variables with no useable info
AreaData.drop(vars_to_drop,axis='columns',inplace=True)
AreaData = condense_deprivation(AreaData,variables,drop_vars=True)
#Build the save file string
save_file = '{}_{}_processed.pkl'.format(data_fn.split('.')[0],VariableSet)
#Open the file in write binary mode
file = open(save_file,'ab')
#make a tuple of the variables to retain
ToSave = (AreaData,variables,LocVals,LocNames,LocVar)
#Save the processed data file
pkl.dump(ToSave,file)
#close the file
file.close()