-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_exploration.py
88 lines (68 loc) · 3.14 KB
/
data_exploration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 28 22:54:56 2019
@author: jmatt
"""
import matplotlib.pyplot as plt
def data_exploration(DataSubset,responseVar,DiscVars):
"""
Performs preliminary exploration of the dataset
IMPUTS
DataSubset - pandas dataframe of the data
responseVar - the column name of the response variable
DiscVar - the column names of the discretionary spending variables
OUTPUT
None
"""
plt.figure()
plt.hist(DataSubset[responseVar],bins=11)
plt.xlabel('Subjective Wellbeing')
var = 'nhifeft' #total income
#var2 = 'nhifdit' #Disposable income
var_p = var+'p'
var_n = var+'n'
plt.figure()
plt.hist(DataSubset[var_p]-DataSubset[var_n],log=True,bins=100)
plt.xlabel('Total Income')
DataSubset[var_p].max()
DataSubset[var_n].mode()
#Check for differences between imputed and non-imputed variables
dv1 = list(DiscVars) #non-imputed
dv2 = [val+'i' for val in dv1] #imputed
print('\nCheck for differences between imputed and non-imputed variables')
for i in range(3):
test = DataSubset[dv1[i]]-DataSubset[dv1[i]]
print('{} minus {}==> min:{} max:{}'.format(dv1[i],dv2[i],test.min(),test.max()))
total_p = 'nhifeftp' #total income
total_n = 'nhifeftn' #total income
disp_p = 'nhifditp' #Disposable income
disp_n = 'nhifditn' #Disposable income
print('\nConfirm that the positive\\negative income & disposable income variables are greater-than\less-than zero respectively' )
print('min of positive total income: {}'.format(DataSubset[total_p].min()))
#note values in 'negative income' columns are provided as positive values
print('max of negative total income: {}'.format(DataSubset[total_n].min()))
print('min of positive disposable income: {}'.format(DataSubset[disp_p].min()))
#note values in 'negative income' columns are provided as positive values
print('max of negative disposable income: {}'.format(DataSubset[disp_n].min()))
#combine total income variables and add to dataset
t_inc = DataSubset[total_p]-DataSubset[total_n]
DataSubset['total_income'] = t_inc
#combine disposable income variables and add to dataset
d_inc = DataSubset[disp_p]-DataSubset[disp_n]
DataSubset['disposable_income']= d_inc
plt.figure()
plt.plot(d_inc,t_inc,'.')
plt.xlabel('Annual disposable income ($)')
plt.ylabel('Annual total income ($)')
plt.figure()
plt.plot(t_inc,DataSubset['total_disc'],'.')
plt.ylabel('Annual spending on alcohol/cigarettes/meals-eaten-out ($)')
plt.xlabel('Annual total income ($)')
plt.figure()
plt.plot(t_inc,DataSubset['total_clothing'],'.')
plt.ylabel('Annual spending on clothing ($)')
plt.xlabel('Annual total income ($)')
plt.figure()
plt.plot(DataSubset['total_disc'],DataSubset['total_clothing'],'.')
plt.ylabel('Annual spending on clothing ($)')
plt.xlabel('Annual spending on alcohol/cigarettes/meals-eaten-out ($)')