-
Notifications
You must be signed in to change notification settings - Fork 2
/
data_processing.py
182 lines (127 loc) · 6.13 KB
/
data_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
"""
app: Module to run the app
=============================================
.. moduleauthor:: Lilian MAREY <[email protected]>
"""
import dash
import dash_core_components as dcc
from dash.dependencies import Input, Output
import dash_html_components as html
from sys import exit
from datetime import date
import pandas as pd
from src.mysettings import *
from src.helpers import *
from src.preprocess import *
import src.plots as plt
##########################################
# Import needed data
# Harmonised data on cases, deaths, tests
print('Importation of input CSV files...')
df_harmonised = pd.read_csv(HARMONISED_DATA_PATH, skiprows = [0, 1, 2])
df_harmonised = df_harmonised[
['Country', 'Region', 'Date', 'Sex', 'Age', 'Cases', 'Deaths', 'Tests']
]
print('Done')
###########################################
# Delete detected untrue source
df_harmonised = df_harmonised[-df_harmonised['Country'].isin(['UK'])]
###########################################
# Taking subset of data for experimentation : different sets depending on what you want to process
print('Taking subset of data for experimentation...')
# for processing the hole data : no selection
df_harmonised = df_harmonised[df_harmonised['Country'].isin(['USA'])]
df_harmonised = df_harmonised[df_harmonised['Region'].isin(['Ohio'])]
# 'North Dakota', 'Ohio', 'Oklahoma'
# for quick tests
# df_harmonised = df_harmonised[df_harmonised['Region'].isin(['Corse'])]
# df_harmonised = df_harmonised[df_harmonised['Sex'].isin(['b'])]
# df_harmonised = df_harmonised[df_harmonised['Age'].isin([80])]
# for an histogram
# df_harmonised = df_harmonised[df_harmonised['Country'].isin(['France'])]
# df_harmonised = df_harmonised[df_harmonised['Region'].isin(['Ile-de-France'])]
# df_harmonised = df_harmonised[df_harmonised['Sex'].isin(['b'])]
# a quite complete set, quite fast to process, for a complete demonstration
# df1 = df_harmonised[df_harmonised['Region'].isin(['All'])].copy()
# df1 = df1[df1['Sex'].isin(['b'])]
# df1 = df1[df1['Age'].isin([80])]
# df2 = df_harmonised[df_harmonised['Country'] == 'France'].copy()
# df3 = df_harmonised[df_harmonised['Country'].isin(['USA'])]
# df3 = df3[df3['Sex'].isin(['b'])]
# df_harmonised = df3.merge(df1.merge(df2, 'outer'), 'outer')
# for a world map
# df_harmonised = df_harmonised[df_harmonised['Region'].isin(['All'])]
# df_harmonised = df_harmonised[df_harmonised['Sex'].isin(['b'])]
# df_harmonised = df_harmonised[df_harmonised['Age'].isin([80])]
# for a USA map
# df_harmonised = df_harmonised[df_harmonised['Country'].isin(['USA'])]
# df_harmonised = df_harmonised[df_harmonised['Sex'].isin(['b'])]
# df_harmonised = df_harmonised[df_harmonised['Age'].isin([80])]
###########################################
# Adding CFR column to the Harmonised data
print('Adding CFR column to the Harmonised data')
df_harmonised['CFR'] = computeRatio(
df_harmonised['Deaths'],
df_harmonised['Cases']
)
print('Done')
###########################################
# Adding Tests by cases column to the Harmonised data
print('Adding Cases_by_tests column to the Harmonised data')
df_harmonised['Tests by cases'] = computeRatio(
df_harmonised['Tests'],
df_harmonised['Cases']
)
print('Done')
###########################################
# Sorting data
print('Done')
print('Data shape : ', df_harmonised.shape)
print('Creating date_code column...')
df_harmonised['Date_code'] = df_harmonised.apply(computeDatecode, axis = 1)
print('Done')
print('Sorting Dataframe...')
df_harmonised = df_harmonised.sort_values(by = ['Date_code'])
print('Done')
###########################################
# Computing dayly/weekly/monthly data
print('Melting dataframe...')
df_harmonised = meltDataframe(df_harmonised)
print('Done')
###########################################
# Adding gap_in_day and label column to the harmonised data
print('Creating date_code column...')
df_harmonised['gap_in_day'] = df_harmonised.apply(lambda row : time_delta('01/01/2020', row.Date), axis = 1)
print('Done')
print('Creating Country - Region - Age - Gender column...')
df_harmonised['Country - Region - Age - Gender'] = df_harmonised.apply(
lambda row : str(row.Country) + ' - '
+ row.Region + ' - '
+ str(row.Age) + '-'
+ str(row.Age+sum(
[4 if row.Age == 100 else 9]
)
)
+ ' ans - ' + label_gender[row.Sex],
axis = 1)
print('Done')
###########################################
# Building time metrics
print('Building time metrics...')
df_harmonised = build_time_metrics(df_harmonised)
print('Time metrics built')
###########################################
# Building date label column
df_harmonised['Date_format'] = df_harmonised.apply(computeDateFormat, axis = 1)
###################################
# Corrections on original dataset to fit with other datasets
df_harmonised['Region'] = df_harmonised['Region'].replace('Lousiana', 'Louisiana')
df_harmonised['Region'] = df_harmonised['Region'].replace('NYC', 'New York')
###################################
# Computation of values by population
df_harmonised['Value_by_pop'] = df_harmonised.apply(computeValuebyPop, axis = 1)
###################################
# Creation of file
PATH = pathlib.Path(__file__).parent
df_harmonised.to_csv(str(PATH) + '/data/preprocessed_data_update_6.csv')
print('File created')