-
Notifications
You must be signed in to change notification settings - Fork 0
/
analyze.py
118 lines (99 loc) · 4.88 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
Sam Lindsay and Peter Xu
CSE 163
Top level program that is used to analyze the cancer data set. Loads the data,
cleans it, then creates visualizations.
"""
import pandas as pd
import geopandas as gpd
from utilities import Utils
from plots import CancerPlots
AREA_DATA_PATH = "data\\USCS-1999-2018-ASCII\\BYAREA.TXT"
SITE_DATA_PATH = "data\\USCS-1999-2018-ASCII\\BYSITE.TXT"
COUNTY_DATA_PATH = "data\\USCS-1999-2018-ASCII\\BYAREA_COUNTY.TXT"
SHP_DATA_PATH = "data\\2020_us_county_shp\\cb_2020_us_county_20m.shp"
def state_change(data):
"""
Takes in a DataFrame data and returns a list of MIR percent changes between
1999 and 2018 for each state. Also creates a plot showing the MIR for each
state over time and saves it to "state_improvement_plot.html".
Only considers data for all races, sexes, and types of cancer combined in
order to best compare different states.
"""
data = Utils.filter_sex_site_race(data)
data = data[['AREA', 'AGE_ADJUSTED_RATE', 'EVENT_TYPE', 'YEAR']]
data = Utils.remove_rows(data=data, chars=['~', '+', '.', '-'])
data = Utils.get_mir(data=data,
on=['AREA', 'YEAR'],
rate_col='AGE_ADJUSTED_RATE')
data = data[["AREA", "YEAR", "MIR"]]
data = data.sort_values(["YEAR"])
CancerPlots.make_state_plot(data)
grouped = data.groupby(by="AREA")
state_change = (grouped['MIR'].last() - grouped['MIR'].first()) \
/ grouped['MIR'].first()
return state_change
def cancer_change(data):
"""
This function takes in a DataFrame data and returns a list of MIR
percentage change between 1999 and 2018 for each type of cancer.
Also creates a plot showing the MIR of all types of cancer over the
time period and saves it to "cancer_type_plot.html".
Here, we only consider the data for all races and all sexes combined
as they cover the most portion of population in our dataset.
"""
data = Utils.filter_sex_site_race(data, site=None)
data = Utils.remove_rows(data=data, chars=['~', '+', '.', '-'])
data = data[['SITE', 'AGE_ADJUSTED_RATE', 'EVENT_TYPE', 'YEAR']]
data = Utils.get_mir(data=data,
on=['SITE', 'YEAR'],
rate_col='AGE_ADJUSTED_RATE')
data = data[["YEAR", "MIR", "SITE"]]
data = data.sort_values(["YEAR"])
CancerPlots.make_cancer_plot(data)
grouped = data.groupby(by="SITE")
cancer_change = (grouped['MIR'].last() - grouped['MIR'].first()) \
/ grouped['MIR'].first()
return cancer_change
def create_interactive(by_county, counties):
"""
Takes in two data sets: "by_county" is a DataFrame containing cancer data
that is broken down by county and "counties" is geospatial data defining
the shape of each county. Does not return anything, but it does create
an interactive visualization showing the MIR in each county filtered by
race. This visualization is saved to "state_race_map.html".
"""
by_county = Utils.filter_alaska_hawaii(by_county, "STATE")
counties = Utils.filter_alaska_hawaii(counties, "STUSPS")
by_county_c = by_county[["AREA", "RACE", "SITE", "YEAR", "EVENT_TYPE",
"AGE_ADJUSTED_RATE", "SEX", "STATE"]].copy()
by_county_c = Utils.filter_sex_site_race(by_county, race=None)
counties_c = counties[["GEOID", "NAMELSAD", "STUSPS", "STATE_NAME",
"geometry"]].copy()
by_county_c = Utils.remove_rows(data=by_county_c,
chars=['+', '~', '.', '-'])
by_county_c = Utils.get_mir(data=by_county_c,
on=['AREA', 'RACE', 'SEX', 'SITE', 'YEAR'],
rate_col='AGE_ADJUSTED_RATE')
CancerPlots.generate_map(by_county_c, counties_c)
def main():
by_area = pd.read_csv(AREA_DATA_PATH, sep="|", low_memory=False)
by_site = pd.read_csv(SITE_DATA_PATH, sep="|", low_memory=False)
by_county = pd.read_csv(COUNTY_DATA_PATH, sep="|", low_memory=False)
counties = gpd.read_file(SHP_DATA_PATH)
# Question 1 - Which state had the greatest change?
change_by_state = state_change(by_area)
print("Most positive change_by_state in MIR: " +
str(change_by_state.idxmax()) + " " + str(change_by_state.max()))
print("Most negative change_by_state in MIR: " +
str(change_by_state.idxmin()) + " " + str(change_by_state.min()))
# Question 2 - Which type of cancer had the greatest change?
change_by_cancer = cancer_change(by_site)
print("Most positive change_by_cancer in MIR: "
+ str(change_by_cancer.idxmax()) + " " + str(change_by_cancer.max()))
print("Most negative change_by_cancer in MIR: "
+ str(change_by_cancer.idxmin()) + " " + str(change_by_cancer.min()))
# Question 3 - In the same county, what is the difference in racial groups?
create_interactive(by_county, counties)
if __name__ == "__main__":
main()