-
Notifications
You must be signed in to change notification settings - Fork 0
/
utilities.py
77 lines (69 loc) · 2.69 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
"""
Sam Lindsay and Peter Xu
CSE 163
This file defines the Utils class, which contains methods related to cleaning
and preparing the cancer data set.
"""
import pandas as pd
class Utils:
"""
Contains functions that cleans and prepares the cancer data for analysis
and visualization.
"""
def get_mir(data, on, rate_col):
"""
Takes in the already cleaned data. Given a set of columns to merge
on, and desired col. it splits the data into mortality and
incidence rows, then merges together based on the keys. Then it
calculates an MIR and appends it to the data set with a new column.
Returns the data with the MIR.
"""
data = data.astype({rate_col: 'float32'})
mortality = data[data["EVENT_TYPE"] == "Mortality"]
incidence = data[data["EVENT_TYPE"] == "Incidence"]
joined = pd.merge(mortality, incidence, how='inner', on=on)
joined["MIR"] = joined[rate_col + "_x"] / joined[rate_col + "_y"]
return joined
def remove_rows(data, chars):
"""
Filters rows based on a set of characters.
Used to clean out the characters that are all used to represent
a NA value. Return the original dataset if no characters is passed
in for row removal.
"""
if (len(chars) == 0) or (chars is None):
return data
check = data != chars[0]
for c in chars:
check = check & (data != c)
return data[check.all(1)]
def filter_sex_site_race(data, site="All Cancer Sites Combined",
sex="Male and Female", race="All Races"):
"""
Takes in cancer data as well as a site, sex, and race value.
If any of site, sex, and race are not None, then the related column
is filtered by the given value. Returns filtered data.
"""
filters = []
if site is not None:
filters.append(data["SITE"] == site)
if sex is not None:
filters.append(data["SEX"] == sex)
if race is not None:
filters.append(data["RACE"] == race)
if len(filters) == 0:
return data
else:
final_filter = filters[0]
for f in filters:
final_filter = final_filter & f
return data[final_filter]
def filter_alaska_hawaii(data, colname):
"""
Takes in cancer data, returns data without Alaska or Hawaii.
Also takes in a column name which specifies which columns contains
the state data (in its USPS abbreviation)
"""
ak_filter = data[colname] != "AK"
hi_filter = data[colname] != "HI"
return data[ak_filter & hi_filter]