-
Notifications
You must be signed in to change notification settings - Fork 0
/
quarto.qmd
121 lines (106 loc) · 3.89 KB
/
quarto.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
---
title: "Heart Failure Survival"
format: html
editor_options:
chunk_output_type: console
execute:
echo: false
warning: false
error: false
---
This report uses the [Heart Failure Clinical Records](https://archive.ics.uci.edu/dataset/519/heart+failure+clinical+records) dataset to compare clinical metrics across patient groups, focusing on younger patients' survival outcomes and diabetes status.
## Connect to Data
We start by connecting to the `HEART_FAILURE` database in Snowflake, standardizing column names, and filtering to our age range of interest.
```{python}
import ibis
from plotnine import ggplot, aes, geom_boxplot, labs, theme
from great_tables import GT
# Connect to Snowflake with Ibis
con = ibis.snowflake.connect(
warehouse="DEFAULT_WH",
database="HEART_FAILURE",
schema="PUBLIC",
connection_name="workbench"
)
# Connect to the table and filter the data
heart_failure = con.table("HEART_FAILURE")
# Standardize column names and filter the data
heart_failure_filtered = (
heart_failure.filter(heart_failure.AGE < 50) # Filter to age < 50
.rename(
{
"age": "AGE",
"diabetes": "DIABETES",
"serum_sodium": "SERUM_SODIUM",
"serum_creatinine": "SERUM_CREATININE",
"sex": "SEX",
"death_event": "DEATH_EVENT",
}
)
.select(
["age", "diabetes", "serum_sodium", "serum_creatinine", "sex", "death_event"]
)
)
```
## Age Distribution by Diabetes Status and Survival
The box plot below illustrates the distribution of ages among heart failure patients, grouped by diabetes status and survival outcome (0 = Survived, 1 = Died).
```{python}
heart_failure_plot = (
heart_failure_filtered
.mutate(
death_event=heart_failure_filtered["death_event"].cast("string"),
diabetes=heart_failure_filtered["diabetes"].cast("string")
)
)
(
ggplot(heart_failure_plot, aes(x="death_event", y="serum_sodium", color="diabetes")) +
geom_boxplot() +
labs(
title="Serum Sodium Levels by Diabetes Status and Survival Outcome",
x="Survival Outcome (0 = Survived, 1 = Died)",
y="Serum Sodium (mEq/L)",
color="Diabetes"
) +
theme(legend_position="bottom")
)
```
## Compare Metrics in a Table
In the table below, we compare clinical metrics across patient groups categorized by survival outcome ("Survived" or "Died") and diabetes status ("Yes" or "No"). This breakdown provides insights into how these clinical characteristics differ based on patient health conditions.
```{python}
# Summarize clinical metrics for table display
comparison = (
heart_failure_filtered
.group_by(["death_event", "diabetes"])
.aggregate(
median_age=heart_failure_filtered["age"].median(),
median_serum_creatinine=heart_failure_filtered["serum_creatinine"].median(),
median_serum_sodium=heart_failure_filtered["serum_sodium"].median()
)
.mutate(
death_event=ibis.ifelse(heart_failure_filtered["death_event"] == 1, "Died", "Survived"),
diabetes=ibis.ifelse(heart_failure_filtered["diabetes"] == 1, "Yes", "No"),
median_serum_creatinine=heart_failure_filtered["serum_creatinine"].median().cast("float64")
)
.rename(
{
"Survival Outcome": "death_event",
"Diabetes Status": "diabetes",
"Median Age": "median_age",
"Median Serum Creatinine (mg/dL)": "median_serum_creatinine",
"Median Serum Sodium (mEq/L)": "median_serum_sodium"
}
)
)
(
GT(comparison.execute())
.tab_header(title="Clinical Metrics by Survival Outcome and Diabetes Status")
.fmt_number(
columns=["Median Age", "Median Serum Creatinine (mg/dL)", "Median Serum Sodium (mEq/L)"],
decimals=1
)
.data_color(
columns=["Median Serum Creatinine (mg/dL)", "Median Serum Sodium (mEq/L)"],
palette=["white", "blue"]
)
)
```