-
Notifications
You must be signed in to change notification settings - Fork 0
/
cov_stackplot.py
132 lines (104 loc) · 5.21 KB
/
cov_stackplot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
def complete_dataframe(df1=None, df2=None, with_nonzero=False):
"""
This function ensures that both DataFrames have the same columns and indexes.
If a column or index in df1 is missing in df2, it will be added to df2.
If with_nonzero is True, missing rows are filled with the previous row's data, otherwise, they are filled with None.
Parameters:
df1: The first DataFrame, serving as the reference.
df2: The second DataFrame, which will be completed.
with_nonzero: If True, rows missing from df2 will be filled with the previous row's values.
Returns:
df2: The modified DataFrame, with missing columns and indexes from df1 added and NaNs filled with 0.
"""
# Include columns from df1 in df2 if they don't exist in df2
COLUMNS1 = df1.columns
COLUMNS2 = df2.columns
for C1 in COLUMNS1:
if C1 not in COLUMNS2:
df2[C1] = None # Add missing columns to df2
# Include indexes from df1 in df2 if they don't exist in df2
INDEX1 = df1.index
INDEX2 = df2.index
for I1 in INDEX1:
if I1 not in INDEX2:
if with_nonzero:
# If the index is missing, add it and fill it with previous row's data
df2.loc[I1] = None
df2 = df2.sort_index()
Idx = np.where(df2.index == I1)[0][0]
if Idx != 0:
df2.loc[I1] = df2.iloc[Idx-1, :] # Fill missing row with previous row
else:
df2.loc[I1] = df2.iloc[Idx+1, :] # If it's the first row, use the next row
else:
df2.loc[I1] = None # If with_nonzero is False, fill with None
# Sort the index of df2 and fill NaN values with 0
df2 = df2.sort_index()
df2 = df2.fillna(0)
return df2
# Update the plot settings with custom parameters for better readability
params = {'legend.fontsize': 14,
'figure.figsize': (20, 16),
'axes.labelsize': 12,
'axes.titlesize': 12,
'xtick.labelsize': 10,
'ytick.labelsize': 10}
plt.rcParams.update(params)
# Load the CSV file containing COVID-19 data from São Paulo
df = pd.read_csv("cov_SP2024-jan-26-jun.csv", sep=",", index_col=False)
# Filter only the relevant columns (country, date of collection, and clade)
df = df[['pais', 'collection_date', 'clado']]
# Reset the DataFrame index
df.reset_index(drop=True, inplace=True)
# Convert the 'collection_date' column to datetime format
df['collection_date'] = pd.to_datetime(df['collection_date'])
# Create a list of unique values from the 'pais' column (locations)
UPA = list(set(df['pais'].values))
UPA = [i for i in UPA if i == i] # Filter out NaN values
UPA.sort() # Sort the list alphabetically
# Group the data by month and clade and count occurrences
df1 = df[['collection_date', 'clado']]
df1['month'] = df1['collection_date'].dt.to_period('M') # Convert dates to periods by month
df1 = df1.groupby(['month', 'clado']).size().unstack(fill_value=0) # Group by month and clade, fill missing values with 0
# Count occurrences of each clade in the dataset
contagem_clado = df['clado'].value_counts()
print(contagem_clado)
# Re-group data by month and clade (duplicate of above code, may not be necessary)
df['collection_date'] = pd.to_datetime(df['collection_date'])
df1 = df[['collection_date', 'clado']]
df1['month'] = df1['collection_date'].dt.to_period('M')
df1 = df1.groupby(['month', 'clado']).size().unstack(fill_value=0)
# Define the colors for each clade in the plot
unique_clades = set(df['clado'])
clade_color_dict = {"23I": "#8dd3c7", "recombinant": "#ffffb3", "23A": "#bebada", "23G": "#fb8072",
"23F": "#80b1d3", "23E": "#fdb462", "23H": "#b3de69", "22E": "#fccde5", "21K": "#d9d9d9"}
# Create a figure for the plot
fig, ax = plt.subplots(figsize=(16, 12))
# Calculate the frequency of each clade per month as a percentage
freq_por_mes = df1.T / df1.sum(axis=1) * 100 # Calculate percentages
freq_por_mes = freq_por_mes.T
# Prepare the labels for the x-axis (dates) and colors for the plot
days = [str(month) for month in freq_por_mes.index] # Convert month periods to strings for plotting
clade_colors = [clade_color_dict.get(c, "#333333") for c in freq_por_mes.columns] # Use clade colors, default to grey if not found
# Plot a stacked area plot showing the frequency of clades over time
lines = ax.stackplot(days, freq_por_mes.T.values, labels=freq_por_mes.columns, colors=clade_colors)
# Get handles and labels for the legend
h, l = ax.get_legend_handles_labels()
handle_dict = {label: handle for handle, label in zip(h, l)}
# Set plot titles and labels
ax.set_title("São Paulo")
ax.set_xlabel('Date')
ax.tick_params(axis='x', rotation=45) # Rotate x-axis labels for better readability
ax.set_ylabel('Frequency (%)')
# Set y-axis limits to ensure the percentage scale goes from 0 to 100
ax.set_ylim(0, 100)
# Add a legend below the plot
fig.legend(handles=list(handle_dict.values()), labels=list(handle_dict.keys()),
loc='lower center', bbox_to_anchor=(0.5, -0.05), ncol=len(unique_clades), fontsize='14')
# Save the figure as a PDF
fig.savefig("cov_SP_2024.pdf", dpi=300, bbox_inches='tight')
# Display the plot
plt.show()