-
Notifications
You must be signed in to change notification settings - Fork 0
/
flu_state_stackplot_no_fill.py
119 lines (97 loc) · 5.16 KB
/
flu_state_stackplot_no_fill.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Import necessary libraries
import pandas as pd # For data manipulation and analysis
import matplotlib.pyplot as plt # For plotting graphs
import numpy as np # For numerical operations
# Function to ensure that the second DataFrame (df2) includes all columns and indexes from the first (df1)
def complete_dataframe(df1=None, df2=None, with_nonzero=False):
# Get columns from df1 and df2
COLUMNS1 = df1.columns
COLUMNS2 = df2.columns
# Add missing columns from df1 to df2
for C1 in COLUMNS1:
if C1 not in COLUMNS2:
df2[C1] = None
# Get indexes from df1 and df2
INDEX1 = df1.index
INDEX2 = df2.index
# Add missing indexes from df1 to df2, with optional interpolation
for I1 in INDEX1:
if I1 not in INDEX2:
if with_nonzero:
df2.loc[I1] = None # Add missing index to df2
df2 = df2.sort_index() # Sort the DataFrame by index
# Get index location of the new row
Idx = np.where(df2.index == I1)[0][0]
# If the index is not the first row, copy values from the previous row
if Idx != 0:
df2.loc[I1] = df2.iloc[Idx - 1, :]
else:
df2.loc[I1] = df2.iloc[Idx + 1, :]
else:
df2.loc[I1] = None # Otherwise, set the missing row as None
df2 = df2.sort_index() # Sort the DataFrame again
df2 = df2.fillna(0) # Replace NaN values with 0
return df2
# Configuration of parameters for the plots
params = {
'legend.fontsize': 14, # Font size for the legend
'figure.figsize': (20, 10), # Size of the figure
'axes.labelsize': 12, # Font size for axis labels
'axes.titlesize': 12, # Font size for the title
'xtick.labelsize': 10, # Font size for x-axis tick labels
'ytick.labelsize': 10 # Font size for y-axis tick labels
}
plt.rcParams.update(params) # Update plot parameters
# Load the DataFrame from a CSV file
df = pd.read_csv("result_metadata_final_graficos_estado_com_UPAS.csv", sep=",", index_col=False)
df
# Select only relevant columns: 'estado', 'date', and 'short-clade'
df = df[['estado', 'date', 'short-clade']]
df.reset_index(drop=True, inplace=True) # Reset the DataFrame index
df['date'] = pd.to_datetime(df['date']) # Convert 'date' column to datetime format
# Define the desired date range
start_date = pd.Period('2023-04', freq='M') # Start in April 2023
end_date = pd.Period('2024-05', freq='M') # End in May 2024
all_months = pd.period_range(start=start_date, end=end_date, freq='M') # Generate a range of months
# Group the data by month and clade (variant) and count occurrences
df['month'] = df['date'].dt.to_period('M') # Extract the month from the 'date' column
df1 = df.groupby(['month', 'short-clade']).size().unstack(fill_value=0) # Count occurrences by clade per month
# Reindex the DataFrame to ensure all months in the range are included, even if no data exists for some
df1 = df1.reindex(all_months, fill_value=0)
# Ensure all clades (variants) are included as columns
df1 = df1.reindex(columns=df1.columns.union(df['short-clade'].unique()))
# Define colors for each clade (variant) for the plot
unique_clades = set(df['short-clade']) # Get the unique clades (variants)
clade_color_dict = {
"5a.2a.1": "#8dd3c7", # Light teal for clade 5a.2a.1
"5a.2a": "#fccde5", # Pink for clade 5a.2a
"6B": "#bebada", # Lavender for clade 6B
"6B.1A": "#fb8072", # Coral for clade 6B.1A
"5a.1": "#80b1d3" # Light blue for clade 5a.1
}
# Create the figure and axes with specified size
fig, ax = plt.subplots(figsize=(16, 8)) # Set figure size to 16 inches wide, 8 inches tall
# Calculate the frequency of each clade by month in percentage
freq_por_mes = df1.T / df1.sum(axis=1) * 100 # Calculate percentages
freq_por_mes = freq_por_mes.T # Transpose back
# Get the month labels and corresponding colors for the clades
days = [str(month) for month in freq_por_mes.index] # Convert month objects to string for plotting
clade_colors = [clade_color_dict.get(c, "#333333") for c in freq_por_mes.columns] # Assign colors to each clade
# Create a stacked area plot of clade frequencies over time
lines = ax.stackplot(days, freq_por_mes.T.values, labels=freq_por_mes.columns, colors=clade_colors)
# Get handles and labels for the legend (to show the clade names)
h, l = ax.get_legend_handles_labels()
handle_dict = {label: handle for handle, label in zip(h, l)}
# Set the title and axis labels with appropriate font sizes
ax.set_title("São Paulo State", fontsize=20) # Set plot title
ax.set_xlabel('Date', fontsize=16) # Label for x-axis
ax.tick_params(axis='x', rotation=45) # Rotate x-axis labels for better readability
ax.set_ylabel('Frequency (%)', fontsize=16) # Label for y-axis
# Ensure y-axis limits are from 0% to 100%
ax.set_ylim(0, 100)
# Add a legend to the plot with the clades (variants) listed
fig.legend(handles=list(handle_dict.values()), labels=list(handle_dict.keys()), loc='lower center', bbox_to_anchor=(0.5, -0.09), ncol=len(unique_clades), fontsize='14')
# Save the figure as a high-resolution PDF
fig.savefig("flu_sp_state_2024.pdf", dpi=400, bbox_inches='tight')
# Display the plot
plt.show()