From c380ea2ed54c59733308292d540d9344a41c1ad3 Mon Sep 17 00:00:00 2001 From: johaGL Date: Fri, 6 Oct 2023 16:34:21 +0200 Subject: [PATCH 1/2] processing & visualization: np.around(ARR, decimals=6) --- .../method/isotopologue_proportions_plot.yaml | 2 +- src/dimet/processing/differential_analysis.py | 38 +++++++++++++++++-- .../fit_statistical_distribution.py | 5 ++- src/dimet/processing/pca_analysis.py | 5 +++ src/dimet/visualization/abundance_bars.py | 8 +++- .../visualization/isotopologue_proportions.py | 21 ++++++++-- .../mean_enrichment_line_plot.py | 15 +++++++- src/dimet/visualization/metabologram.py | 7 ++++ tests/test_differential_analysis.py | 23 +++++++++++ 9 files changed, 112 insertions(+), 12 deletions(-) diff --git a/src/dimet/config/analysis/method/isotopologue_proportions_plot.yaml b/src/dimet/config/analysis/method/isotopologue_proportions_plot.yaml index e16bf99..65aa5dd 100644 --- a/src/dimet/config/analysis/method/isotopologue_proportions_plot.yaml +++ b/src/dimet/config/analysis/method/isotopologue_proportions_plot.yaml @@ -5,7 +5,7 @@ name: Generate isotopologues stack plots # here put 'hidden' advanced defaults figure_format: svg -max_nb_carbons_possible: 12 +max_nb_carbons_possible: 18 # up to 30 are supported appearance_separated_time: True # adds a space between timepoints, conditions stay comparative split_plots_by_condition: False # prints each condition in independent plots height_each_stack: !!float 4.9 diff --git a/src/dimet/processing/differential_analysis.py b/src/dimet/processing/differential_analysis.py index 6b2cd1e..7b1bf21 100644 --- a/src/dimet/processing/differential_analysis.py +++ b/src/dimet/processing/differential_analysis.py @@ -278,7 +278,8 @@ def run_distribution_fitting(df: pd.DataFrame): autoset_tailway = auto_detect_tailway(df, best_distribution, args_param) logger.info(f"auto, best pvalues calculated : {autoset_tailway}") df = compute_p_value(df, autoset_tailway, best_distribution, args_param) - + df["zscore"] = np.around( + df["zscore"].astype(float).to_numpy(), decimals=6) return df @@ -345,6 +346,31 @@ def reorder_columns_diff_end(df: pd.DataFrame) -> pd.DataFrame: return df +def round_result_float_columns(df: pd.DataFrame) -> pd.DataFrame: + result_float_columns = [ + "log2FC", + "pvalue", + "padj", + "distance/span", + "FC", + "distance", + "span_allsamples"] + + columns_gmean = [column for column in list(df.columns) if + column.startswith("gmean_")] # also gmean columns + result_float_columns = list( + set(result_float_columns).union(set(columns_gmean))) + + for column in result_float_columns: + if column in list(df.columns): + df[column] = np.around( + df[column].astype(float).to_numpy(), + decimals=6 + ) + + return df + + def pairwise_comparison( df: pd.DataFrame, dataset: Dataset, cfg: DictConfig, comparison: List[str], test: availtest_methods_type @@ -368,17 +394,19 @@ def pairwise_comparison( df4c = df4c[(df4c.T != 0).any()] # delete rows being zero everywhere df4c = df4c.dropna(axis=0, how="all") df4c = row_wise_nanstd_reduction(df4c) + df4c = df4c.round(decimals=6) df4c = countnan_samples(df4c, this_comparison) df4c = distance_or_overlap(df4c, this_comparison) df4c = compute_span_incomparison(df4c, this_comparison) df4c["distance/span"] = df4c.distance.div(df4c.span_allsamples) df4c = calculate_gmean(df4c, this_comparison) - print(this_comparison) + df_good, df_bad = select_rows_with_sufficient_non_nan_values( df4c, groups=this_comparison) if test == "disfit": df_good = run_distribution_fitting(df_good) + else: result_test_df = run_statistical_test(df_good, this_comparison, test) assert result_test_df.shape[0] == df_good.shape[0] @@ -424,6 +452,7 @@ def differential_comparison( result = pairwise_comparison(df, dataset, cfg, comparison, test) result["compartment"] = compartment result = reorder_columns_diff_end(result) + result = round_result_float_columns(result) result = result.sort_values(["padj", "distance/span"], ascending=[True, False]) comp = "-".join(map(lambda x: "-".join(x), comparison)) @@ -471,6 +500,7 @@ def multi_group_compairson( df4c = df4c.dropna(axis=0, how="all") df4c = row_wise_nanstd_reduction(df4c) + df4c = df4c.round(decimals=6) this_comparison = [list(filter(lambda x: x in columns, sublist)) for sublist in conditions_list] df4c = apply_multi_group_kruskal_wallis(df4c, this_comparison) @@ -514,13 +544,13 @@ def time_course_analysis(file_name: data_files_keys_type, cfg: DictConfig, test: availtest_methods_type, out_table_dir: str): - ''' + """ Time-course comparison is performed on compartmentalized versions of data files Attention: we replace zero values using the provided method Writes the table(s) with computed statistics in the relevant output directory - ''' + """ assert_literal(test, availtest_methods_type, "Available test") assert_literal(file_name, data_files_keys_type, "file name") diff --git a/src/dimet/processing/fit_statistical_distribution.py b/src/dimet/processing/fit_statistical_distribution.py index 223f3c8..9d269d8 100644 --- a/src/dimet/processing/fit_statistical_distribution.py +++ b/src/dimet/processing/fit_statistical_distribution.py @@ -12,6 +12,8 @@ logger = logging.getLogger(__name__) +np.random.seed(123) + def compute_z_score(df: pd.DataFrame, column_name: str) -> pd.DataFrame: """ @@ -26,7 +28,8 @@ def find_best_distribution(df: pd.DataFrame): Find the best distribution among all the scipy.stats distributions and return it together with its parameters - The input dataframe df has to have a "zscore" column as the fitting is done on the zscores + The input dataframe df has to have a "zscore" column + as the fitting is done on the zscores """ logger.info("Fitting a distribution") dist = np.around(np.array((df["zscore"]).astype(float)), 5) diff --git a/src/dimet/processing/pca_analysis.py b/src/dimet/processing/pca_analysis.py index 168f3f3..abbd0ec 100644 --- a/src/dimet/processing/pca_analysis.py +++ b/src/dimet/processing/pca_analysis.py @@ -52,6 +52,7 @@ def compute_pca( X = np.transpose(np.array(quantitative_df)) pca = PCA(n_components=dims) pc = pca.fit_transform(X) + pc = np.around(pc, decimals=6) pc_df = pd.DataFrame(data=pc, columns=['PC' + str(i) for i in range(1, dims + 1)]) pc_df = pc_df.assign(name_to_plot=quantitative_df.columns) @@ -60,6 +61,10 @@ def compute_pca( var_explained_df = pd.DataFrame({ 'Explained Variance %': pca.explained_variance_ratio_ * 100, 'PC': ['PC' + str(i) for i in range(1, dims + 1)]}) + var_explained_df['Explained Variance %'] = np.around( + var_explained_df['Explained Variance %'].astype(float).to_numpy(), + decimals=6 + ) return pc_df, var_explained_df diff --git a/src/dimet/visualization/abundance_bars.py b/src/dimet/visualization/abundance_bars.py index 20116c0..5fa7eb2 100644 --- a/src/dimet/visualization/abundance_bars.py +++ b/src/dimet/visualization/abundance_bars.py @@ -51,7 +51,7 @@ def plot_one_metabolite(df: pd.DataFrame, do_stripplot: bool) -> matplotlib.axes: """" returns a single object of type matplotlib.axes - with all the individual metabolite plot + with the individual metabolite plot """ plt.rcParams.update({"font.size": 21}) sns.barplot( @@ -237,7 +237,7 @@ def run_plot_abundance_bars(dataset: Dataset, out_plot_dir, metadata_df.loc[metadata_df['compartment'] == compartment, :] compartment_df = dataset.compartmentalized_dfs[ "abundances"][compartment] - # metadata and abundances time of interest + # metadata and abundances: slice of timepoints of interest metadata_slice = metadata_compartment_df.loc[ metadata_compartment_df[ "timepoint"].isin(timepoints), :] @@ -245,6 +245,10 @@ def run_plot_abundance_bars(dataset: Dataset, out_plot_dir, # total piled-up data: piled_sel = pile_up_abundance(values_slice, metadata_slice) + piled_sel['abundance'] = np.around( + piled_sel['abundance'].astype(float).to_numpy(), + decimals=6 + ) piled_sel["condition"] = pd.Categorical( piled_sel["condition"], conditions) piled_sel["timepoint"] = pd.Categorical( diff --git a/src/dimet/visualization/isotopologue_proportions.py b/src/dimet/visualization/isotopologue_proportions.py index aeaa9e1..0372434 100644 --- a/src/dimet/visualization/isotopologue_proportions.py +++ b/src/dimet/visualization/isotopologue_proportions.py @@ -11,6 +11,7 @@ import matplotlib.patches as mpatches import matplotlib.pyplot as plt import matplotlib.ticker as mticker +import numpy as np import pandas as pd import seaborn as sns from dimet.data import Dataset @@ -54,6 +55,11 @@ def isotopologue_proportions_2piled_df( piled_df['timenum'] = piled_df['timenum'].astype(str) piled_df['Isotopologue Contribution (%)'] = \ piled_df['Isotopologue Contribution (%)'] * 100 + + piled_df['Isotopologue Contribution (%)'] = np.around( + piled_df['Isotopologue Contribution (%)'].astype(float).to_numpy(), + decimals=6 + ) return piled_df @@ -85,15 +91,24 @@ def massage_isotopologues(piled_df) -> pd.DataFrame: def prepare_means_replicates(piled_df, metaboli_selected) -> Dict: """ - returns a dictionary of dataframes, keys are metabolites + returns a dictionary of dataframes, keys are metabolites. + for each dataframe: + - the mean over the biological replicates, by isotopologue, is computed + and is saved in the column 'Isotopologue Contribution (%)' + - has columns: + condition, metabolite, m+x, timenum, Isotopologue Contribution (%) """ dfcopy = piled_df.copy() # instead groupby isotopologue_name, using m+x and metabolite works better dfcopy = dfcopy.groupby( ["condition", "metabolite", "m+x", "timenum"]) \ - .mean("Isotopologue Contribution %") # df.mean skips nan by default - dfcopy = dfcopy.reset_index() + .mean("Isotopologue Contribution (%)") # df.mean skips nan by default + dfcopy["Isotopologue Contribution (%)"] = np.around( + dfcopy["Isotopologue Contribution (%)"].astype(float).to_numpy(), + decimals=6 + ) + dfcopy = dfcopy.reset_index() dfs_dict = dict() for i in metaboli_selected: tmp = dfcopy.loc[dfcopy["metabolite"] == i, ].reset_index(drop=True) diff --git a/src/dimet/visualization/mean_enrichment_line_plot.py b/src/dimet/visualization/mean_enrichment_line_plot.py index 45b6052..7d2ed78 100644 --- a/src/dimet/visualization/mean_enrichment_line_plot.py +++ b/src/dimet/visualization/mean_enrichment_line_plot.py @@ -4,6 +4,7 @@ import matplotlib import matplotlib.pyplot as plt +import numpy as np import pandas as pd import seaborn as sns from hydra.core.config_store import ConfigStore @@ -38,6 +39,10 @@ def melt_data_metadata_2df(compartment_df: pd.DataFrame, value_name="Fractional Contribution (%)") melted_df["Fractional Contribution (%)"] = \ melted_df["Fractional Contribution (%)"] * 100 + melted_df["Fractional Contribution (%)"] = np.around( + melted_df["Fractional Contribution (%)"].astype(float).to_numpy(), + decimals=6 + ) return melted_df @@ -79,6 +84,14 @@ def metabolite_df__mean_and_sd( one_metabolite_result = mean_df.merge(std_df, how='inner', on=["condition", "timenum", "metabolite"]) + one_metabolite_result['mean'] = np.around( + one_metabolite_result['mean'] .astype(float).to_numpy(), + decimals=6 + ) + one_metabolite_result['sd'] = np.around( + one_metabolite_result['sd'] .astype(float).to_numpy(), + decimals=6 + ) return one_metabolite_result @@ -406,7 +419,7 @@ def generate_metabolites_numbered_dict(cfg: DictConfig, { 0: ['Pyr', 'Cit'], 1: ['Asn', 'Asp']} will result in one plot by couple of metabolites, totalling 2 plots. If option 'plot_grouped_by_dict' not specified, one single metabolite - per numeric key is set. + per numeric key is set (default behaviour). """ if cfg.analysis.method.plot_grouped_by_dict is not None: metabolites_numbered_dict = cfg.analysis.method.plot_grouped_by_dict diff --git a/src/dimet/visualization/metabologram.py b/src/dimet/visualization/metabologram.py index 40665a3..877cdf8 100644 --- a/src/dimet/visualization/metabologram.py +++ b/src/dimet/visualization/metabologram.py @@ -100,6 +100,9 @@ def pile_dfs_by_contexts(contexts_dict: Dict[int, Dict[str, pd.DataFrame]], df = df.assign(context=context, typemol=molecule_type) df['context'] = df['context'].astype(int) df_out = pd.concat([df_out, df], axis=0) + df_out['VALUES'] = np.around( + df_out['VALUES'].astype(float).to_numpy(), decimals=6 + ) data_cleaned_dict[molecule_type] = df_out return data_cleaned_dict @@ -328,9 +331,13 @@ def donut_outer(curr_pathway_context_df, genecircportion = 50 / curr_pathway_context_df.loc[ curr_pathway_context_df.typemol == "transcripts", :].shape[0] + genecircportion = np.around(np.array(genecircportion), decimals=6) + metabocircportion = 50 / curr_pathway_context_df.loc[ curr_pathway_context_df.typemol == "metabolites", :].shape[0] + metabocircportion = np.around(np.array(metabocircportion), decimals=6) + curr_pathway_context_df.loc[ curr_pathway_context_df.typemol == "transcripts", "circportion"] = genecircportion diff --git a/tests/test_differential_analysis.py b/tests/test_differential_analysis.py index 8275c1c..a170096 100644 --- a/tests/test_differential_analysis.py +++ b/tests/test_differential_analysis.py @@ -125,6 +125,29 @@ def test_reorder_columns_diff_end(self): ))) self.assertEqual(result.shape, (2, 11)) + def test_round_result_float_columns(self): + data = { + "distance": [2, 1.5], "span_allsamples": [4, 6], + "distance/span": [0.59595959595959, 0.25], + "count_nan_samples_group1": [0, 0], + "count_nan_samples_group2": [0, 0], + "pvalue": [0, 0], + "padj": [1.54354343434e-3, 1.3543434354335e-4], + "log2FC": [4.063030512104, 5.0202235], + "FC": [8, 23], "compartment": ["med", "med"], + } + df = pd.DataFrame(data) + df.index = ['met1', 'met3'] + result = differential_analysis.round_result_float_columns(df) + self.assertTrue( + any(np.array(result["padj"]) == np.array([0.001544, 0.000135]) + ) + ) + self.assertTrue( + any(np.array(result["log2FC"]) == np.array([4.063031, 5.020224]) + ) + ) + def test_time_course_auto_list_comparisons(self): metadata = pd.DataFrame({ 'condition': ['cond1', 'cond1', 'cond1', 'cond1', From 05e1233a8b7bfb8528ece1182ae6105a6d437b09 Mon Sep 17 00:00:00 2001 From: johaGL Date: Fri, 6 Oct 2023 16:52:10 +0200 Subject: [PATCH 2/2] fix(differential analysis): np.around decimals=6 --- src/dimet/processing/differential_analysis.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/dimet/processing/differential_analysis.py b/src/dimet/processing/differential_analysis.py index 7b1bf21..01504e9 100644 --- a/src/dimet/processing/differential_analysis.py +++ b/src/dimet/processing/differential_analysis.py @@ -424,6 +424,7 @@ def pairwise_comparison( # re-integrate the "bad" sub-dataframes to the full dataframe result = concatenate_dataframes(df_good, df_bad, df_no_padj) + result = round_result_float_columns(result) return result @@ -452,7 +453,7 @@ def differential_comparison( result = pairwise_comparison(df, dataset, cfg, comparison, test) result["compartment"] = compartment result = reorder_columns_diff_end(result) - result = round_result_float_columns(result) + result = result.sort_values(["padj", "distance/span"], ascending=[True, False]) comp = "-".join(map(lambda x: "-".join(x), comparison)) @@ -506,6 +507,7 @@ def multi_group_compairson( df4c = apply_multi_group_kruskal_wallis(df4c, this_comparison) df4c = compute_padj(df4c, 0.05, cfg.analysis.method.correction_method) + df4c = round_result_float_columns(df4c) base_file_name = dataset.get_file_for_label(file_name) base_file_name += f"--{compartment}--multigroup" output_file_name = os.path.join(out_table_dir,