Merge pull request #17 from cbib/oct_2023

Oct 2023
cbib · Oct 6, 2023 · f7caab9 · f7caab9
2 parents 814a4d7 + 05e1233
commit f7caab9
Show file tree

Hide file tree

Showing 9 changed files with 114 additions and 12 deletions.
diff --git a/src/dimet/config/analysis/method/isotopologue_proportions_plot.yaml b/src/dimet/config/analysis/method/isotopologue_proportions_plot.yaml
@@ -5,7 +5,7 @@ name: Generate isotopologues stack plots
 
 # here put 'hidden' advanced defaults
 figure_format: svg
-max_nb_carbons_possible: 12
+max_nb_carbons_possible: 18  # up to 30 are supported
 appearance_separated_time: True # adds a space between timepoints, conditions stay comparative
 split_plots_by_condition: False # prints each condition in independent plots
 height_each_stack: !!float 4.9

diff --git a/src/dimet/processing/differential_analysis.py b/src/dimet/processing/differential_analysis.py
@@ -278,7 +278,8 @@ def run_distribution_fitting(df: pd.DataFrame):
     autoset_tailway = auto_detect_tailway(df, best_distribution, args_param)
     logger.info(f"auto, best pvalues calculated : {autoset_tailway}")
     df = compute_p_value(df, autoset_tailway, best_distribution, args_param)
-
+    df["zscore"] = np.around(
+        df["zscore"].astype(float).to_numpy(), decimals=6)
     return df
 
 
@@ -345,6 +346,31 @@ def reorder_columns_diff_end(df: pd.DataFrame) -> pd.DataFrame:
     return df
 
 
+def round_result_float_columns(df: pd.DataFrame) -> pd.DataFrame:
+    result_float_columns = [
+        "log2FC",
+        "pvalue",
+        "padj",
+        "distance/span",
+        "FC",
+        "distance",
+        "span_allsamples"]
+
+    columns_gmean = [column for column in list(df.columns) if
+                     column.startswith("gmean_")]  # also gmean columns
+    result_float_columns = list(
+        set(result_float_columns).union(set(columns_gmean)))
+
+    for column in result_float_columns:
+        if column in list(df.columns):
+            df[column] = np.around(
+                df[column].astype(float).to_numpy(),
+                decimals=6
+            )
+
+    return df
+
+
 def pairwise_comparison(
         df: pd.DataFrame, dataset: Dataset, cfg: DictConfig,
         comparison: List[str], test: availtest_methods_type
@@ -368,17 +394,19 @@ def pairwise_comparison(
     df4c = df4c[(df4c.T != 0).any()]  # delete rows being zero everywhere
     df4c = df4c.dropna(axis=0, how="all")
     df4c = row_wise_nanstd_reduction(df4c)
+    df4c = df4c.round(decimals=6)
     df4c = countnan_samples(df4c, this_comparison)
     df4c = distance_or_overlap(df4c, this_comparison)
     df4c = compute_span_incomparison(df4c, this_comparison)
     df4c["distance/span"] = df4c.distance.div(df4c.span_allsamples)
     df4c = calculate_gmean(df4c, this_comparison)
-    print(this_comparison)
+
     df_good, df_bad = select_rows_with_sufficient_non_nan_values(
         df4c, groups=this_comparison)
 
     if test == "disfit":
         df_good = run_distribution_fitting(df_good)
+
     else:
         result_test_df = run_statistical_test(df_good, this_comparison, test)
         assert result_test_df.shape[0] == df_good.shape[0]
@@ -396,6 +424,7 @@ def pairwise_comparison(
 
     # re-integrate the "bad" sub-dataframes to the full dataframe
     result = concatenate_dataframes(df_good, df_bad, df_no_padj)
+    result = round_result_float_columns(result)
     return result
 
 
@@ -424,6 +453,7 @@ def differential_comparison(
             result = pairwise_comparison(df, dataset, cfg, comparison, test)
             result["compartment"] = compartment
             result = reorder_columns_diff_end(result)
+
             result = result.sort_values(["padj", "distance/span"],
                                         ascending=[True, False])
             comp = "-".join(map(lambda x: "-".join(x), comparison))
@@ -471,11 +501,13 @@ def multi_group_compairson(
         df4c = df4c.dropna(axis=0, how="all")
 
         df4c = row_wise_nanstd_reduction(df4c)
+        df4c = df4c.round(decimals=6)
         this_comparison = [list(filter(lambda x: x in columns, sublist)) for
                            sublist in conditions_list]
         df4c = apply_multi_group_kruskal_wallis(df4c, this_comparison)
         df4c = compute_padj(df4c, 0.05,
                             cfg.analysis.method.correction_method)
+        df4c = round_result_float_columns(df4c)
         base_file_name = dataset.get_file_for_label(file_name)
         base_file_name += f"--{compartment}--multigroup"
         output_file_name = os.path.join(out_table_dir,
@@ -514,13 +546,13 @@ def time_course_analysis(file_name: data_files_keys_type,
                          cfg: DictConfig,
                          test: availtest_methods_type,
                          out_table_dir: str):
-    '''
+    """
     Time-course comparison is performed on compartmentalized versions of
     data files
     Attention: we replace zero values using the provided method
     Writes the table(s) with computed statistics in the relevant
      output directory
-    '''
+    """
 
     assert_literal(test, availtest_methods_type, "Available test")
     assert_literal(file_name, data_files_keys_type, "file name")

diff --git a/src/dimet/processing/fit_statistical_distribution.py b/src/dimet/processing/fit_statistical_distribution.py
@@ -12,6 +12,8 @@
 
 logger = logging.getLogger(__name__)
 
+np.random.seed(123)
+
 
 def compute_z_score(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
     """
@@ -26,7 +28,8 @@ def find_best_distribution(df: pd.DataFrame):
     Find the best distribution among all the scipy.stats distributions
     and return it together with its parameters
 
-    The input dataframe df has to have a "zscore" column as the fitting is done on the zscores
+    The input dataframe df has to have a "zscore" column
+    as the fitting is done on the zscores
     """
     logger.info("Fitting a distribution")
     dist = np.around(np.array((df["zscore"]).astype(float)), 5)

diff --git a/src/dimet/processing/pca_analysis.py b/src/dimet/processing/pca_analysis.py
@@ -52,6 +52,7 @@ def compute_pca(
     X = np.transpose(np.array(quantitative_df))
     pca = PCA(n_components=dims)
     pc = pca.fit_transform(X)
+    pc = np.around(pc, decimals=6)
     pc_df = pd.DataFrame(data=pc,
                          columns=['PC' + str(i) for i in range(1, dims + 1)])
     pc_df = pc_df.assign(name_to_plot=quantitative_df.columns)
@@ -60,6 +61,10 @@ def compute_pca(
     var_explained_df = pd.DataFrame({
         'Explained Variance %': pca.explained_variance_ratio_ * 100,
         'PC': ['PC' + str(i) for i in range(1, dims + 1)]})
+    var_explained_df['Explained Variance %'] = np.around(
+        var_explained_df['Explained Variance %'].astype(float).to_numpy(),
+        decimals=6
+    )
 
     return pc_df, var_explained_df
 

diff --git a/src/dimet/visualization/abundance_bars.py b/src/dimet/visualization/abundance_bars.py
@@ -51,7 +51,7 @@ def plot_one_metabolite(df: pd.DataFrame,
                         do_stripplot: bool) -> matplotlib.axes:
     """"
     returns a single object of type matplotlib.axes
-    with all the individual metabolite plot
+    with the individual metabolite plot
     """
     plt.rcParams.update({"font.size": 21})
     sns.barplot(
@@ -237,14 +237,18 @@ def run_plot_abundance_bars(dataset: Dataset, out_plot_dir,
             metadata_df.loc[metadata_df['compartment'] == compartment, :]
         compartment_df = dataset.compartmentalized_dfs[
             "abundances"][compartment]
-        # metadata and abundances time of interest
+        # metadata and abundances: slice of timepoints of interest
         metadata_slice = metadata_compartment_df.loc[
                          metadata_compartment_df[
                              "timepoint"].isin(timepoints), :]
         values_slice = compartment_df[list(metadata_slice["name_to_plot"])]
 
         # total piled-up data:
         piled_sel = pile_up_abundance(values_slice, metadata_slice)
+        piled_sel['abundance'] = np.around(
+            piled_sel['abundance'].astype(float).to_numpy(),
+            decimals=6
+        )
         piled_sel["condition"] = pd.Categorical(
             piled_sel["condition"], conditions)
         piled_sel["timepoint"] = pd.Categorical(

diff --git a/src/dimet/visualization/isotopologue_proportions.py b/src/dimet/visualization/isotopologue_proportions.py
@@ -11,6 +11,7 @@
 import matplotlib.patches as mpatches
 import matplotlib.pyplot as plt
 import matplotlib.ticker as mticker
+import numpy as np
 import pandas as pd
 import seaborn as sns
 from dimet.data import Dataset
@@ -54,6 +55,11 @@ def isotopologue_proportions_2piled_df(
     piled_df['timenum'] = piled_df['timenum'].astype(str)
     piled_df['Isotopologue Contribution (%)'] = \
         piled_df['Isotopologue Contribution (%)'] * 100
+
+    piled_df['Isotopologue Contribution (%)'] = np.around(
+        piled_df['Isotopologue Contribution (%)'].astype(float).to_numpy(),
+        decimals=6
+    )
     return piled_df
 
 
@@ -85,15 +91,24 @@ def massage_isotopologues(piled_df) -> pd.DataFrame:
 
 def prepare_means_replicates(piled_df, metaboli_selected) -> Dict:
     """
-    returns a dictionary of dataframes, keys are metabolites
+    returns a dictionary of dataframes, keys are metabolites.
+    for each dataframe:
+       - the mean over the biological replicates, by isotopologue, is computed
+          and is saved in the column 'Isotopologue Contribution (%)'
+       - has columns:
+           condition, metabolite, m+x, timenum, Isotopologue Contribution (%)
     """
     dfcopy = piled_df.copy()
     # instead groupby isotopologue_name, using m+x and metabolite works better
     dfcopy = dfcopy.groupby(
         ["condition", "metabolite", "m+x", "timenum"]) \
-        .mean("Isotopologue Contribution %")  # df.mean skips nan by default
-    dfcopy = dfcopy.reset_index()
+        .mean("Isotopologue Contribution (%)")  # df.mean skips nan by default
 
+    dfcopy["Isotopologue Contribution (%)"] = np.around(
+        dfcopy["Isotopologue Contribution (%)"].astype(float).to_numpy(),
+        decimals=6
+    )
+    dfcopy = dfcopy.reset_index()
     dfs_dict = dict()
     for i in metaboli_selected:
         tmp = dfcopy.loc[dfcopy["metabolite"] == i, ].reset_index(drop=True)

diff --git a/src/dimet/visualization/mean_enrichment_line_plot.py b/src/dimet/visualization/mean_enrichment_line_plot.py
@@ -4,6 +4,7 @@
 
 import matplotlib
 import matplotlib.pyplot as plt
+import numpy as np
 import pandas as pd
 import seaborn as sns
 from hydra.core.config_store import ConfigStore
@@ -38,6 +39,10 @@ def melt_data_metadata_2df(compartment_df: pd.DataFrame,
                         value_name="Fractional Contribution (%)")
     melted_df["Fractional Contribution (%)"] = \
         melted_df["Fractional Contribution (%)"] * 100
+    melted_df["Fractional Contribution (%)"] = np.around(
+        melted_df["Fractional Contribution (%)"].astype(float).to_numpy(),
+        decimals=6
+    )
     return melted_df
 
 
@@ -79,6 +84,14 @@ def metabolite_df__mean_and_sd(
     one_metabolite_result = mean_df.merge(std_df, how='inner',
                                           on=["condition", "timenum",
                                               "metabolite"])
+    one_metabolite_result['mean'] = np.around(
+        one_metabolite_result['mean'] .astype(float).to_numpy(),
+        decimals=6
+    )
+    one_metabolite_result['sd'] = np.around(
+        one_metabolite_result['sd'] .astype(float).to_numpy(),
+        decimals=6
+    )
     return one_metabolite_result
 
 
@@ -406,7 +419,7 @@ def generate_metabolites_numbered_dict(cfg: DictConfig,
     { 0: ['Pyr', 'Cit'], 1: ['Asn', 'Asp']}
     will result in one plot by couple of metabolites, totalling 2 plots.
     If option 'plot_grouped_by_dict' not specified, one single metabolite
-    per numeric key is set.
+    per numeric key is set (default behaviour).
     """
     if cfg.analysis.method.plot_grouped_by_dict is not None:
         metabolites_numbered_dict = cfg.analysis.method.plot_grouped_by_dict

diff --git a/src/dimet/visualization/metabologram.py b/src/dimet/visualization/metabologram.py
@@ -100,6 +100,9 @@ def pile_dfs_by_contexts(contexts_dict: Dict[int, Dict[str, pd.DataFrame]],
             df = df.assign(context=context, typemol=molecule_type)
             df['context'] = df['context'].astype(int)
             df_out = pd.concat([df_out, df], axis=0)
+            df_out['VALUES'] = np.around(
+                df_out['VALUES'].astype(float).to_numpy(), decimals=6
+            )
 
         data_cleaned_dict[molecule_type] = df_out
     return data_cleaned_dict
@@ -328,9 +331,13 @@ def donut_outer(curr_pathway_context_df,
     genecircportion = 50 / curr_pathway_context_df.loc[
                            curr_pathway_context_df.typemol == "transcripts",
                            :].shape[0]
+    genecircportion = np.around(np.array(genecircportion), decimals=6)
+
     metabocircportion = 50 / curr_pathway_context_df.loc[
                              curr_pathway_context_df.typemol == "metabolites",
                              :].shape[0]
+    metabocircportion = np.around(np.array(metabocircportion), decimals=6)
+
     curr_pathway_context_df.loc[
         curr_pathway_context_df.typemol == "transcripts",
         "circportion"] = genecircportion

diff --git a/tests/test_differential_analysis.py b/tests/test_differential_analysis.py
@@ -125,6 +125,29 @@ def test_reorder_columns_diff_end(self):
             )))
         self.assertEqual(result.shape, (2, 11))
 
+    def test_round_result_float_columns(self):
+        data = {
+            "distance": [2, 1.5], "span_allsamples": [4, 6],
+            "distance/span": [0.59595959595959, 0.25],
+            "count_nan_samples_group1": [0, 0],
+            "count_nan_samples_group2": [0, 0],
+            "pvalue": [0, 0],
+            "padj": [1.54354343434e-3, 1.3543434354335e-4],
+            "log2FC": [4.063030512104, 5.0202235],
+            "FC": [8, 23], "compartment": ["med", "med"],
+        }
+        df = pd.DataFrame(data)
+        df.index = ['met1', 'met3']
+        result = differential_analysis.round_result_float_columns(df)
+        self.assertTrue(
+            any(np.array(result["padj"]) == np.array([0.001544, 0.000135])
+                )
+        )
+        self.assertTrue(
+            any(np.array(result["log2FC"]) == np.array([4.063031, 5.020224])
+                )
+        )
+
     def test_time_course_auto_list_comparisons(self):
         metadata = pd.DataFrame({
             'condition': ['cond1', 'cond1', 'cond1', 'cond1',