Merge pull request #14 from cbib/str_var_update

update pyproject.toml. Corr: compartment, fdr_bh. flake8 ok. Opt cfg pca.
cbib · Aug 24, 2023 · 4b693de · 4b693de
2 parents 6c3884e + 79c80b9
commit 4b693de
Show file tree

Hide file tree

Showing 24 changed files with 76 additions and 113 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,16 +1,17 @@
 [tool.poetry]
 name="DIMet"
-version="0.1.0"
-description="A tool for Differential Isotope-labeled targeted Metabolomics"
+version="0.1.1"
+description="A tool for Differential analysis of Isotope-labeled targeted Metabolomics data"
 readme="README.md"
 license = "MIT"
 authors = [ 
   "Johanna Galvis Rodriguez <[email protected]>",
-  "Joris Guyon <[email protected]>",
   "Benjamin Dartigues <[email protected]>",
   "Florian Specque <[email protected]>",
-  "Thomas Daubon <[email protected]>",
   "Slim Karkar <[email protected]>",
+  "Helge Hecht <[email protected]>",
+  "Bjorn Gruening <[email protected]>",
+  "Hayssam Soueidan <[email protected]>",
   "Macha Nikolski <[email protected]>" 
 
 ]

diff --git a/src/dimet/config/analysis/method/differential_analysis.yaml b/src/dimet/config/analysis/method/differential_analysis.yaml
@@ -8,7 +8,7 @@ grouping :
   - timepoint
 
 qualityDistanceOverSpan : -0.3
-correction_method : "bonferroni"
+correction_method : "fdr_bh"
 
 impute_values:
   abundances: "min"

diff --git a/src/dimet/config/analysis/method/metabologram_integration.yaml b/src/dimet/config/analysis/method/metabologram_integration.yaml
@@ -10,7 +10,7 @@ grouping :
   - timepoint
 
 qualityDistanceOverSpan : -0.3
-correction_method : "bonferroni"
+correction_method : "fdr_bh"
 
 impute_values:
   abundances: "min"

diff --git a/src/dimet/config/analysis/method/multi_group_comparison.yaml b/src/dimet/config/analysis/method/multi_group_comparison.yaml
@@ -7,7 +7,7 @@ grouping :
   - condition
   - timepoint
 
-correction_method : "bonferroni"
+correction_method : "fdr_bh"
 
 datatypes:
   - abundances

diff --git a/src/dimet/config/analysis/method/pca_plot.yaml b/src/dimet/config/analysis/method/pca_plot.yaml
@@ -3,6 +3,9 @@ _target_: dimet.method.PcaPlotConfig
 label: pca-plot
 name: Generate Principal Component Analysis plots
 
+color: condition   # color dots using this variable
+style: timepoint   # style of the dots' shapes, using this variable
+
 pca_split_further:
   - timepoint
   # - condition

diff --git a/src/dimet/config/analysis/method/time_course_analysis.yaml b/src/dimet/config/analysis/method/time_course_analysis.yaml
@@ -8,7 +8,7 @@ grouping:
   - timepoint
 
 qualityDistanceOverSpan : -0.3
-correction_method : "bonferroni"
+correction_method : "fdr_bh"
 
 impute_values:
   abundances: "min"

diff --git a/src/dimet/helpers.py b/src/dimet/helpers.py
@@ -244,8 +244,8 @@ def df_to_dict_by_compartment(df: pd.DataFrame,
     splits df into a dictionary of dataframes, each for one compartment
     """
     output_dict = dict()
-    for compartment in metadata["short_comp"].unique():
-        sample_names = metadata[metadata["short_comp"] == compartment][
+    for compartment in metadata['compartment'].unique():
+        sample_names = metadata[metadata['compartment'] == compartment][
             "original_name"]
         compartment_df = df[list(sample_names)]
         output_dict[compartment] = compartment_df
@@ -435,7 +435,7 @@ def absolute_geommean_diff(b_values: np.array, a_values: np.array):
 def drop_all_nan_metabolites_on_comp_frames(frames_dict: Dict,
                                             metadata: pd.DataFrame) -> Dict:
     """ metabolites must be in rows """
-    compartments = metadata["short_comp"].unique().tolist()
+    compartments = metadata['compartment'].unique().tolist()
     for dataset in frames_dict.keys():
         for compartment in compartments:
             tmp = frames_dict[dataset][compartment]
@@ -454,9 +454,9 @@ def set_samples_names(frames_dict: Dict, metadata: pd.DataFrame) -> Dict:
     """
     for dataset, compartments_dict in frames_dict.items():
         for compartment, df in compartments_dict.items():
-            original_names = metadata[metadata["short_comp"] == compartment][
+            original_names = metadata[metadata['compartment'] == compartment][
                 "original_name"]
-            new_names = metadata[metadata["short_comp"] == compartment][
+            new_names = metadata[metadata['compartment'] == compartment][
                 "name_to_plot"]
             renamed_columns = {old: new for old, new in
                                zip(original_names, new_names)

diff --git a/src/dimet/method/__init__.py b/src/dimet/method/__init__.py
@@ -127,6 +127,8 @@ def build(self) -> "PcaAnalysis":
 
 
 class PcaPlotConfig(MethodConfig):
+    color: Union[str, None] = "condition"
+    style: Union[str, None] = "timepoint"
     pca_split_further: Union[ListConfig, None] = ["timepoint"]
     draw_ellipses: Union[str, None] = "condition"
     run_iris_demo: bool = False
@@ -205,7 +207,7 @@ def run(self, cfg: DictConfig, dataset: Dataset) -> None:
                 "No selected metabolites provided, plotting for all")
             with open_dict(cfg):
                 cfg.analysis["metabolites"] = {}
-                for c in set(dataset.metadata_df["short_comp"]):
+                for c in set(dataset.metadata_df['compartment']):
                     metabolites_compartment = \
                         dataset.compartmentalized_dfs[
                             'abundances'][c].index.to_list()
@@ -220,7 +222,7 @@ def check_expectations(self, cfg: DictConfig, dataset: Dataset) -> None:
         # check that necessary information is provided in the analysis config
         try:
             if not set(cfg.analysis.metabolites.keys()).issubset(
-                    dataset.metadata_df["short_comp"]):
+                    dataset.metadata_df['compartment']):
                 raise ValueError(
                     "[Analysis > Metabolites > compartments] are missing "
                     "from [Metadata > Compartments]"
@@ -378,7 +380,7 @@ def run(self, cfg: DictConfig, dataset: Dataset) -> None:
             logger.warning(
                 "No selected metabolites provided, plotting for all")
             with open_dict(cfg):
-                compartments = list(set(dataset.metadata_df["short_comp"]))
+                compartments = list(set(dataset.metadata_df['compartment']))
                 cfg.analysis["metabolites"] = dict()
                 for c in compartments:
                     isotopologues_names = \
@@ -397,7 +399,7 @@ def run(self, cfg: DictConfig, dataset: Dataset) -> None:
     def check_expectations(self, cfg: DictConfig, dataset: Dataset) -> None:
         try:
             if not set(cfg.analysis.metabolites.keys()).issubset(
-                    dataset.metadata_df['short_comp']):
+                    dataset.metadata_df['compartment']):
                 raise ValueError(
                     "[Analysis > Metabolites > compartments] "
                     "are missing from [Metadata > Compartments]"
@@ -440,7 +442,7 @@ def run(self, cfg: DictConfig, dataset: Dataset) -> None:
                 "No selected metabolites provided, plotting for all")
             with open_dict(cfg):
                 cfg.analysis["metabolites"] = {}
-                for c in set(dataset.metadata_df["short_comp"]):
+                for c in set(dataset.metadata_df['compartment']):
                     cfg.analysis["metabolites"][c] = \
                         dataset.compartmentalized_dfs[
                             'mean_enrichment'][c].index.to_list()
@@ -453,7 +455,7 @@ def run(self, cfg: DictConfig, dataset: Dataset) -> None:
     def check_expectations(self, cfg: DictConfig, dataset: Dataset) -> None:
         try:
             if not set(cfg.analysis.metabolites.keys()).issubset(
-                    dataset.metadata_df['short_comp']):
+                    dataset.metadata_df['compartment']):
                 raise ValueError(
                     "[Analysis > Metabolites > compartments] are missing "
                     "from [Metadata > Compartments]"
@@ -728,11 +730,11 @@ def check_expectations(self, cfg: DictConfig,
             if not isinstance(cfg.analysis.compartment, str):
                 raise ValueError("compartment must be string in config file")
             if cfg.analysis.compartment not in \
-                    set(data_integration.metadata_df['short_comp']):
+                    set(data_integration.metadata_df['compartment']):
                 raise ValueError(
                     f"the compartment '{cfg.analysis.compartment}' "
                     f"in the config file does not exist. Must be one of: "
-                    f"{set(data_integration.metadata_df['short_comp'])}"
+                    f"{set(data_integration.metadata_df['compartment'])}"
                 )
             if not len(cfg.analysis.statistical_test.keys()) == 1:
                 raise ValueError(

diff --git a/src/dimet/processing/differential_analysis.py b/src/dimet/processing/differential_analysis.py
@@ -392,7 +392,7 @@ def pairwise_comparison(
         df_good, "distance/span", cfg.analysis.method.qualityDistanceOverSpan
     )
     df_good = compute_padj(df_good, 0.05,
-                                   cfg.analysis.method.correction_method)
+                           cfg.analysis.method.correction_method)
 
     # re-integrate the "bad" sub-dataframes to the full dataframe
     result = concatenate_dataframes(df_good, df_bad, df_no_padj)
@@ -475,7 +475,7 @@ def multi_group_compairson(
                            sublist in conditions_list]
         df4c = apply_multi_group_kruskal_wallis(df4c, this_comparison)
         df4c = compute_padj(df4c, 0.05,
-                                    cfg.analysis.method.correction_method)
+                            cfg.analysis.method.correction_method)
         base_file_name = dataset.get_file_for_label(file_name)
         base_file_name += f"--{compartment}--multigroup"
         output_file_name = os.path.join(out_table_dir,

diff --git a/src/dimet/processing/pca_analysis.py b/src/dimet/processing/pca_analysis.py
@@ -73,7 +73,7 @@ def pca_on_split_dataset(compartment_df: pd.DataFrame,
     and computes PCA on each subset.
     The results are added to the dictionary of results.
     """
-    assert len(metadata_co_df['short_comp'].unique()) == 1
+    assert len(metadata_co_df['compartment'].unique()) == 1
     assert chosen_column in ["condition", "timepoint"]
     unique_nominal_values = metadata_co_df[chosen_column].unique().tolist()
     pca_tables_dict = {}
@@ -149,7 +149,7 @@ def run_pca_analysis(file_name: data_files_keys_type,
         val_instead_zero = arg_repl_zero2value(impute_value, df)
         df = df.replace(to_replace=0, value=val_instead_zero)
 
-        metadata_co_df = metadata_df[metadata_df['short_comp'] == compartment]
+        metadata_co_df = metadata_df[metadata_df['compartment'] == compartment]
 
         pca_compartment_dict = pca_global_compartment_dataset(
             df, metadata_co_df, description=[file_name, compartment]

diff --git a/src/dimet/visualization/abundance_bars.py b/src/dimet/visualization/abundance_bars.py
@@ -214,10 +214,10 @@ def run_plot_abundance_bars(dataset: Dataset, out_plot_dir,
     width_each_subfig = cfg.analysis.width_each_subfig
     height_each_subfig = cfg.analysis.method.height_each_subfig
 
-    compartments = set(metadata_df["short_comp"])
+    compartments = set(metadata_df['compartment'])
     for compartment in compartments:
         metadata_compartment_df: pd.DataFrame = \
-            metadata_df.loc[metadata_df["short_comp"] == compartment, :]
+            metadata_df.loc[metadata_df['compartment'] == compartment, :]
         compartment_df = dataset.compartmentalized_dfs[
             "abundances"][compartment]
         # metadata and abundances time of interest

diff --git a/src/dimet/visualization/distr_fit_plot.py b/src/dimet/visualization/distr_fit_plot.py
@@ -163,7 +163,7 @@ def run_distr_fit_plot(
         df = compartmentalized_df
         df = df[(df.T != 0).any()]
         val_instead_zero = arg_repl_zero2value(impute_value,
-                                                       df)
+                                               df)
         df = df.replace(to_replace=0, value=val_instead_zero)
         if mode == "pairwise":
             for comparison in cfg.analysis.comparisons:

diff --git a/src/dimet/visualization/isotopologue_proportions.py b/src/dimet/visualization/isotopologue_proportions.py
@@ -45,7 +45,7 @@ def isotopologue_proportions_2piled_df(
         combined_isos_metadata_df, metada_df, on='name_to_plot')
 
     combined_isos_metadata_df = combined_isos_metadata_df.drop(
-        columns=['short_comp', 'original_name', 'name_to_plot', 'timepoint'])
+        columns=['compartment', 'original_name', 'name_to_plot', 'timepoint'])
     piled_df = pd.melt(combined_isos_metadata_df,
                        id_vars=['timenum', 'condition'],
                        var_name="isotopologue_name",
@@ -455,11 +455,11 @@ def run_isotopologue_proportions_plot(dataset: Dataset,
     time_levels_list: List[str] = [
         str(i) for i in sorted(metadata_df['timenum'].unique())]
 
-    compartments = list(metadata_df['short_comp'].unique())
+    compartments = list(metadata_df['compartment'].unique())
 
     for compartment in compartments:
         metadata_compartment_df: pd.DataFrame = \
-            metadata_df.loc[metadata_df["short_comp"] == compartment, :]
+            metadata_df.loc[metadata_df['compartment'] == compartment, :]
         compartment_df = dataset.compartmentalized_dfs[
             "isotopologue_proportions"][compartment]
 

diff --git a/src/dimet/visualization/mean_enrichment_line_plot.py b/src/dimet/visualization/mean_enrichment_line_plot.py
@@ -30,7 +30,7 @@ def melt_data_metadata_2df(compartment_df: pd.DataFrame,
                               on='name_to_plot')
     compartment_df = compartment_df.drop(columns=['name_to_plot',
                                                   'timepoint',
-                                                  'short_comp',
+                                                  'compartment',
                                                   'original_name'])
     melted_df = pd.melt(compartment_df,
                         id_vars=['timenum', 'condition'],
@@ -346,7 +346,7 @@ def line_plot_by_compartment(dataset: Dataset,
                              cfg: DictConfig) -> None:
     """ calls function to construct and save plot """
     metadata_df = dataset.metadata_df
-    compartments = list(metadata_df['short_comp'].unique())
+    compartments = list(metadata_df['compartment'].unique())
     width_subplot = cfg.analysis.width_subplot
     height_subplot = cfg.analysis.method.height_subplot
     xaxis_title = cfg.analysis.method.xaxis_title
@@ -355,7 +355,7 @@ def line_plot_by_compartment(dataset: Dataset,
     alpha_conf = cfg.analysis.method.alpha
 
     for co in compartments:
-        metadata_co_df = metadata_df.loc[metadata_df['short_comp'] == co, :]
+        metadata_co_df = metadata_df.loc[metadata_df['compartment'] == co, :]
         compartment_df = dataset.compartmentalized_dfs["mean_enrichment"][co]
 
         melted_co_df = melt_data_metadata_2df(compartment_df, metadata_co_df)

diff --git a/src/dimet/visualization/pca_plot.py b/src/dimet/visualization/pca_plot.py
@@ -145,15 +145,19 @@ def run_pca_plot(pca_results_dict: dict,  cfg: DictConfig,
         name_plot_var = f"{'--'.join(tup)}_var.pdf"
         figure_var.savefig(os.path.join(out_plot_dir, name_plot_var))
         plt.close()
+
+        color_dot = cfg.analysis.method.color
+        style_dot = cfg.analysis.method.style
         options_labels = {'label-y': "name_to_plot",
                           'label-n': ""}  # when empty string, no dot labels
+
         # scatter: save both versions, labeled dots and unlabeled dots:
         for choice in options_labels.keys():
             labels_column = options_labels[choice]
             name_elements = list(tup) + [choice]
             scatter_fig: figure.Figure = pca_scatter_plot(
-                pc_df,  var_explained_df, "condition",
-                "condition", labels_column,
+                pc_df,  var_explained_df, color_dot,
+                style_dot, labels_column,
                 ellipses_column=cfg.analysis.method.draw_ellipses)
             pca_scatter_2_pdf(scatter_fig, name_elements, out_plot_dir)
             plt.close()

diff --git a/tests/test_abundance_bars.py b/tests/test_abundance_bars.py
@@ -27,7 +27,7 @@ def test_pile_up_abundance(self):
             'name_to_plot': ['beta-1', 'beta-2', 'ctrl-1', 'ctrl-2'],
             'condition': ['beta-glu', 'beta-glu', 'control', 'control'],
             'timepoint': ['t0', 't0', 't0', 't0'],
-            'short_comp': ['ex', 'ex', 'ex', 'ex']
+            'compartment': ['ex', 'ex', 'ex', 'ex']
         })
         result = abundance_bars.pile_up_abundance(df, metadata_df)
         self.assertTrue(result.shape == (12, 4))
@@ -46,7 +46,7 @@ def test_plot_one_metabolite(self):
             'timepoint': ['t0', 't0'],
             'condition': ['beta', 'alpha'],
             'metabolite': ['m1', 'm1'],
-            'abundance' : [200, 700]
+            'abundance': [200, 700]
         })
         fig_this_metabolite, axs_k = plt.subplots(
             nrows=1, ncols=1,
@@ -67,7 +67,7 @@ def test_plot_abundance_bars_no_grid(self):
             'timepoint': ['t0', 't0'],
             'condition': ['beta', 'alpha'],
             'metabolite': ['m1', 'm1'],
-            'abundance' : [200, 700]
+            'abundance': [200, 700]
         })
         try:
             os.makedirs("../__pycache__/")
@@ -84,4 +84,3 @@ def test_plot_abundance_bars_no_grid(self):
             height_each_subfig=2.4,
             cfg=cfg_m)
         self.assertTrue(result is None)
-
diff --git a/tests/test_differential_analysis.py b/tests/test_differential_analysis.py
@@ -50,9 +50,7 @@ def test_select_rows_with_sufficient_non_nan_values(self):
         groups = [["c1", "c2", "c3"], ["c4", "c5", "c6"]]
         df = countnan_samples(df, groups)
         result_good, result_bad = differential_analysis. \
-            select_rows_with_sufficient_non_nan_values(
-            df, groups
-        )
+            select_rows_with_sufficient_non_nan_values(df, groups)
         self.assertEqual(result_good.shape, (2, 8))
         self.assertEqual(result_bad.shape, (2, 8))
         self.assertTrue(np.any(np.array(result_good.loc[0, :]) ==
@@ -141,5 +139,3 @@ def test_time_course_auto_list_comparisons(self):
         self.assertListEqual(result[1], [['cond1', '3h'], ['cond1', '2.7h']])
         self.assertListEqual(result[2], [['cond2', '3h'], ['cond2', '2.7h']])
         self.assertListEqual(result[3], [['cond1', '2.7h'], ['cond1', '1h']])
-
-
diff --git a/tests/test_distr_fit_plot.py b/tests/test_distr_fit_plot.py
@@ -14,11 +14,10 @@
 class TestDistrFitPlot(TestCase):
     def test_make_pdf(self):
         dist = getattr(stats, "gennorm")
-        params_dict =  {'beta': 1.09, 'loc': 0.01, 'scale': 1.77}
+        params_dict = {'beta': 1.09, 'loc': 0.01, 'scale': 1.77}
         params = list(params_dict.values())
         result = distr_fit_plot.make_pdf(dist, params, size=10000)
-        self.assertAlmostEqual(result.index[0],-5.9147, 2 )
-        self.assertAlmostEqual(result.index[-3], 5.932, 2 )
-        self.assertAlmostEqual(result.to_list()[1],  0.007, 3 )
+        self.assertAlmostEqual(result.index[0], -5.9147, 2)
+        self.assertAlmostEqual(result.index[-3], 5.932, 2)
+        self.assertAlmostEqual(result.to_list()[1],  0.007, 3)
         self.assertAlmostEqual(result.to_list()[-1], 0.00699, 3)
-