Merge pull request #26 from cbib/disfit-no-padj

disfit fixes & unitary tests improvement
cbib · Feb 27, 2024 · 1f45165 · 1f45165
2 parents 8e52d6d + b2e794a
commit 1f45165
Show file tree

Hide file tree

Showing 14 changed files with 227 additions and 143 deletions.
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ DIMet: Differential analysis of Isotope-labeled targeted Metabolomics data
 DIMet is a bioinformatics pipeline for **differential and time-course analysis of targeted isotope-labeled metabolomics data**.
 
 DIMet supports the analysis of full metabolite abundances and isotopologue contributions, 
-and allows to perform it either in the differential comparison mode or as a time-series analysis. 
+and allows to perform it in the differential comparison mode, or as a time-series analysis, or even processing entire labelling profiles.
 As input, DIMet accepts three types of measures: a) isotopologues’ contributions, b) fractional contributions (also known as mean enrichment), c) full metabolites’ abundances. 
 DIMet also offers a _pathway-based omics integration_ through **Metabolograms**.
 

diff --git a/src/dimet/config/analysis/method/differential_analysis.yaml b/src/dimet/config/analysis/method/differential_analysis.yaml
@@ -21,3 +21,9 @@ statistical_test:
   mean_enrichment: KW
   isotopologues: KW
   isotopologue_proportions: KW
+
+disfit_tail_option: "auto"
+
+# Note: the disfit_tail_option depends on the comparison and the data:
+#    if advanced knowledge of both, set "two-sided" or "right-tailed"
+#    otherwise leave "auto" as default
diff --git a/src/dimet/config/analysis/method/metabologram_integration.yaml b/src/dimet/config/analysis/method/metabologram_integration.yaml
@@ -45,4 +45,10 @@ display_label_and_value : False # if False, display labels of the molecules alon
 
 color_nan_elements : "gray"
 
+disfit_tail_option: "two-sided"
+
+# Note: the disfit_tail_option depends on the comparison and the data:
+#    if advanced knowledge of both, set "two-sided" or "right-tailed"
+#    otherwise leave "auto" as default
+
 
diff --git a/src/dimet/config/analysis/method/time_course_analysis.yaml b/src/dimet/config/analysis/method/time_course_analysis.yaml
@@ -21,3 +21,9 @@ statistical_test:
   mean_enrichment: KW
   isotopologues: KW
   isotopologue_proportions: KW
+
+disfit_tail_option: "auto"
+
+# Note: the disfit_tail_option depends on the comparison and the data:
+#    if advanced knowledge of both, set "two-sided" or "right-tailed"
+#    otherwise leave "auto" as default
diff --git a/src/dimet/constants.py b/src/dimet/constants.py
@@ -49,10 +49,9 @@ def assert_literal(value: str, lit_type, check: Optional[str] = None):
     "fdr_bh", "fdr_by", "fdr_tsbh", "fdr_tsbky"
 ]
 
-comparison_modes = ["pairwise", "multigroup"]  # TODO: verify if ever used
+# comparison_modes = ["pairwise", "multigroup"]  # unused to date
 
-comparison_modes_types = Literal[
-    "pairwise", "multigroup"]  # TODO: verify if ever used
+# comparison_modes_types = Literal["pairwise", "multigroup"]  # unused to date
 
 overlap_methods = ["symmetric", "asymmetric"]
 

diff --git a/src/dimet/helpers.py b/src/dimet/helpers.py
@@ -481,3 +481,10 @@ def message_bad_separator_input(df: pd.DataFrame, type_df: str) -> None:
             if df.empty:
                 logger.info(f"{e}. {error_message}")
                 raise ValueError(error_message)
+
+
+def msg_correction_method_not_suitable(filename: str, test: str) -> str:
+    message = (f"Using '{test}' for {filename}: the method"
+               f" for multiple tests correction (e.g. Bonferroni, "
+               f" B-H, or other), is unsuitable and will be omitted")
+    return message
diff --git a/src/dimet/method/__init__.py b/src/dimet/method/__init__.py
@@ -14,7 +14,7 @@
                              data_types_suitable_for_metabologram,
                              metabolites_values_for_metabologram)
 from dimet.data import DataIntegration, Dataset
-from dimet.helpers import flatten
+from dimet.helpers import flatten, msg_correction_method_not_suitable
 from dimet.processing.bivariate_analysis import bivariate_comparison
 from dimet.processing.differential_analysis import (differential_comparison,
                                                     multi_group_compairson,
@@ -289,6 +289,8 @@ def run(self, cfg: DictConfig, dataset: Dataset) -> None:
             tmp = dataset.get_file_for_label(file_name)  # current file name
             logger.info(
                 f"Running differential analysis of {tmp} using {test} test")
+            if test == "disfit":
+                logger.info(msg_correction_method_not_suitable(tmp, test))
             differential_comparison(file_name, dataset, cfg, test,
                                     out_table_dir=out_table_dir)
 
@@ -619,6 +621,8 @@ def run(self, cfg: DictConfig, dataset: Dataset) -> None:
             tmp = dataset.get_file_for_label(file_name)  # current file
             logger.info(
                 f"Running time-course analysis of {tmp} using {test} test")
+            if test == "disfit":
+                logger.info(msg_correction_method_not_suitable(tmp, test))
             time_course_analysis(file_name, dataset, cfg, test,
                                  out_table_dir=out_table_dir)
 

diff --git a/src/dimet/processing/differential_analysis.py b/src/dimet/processing/differential_analysis.py
@@ -239,68 +239,55 @@ def run_statistical_test(df: pd.DataFrame, comparison: List,
 def auto_detect_tailway(good_df, best_distribution, args_param):
     min_pval_ = list()
     for tail_way in ["two-sided", "right-tailed"]:
-        tmp = compute_p_value(good_df, tail_way, best_distribution,
-                              args_param)
+        tmp = fit_statistical_distribution.compute_p_value(
+            good_df, tail_way, best_distribution, args_param)
 
         min_pval_.append(tuple([tail_way, tmp["pvalue"].min()]))
 
     return min(min_pval_, key=lambda x: x[1])[0]
 
 
-def run_distribution_fitting(df: pd.DataFrame):
+def run_distribution_fitting(df: pd.DataFrame,
+                             disfit_tail_option: str) -> pd.DataFrame:
+    recognized_tail_options = ["auto", "two-sided", "right-tailed"]
+    assert disfit_tail_option in recognized_tail_options, ("unrecognized"
+           "disfit_tail_option")
     df = fit_statistical_distribution.compute_z_score(df, "FC")
     best_distribution, args_param = \
         fit_statistical_distribution.find_best_distribution(df)
-    autoset_tailway = auto_detect_tailway(df, best_distribution, args_param)
-    logger.info(f"auto, best pvalues calculated : {autoset_tailway}")
-    df = compute_p_value(df, autoset_tailway, best_distribution, args_param)
+    if disfit_tail_option == "auto":
+        autoset_tailway = auto_detect_tailway(
+            df, best_distribution, args_param)
+        logger.info(f"auto, best pvalues calculated : {autoset_tailway}")
+        df = fit_statistical_distribution.compute_p_value(
+            df, autoset_tailway, best_distribution, args_param)
+    else:  # two-sided or right-tailed
+        logger.info(f"the disfit_tail_option is: '{disfit_tail_option}'")
+        df = fit_statistical_distribution.compute_p_value(
+            df, disfit_tail_option, best_distribution, args_param)
+
     df["zscore"] = np.around(
         df["zscore"].astype(float).to_numpy(), decimals=6)
-    return df
-
 
-def compute_p_value(df: pd.DataFrame, test: str, best_dist,
-                    args_param) -> pd.DataFrame:
-    if test == "right-tailed":
-        df["pvalue"] = 1 - best_dist.cdf(df["zscore"], **args_param)
-    elif test == "two-sided":
-        df["pvalue"] = 2 * (
-                    1 - best_dist.cdf(abs(df["zscore"]), **args_param))
-    else:
-        print("WARNING [compute_p_value]: only 'right-tailed' or " 
-              "'two-sided' as test argument supported")
     return df
 
 
-def filter_diff_results(ratiosdf, padj_cutoff, log2FC_abs_cutoff):
-    ratiosdf["abslfc"] = ratiosdf["log2FC"].abs()
-    ratiosdf = ratiosdf.loc[(ratiosdf["padj"] <= padj_cutoff) & (
-                ratiosdf["abslfc"] >= log2FC_abs_cutoff), :]
-    ratiosdf = ratiosdf.sort_values(["padj", "pvalue", "distance/span"],
-                                    ascending=[True, True, False])
-    ratiosdf = ratiosdf.drop(columns=["abslfc"])
-
-    return ratiosdf
-
-
-def reorder_columns_diff_end(df: pd.DataFrame) -> pd.DataFrame:
+def reorder_columns_diff_end(df: pd.DataFrame, test: str) -> pd.DataFrame:
     standard_cols = [
         "count_nan_samples_group1",
         "count_nan_samples_group2",
         "distance",
         "span_allsamples",
         "distance/span",
-        #        'stat',
         "pvalue",
         "padj",
         "log2FC",
         "FC",
-        "compartment",
+        "compartment"
     ]
 
     desired_order = [
         "log2FC",
-        #        'stat',
         "pvalue",
         "padj",
         "distance/span",
@@ -309,9 +296,12 @@ def reorder_columns_diff_end(df: pd.DataFrame) -> pd.DataFrame:
         "count_nan_samples_group2",
         "distance",
         "span_allsamples",
-        "compartment",
+        "compartment"
     ]
 
+    if test == "disfit":  # exclude the senseless column when this test ran
+        standard_cols = [i for i in standard_cols if i != "padj"]
+        desired_order = [i for i in desired_order if i != "padj"]
     standard_df = df[standard_cols]
     df = df.drop(columns=standard_cols)
     # reorder the standard part
@@ -323,28 +313,59 @@ def reorder_columns_diff_end(df: pd.DataFrame) -> pd.DataFrame:
 
 
 def round_result_float_columns(df: pd.DataFrame) -> pd.DataFrame:
-    result_float_columns = [
-        "log2FC",
-        "pvalue",
-        "padj",
-        "distance/span",
-        "FC",
-        "distance",
-        "span_allsamples"]
+    for column in list(df.columns):
+        try:
+            df[column] = np.around(
+                 df[column].astype(float).to_numpy(),
+                 decimals=6
+             )
+        except ValueError:
+            continue
+        except TypeError:
+            continue
+        except Exception as e:
+            print(e)
+            continue
+    return df
 
-    columns_gmean = [column for column in list(df.columns) if
-                     column.startswith("gmean_")]  # also gmean columns
-    result_float_columns = list(
-        set(result_float_columns).union(set(columns_gmean)))
 
-    for column in result_float_columns:
-        if column in list(df.columns):
-            df[column] = np.around(
-                df[column].astype(float).to_numpy(),
-                decimals=6
-            )
+def compute_current_comparison_test(
+        df_good: pd.DataFrame, df_bad: pd.DataFrame,
+        this_comparison: List[List], test: str, cfg: DictConfig
+) -> pd.DataFrame:
+    """
+    Wraps functions for applying the parametric or non-parametric chosen test
+    Note that 'this_comparison' is the list of, exactly,
+    two lists of samples names, e.g.:
+    [['Tr_cell_0h-1', 'Tr_cell_0h-2'], ['Ct_cell_0h-1', 'Ct_cell_0h-2']]
+    the first list corresponds to the treated samples,
+    the second list corresponds to the control samples.
+    """
+    # log transform, in base 2, the Fold Change
+    df_good = df_good.assign(log2FC=np.log2(df_good["FC"]))
 
-    return df
+    if test == "disfit":
+        result = run_distribution_fitting(
+            df_good,
+            disfit_tail_option=cfg.analysis.method.disfit_tail_option)
+    else:
+        result_test_df = run_statistical_test(df_good, this_comparison, test)
+        assert result_test_df.shape[0] == df_good.shape[0]
+        result_test_df.set_index("metabolite", inplace=True)
+        df_good = pd.merge(df_good, result_test_df, left_index=True,
+                           right_index=True)
+        df_good, df_no_quality_d_s = split_rows_by_threshold(
+            df_good, "distance/span",
+            cfg.analysis.method.qualityDistanceOverSpan
+        )
+        df_good = compute_padj(df_good, 0.05,
+                               cfg.analysis.method.correction_method)
+        # re-integrate the "bad" sub-dataframes to the full dataframe
+        result = concatenate_dataframes(df_good, df_bad, df_no_quality_d_s)
+
+    result = round_result_float_columns(result)
+
+    return result
 
 
 def pairwise_comparison(
@@ -380,27 +401,9 @@ def pairwise_comparison(
     df_good, df_bad = select_rows_with_sufficient_non_nan_values(
         df4c, groups=this_comparison)
 
-    if test == "disfit":
-        df_good = run_distribution_fitting(df_good)
+    result = compute_current_comparison_test(df_good, df_bad,
+                                             this_comparison, test, cfg)
 
-    else:
-        result_test_df = run_statistical_test(df_good, this_comparison, test)
-        assert result_test_df.shape[0] == df_good.shape[0]
-        result_test_df.set_index("metabolite", inplace=True)
-        df_good = pd.merge(df_good, result_test_df, left_index=True,
-                           right_index=True)
-
-    df_good["log2FC"] = np.log2(df_good["FC"])
-
-    df_good, df_no_padj = split_rows_by_threshold(
-        df_good, "distance/span", cfg.analysis.method.qualityDistanceOverSpan
-    )
-    df_good = compute_padj(df_good, 0.05,
-                           cfg.analysis.method.correction_method)
-
-    # re-integrate the "bad" sub-dataframes to the full dataframe
-    result = concatenate_dataframes(df_good, df_bad, df_no_padj)
-    result = round_result_float_columns(result)
     return result
 
 
@@ -428,9 +431,9 @@ def differential_comparison(
         for comparison in cfg.analysis.comparisons:
             result = pairwise_comparison(df, dataset, cfg, comparison, test)
             result["compartment"] = compartment
-            result = reorder_columns_diff_end(result)
+            result = reorder_columns_diff_end(result, test)
 
-            result = result.sort_values(["padj", "distance/span"],
+            result = result.sort_values(["pvalue", "distance/span"],
                                         ascending=[True, False])
             comp = "-".join(map(lambda x: "-".join(x), comparison))
             base_file_name = dataset.get_file_for_label(file_name)
@@ -441,7 +444,7 @@ def differential_comparison(
                 output_file_name,
                 index_label="metabolite",
                 header=True,
-                sep="\t",
+                sep="\t"
             )
             logger.info(f"Saved the result in {output_file_name}")
 
@@ -550,8 +553,9 @@ def time_course_analysis(file_name: data_files_keys_type,
             result = pairwise_comparison(df, dataset, cfg, comparison,
                                          test)
             result["compartment"] = compartment
-            result = reorder_columns_diff_end(result)
-            result = result.sort_values(["padj", "distance/span"],
+            result = reorder_columns_diff_end(result, test)
+
+            result = result.sort_values(["pvalue", "distance/span"],
                                         ascending=[True, False])
             comp = "-".join(map(lambda x: "-".join(x), comparison))
             base_file_name = dataset.get_file_for_label(file_name)