Skip to content

Commit

Permalink
Merge pull request #17 from cbib/oct_2023
Browse files Browse the repository at this point in the history
Oct 2023
  • Loading branch information
johaGL authored Oct 6, 2023
2 parents 814a4d7 + 05e1233 commit f7caab9
Show file tree
Hide file tree
Showing 9 changed files with 114 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: Generate isotopologues stack plots

# here put 'hidden' advanced defaults
figure_format: svg
max_nb_carbons_possible: 12
max_nb_carbons_possible: 18 # up to 30 are supported
appearance_separated_time: True # adds a space between timepoints, conditions stay comparative
split_plots_by_condition: False # prints each condition in independent plots
height_each_stack: !!float 4.9
Expand Down
40 changes: 36 additions & 4 deletions src/dimet/processing/differential_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,8 @@ def run_distribution_fitting(df: pd.DataFrame):
autoset_tailway = auto_detect_tailway(df, best_distribution, args_param)
logger.info(f"auto, best pvalues calculated : {autoset_tailway}")
df = compute_p_value(df, autoset_tailway, best_distribution, args_param)

df["zscore"] = np.around(
df["zscore"].astype(float).to_numpy(), decimals=6)
return df


Expand Down Expand Up @@ -345,6 +346,31 @@ def reorder_columns_diff_end(df: pd.DataFrame) -> pd.DataFrame:
return df


def round_result_float_columns(df: pd.DataFrame) -> pd.DataFrame:
result_float_columns = [
"log2FC",
"pvalue",
"padj",
"distance/span",
"FC",
"distance",
"span_allsamples"]

columns_gmean = [column for column in list(df.columns) if
column.startswith("gmean_")] # also gmean columns
result_float_columns = list(
set(result_float_columns).union(set(columns_gmean)))

for column in result_float_columns:
if column in list(df.columns):
df[column] = np.around(
df[column].astype(float).to_numpy(),
decimals=6
)

return df


def pairwise_comparison(
df: pd.DataFrame, dataset: Dataset, cfg: DictConfig,
comparison: List[str], test: availtest_methods_type
Expand All @@ -368,17 +394,19 @@ def pairwise_comparison(
df4c = df4c[(df4c.T != 0).any()] # delete rows being zero everywhere
df4c = df4c.dropna(axis=0, how="all")
df4c = row_wise_nanstd_reduction(df4c)
df4c = df4c.round(decimals=6)
df4c = countnan_samples(df4c, this_comparison)
df4c = distance_or_overlap(df4c, this_comparison)
df4c = compute_span_incomparison(df4c, this_comparison)
df4c["distance/span"] = df4c.distance.div(df4c.span_allsamples)
df4c = calculate_gmean(df4c, this_comparison)
print(this_comparison)

df_good, df_bad = select_rows_with_sufficient_non_nan_values(
df4c, groups=this_comparison)

if test == "disfit":
df_good = run_distribution_fitting(df_good)

else:
result_test_df = run_statistical_test(df_good, this_comparison, test)
assert result_test_df.shape[0] == df_good.shape[0]
Expand All @@ -396,6 +424,7 @@ def pairwise_comparison(

# re-integrate the "bad" sub-dataframes to the full dataframe
result = concatenate_dataframes(df_good, df_bad, df_no_padj)
result = round_result_float_columns(result)
return result


Expand Down Expand Up @@ -424,6 +453,7 @@ def differential_comparison(
result = pairwise_comparison(df, dataset, cfg, comparison, test)
result["compartment"] = compartment
result = reorder_columns_diff_end(result)

result = result.sort_values(["padj", "distance/span"],
ascending=[True, False])
comp = "-".join(map(lambda x: "-".join(x), comparison))
Expand Down Expand Up @@ -471,11 +501,13 @@ def multi_group_compairson(
df4c = df4c.dropna(axis=0, how="all")

df4c = row_wise_nanstd_reduction(df4c)
df4c = df4c.round(decimals=6)
this_comparison = [list(filter(lambda x: x in columns, sublist)) for
sublist in conditions_list]
df4c = apply_multi_group_kruskal_wallis(df4c, this_comparison)
df4c = compute_padj(df4c, 0.05,
cfg.analysis.method.correction_method)
df4c = round_result_float_columns(df4c)
base_file_name = dataset.get_file_for_label(file_name)
base_file_name += f"--{compartment}--multigroup"
output_file_name = os.path.join(out_table_dir,
Expand Down Expand Up @@ -514,13 +546,13 @@ def time_course_analysis(file_name: data_files_keys_type,
cfg: DictConfig,
test: availtest_methods_type,
out_table_dir: str):
'''
"""
Time-course comparison is performed on compartmentalized versions of
data files
Attention: we replace zero values using the provided method
Writes the table(s) with computed statistics in the relevant
output directory
'''
"""

assert_literal(test, availtest_methods_type, "Available test")
assert_literal(file_name, data_files_keys_type, "file name")
Expand Down
5 changes: 4 additions & 1 deletion src/dimet/processing/fit_statistical_distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

logger = logging.getLogger(__name__)

np.random.seed(123)


def compute_z_score(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
"""
Expand All @@ -26,7 +28,8 @@ def find_best_distribution(df: pd.DataFrame):
Find the best distribution among all the scipy.stats distributions
and return it together with its parameters
The input dataframe df has to have a "zscore" column as the fitting is done on the zscores
The input dataframe df has to have a "zscore" column
as the fitting is done on the zscores
"""
logger.info("Fitting a distribution")
dist = np.around(np.array((df["zscore"]).astype(float)), 5)
Expand Down
5 changes: 5 additions & 0 deletions src/dimet/processing/pca_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def compute_pca(
X = np.transpose(np.array(quantitative_df))
pca = PCA(n_components=dims)
pc = pca.fit_transform(X)
pc = np.around(pc, decimals=6)
pc_df = pd.DataFrame(data=pc,
columns=['PC' + str(i) for i in range(1, dims + 1)])
pc_df = pc_df.assign(name_to_plot=quantitative_df.columns)
Expand All @@ -60,6 +61,10 @@ def compute_pca(
var_explained_df = pd.DataFrame({
'Explained Variance %': pca.explained_variance_ratio_ * 100,
'PC': ['PC' + str(i) for i in range(1, dims + 1)]})
var_explained_df['Explained Variance %'] = np.around(
var_explained_df['Explained Variance %'].astype(float).to_numpy(),
decimals=6
)

return pc_df, var_explained_df

Expand Down
8 changes: 6 additions & 2 deletions src/dimet/visualization/abundance_bars.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def plot_one_metabolite(df: pd.DataFrame,
do_stripplot: bool) -> matplotlib.axes:
""""
returns a single object of type matplotlib.axes
with all the individual metabolite plot
with the individual metabolite plot
"""
plt.rcParams.update({"font.size": 21})
sns.barplot(
Expand Down Expand Up @@ -237,14 +237,18 @@ def run_plot_abundance_bars(dataset: Dataset, out_plot_dir,
metadata_df.loc[metadata_df['compartment'] == compartment, :]
compartment_df = dataset.compartmentalized_dfs[
"abundances"][compartment]
# metadata and abundances time of interest
# metadata and abundances: slice of timepoints of interest
metadata_slice = metadata_compartment_df.loc[
metadata_compartment_df[
"timepoint"].isin(timepoints), :]
values_slice = compartment_df[list(metadata_slice["name_to_plot"])]

# total piled-up data:
piled_sel = pile_up_abundance(values_slice, metadata_slice)
piled_sel['abundance'] = np.around(
piled_sel['abundance'].astype(float).to_numpy(),
decimals=6
)
piled_sel["condition"] = pd.Categorical(
piled_sel["condition"], conditions)
piled_sel["timepoint"] = pd.Categorical(
Expand Down
21 changes: 18 additions & 3 deletions src/dimet/visualization/isotopologue_proportions.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import seaborn as sns
from dimet.data import Dataset
Expand Down Expand Up @@ -54,6 +55,11 @@ def isotopologue_proportions_2piled_df(
piled_df['timenum'] = piled_df['timenum'].astype(str)
piled_df['Isotopologue Contribution (%)'] = \
piled_df['Isotopologue Contribution (%)'] * 100

piled_df['Isotopologue Contribution (%)'] = np.around(
piled_df['Isotopologue Contribution (%)'].astype(float).to_numpy(),
decimals=6
)
return piled_df


Expand Down Expand Up @@ -85,15 +91,24 @@ def massage_isotopologues(piled_df) -> pd.DataFrame:

def prepare_means_replicates(piled_df, metaboli_selected) -> Dict:
"""
returns a dictionary of dataframes, keys are metabolites
returns a dictionary of dataframes, keys are metabolites.
for each dataframe:
- the mean over the biological replicates, by isotopologue, is computed
and is saved in the column 'Isotopologue Contribution (%)'
- has columns:
condition, metabolite, m+x, timenum, Isotopologue Contribution (%)
"""
dfcopy = piled_df.copy()
# instead groupby isotopologue_name, using m+x and metabolite works better
dfcopy = dfcopy.groupby(
["condition", "metabolite", "m+x", "timenum"]) \
.mean("Isotopologue Contribution %") # df.mean skips nan by default
dfcopy = dfcopy.reset_index()
.mean("Isotopologue Contribution (%)") # df.mean skips nan by default

dfcopy["Isotopologue Contribution (%)"] = np.around(
dfcopy["Isotopologue Contribution (%)"].astype(float).to_numpy(),
decimals=6
)
dfcopy = dfcopy.reset_index()
dfs_dict = dict()
for i in metaboli_selected:
tmp = dfcopy.loc[dfcopy["metabolite"] == i, ].reset_index(drop=True)
Expand Down
15 changes: 14 additions & 1 deletion src/dimet/visualization/mean_enrichment_line_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from hydra.core.config_store import ConfigStore
Expand Down Expand Up @@ -38,6 +39,10 @@ def melt_data_metadata_2df(compartment_df: pd.DataFrame,
value_name="Fractional Contribution (%)")
melted_df["Fractional Contribution (%)"] = \
melted_df["Fractional Contribution (%)"] * 100
melted_df["Fractional Contribution (%)"] = np.around(
melted_df["Fractional Contribution (%)"].astype(float).to_numpy(),
decimals=6
)
return melted_df


Expand Down Expand Up @@ -79,6 +84,14 @@ def metabolite_df__mean_and_sd(
one_metabolite_result = mean_df.merge(std_df, how='inner',
on=["condition", "timenum",
"metabolite"])
one_metabolite_result['mean'] = np.around(
one_metabolite_result['mean'] .astype(float).to_numpy(),
decimals=6
)
one_metabolite_result['sd'] = np.around(
one_metabolite_result['sd'] .astype(float).to_numpy(),
decimals=6
)
return one_metabolite_result


Expand Down Expand Up @@ -406,7 +419,7 @@ def generate_metabolites_numbered_dict(cfg: DictConfig,
{ 0: ['Pyr', 'Cit'], 1: ['Asn', 'Asp']}
will result in one plot by couple of metabolites, totalling 2 plots.
If option 'plot_grouped_by_dict' not specified, one single metabolite
per numeric key is set.
per numeric key is set (default behaviour).
"""
if cfg.analysis.method.plot_grouped_by_dict is not None:
metabolites_numbered_dict = cfg.analysis.method.plot_grouped_by_dict
Expand Down
7 changes: 7 additions & 0 deletions src/dimet/visualization/metabologram.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ def pile_dfs_by_contexts(contexts_dict: Dict[int, Dict[str, pd.DataFrame]],
df = df.assign(context=context, typemol=molecule_type)
df['context'] = df['context'].astype(int)
df_out = pd.concat([df_out, df], axis=0)
df_out['VALUES'] = np.around(
df_out['VALUES'].astype(float).to_numpy(), decimals=6
)

data_cleaned_dict[molecule_type] = df_out
return data_cleaned_dict
Expand Down Expand Up @@ -328,9 +331,13 @@ def donut_outer(curr_pathway_context_df,
genecircportion = 50 / curr_pathway_context_df.loc[
curr_pathway_context_df.typemol == "transcripts",
:].shape[0]
genecircportion = np.around(np.array(genecircportion), decimals=6)

metabocircportion = 50 / curr_pathway_context_df.loc[
curr_pathway_context_df.typemol == "metabolites",
:].shape[0]
metabocircportion = np.around(np.array(metabocircportion), decimals=6)

curr_pathway_context_df.loc[
curr_pathway_context_df.typemol == "transcripts",
"circportion"] = genecircportion
Expand Down
23 changes: 23 additions & 0 deletions tests/test_differential_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,29 @@ def test_reorder_columns_diff_end(self):
)))
self.assertEqual(result.shape, (2, 11))

def test_round_result_float_columns(self):
data = {
"distance": [2, 1.5], "span_allsamples": [4, 6],
"distance/span": [0.59595959595959, 0.25],
"count_nan_samples_group1": [0, 0],
"count_nan_samples_group2": [0, 0],
"pvalue": [0, 0],
"padj": [1.54354343434e-3, 1.3543434354335e-4],
"log2FC": [4.063030512104, 5.0202235],
"FC": [8, 23], "compartment": ["med", "med"],
}
df = pd.DataFrame(data)
df.index = ['met1', 'met3']
result = differential_analysis.round_result_float_columns(df)
self.assertTrue(
any(np.array(result["padj"]) == np.array([0.001544, 0.000135])
)
)
self.assertTrue(
any(np.array(result["log2FC"]) == np.array([4.063031, 5.020224])
)
)

def test_time_course_auto_list_comparisons(self):
metadata = pd.DataFrame({
'condition': ['cond1', 'cond1', 'cond1', 'cond1',
Expand Down

0 comments on commit f7caab9

Please sign in to comment.