diff --git a/q2_fmt/_peds.py b/q2_fmt/_peds.py index ccec7e0..5e9bbba 100644 --- a/q2_fmt/_peds.py +++ b/q2_fmt/_peds.py @@ -25,7 +25,7 @@ def peds(ctx, table, metadata, peds_metric, time_column, reference_column, subject_column, filter_missing_references=False, - drop_incomplete_subjects=False, drop_incomplete_timepoint=None, + drop_incomplete_subjects=False, drop_incomplete_timepoints=None, level_delimiter=None): peds_heatmap = ctx.get_action('fmt', 'peds_heatmap') @@ -38,14 +38,14 @@ def peds(ctx, table, metadata, peds_metric, time_column, reference_column, table=table, metadata=metadata, time_column=time_column, subject_column=subject_column, reference_column=reference_column, drop_incomplete_subjects=drop_incomplete_subjects, - drop_incomplete_timepoint=drop_incomplete_timepoint, + drop_incomplete_timepoints=drop_incomplete_timepoints, filter_missing_references=filter_missing_references) else: - if drop_incomplete_subjects or drop_incomplete_timepoint: + if drop_incomplete_subjects or drop_incomplete_timepoints: warnings.warn('Feature PEDS was selected as the PEDS metric, which' ' does not accept `drop_incomplete_subjects` or' - ' `drop_incomplete_timepoint` as parameters. One' + ' `drop_incomplete_timepoints` as parameters. One' ' (or both) of these parameters were detected in' ' your input, and will be ignored.') @@ -65,9 +65,11 @@ def peds_heatmap(output_dir: str, data: pd.DataFrame, per_subject_stats: pd.DataFrame = None, global_stats: pd.DataFrame = None): _rename_features(data=data, level_delimiter=level_delimiter) + gstats = None + table1 = None + psstats = None if global_stats is not None: gstats = global_stats.to_html(index=False) - # table2, gstats = _make_stats(global_stats) if per_subject_stats is not None: table1, psstats = _make_stats(per_subject_stats) J_ENV = jinja2.Environment( @@ -117,12 +119,17 @@ def sample_peds(table: pd.DataFrame, metadata: qiime2.Metadata, time_column: str, reference_column: str, subject_column: str, filter_missing_references: bool = False, drop_incomplete_subjects: bool = False, - drop_incomplete_timepoint: list = None) -> (pd.DataFrame): + drop_incomplete_timepoints: list = None) -> (pd.DataFrame): + ids_with_data = table.index metadata = metadata.filter_ids(ids_to_keep=ids_with_data) column_properties = metadata.columns - # TODO: Make incomplete samples possible move this to heatmap metadata = metadata.to_dataframe() + if drop_incomplete_timepoints is not None: + metadata = _drop_incomplete_timepoints(metadata, time_column, + drop_incomplete_timepoints) + table.filter(items=metadata.index) + # TODO: Make incomplete samples possible move this to heatmap num_timepoints = _check_for_time_column(metadata, time_column) _check_column_type(column_properties, "time", time_column, "numeric") @@ -139,10 +146,7 @@ def sample_peds(table: pd.DataFrame, metadata: qiime2.Metadata, subject_column, "categorical") _check_duplicate_subject_timepoint(subject_series, metadata, subject_column, time_column) - if drop_incomplete_timepoint is not None: - metadata = _drop_incomplete_timepoints(metadata, time_column, - drop_incomplete_timepoint) - table.filter(items=metadata.index) + # return things that should be removed metadata, used_references = \ _check_subjects_in_all_timepoints(subject_series, num_timepoints, @@ -357,7 +361,6 @@ def _check_reference_column(metadata, reference_column): def _filter_associated_reference(reference_series, metadata, time_column, filter_missing_references, reference_column): used_references = reference_series[~metadata[time_column].isna()] - if used_references.isna().any(): if filter_missing_references: metadata = metadata.dropna(subset=[reference_column]) @@ -393,8 +396,8 @@ def _check_duplicate_subject_timepoint(subject_series, metadata, def _drop_incomplete_timepoints(metadata, time_column, - drop_incomplete_timepoint): - for time in drop_incomplete_timepoint: + drop_incomplete_timepoints): + for time in drop_incomplete_timepoints: try: assert (float(time) in metadata[time_column].unique()) @@ -492,7 +495,7 @@ def peds_simulation(table: pd.DataFrame, metadata: qiime2.Metadata, subject_column: str, filter_missing_references: bool = False, drop_incomplete_subjects: bool = False, - drop_incomplete_timepoint: list = None, + drop_incomplete_timepoints: list = None, num_iterations: int = 999) -> (pd.DataFrame, pd.DataFrame): metadata_df = metadata.to_dataframe() @@ -527,7 +530,8 @@ def peds_simulation(table: pd.DataFrame, metadata: qiime2.Metadata, subject_column=subject_column, filter_missing_references=filter_missing_references, drop_incomplete_subjects=drop_incomplete_subjects, - drop_incomplete_timepoint=drop_incomplete_timepoint).set_index("id") + drop_incomplete_timepoints=drop_incomplete_timepoints + ).set_index("id") actual_peds = peds["measure"] # Mismatch simulation: diff --git a/q2_fmt/plugin_setup.py b/q2_fmt/plugin_setup.py index fd19fdc..255d086 100644 --- a/q2_fmt/plugin_setup.py +++ b/q2_fmt/plugin_setup.py @@ -63,11 +63,11 @@ drop_incomplete_subjects = ('Filter out subjects that do not have a sample at' ' every timepoint. Default behavior is to raise an' ' error if any subject is missing a timepoint.') -drop_incomplete_timepoint = ('Filter out specified timepoints. This is useful' - ' for removing frequently missing timepoints' - ' which cause many subjects to be dropped.' - ' Default behavior is to raise an error if any' - ' subject is missing a timepoint.') +drop_incomplete_timepoints = ('Filter out multiple specified timepoints.' + ' This is useful for removing frequently missing' + ' timepoints which cause many subjects to be' + ' dropped. Default behavior is to raise an error' + ' if any subject is missing a timepoint.') level_delimiter = 'delimiter to split taxonomic label on' control_column = ('The column within `metadata` that contains any relevant' ' control group IDs. Actual treatment samples should not' @@ -233,7 +233,7 @@ 'subject_column': Str, 'filter_missing_references': Bool, 'drop_incomplete_subjects': Bool, - 'drop_incomplete_timepoint': List[Str], + 'drop_incomplete_timepoints': List[Str], 'level_delimiter': Str}, outputs=[('peds_heatmap', Visualization)], input_descriptions={'table': peds_table}, @@ -245,7 +245,7 @@ 'subject_column': subject_column, 'filter_missing_references': filter_missing_references, 'drop_incomplete_subjects': drop_incomplete_subjects, - 'drop_incomplete_timepoint': drop_incomplete_timepoint, + 'drop_incomplete_timepoints': drop_incomplete_timepoints, 'level_delimiter': level_delimiter}, output_descriptions={'peds_heatmap': 'PEDS heatmap visualization'}, name='PEDS pipeline to calculate feature or sample PEDS', @@ -276,7 +276,7 @@ 'reference_column': Str, 'subject_column': Str, 'filter_missing_references': Bool, 'drop_incomplete_subjects': Bool, - 'drop_incomplete_timepoint': List[Str]}, + 'drop_incomplete_timepoints': List[Str]}, outputs=[('peds_dists', Dist1D[Ordered, Matched] % Properties("peds"))], input_descriptions={'table': peds_table}, parameter_descriptions={ @@ -286,7 +286,7 @@ 'subject_column': subject_column, 'filter_missing_references': filter_missing_references, 'drop_incomplete_subjects': drop_incomplete_subjects, - 'drop_incomplete_timepoint': drop_incomplete_timepoint + 'drop_incomplete_timepoints': drop_incomplete_timepoints }, output_descriptions={ 'peds_dists': peds_dists @@ -337,7 +337,7 @@ 'subject_column': T_subject, 'filter_missing_references': Bool, 'drop_incomplete_subjects': Bool, - 'drop_incomplete_timepoint': List[Str], + 'drop_incomplete_timepoints': List[Str], 'num_iterations': Int % Range(99, None)}, outputs=[('per_subject_stats', StatsTable[Pairwise]), ('global_stats', StatsTable[Pairwise])], @@ -348,7 +348,7 @@ 'subject_column': subject_column, 'filter_missing_references': filter_missing_references, 'drop_incomplete_subjects': drop_incomplete_subjects, - 'drop_incomplete_timepoint': drop_incomplete_timepoint, + 'drop_incomplete_timepoints': drop_incomplete_timepoints, 'num_iterations': 'The number of iterations to run the Monte Carlo' ' simulation on' }, diff --git a/q2_fmt/tests/test_engraftment.py b/q2_fmt/tests/test_engraftment.py index 8e94332..619e776 100644 --- a/q2_fmt/tests/test_engraftment.py +++ b/q2_fmt/tests/test_engraftment.py @@ -782,9 +782,9 @@ def test_incomplete_timepoints_with_flag(self): 'donor1', 'donor2'], 'Ref': ['donor1', 'donor1', 'donor1', 'donor2', np.nan, np.nan], - 'subject': ['sub1', 'sub1', 'sub1', 'sub2', np.nan, + 'subject': ['sub1', 'sub1', 'sub2', 'sub2', np.nan, np.nan], - 'group': [1, 2, 3, 2, np.nan, + 'group': [1, 2, 2, 3, np.nan, np.nan]}).set_index('id') metadata = Metadata(metadata_df) table_df = pd.DataFrame({ @@ -797,16 +797,16 @@ def test_incomplete_timepoints_with_flag(self): time_column="group", reference_column="Ref", subject_column="subject", - drop_incomplete_subjects=True) + drop_incomplete_timepoints=[1, 3]) exp_peds_df = pd.DataFrame({ - 'id': ['sample1', 'sample2', 'sample3'], - 'measure': [0.666667, 0.333333, 1], - 'transfered_donor_features': [2, 1, 3], - 'total_donor_features': [3, 3, 3], - 'donor': ["donor1", "donor1", "donor1"], - 'subject': ["sub1", "sub1", "sub1"], - 'group': [1.0, 2.0, 3.0] + 'id': ['sample2', 'sample3'], + 'measure': [0.333333, 1], + 'transfered_donor_features': [1, 3], + 'total_donor_features': [3, 3], + 'donor': ["donor1", "donor1"], + 'subject': ["sub1", "sub2"], + 'group': [2.0, 2.0] }) pd.testing.assert_frame_equal(sample_peds_df, exp_peds_df)