From c4de0a9a4fecbf1ad11872bb14c18d24d1b1851e Mon Sep 17 00:00:00 2001 From: Gregory Way Date: Fri, 21 Jul 2023 14:58:01 -0500 Subject: [PATCH] Make sample argument uniform for normalize and feature_select (#311) * update function docstrings * update samples to consistent function * update samples arg * update samples argument * update documentation * update samples argument * fix type in docstring * Apply suggestions from code review Co-authored-by: Dave Bunten * inplace query --------- Co-authored-by: Dave Bunten --- pycytominer/cyto_utils/features.py | 9 ++++--- .../operations/correlation_threshold.py | 9 ++++--- pycytominer/operations/get_na_columns.py | 9 ++++--- pycytominer/operations/noise_removal.py | 9 ++++--- pycytominer/operations/variance_threshold.py | 9 ++++--- .../test_feature_drop_outlier.py | 10 +++++-- pycytominer/tests/test_feature_select.py | 26 ++++++++++++++++++ .../test_correlation_threshold.py | 10 ++++--- .../test_operations/test_get_na_columns.py | 12 ++++++--- .../test_variance_threshold.py | 27 +++++++++++++++++++ 10 files changed, 105 insertions(+), 25 deletions(-) diff --git a/pycytominer/cyto_utils/features.py b/pycytominer/cyto_utils/features.py index 8cfc43cd..11932b49 100644 --- a/pycytominer/cyto_utils/features.py +++ b/pycytominer/cyto_utils/features.py @@ -152,8 +152,11 @@ def drop_outlier_features( DataFrame that includes metadata and observation features. features : list of str or str, default "infer" Features present in the population dataframe. If "infer", then assume Cell Painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_" - samples : list of str or str, default "all" - Samples to perform the operation on + samples : str, default "all" + List of samples to perform operation on. The function uses a pd.DataFrame.query() + function, so you should structure samples in this fashion. An example is + "Metadata_treatment == 'control'" (include all quotes). + If "all", use all samples to calculate. outlier_cutoff : int or float, default 500 see https://github.com/cytomining/pycytominer/issues/237 for details. Threshold to remove features if absolute values is greater @@ -166,7 +169,7 @@ def drop_outlier_features( # Subset dataframe if samples != "all": - population_df = population_df.loc[samples, :] + population_df.query(samples, inplace=True) if features == "infer": features = infer_cp_features(population_df) diff --git a/pycytominer/operations/correlation_threshold.py b/pycytominer/operations/correlation_threshold.py index 57bbbc78..d3620f50 100644 --- a/pycytominer/operations/correlation_threshold.py +++ b/pycytominer/operations/correlation_threshold.py @@ -25,8 +25,11 @@ def correlation_threshold( List of features present in the population dataframe [default: "infer"] if "infer", then assume cell painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_". - samples : list or str, default "all" - List of samples to perform operation on. If "all", use all samples to calculate. + samples : str, default "all" + List of samples to perform operation on. The function uses a pd.DataFrame.query() + function, so you should structure samples in this fashion. An example is + "Metadata_treatment == 'control'" (include all quotes). + If "all", use all samples to calculate. threshold - float, default 0.9 Must be between (0, 1) to exclude features method - str, default "pearson" @@ -45,7 +48,7 @@ def correlation_threshold( # Subset dataframe and calculate correlation matrix across subset features if samples != "all": - population_df = population_df.loc[samples, :] + population_df.query(samples, inplace=True) if features == "infer": features = infer_cp_features(population_df) diff --git a/pycytominer/operations/get_na_columns.py b/pycytominer/operations/get_na_columns.py index 637277ce..ad9a4078 100644 --- a/pycytominer/operations/get_na_columns.py +++ b/pycytominer/operations/get_na_columns.py @@ -18,8 +18,11 @@ def get_na_columns(population_df, features="infer", samples="all", cutoff=0.05): List of features present in the population dataframe [default: "infer"] if "infer", then assume cell painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_". - samples : list or str, default "all" - List of samples to perform operation on. If "all", use all samples to calculate. + samples : str, default "all" + List of samples to perform operation on. The function uses a pd.DataFrame.query() + function, so you should structure samples in this fashion. An example is + "Metadata_treatment == 'control'" (include all quotes). + If "all", use all samples to calculate. cutoff : float Exclude features that have a certain proportion of missingness @@ -30,7 +33,7 @@ def get_na_columns(population_df, features="infer", samples="all", cutoff=0.05): """ if samples != "all": - population_df = population_df.loc[samples, :] + population_df.query(samples, inplace=True) if features == "infer": features = infer_cp_features(population_df) diff --git a/pycytominer/operations/noise_removal.py b/pycytominer/operations/noise_removal.py index 57ef4f06..917f463f 100644 --- a/pycytominer/operations/noise_removal.py +++ b/pycytominer/operations/noise_removal.py @@ -27,8 +27,11 @@ def noise_removal( List of features present in the population dataframe [default: "infer"] if "infer", then assume cell painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_". - samples : list or str, default "all" - List of samples to perform operation on. If "all", use all samples to calculate. + samples : str, default "all" + List of samples to perform operation on. The function uses a pd.DataFrame.query() + function, so you should structure samples in this fashion. An example is + "Metadata_treatment == 'control'" (include all quotes). + If "all", use all samples to calculate. noise_removal_stdev_cutoff : float Maximum mean stdev value for a feature to be kept, with features grouped according to the perturbations in noise_removal_perturbation_groups. @@ -41,7 +44,7 @@ def noise_removal( """ # Subset dataframe if samples != "all": - population_df = population_df.loc[samples, :] + population_df.query(samples, inplace=True) if features == "infer": features = infer_cp_features(population_df) diff --git a/pycytominer/operations/variance_threshold.py b/pycytominer/operations/variance_threshold.py index 0fb93873..c569ab6c 100644 --- a/pycytominer/operations/variance_threshold.py +++ b/pycytominer/operations/variance_threshold.py @@ -21,8 +21,11 @@ def variance_threshold( List of features present in the population dataframe [default: "infer"] if "infer", then assume cell painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_". - samples : list or str, default "all" - List of samples to perform operation on. If "all", use all samples to calculate. + samples : str, default "all" + List of samples to perform operation on. The function uses a pd.DataFrame.query() + function, so you should structure samples in this fashion. An example is + "Metadata_treatment == 'control'" (include all quotes). + If "all", use all samples to calculate. freq_cut : float, default 0.05 Ratio (2nd most common feature val / most common). Must range between 0 and 1. Remove features lower than freq_cut. A low freq_cut will remove features @@ -45,7 +48,7 @@ def variance_threshold( # Subset dataframe if samples != "all": - population_df = population_df.loc[samples, :] + population_df.query(samples, inplace=True) if features == "infer": features = infer_cp_features(population_df) diff --git a/pycytominer/tests/test_cyto_utils/test_feature_drop_outlier.py b/pycytominer/tests/test_cyto_utils/test_feature_drop_outlier.py index 303b18b0..22d34932 100644 --- a/pycytominer/tests/test_cyto_utils/test_feature_drop_outlier.py +++ b/pycytominer/tests/test_cyto_utils/test_feature_drop_outlier.py @@ -20,6 +20,8 @@ "control", "control", ], + "Metadata_test_drop_me": ["no", "no", "no", "no", "yes", "no", "yes", "yes"], + "Metadata_test_drop_me_2": ["no", "no", "no", "no", "yes", "yes", "yes", "yes"], "Cells_x": [1, 2, -8, 2, 5, 5, 5, -1], "Cytoplasm_y": [3, -1, 7, 4, 5, -9, 6, 1], "Nuclei_z": [-1, 8, 2, 5, -6, 20, 2, -2], @@ -41,11 +43,15 @@ def test_outlier_15_cutoff(): def test_outlier_samples_15(): - result = drop_outlier_features(data_df, samples=[0, 1, 2, 3, 5], outlier_cutoff=15) + result = drop_outlier_features( + data_df, samples="Metadata_test_drop_me == 'no'", outlier_cutoff=15 + ) expected_result = ["Cells_zz", "Nuclei_z"] assert sorted(result) == sorted(expected_result) - result = drop_outlier_features(data_df, samples=[0, 1, 2, 3], outlier_cutoff=15) + result = drop_outlier_features( + data_df, samples="Metadata_test_drop_me_2 == 'no'", outlier_cutoff=15 + ) expected_result = ["Cells_zz"] assert result == expected_result diff --git a/pycytominer/tests/test_feature_select.py b/pycytominer/tests/test_feature_select.py index 39a9e35e..66072ab2 100644 --- a/pycytominer/tests/test_feature_select.py +++ b/pycytominer/tests/test_feature_select.py @@ -87,6 +87,7 @@ def test_feature_select_noise_removal(): result1 = feature_select( profiles=data_df, features=data_df.columns.tolist(), + samples="all", operation="noise_removal", noise_removal_perturb_groups=data_df_groups, noise_removal_stdev_cutoff=2.5, @@ -105,18 +106,22 @@ def test_feature_select_noise_removal(): noise_removal_perturb_groups=data_df_groups, noise_removal_stdev_cutoff=3.5, ) + expected_result1 = data_df[["x", "y"]] expected_result2 = data_df[[]] expected_result3 = data_df[["x", "y", "z", "zz"]] + pd.testing.assert_frame_equal(result1, expected_result1) pd.testing.assert_frame_equal(result2, expected_result2) pd.testing.assert_frame_equal(result3, expected_result3) # Test on data_unique_test_df, which has 100 rows data_unique_test_df_groups = [] + # Create a 100 element list containing 10 replicates of 10 perturbations for elem in ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]: data_unique_test_df_groups.append([elem] * 10) + # Unstack so it's just a single list data_unique_test_df_groups = [ item for sublist in data_unique_test_df_groups for item in sublist @@ -136,8 +141,10 @@ def test_feature_select_noise_removal(): noise_removal_perturb_groups=data_unique_test_df_groups, noise_removal_stdev_cutoff=500, ) + expected_result4 = data_unique_test_df[["a", "b"]] expected_result5 = data_unique_test_df[["a", "b", "c", "d"]] + pd.testing.assert_frame_equal(result4, expected_result4) pd.testing.assert_frame_equal(result5, expected_result5) @@ -158,13 +165,16 @@ def test_feature_select_noise_removal(): noise_removal_perturb_groups="perturb_group", noise_removal_stdev_cutoff=500, ) + expected_result4b = data_unique_test_df2[["a", "b", "perturb_group"]] expected_result5b = data_unique_test_df2[["a", "b", "c", "d", "perturb_group"]] + pd.testing.assert_frame_equal(result4b, expected_result4b) pd.testing.assert_frame_equal(result5b, expected_result5b) # Test assertion errors for the user inputting the perturbation groupings bad_perturb_list = ["a", "a", "b", "b", "a", "a", "b"] + with pytest.raises( AssertionError ): # When the inputted perturb list doesn't match the length of the data @@ -198,6 +208,22 @@ def test_feature_select_noise_removal(): noise_removal_stdev_cutoff=2.5, ) + with pytest.raises( + AssertionError + ): # When the perturbation group doesn't match b/c samples argument used + # Add metadata_sample column + data_sample_id_df = data_df.assign( + Metadata_sample=[f"sample_{x}" for x in range(0, data_df.shape[0])] + ) + feature_select( + profiles=data_sample_id_df, + features=data_df.columns.tolist(), + samples="Metadata_sample != 'sample_1'", + operation="noise_removal", + noise_removal_perturb_groups=data_df_groups, + noise_removal_stdev_cutoff=2.5, + ) + def test_feature_select_get_na_columns(): """ diff --git a/pycytominer/tests/test_operations/test_correlation_threshold.py b/pycytominer/tests/test_operations/test_correlation_threshold.py index 8499c87e..1e5cf8e7 100644 --- a/pycytominer/tests/test_operations/test_correlation_threshold.py +++ b/pycytominer/tests/test_operations/test_correlation_threshold.py @@ -12,7 +12,6 @@ } ).reset_index(drop=True) - data_uncorrelated_df = pd.DataFrame( { "x": [2, 2, 2, 5, 2, 1], @@ -62,14 +61,17 @@ def test_correlation_threshold_uncorrelated(): def test_correlation_threshold_samples(): + # Add metadata_sample column + data_sample_id_df = data_df.assign( + Metadata_sample=[f"sample_{x}" for x in range(0, data_df.shape[0])] + ) correlation_threshold_result = correlation_threshold( - population_df=data_df, + population_df=data_sample_id_df, features=data_df.columns.tolist(), - samples=[0, 1, 3, 4, 5], + samples="Metadata_sample != 'sample_2'", threshold=0.9, method="pearson", ) - expected_result = ["y"] assert correlation_threshold_result == expected_result diff --git a/pycytominer/tests/test_operations/test_get_na_columns.py b/pycytominer/tests/test_operations/test_get_na_columns.py index 56b7a832..eb0725f7 100644 --- a/pycytominer/tests/test_operations/test_get_na_columns.py +++ b/pycytominer/tests/test_operations/test_get_na_columns.py @@ -45,17 +45,21 @@ def test_get_na_columns_sample(): """ Testing get_na_columns pycyominer function with samples option """ + data_sample_id_df = data_df.assign( + Metadata_sample=[f"sample_{x}" for x in range(0, data_df.shape[0])] + ) get_na_columns_result = get_na_columns( - population_df=data_df, - samples=[1, 2, 3, 4, 5], + population_df=data_sample_id_df, + samples="Metadata_sample != 'sample_0'", features=["x", "y", "zz"], cutoff=0.4, ) + assert len(get_na_columns_result) == 0 get_na_columns_result = get_na_columns( - population_df=data_df, - samples=[1, 2, 3, 4, 5], + population_df=data_sample_id_df, + samples="Metadata_sample != 'sample_0'", features=["x", "y", "zz"], cutoff=0.1, ) diff --git a/pycytominer/tests/test_operations/test_variance_threshold.py b/pycytominer/tests/test_operations/test_variance_threshold.py index de0d9df0..9d697a43 100644 --- a/pycytominer/tests/test_operations/test_variance_threshold.py +++ b/pycytominer/tests/test_operations/test_variance_threshold.py @@ -121,3 +121,30 @@ def test_variance_threshold_featureinfer(): expected_result = ["Cells_a"] assert excluded_features == expected_result + + +def test_variance_threshold_samples(): + unique_cut = 0.01 + excluded_features = variance_threshold( + population_df=data_unique_test_df, + features=data_unique_test_df.columns.tolist(), + samples="all", + unique_cut=unique_cut, + ) + expected_result = ["a"] + + assert sorted(excluded_features) == sorted(expected_result) + + # Add metadata_sample column + data_sample_id_df = data_df.assign( + Metadata_sample=[f"sample_{x}" for x in range(0, data_df.shape[0])] + ) + + excluded_features = variance_threshold( + population_df=data_sample_id_df, + features=data_sample_id_df.columns.tolist(), + samples="Metadata_sample != 'sample_5'", + unique_cut=unique_cut, + ) + expected_result = ["a", "b"] + assert sorted(excluded_features) == sorted(expected_result)