Skip to content

Commit

Permalink
Merge branch 'master' into feature/setup-pyproject-toml
Browse files Browse the repository at this point in the history
  • Loading branch information
kenibrewer committed Aug 7, 2023
2 parents bc1c819 + c4de0a9 commit dbde5f4
Show file tree
Hide file tree
Showing 10 changed files with 105 additions and 25 deletions.
9 changes: 6 additions & 3 deletions pycytominer/cyto_utils/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,11 @@ def drop_outlier_features(
DataFrame that includes metadata and observation features.
features : list of str or str, default "infer"
Features present in the population dataframe. If "infer", then assume Cell Painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_"
samples : list of str or str, default "all"
Samples to perform the operation on
samples : str, default "all"
List of samples to perform operation on. The function uses a pd.DataFrame.query()
function, so you should structure samples in this fashion. An example is
"Metadata_treatment == 'control'" (include all quotes).
If "all", use all samples to calculate.
outlier_cutoff : int or float, default 500
see https://github.com/cytomining/pycytominer/issues/237 for details.
Threshold to remove features if absolute values is greater
Expand All @@ -166,7 +169,7 @@ def drop_outlier_features(

# Subset dataframe
if samples != "all":
population_df = population_df.loc[samples, :]
population_df.query(samples, inplace=True)

if features == "infer":
features = infer_cp_features(population_df)
Expand Down
9 changes: 6 additions & 3 deletions pycytominer/operations/correlation_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,11 @@ def correlation_threshold(
List of features present in the population dataframe [default: "infer"]
if "infer", then assume cell painting features are those that start with
"Cells_", "Nuclei_", or "Cytoplasm_".
samples : list or str, default "all"
List of samples to perform operation on. If "all", use all samples to calculate.
samples : str, default "all"
List of samples to perform operation on. The function uses a pd.DataFrame.query()
function, so you should structure samples in this fashion. An example is
"Metadata_treatment == 'control'" (include all quotes).
If "all", use all samples to calculate.
threshold - float, default 0.9
Must be between (0, 1) to exclude features
method - str, default "pearson"
Expand All @@ -45,7 +48,7 @@ def correlation_threshold(

# Subset dataframe and calculate correlation matrix across subset features
if samples != "all":
population_df = population_df.loc[samples, :]
population_df.query(samples, inplace=True)

if features == "infer":
features = infer_cp_features(population_df)
Expand Down
9 changes: 6 additions & 3 deletions pycytominer/operations/get_na_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@ def get_na_columns(population_df, features="infer", samples="all", cutoff=0.05):
List of features present in the population dataframe [default: "infer"]
if "infer", then assume cell painting features are those that start with
"Cells_", "Nuclei_", or "Cytoplasm_".
samples : list or str, default "all"
List of samples to perform operation on. If "all", use all samples to calculate.
samples : str, default "all"
List of samples to perform operation on. The function uses a pd.DataFrame.query()
function, so you should structure samples in this fashion. An example is
"Metadata_treatment == 'control'" (include all quotes).
If "all", use all samples to calculate.
cutoff : float
Exclude features that have a certain proportion of missingness
Expand All @@ -30,7 +33,7 @@ def get_na_columns(population_df, features="infer", samples="all", cutoff=0.05):
"""

if samples != "all":
population_df = population_df.loc[samples, :]
population_df.query(samples, inplace=True)

if features == "infer":
features = infer_cp_features(population_df)
Expand Down
9 changes: 6 additions & 3 deletions pycytominer/operations/noise_removal.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,11 @@ def noise_removal(
List of features present in the population dataframe [default: "infer"]
if "infer", then assume cell painting features are those that start with
"Cells_", "Nuclei_", or "Cytoplasm_".
samples : list or str, default "all"
List of samples to perform operation on. If "all", use all samples to calculate.
samples : str, default "all"
List of samples to perform operation on. The function uses a pd.DataFrame.query()
function, so you should structure samples in this fashion. An example is
"Metadata_treatment == 'control'" (include all quotes).
If "all", use all samples to calculate.
noise_removal_stdev_cutoff : float
Maximum mean stdev value for a feature to be kept, with features grouped according to the perturbations in
noise_removal_perturbation_groups.
Expand All @@ -41,7 +44,7 @@ def noise_removal(
"""
# Subset dataframe
if samples != "all":
population_df = population_df.loc[samples, :]
population_df.query(samples, inplace=True)

if features == "infer":
features = infer_cp_features(population_df)
Expand Down
9 changes: 6 additions & 3 deletions pycytominer/operations/variance_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,11 @@ def variance_threshold(
List of features present in the population dataframe [default: "infer"]
if "infer", then assume cell painting features are those that start with
"Cells_", "Nuclei_", or "Cytoplasm_".
samples : list or str, default "all"
List of samples to perform operation on. If "all", use all samples to calculate.
samples : str, default "all"
List of samples to perform operation on. The function uses a pd.DataFrame.query()
function, so you should structure samples in this fashion. An example is
"Metadata_treatment == 'control'" (include all quotes).
If "all", use all samples to calculate.
freq_cut : float, default 0.05
Ratio (2nd most common feature val / most common). Must range between 0 and 1.
Remove features lower than freq_cut. A low freq_cut will remove features
Expand All @@ -45,7 +48,7 @@ def variance_threshold(

# Subset dataframe
if samples != "all":
population_df = population_df.loc[samples, :]
population_df.query(samples, inplace=True)

if features == "infer":
features = infer_cp_features(population_df)
Expand Down
10 changes: 8 additions & 2 deletions tests/test_cyto_utils/test_feature_drop_outlier.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
"control",
"control",
],
"Metadata_test_drop_me": ["no", "no", "no", "no", "yes", "no", "yes", "yes"],
"Metadata_test_drop_me_2": ["no", "no", "no", "no", "yes", "yes", "yes", "yes"],
"Cells_x": [1, 2, -8, 2, 5, 5, 5, -1],
"Cytoplasm_y": [3, -1, 7, 4, 5, -9, 6, 1],
"Nuclei_z": [-1, 8, 2, 5, -6, 20, 2, -2],
Expand All @@ -41,11 +43,15 @@ def test_outlier_15_cutoff():


def test_outlier_samples_15():
result = drop_outlier_features(data_df, samples=[0, 1, 2, 3, 5], outlier_cutoff=15)
result = drop_outlier_features(
data_df, samples="Metadata_test_drop_me == 'no'", outlier_cutoff=15
)
expected_result = ["Cells_zz", "Nuclei_z"]
assert sorted(result) == sorted(expected_result)

result = drop_outlier_features(data_df, samples=[0, 1, 2, 3], outlier_cutoff=15)
result = drop_outlier_features(
data_df, samples="Metadata_test_drop_me_2 == 'no'", outlier_cutoff=15
)
expected_result = ["Cells_zz"]
assert result == expected_result

Expand Down
26 changes: 26 additions & 0 deletions tests/test_feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def test_feature_select_noise_removal():
result1 = feature_select(
profiles=data_df,
features=data_df.columns.tolist(),
samples="all",
operation="noise_removal",
noise_removal_perturb_groups=data_df_groups,
noise_removal_stdev_cutoff=2.5,
Expand All @@ -105,18 +106,22 @@ def test_feature_select_noise_removal():
noise_removal_perturb_groups=data_df_groups,
noise_removal_stdev_cutoff=3.5,
)

expected_result1 = data_df[["x", "y"]]
expected_result2 = data_df[[]]
expected_result3 = data_df[["x", "y", "z", "zz"]]

pd.testing.assert_frame_equal(result1, expected_result1)
pd.testing.assert_frame_equal(result2, expected_result2)
pd.testing.assert_frame_equal(result3, expected_result3)

# Test on data_unique_test_df, which has 100 rows
data_unique_test_df_groups = []

# Create a 100 element list containing 10 replicates of 10 perturbations
for elem in ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]:
data_unique_test_df_groups.append([elem] * 10)

# Unstack so it's just a single list
data_unique_test_df_groups = [
item for sublist in data_unique_test_df_groups for item in sublist
Expand All @@ -136,8 +141,10 @@ def test_feature_select_noise_removal():
noise_removal_perturb_groups=data_unique_test_df_groups,
noise_removal_stdev_cutoff=500,
)

expected_result4 = data_unique_test_df[["a", "b"]]
expected_result5 = data_unique_test_df[["a", "b", "c", "d"]]

pd.testing.assert_frame_equal(result4, expected_result4)
pd.testing.assert_frame_equal(result5, expected_result5)

Expand All @@ -158,13 +165,16 @@ def test_feature_select_noise_removal():
noise_removal_perturb_groups="perturb_group",
noise_removal_stdev_cutoff=500,
)

expected_result4b = data_unique_test_df2[["a", "b", "perturb_group"]]
expected_result5b = data_unique_test_df2[["a", "b", "c", "d", "perturb_group"]]

pd.testing.assert_frame_equal(result4b, expected_result4b)
pd.testing.assert_frame_equal(result5b, expected_result5b)

# Test assertion errors for the user inputting the perturbation groupings
bad_perturb_list = ["a", "a", "b", "b", "a", "a", "b"]

with pytest.raises(
AssertionError
): # When the inputted perturb list doesn't match the length of the data
Expand Down Expand Up @@ -198,6 +208,22 @@ def test_feature_select_noise_removal():
noise_removal_stdev_cutoff=2.5,
)

with pytest.raises(
AssertionError
): # When the perturbation group doesn't match b/c samples argument used
# Add metadata_sample column
data_sample_id_df = data_df.assign(
Metadata_sample=[f"sample_{x}" for x in range(0, data_df.shape[0])]
)
feature_select(
profiles=data_sample_id_df,
features=data_df.columns.tolist(),
samples="Metadata_sample != 'sample_1'",
operation="noise_removal",
noise_removal_perturb_groups=data_df_groups,
noise_removal_stdev_cutoff=2.5,
)


def test_feature_select_get_na_columns():
"""
Expand Down
10 changes: 6 additions & 4 deletions tests/test_operations/test_correlation_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
}
).reset_index(drop=True)


data_uncorrelated_df = pd.DataFrame(
{
"x": [2, 2, 2, 5, 2, 1],
Expand Down Expand Up @@ -62,14 +61,17 @@ def test_correlation_threshold_uncorrelated():


def test_correlation_threshold_samples():
# Add metadata_sample column
data_sample_id_df = data_df.assign(
Metadata_sample=[f"sample_{x}" for x in range(0, data_df.shape[0])]
)
correlation_threshold_result = correlation_threshold(
population_df=data_df,
population_df=data_sample_id_df,
features=data_df.columns.tolist(),
samples=[0, 1, 3, 4, 5],
samples="Metadata_sample != 'sample_2'",
threshold=0.9,
method="pearson",
)

expected_result = ["y"]

assert correlation_threshold_result == expected_result
Expand Down
12 changes: 8 additions & 4 deletions tests/test_operations/test_get_na_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,21 @@ def test_get_na_columns_sample():
"""
Testing get_na_columns pycyominer function with samples option
"""
data_sample_id_df = data_df.assign(
Metadata_sample=[f"sample_{x}" for x in range(0, data_df.shape[0])]
)
get_na_columns_result = get_na_columns(
population_df=data_df,
samples=[1, 2, 3, 4, 5],
population_df=data_sample_id_df,
samples="Metadata_sample != 'sample_0'",
features=["x", "y", "zz"],
cutoff=0.4,
)

assert len(get_na_columns_result) == 0

get_na_columns_result = get_na_columns(
population_df=data_df,
samples=[1, 2, 3, 4, 5],
population_df=data_sample_id_df,
samples="Metadata_sample != 'sample_0'",
features=["x", "y", "zz"],
cutoff=0.1,
)
Expand Down
27 changes: 27 additions & 0 deletions tests/test_operations/test_variance_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,30 @@ def test_variance_threshold_featureinfer():
expected_result = ["Cells_a"]

assert excluded_features == expected_result


def test_variance_threshold_samples():
unique_cut = 0.01
excluded_features = variance_threshold(
population_df=data_unique_test_df,
features=data_unique_test_df.columns.tolist(),
samples="all",
unique_cut=unique_cut,
)
expected_result = ["a"]

assert sorted(excluded_features) == sorted(expected_result)

# Add metadata_sample column
data_sample_id_df = data_df.assign(
Metadata_sample=[f"sample_{x}" for x in range(0, data_df.shape[0])]
)

excluded_features = variance_threshold(
population_df=data_sample_id_df,
features=data_sample_id_df.columns.tolist(),
samples="Metadata_sample != 'sample_5'",
unique_cut=unique_cut,
)
expected_result = ["a", "b"]
assert sorted(excluded_features) == sorted(expected_result)

0 comments on commit dbde5f4

Please sign in to comment.