openpipelines-bio · DriesSchaumont · Jan 8, 2024 · Jan 9, 2024 · Jan 9, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,8 @@
 
 ## NEW FUNCTIONALITY
 
+* `filter_with_hvg`: add `output_obs_num_highly_variable_genes` argument in order to store the number of highly variable genes that were discovered.
+
 * `cluster/leiden`: Allow calculating multiple resolutions in parallel (PR #645).
 
 * `rna_multisample` workflow: added `--modality` argument (PR #607).

diff --git a/src/filter/filter_with_hvg/config.vsh.yaml b/src/filter/filter_with_hvg/config.vsh.yaml
@@ -112,6 +112,13 @@ functionality:
         For all flavors, genes are first sorted by how many batches they are a HVG. For dispersion-based flavors 
         ties are broken by normalized dispersion. If flavor = 'seurat_v3', ties are broken by the median (across
         batches) rank based on within-batch normalized variance.
+
+    - name: "--output_obs_num_highly_variable_genes"
+      type: string
+      description: |
+        When specified, store the number of genes that were found. The value from this argument is used as the 
+        column name.
+
   resources:
     - type: python_script
       path: script.py

diff --git a/src/filter/filter_with_hvg/script.py b/src/filter/filter_with_hvg/script.py
@@ -23,6 +23,10 @@
   'layer': 'log_transformed'
 }
 
+meta = {
+    'resources_dir': "."
+}
+
 mu_in = mu.read_h5mu('resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu')
 rna_in = mu_in.mod["rna"]
 assert "filter_with_hvg" not in rna_in.var.columns
@@ -142,6 +146,9 @@ def setup_logger():
     # drop mean_bin as mudata/anndata doesn't support tuples
     data.varm[par["varm_name"]] = out.drop("mean_bin", axis=1)
 
+if par['output_obs_num_highly_variable_genes']:
+    data.obs[par['output_obs_num_highly_variable_genes']] = out["highly_variable"].sum()
+
 if par["do_subset"]:
     keep_feats = np.ravel(data.var[par["var_name_filter"]])
     mdata.mod[mod] = data[:,keep_feats]

diff --git a/src/filter/filter_with_hvg/test.py b/src/filter/filter_with_hvg/test.py
@@ -152,6 +152,22 @@ def test_filter_with_hvg_cell_ranger_unfiltered_data_change_error_message(run_co
                      r"returned by scanpy \(see above\) could be the "
                      r"result from trying to use this component on unfiltered data\.",
                     err.value.stdout.decode('utf-8'))
+
+def test_add_number_of_highly_variable_genes_selected(run_component, lognormed_test_data_path):
+    run_component([
+        "--flavor", "seurat",
+        "--input", lognormed_test_data_path,
+        "--output", "output.h5mu",
+        "--layer", "log_transformed",
+        "--output_obs_num_highly_variable_genes", "foo",
+        "--output_compression", "gzip"])
+    assert os.path.exists("output.h5mu")
+    data = mu.read_h5mu("output.h5mu")
+    assert "filter_with_hvg" in data.mod["rna"].var.columns
+    assert "foo" in data.mod["rna"].obs.columns
+    result_col = data.mod["rna"].obs["foo"]
+    assert (result_col <= len(data.mod["rna"].var_names)).all()
+    assert (result_col > 0).any() 
 
 
 if __name__ == '__main__':