Merge remote-tracking branch 'origin/main' into feature/no-ref/add-sc…

…PRINT * origin/main: fix repository name Migrate from core to openproblems (#11) fix broken urls fix wf Pre-filter batches in hvg overlap metric (#9) Adjust resources (#10) Add scGPT (#8) Add UCE method (#7) Add Geneformer (#6) fix submodule Update common submodule
openproblems-bio · Nov 21, 2024 · 5ece7e2 · 5ece7e2
2 parents c9e8a74 + 6dc0f1d
commit 5ece7e2
Show file tree

Hide file tree

Showing 22 changed files with 685 additions and 47 deletions.
diff --git a/_viash.yaml b/_viash.yaml
@@ -86,8 +86,7 @@ config_mods: |
   .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" }
 
 repositories:
-  - name: core
+  - name: openproblems
     type: github
-    repo: openproblems-bio/core
+    repo: openproblems-bio/openproblems
     tag: build/main
-    path: viash/core
diff --git a/common b/common
diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh
@@ -26,6 +26,7 @@ input_states: resources/datasets/**/state.yaml
 rename_keys: 'input_dataset:output_dataset;input_solution:output_solution'
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
+settings: '{"methods_exclude": ["uce"]}'
 HERE
 
 # run the benchmark

diff --git a/scripts/run_benchmark/run_test_local.sh b/scripts/run_benchmark/run_test_local.sh
@@ -21,6 +21,7 @@ input_states: resources_test/task_batch_integration/**/state.yaml
 rename_keys: 'input_dataset:output_dataset;input_solution:output_solution'
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
+settings: '{"methods_exclude": ["uce"]}'
 HERE
 
 nextflow run . \

diff --git a/scripts/run_benchmark/run_test_seqeracloud.sh b/scripts/run_benchmark/run_test_seqeracloud.sh
@@ -13,7 +13,7 @@ cat > /tmp/params.yaml << 'HERE'
 input_states: s3://openproblems-data/resources_test/task_batch_integration/**/state.yaml
 rename_keys: 'input_dataset:output_dataset;input_solution:output_solution'
 output_state: "state.yaml"
-publish_dir: s3://openproblems-nextflow/temp/task_batch_integration/
+publish_dir: s3://openproblems-work/temp/task_batch_integration/
 HERE
 
 tw launch https://github.com/openproblems-bio/task_batch_integration.git \

diff --git a/src/methods/batchelor_fastmnn/config.vsh.yaml b/src/methods/batchelor_fastmnn/config.vsh.yaml
@@ -17,7 +17,7 @@ references:
   # Nat Biotechnol 36, 421–427 (2018). https://doi.org/10.1038/nbt.4091
   doi: 10.1038/nbt.4091
 links:
-  repository: https://code.bioconductor.org/browse/batchelor/
+  repository: https://github.com/LTLA/batchelor
   documentation: https://bioconductor.org/packages/batchelor/
 info:
   method_types: [embedding]

diff --git a/src/methods/batchelor_mnn_correct/config.vsh.yaml b/src/methods/batchelor_mnn_correct/config.vsh.yaml
@@ -11,7 +11,7 @@ references:
   # Nat Biotechnol 36, 421–427 (2018). https://doi.org/10.1038/nbt.4091
   doi: 10.1038/nbt.4091
 links:
-  repository: https://code.bioconductor.org/browse/batchelor/
+  repository: https://github.com/LTLA/batchelor
   documentation: https://bioconductor.org/packages/batchelor/
 info:
   method_types: [feature]

diff --git a/src/methods/geneformer/config.vsh.yaml b/src/methods/geneformer/config.vsh.yaml
@@ -0,0 +1,58 @@
+__merge__: /src/api/base_method.yaml
+
+name: geneformer
+label: Geneformer
+summary: Geneformer is a foundation transformer model pretrained on a large-scale corpus of single cell transcriptomes
+description: |
+  Geneformer is a foundation transformer model pretrained on a large-scale
+  corpus of single cell transcriptomes to enable context-aware predictions in
+  network biology. For this task, Geneformer is used to create a batch-corrected
+  cell embedding.
+references:
+  doi:
+    - 10.1038/s41586-023-06139-9
+    - 10.1101/2024.08.16.608180
+links:
+  documentation: https://geneformer.readthedocs.io/en/latest/index.html
+  repository: https://huggingface.co/ctheodoris/Geneformer
+
+info:
+  preferred_normalization: counts
+  method_types: [embedding]
+  variants:
+    geneformer_12L_95M_i4096:
+      model: "gf-12L-95M-i4096"
+    geneformer_6L_30M_i2048:
+      model: "gf-6L-30M-i2048"
+    geneformer_12L_30M_i2048:
+      model: "gf-12L-30M-i2048"
+    geneformer_20L_95M_i4096:
+      model: "gf-20L-95M-i4096"
+
+arguments:
+  - name: "--model"
+    type: "string"
+    description: String representing the Geneformer model to use
+    choices: ["gf-6L-30M-i2048", "gf-12L-30M-i2048", "gf-12L-95M-i4096", "gf-20L-95M-i4096"]
+    default: "gf-12L-95M-i4096"
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+
+engines:
+  - type: docker
+    image: openproblems/base_pytorch_nvidia:1.0.0
+    setup:
+      - type: python
+        pip:
+        - pyarrow<15.0.0a0,>=14.0.1
+        - huggingface_hub
+        - git+https://huggingface.co/ctheodoris/Geneformer.git
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime, midmem, midcpu, gpu]
diff --git a/src/methods/geneformer/script.py b/src/methods/geneformer/script.py
@@ -0,0 +1,154 @@
+import os
+import sys
+from tempfile import TemporaryDirectory
+
+import anndata as ad
+import numpy as np
+import pandas as pd
+from geneformer import EmbExtractor, TranscriptomeTokenizer
+from huggingface_hub import hf_hub_download
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+    "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
+    "output": "output.h5ad",
+    "model": "gf-12L-95M-i4096",
+}
+meta = {"name": "geneformer"}
+## VIASH END
+
+n_processors = os.cpu_count()
+
+print(">>> Reading input...", flush=True)
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+
+adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns")
+
+if adata.uns["dataset_organism"] != "homo_sapiens":
+    raise ValueError(
+        f"Geneformer can only be used with human data "
+        f"(dataset_organism == '{adata.uns['dataset_organism']}')"
+    )
+
+is_ensembl = all(var_name.startswith("ENSG") for var_name in adata.var_names)
+if not is_ensembl:
+    raise ValueError(f"Geneformer requires adata.var_names to contain ENSEMBL gene ids")
+
+print(f">>> Getting settings for model '{par['model']}'...", flush=True)
+model_split = par["model"].split("-")
+model_details = {
+    "layers": model_split[1],
+    "dataset": model_split[2],
+    "input_size": int(model_split[3][1:]),
+}
+print(model_details, flush=True)
+
+print(">>> Getting model dictionary files...", flush=True)
+if model_details["dataset"] == "95M":
+    dictionaries_subfolder = "geneformer"
+elif model_details["dataset"] == "30M":
+    dictionaries_subfolder = "geneformer/gene_dictionaries_30m"
+else:
+    raise ValueError(f"Invalid model dataset: {model_details['dataset']}")
+print(f"Dictionaries subfolder: '{dictionaries_subfolder}'")
+
+dictionary_files = {
+    "ensembl_mapping": hf_hub_download(
+        repo_id="ctheodoris/Geneformer",
+        subfolder=dictionaries_subfolder,
+        filename=f"ensembl_mapping_dict_gc{model_details['dataset']}.pkl",
+    ),
+    "gene_median": hf_hub_download(
+        repo_id="ctheodoris/Geneformer",
+        subfolder=dictionaries_subfolder,
+        filename=f"gene_median_dictionary_gc{model_details['dataset']}.pkl",
+    ),
+    "gene_name_id": hf_hub_download(
+        repo_id="ctheodoris/Geneformer",
+        subfolder=dictionaries_subfolder,
+        filename=f"gene_name_id_dict_gc{model_details['dataset']}.pkl",
+    ),
+    "token": hf_hub_download(
+        repo_id="ctheodoris/Geneformer",
+        subfolder=dictionaries_subfolder,
+        filename=f"token_dictionary_gc{model_details['dataset']}.pkl",
+    ),
+}
+
+print(">>> Creating working directory...", flush=True)
+work_dir = TemporaryDirectory()
+input_dir = os.path.join(work_dir.name, "input")
+os.makedirs(input_dir)
+tokenized_dir = os.path.join(work_dir.name, "tokenized")
+os.makedirs(tokenized_dir)
+embedding_dir = os.path.join(work_dir.name, "embedding")
+os.makedirs(embedding_dir)
+print(f"Working directory: '{work_dir.name}'", flush=True)
+
+print(">>> Preparing data...", flush=True)
+adata.var["ensembl_id"] = adata.var_names
+adata.obs["n_counts"] = np.ravel(adata.X.sum(axis=1))
+adata.write_h5ad(os.path.join(input_dir, "input.h5ad"))
+print(adata)
+
+print(">>> Tokenizing data...", flush=True)
+special_token = model_details["dataset"] == "95M"
+print(f"Input size: {model_details['input_size']}, Special token: {special_token}")
+tokenizer = TranscriptomeTokenizer(
+    nproc=n_processors,
+    model_input_size=model_details["input_size"],
+    special_token=special_token,
+    gene_median_file=dictionary_files["gene_median"],
+    token_dictionary_file=dictionary_files["token"],
+    gene_mapping_file=dictionary_files["ensembl_mapping"],
+)
+tokenizer.tokenize_data(input_dir, tokenized_dir, "tokenized", file_format="h5ad")
+
+print(f">>> Getting model files for model '{par['model']}'...", flush=True)
+model_files = {
+    "model": hf_hub_download(
+        repo_id="ctheodoris/Geneformer",
+        subfolder=par["model"],
+        filename="model.safetensors",
+    ),
+    "config": hf_hub_download(
+        repo_id="ctheodoris/Geneformer",
+        subfolder=par["model"],
+        filename="config.json",
+    ),
+}
+model_dir = os.path.dirname(model_files["model"])
+
+print(">>> Extracting embeddings...", flush=True)
+embedder = EmbExtractor(
+    emb_mode="cell", max_ncells=None, token_dictionary_file=dictionary_files["token"]
+)
+embedder.extract_embs(
+    model_dir,
+    os.path.join(tokenized_dir, "tokenized.dataset"),
+    embedding_dir,
+    "embedding",
+)
+embedding = pd.read_csv(os.path.join(embedding_dir, "embedding.csv")).to_numpy()
+
+print(">>> Storing outputs...", flush=True)
+output = ad.AnnData(
+    obs=adata.obs[[]],
+    var=adata.var[[]],
+    obsm={
+        "X_emb": embedding,
+    },
+    uns={
+        "dataset_id": adata.uns["dataset_id"],
+        "normalization_id": adata.uns["normalization_id"],
+        "method_id": meta["name"],
+    },
+)
+print(output)
+
+print(">>> Writing output AnnData to file...", flush=True)
+output.write_h5ad(par["output"], compression="gzip")
+print(">>> Done!")
diff --git a/src/methods/scgpt/config.vsh.yaml b/src/methods/scgpt/config.vsh.yaml
@@ -0,0 +1,59 @@
+__merge__: ../../api/comp_method.yaml
+
+name: scgpt
+label: scGPT
+summary: "A foundation model for single-cell biology"
+description: |
+  scGPT is a foundation model for single-cell biology based on a generative
+  pre-trained transformer and trained on a repository of over 33 million cells.
+  Here, we use zero-shot output from a pre-trained model to get an integrated
+  embedding for the batch integration task.
+references:
+  doi:
+    - 10.1038/s41592-024-02201-0
+links:
+  documentation: https://scgpt.readthedocs.io/en/latest/
+  repository: https://github.com/bowang-lab/scGPT
+
+info:
+  method_types: [embedding]
+  preferred_normalization: counts
+  variants:
+    scgpt_default:
+    scgpt_cp:
+      model: "scGPT_CP"
+
+arguments:
+  - name: --model
+    type: string
+    description: String giving the scGPT model to use
+    choices: ["scGPT_human", "scGPT_CP"]
+    default: "scGPT_human"
+  - name: --n_hvg
+    type: integer
+    default: 3000
+    description: Number of highly variable genes to use.
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+
+engines:
+  - type: docker
+    image: openproblems/base_pytorch_nvidia:1.0.0
+    # TODO: Try to find working installation of flash attention (flash-attn<1.0.5)
+    setup:
+      - type: python
+        pypi:
+          - gdown
+          - scgpt # Install from PyPI to get dependencies
+      - type: docker
+        # Force re-installing from GitHub to get bug fixes
+        run: pip install --upgrade --no-deps --force-reinstall git+https://github.com/bowang-lab/scGPT.git
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime, midmem, midcpu, gpu]