Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into feature/no-ref/add-sc…
Browse files Browse the repository at this point in the history
…PRINT

* origin/main:
  fix repository name
  Migrate from core to openproblems (#11)
  fix broken urls
  fix wf
  Pre-filter batches in hvg overlap metric (#9)
  Adjust resources (#10)
  Add scGPT (#8)
  Add UCE method (#7)
  Add Geneformer (#6)
  fix submodule
  Update common submodule
  • Loading branch information
lazappi committed Nov 21, 2024
2 parents c9e8a74 + 6dc0f1d commit 5ece7e2
Show file tree
Hide file tree
Showing 22 changed files with 685 additions and 47 deletions.
5 changes: 2 additions & 3 deletions _viash.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,7 @@ config_mods: |
.runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" }
repositories:
- name: core
- name: openproblems
type: github
repo: openproblems-bio/core
repo: openproblems-bio/openproblems
tag: build/main
path: viash/core
2 changes: 1 addition & 1 deletion common
1 change: 1 addition & 0 deletions scripts/run_benchmark/run_full_local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ input_states: resources/datasets/**/state.yaml
rename_keys: 'input_dataset:output_dataset;input_solution:output_solution'
output_state: "state.yaml"
publish_dir: "$publish_dir"
settings: '{"methods_exclude": ["uce"]}'
HERE

# run the benchmark
Expand Down
1 change: 1 addition & 0 deletions scripts/run_benchmark/run_test_local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ input_states: resources_test/task_batch_integration/**/state.yaml
rename_keys: 'input_dataset:output_dataset;input_solution:output_solution'
output_state: "state.yaml"
publish_dir: "$publish_dir"
settings: '{"methods_exclude": ["uce"]}'
HERE

nextflow run . \
Expand Down
2 changes: 1 addition & 1 deletion scripts/run_benchmark/run_test_seqeracloud.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ cat > /tmp/params.yaml << 'HERE'
input_states: s3://openproblems-data/resources_test/task_batch_integration/**/state.yaml
rename_keys: 'input_dataset:output_dataset;input_solution:output_solution'
output_state: "state.yaml"
publish_dir: s3://openproblems-nextflow/temp/task_batch_integration/
publish_dir: s3://openproblems-work/temp/task_batch_integration/
HERE

tw launch https://github.com/openproblems-bio/task_batch_integration.git \
Expand Down
2 changes: 1 addition & 1 deletion src/methods/batchelor_fastmnn/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ references:
# Nat Biotechnol 36, 421–427 (2018). https://doi.org/10.1038/nbt.4091
doi: 10.1038/nbt.4091
links:
repository: https://code.bioconductor.org/browse/batchelor/
repository: https://github.com/LTLA/batchelor
documentation: https://bioconductor.org/packages/batchelor/
info:
method_types: [embedding]
Expand Down
2 changes: 1 addition & 1 deletion src/methods/batchelor_mnn_correct/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ references:
# Nat Biotechnol 36, 421–427 (2018). https://doi.org/10.1038/nbt.4091
doi: 10.1038/nbt.4091
links:
repository: https://code.bioconductor.org/browse/batchelor/
repository: https://github.com/LTLA/batchelor
documentation: https://bioconductor.org/packages/batchelor/
info:
method_types: [feature]
Expand Down
58 changes: 58 additions & 0 deletions src/methods/geneformer/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
__merge__: /src/api/base_method.yaml

name: geneformer
label: Geneformer
summary: Geneformer is a foundation transformer model pretrained on a large-scale corpus of single cell transcriptomes
description: |
Geneformer is a foundation transformer model pretrained on a large-scale
corpus of single cell transcriptomes to enable context-aware predictions in
network biology. For this task, Geneformer is used to create a batch-corrected
cell embedding.
references:
doi:
- 10.1038/s41586-023-06139-9
- 10.1101/2024.08.16.608180
links:
documentation: https://geneformer.readthedocs.io/en/latest/index.html
repository: https://huggingface.co/ctheodoris/Geneformer

info:
preferred_normalization: counts
method_types: [embedding]
variants:
geneformer_12L_95M_i4096:
model: "gf-12L-95M-i4096"
geneformer_6L_30M_i2048:
model: "gf-6L-30M-i2048"
geneformer_12L_30M_i2048:
model: "gf-12L-30M-i2048"
geneformer_20L_95M_i4096:
model: "gf-20L-95M-i4096"

arguments:
- name: "--model"
type: "string"
description: String representing the Geneformer model to use
choices: ["gf-6L-30M-i2048", "gf-12L-30M-i2048", "gf-12L-95M-i4096", "gf-20L-95M-i4096"]
default: "gf-12L-95M-i4096"

resources:
- type: python_script
path: script.py
- path: /src/utils/read_anndata_partial.py

engines:
- type: docker
image: openproblems/base_pytorch_nvidia:1.0.0
setup:
- type: python
pip:
- pyarrow<15.0.0a0,>=14.0.1
- huggingface_hub
- git+https://huggingface.co/ctheodoris/Geneformer.git

runners:
- type: executable
- type: nextflow
directives:
label: [midtime, midmem, midcpu, gpu]
154 changes: 154 additions & 0 deletions src/methods/geneformer/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import os
import sys
from tempfile import TemporaryDirectory

import anndata as ad
import numpy as np
import pandas as pd
from geneformer import EmbExtractor, TranscriptomeTokenizer
from huggingface_hub import hf_hub_download

## VIASH START
# Note: this section is auto-generated by viash at runtime. To edit it, make changes
# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
par = {
"input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
"output": "output.h5ad",
"model": "gf-12L-95M-i4096",
}
meta = {"name": "geneformer"}
## VIASH END

n_processors = os.cpu_count()

print(">>> Reading input...", flush=True)
sys.path.append(meta["resources_dir"])
from read_anndata_partial import read_anndata

adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns")

if adata.uns["dataset_organism"] != "homo_sapiens":
raise ValueError(
f"Geneformer can only be used with human data "
f"(dataset_organism == '{adata.uns['dataset_organism']}')"
)

is_ensembl = all(var_name.startswith("ENSG") for var_name in adata.var_names)
if not is_ensembl:
raise ValueError(f"Geneformer requires adata.var_names to contain ENSEMBL gene ids")

print(f">>> Getting settings for model '{par['model']}'...", flush=True)
model_split = par["model"].split("-")
model_details = {
"layers": model_split[1],
"dataset": model_split[2],
"input_size": int(model_split[3][1:]),
}
print(model_details, flush=True)

print(">>> Getting model dictionary files...", flush=True)
if model_details["dataset"] == "95M":
dictionaries_subfolder = "geneformer"
elif model_details["dataset"] == "30M":
dictionaries_subfolder = "geneformer/gene_dictionaries_30m"
else:
raise ValueError(f"Invalid model dataset: {model_details['dataset']}")
print(f"Dictionaries subfolder: '{dictionaries_subfolder}'")

dictionary_files = {
"ensembl_mapping": hf_hub_download(
repo_id="ctheodoris/Geneformer",
subfolder=dictionaries_subfolder,
filename=f"ensembl_mapping_dict_gc{model_details['dataset']}.pkl",
),
"gene_median": hf_hub_download(
repo_id="ctheodoris/Geneformer",
subfolder=dictionaries_subfolder,
filename=f"gene_median_dictionary_gc{model_details['dataset']}.pkl",
),
"gene_name_id": hf_hub_download(
repo_id="ctheodoris/Geneformer",
subfolder=dictionaries_subfolder,
filename=f"gene_name_id_dict_gc{model_details['dataset']}.pkl",
),
"token": hf_hub_download(
repo_id="ctheodoris/Geneformer",
subfolder=dictionaries_subfolder,
filename=f"token_dictionary_gc{model_details['dataset']}.pkl",
),
}

print(">>> Creating working directory...", flush=True)
work_dir = TemporaryDirectory()
input_dir = os.path.join(work_dir.name, "input")
os.makedirs(input_dir)
tokenized_dir = os.path.join(work_dir.name, "tokenized")
os.makedirs(tokenized_dir)
embedding_dir = os.path.join(work_dir.name, "embedding")
os.makedirs(embedding_dir)
print(f"Working directory: '{work_dir.name}'", flush=True)

print(">>> Preparing data...", flush=True)
adata.var["ensembl_id"] = adata.var_names
adata.obs["n_counts"] = np.ravel(adata.X.sum(axis=1))
adata.write_h5ad(os.path.join(input_dir, "input.h5ad"))
print(adata)

print(">>> Tokenizing data...", flush=True)
special_token = model_details["dataset"] == "95M"
print(f"Input size: {model_details['input_size']}, Special token: {special_token}")
tokenizer = TranscriptomeTokenizer(
nproc=n_processors,
model_input_size=model_details["input_size"],
special_token=special_token,
gene_median_file=dictionary_files["gene_median"],
token_dictionary_file=dictionary_files["token"],
gene_mapping_file=dictionary_files["ensembl_mapping"],
)
tokenizer.tokenize_data(input_dir, tokenized_dir, "tokenized", file_format="h5ad")

print(f">>> Getting model files for model '{par['model']}'...", flush=True)
model_files = {
"model": hf_hub_download(
repo_id="ctheodoris/Geneformer",
subfolder=par["model"],
filename="model.safetensors",
),
"config": hf_hub_download(
repo_id="ctheodoris/Geneformer",
subfolder=par["model"],
filename="config.json",
),
}
model_dir = os.path.dirname(model_files["model"])

print(">>> Extracting embeddings...", flush=True)
embedder = EmbExtractor(
emb_mode="cell", max_ncells=None, token_dictionary_file=dictionary_files["token"]
)
embedder.extract_embs(
model_dir,
os.path.join(tokenized_dir, "tokenized.dataset"),
embedding_dir,
"embedding",
)
embedding = pd.read_csv(os.path.join(embedding_dir, "embedding.csv")).to_numpy()

print(">>> Storing outputs...", flush=True)
output = ad.AnnData(
obs=adata.obs[[]],
var=adata.var[[]],
obsm={
"X_emb": embedding,
},
uns={
"dataset_id": adata.uns["dataset_id"],
"normalization_id": adata.uns["normalization_id"],
"method_id": meta["name"],
},
)
print(output)

print(">>> Writing output AnnData to file...", flush=True)
output.write_h5ad(par["output"], compression="gzip")
print(">>> Done!")
59 changes: 59 additions & 0 deletions src/methods/scgpt/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
__merge__: ../../api/comp_method.yaml

name: scgpt
label: scGPT
summary: "A foundation model for single-cell biology"
description: |
scGPT is a foundation model for single-cell biology based on a generative
pre-trained transformer and trained on a repository of over 33 million cells.
Here, we use zero-shot output from a pre-trained model to get an integrated
embedding for the batch integration task.
references:
doi:
- 10.1038/s41592-024-02201-0
links:
documentation: https://scgpt.readthedocs.io/en/latest/
repository: https://github.com/bowang-lab/scGPT

info:
method_types: [embedding]
preferred_normalization: counts
variants:
scgpt_default:
scgpt_cp:
model: "scGPT_CP"

arguments:
- name: --model
type: string
description: String giving the scGPT model to use
choices: ["scGPT_human", "scGPT_CP"]
default: "scGPT_human"
- name: --n_hvg
type: integer
default: 3000
description: Number of highly variable genes to use.

resources:
- type: python_script
path: script.py
- path: /src/utils/read_anndata_partial.py

engines:
- type: docker
image: openproblems/base_pytorch_nvidia:1.0.0
# TODO: Try to find working installation of flash attention (flash-attn<1.0.5)
setup:
- type: python
pypi:
- gdown
- scgpt # Install from PyPI to get dependencies
- type: docker
# Force re-installing from GitHub to get bug fixes
run: pip install --upgrade --no-deps --force-reinstall git+https://github.com/bowang-lab/scGPT.git

runners:
- type: executable
- type: nextflow
directives:
label: [midtime, midmem, midcpu, gpu]
Loading

0 comments on commit 5ece7e2

Please sign in to comment.