From e7a9781f1e77339a879199abf5977d777cf27ece Mon Sep 17 00:00:00 2001 From: Jalil Nourisa Date: Tue, 17 Sep 2024 19:53:18 +0200 Subject: [PATCH] scenic+ dockerfile needs update --- runs.ipynb | 36 +++++++++++++++++ scripts/sbatch/batch_scglue.sh | 6 ++- src/methods/multi_omics/scenicplus/script.py | 6 ++- src/methods/multi_omics/scglue/main.py | 22 +++++++--- src/methods/single_omics/grnboost2/script.py | 42 ++++++-------------- 5 files changed, 73 insertions(+), 39 deletions(-) diff --git a/runs.ipynb b/runs.ipynb index 44b7ed96f..c698055f8 100644 --- a/runs.ipynb +++ b/runs.ipynb @@ -3799,6 +3799,42 @@ "!bash scripts/run_grn_evaluation.sh {tag} {reg_type}" ] }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import scanpy as sc \n", + "import anndata as ad \n", + "multiomics_rna = ad.read_h5ad('resources/grn-benchmark/multiomics_rna.h5ad')\n", + "multiomics_rna.layers['counts'] = multiomics_rna.X.copy()\n", + "sc.pp.normalize_total(multiomics_rna)\n", + "sc.pp.log1p(multiomics_rna)\n", + "sc.pp.scale(multiomics_rna)\n", + "multiomics_rna.layers['lognorm'] = multiomics_rna.X.copy()\n", + "multiomics_rna.X = multiomics_rna.layers['counts']\n", + "del multiomics_rna.layers['counts']\n", + "multiomics_rna.write('resources/grn-benchmark/multiomics_rna.h5ad')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "multiomics_rna = ad.read_h5ad('resources/grn-benchmark/multiomics_rna_0.h5ad')\n", + "multiomics_rna.layers['counts'] = multiomics_rna.X.copy()\n", + "sc.pp.normalize_total(multiomics_rna)\n", + "sc.pp.log1p(multiomics_rna)\n", + "sc.pp.scale(multiomics_rna)\n", + "multiomics_rna.layers['lognorm'] = multiomics_rna.X.copy()\n", + "multiomics_rna.X = multiomics_rna.layers['counts']\n", + "del multiomics_rna.layers['counts']\n", + "multiomics_rna.write('resources/grn-benchmark/multiomics_rna_0.h5ad')" + ] + }, { "cell_type": "code", "execution_count": 4, diff --git a/scripts/sbatch/batch_scglue.sh b/scripts/sbatch/batch_scglue.sh index 147ad7728..35ad825b5 100644 --- a/scripts/sbatch/batch_scglue.sh +++ b/scripts/sbatch/batch_scglue.sh @@ -5,7 +5,9 @@ #SBATCH --error=logs/%j.err #SBATCH --mail-type=END #SBATCH --mail-user=jalil.nourisa@gmail.com -#SBATCH --cpus-per-task=20 #SBATCH --mem=64G +#SBATCH --partition=gpu +#SBATCH --cpus-per-task=20 +#SBATCH --gres=gpu:1 -singularity exec ../../images/scglue python src/methods/multi_omics/scglue/script.py +singularity exec --nv ../../images/scglue python src/methods/multi_omics/scglue/script.py diff --git a/src/methods/multi_omics/scenicplus/script.py b/src/methods/multi_omics/scenicplus/script.py index a1c5cf12d..d98f057e2 100644 --- a/src/methods/multi_omics/scenicplus/script.py +++ b/src/methods/multi_omics/scenicplus/script.py @@ -82,12 +82,13 @@ def process_atac(par): print("---Run pre-process started ---", flush=True) # Create one individual ATAC-seq file per donor + adata_atac = anndata.read_h5ad(par['multiomics_atac']) fragments_dict = {} for donor_id in unique_donor_ids: filepath = os.path.join(atac_dir, f'{donor_id}.tsv') if not os.path.exists(filepath): print(f'Create tsv file {filepath}') - adata_atac = anndata.read_h5ad(par['multiomics_atac']) + adata_atac.obs.cell_type = [s.replace(' ', '_') for s in adata_atac.obs.cell_type] adata_atac.obs.donor_id = [s.replace(' ', '_') for s in adata_atac.obs.donor_id] adata_atac_one_donor = adata_atac[adata_atac.obs.donor_id == donor_id] @@ -97,7 +98,7 @@ def process_atac(par): counts = coo_matrix.data row_names = adata_atac_one_donor.obs.index[rows] coord_names = adata_atac_one_donor.var.index[cols] - del adata_atac, adata_atac_one_donor + del adata_atac_one_donor gc.collect() d = { 'chromosome': np.asarray([s.split(':')[0] for s in coord_names]), @@ -147,6 +148,7 @@ def process_atac(par): names=['Chromosome', 'End'] ) chromsizes.insert(1, 'Start', 0) + print(f'Start pseudobulking') os.makedirs(os.path.join(par['temp_dir'], 'consensus_peak_calling'), exist_ok=True) diff --git a/src/methods/multi_omics/scglue/main.py b/src/methods/multi_omics/scglue/main.py index 9d2798c77..bf0991ec7 100644 --- a/src/methods/multi_omics/scglue/main.py +++ b/src/methods/multi_omics/scglue/main.py @@ -154,7 +154,7 @@ def run_grn(par): rna[:, np.union1d(genes, tfs)].write_loom(f"{par['temp_dir']}/rna.loom") np.savetxt(f"{par['temp_dir']}/tfs.txt", tfs, fmt="%s") - # Construct the command + #Construct the command command = ['pyscenic', 'grn', f"{par['temp_dir']}/rna.loom", f"{par['temp_dir']}/tfs.txt", '-o', f"{par['temp_dir']}/draft_grn.csv", '--seed', '0', '--num_workers', f"{par['num_workers']}", @@ -171,7 +171,15 @@ def run_grn(par): print("Command executed successfully") else: print("Command failed with return code", result.returncode) +def create_prior(par): + atac = ad.read_h5ad(f"{par['temp_dir']}/atac-emb.h5ad") + rna = ad.read_h5ad(f"{par['temp_dir']}/rna-emb.h5ad") + motif_bed = scglue.genomics.read_bed(par['motif_file']) + fs = pd.Index(motif_bed["name"]).intersection(rna.var_names) + atac.var["name"] = atac.var_names + + peaks = atac.var.index print("Generate TF cis-regulatory ranking bridged by ATAC peaks", flush=True) peak_bed = scglue.genomics.Bed(atac.var.loc[peaks]) @@ -193,6 +201,7 @@ def run_grn(par): ) ### Prepare data for pruning + print("Prepare data for pruning ") gene2tf_rank_glue.columns = gene2tf_rank_glue.columns + "_glue" gene2tf_rank_supp.columns = gene2tf_rank_supp.columns + "_supp" @@ -298,11 +307,12 @@ def main(par): rna = ad.read_h5ad(par['multiomics_rna']) atac = ad.read_h5ad(par['multiomics_atac']) - print('Preprocess data', flush=True) - preprocess(rna, atac, par) - print('Train a model', flush=True) - training(par) - run_grn(par) + # print('Preprocess data', flush=True) + # preprocess(rna, atac, par) + # print('Train a model', flush=True) + # training(par) + # run_grn(par) + create_prior(par) prune_grn(par) print('Curate predictions', flush=True) pruned_grn = pd.read_csv( diff --git a/src/methods/single_omics/grnboost2/script.py b/src/methods/single_omics/grnboost2/script.py index ec9b98a7d..8aeaa5e75 100644 --- a/src/methods/single_omics/grnboost2/script.py +++ b/src/methods/single_omics/grnboost2/script.py @@ -7,8 +7,6 @@ from distributed import Client, LocalCluster from tqdm import tqdm import subprocess -import scanpy as sc - @@ -28,10 +26,7 @@ def process_links(net, par): return net # Load scRNA-seq data adata_rna = anndata.read_h5ad(par['multiomics_rna']) -print('Noramlize data') -sc.pp.normalize_total(adata_rna) -sc.pp.log1p(adata_rna) -sc.pp.scale(adata_rna) +adata_rna.X = adata_rna.layers['lognorm'] #TODO: fix this groups = adata_rna.obs.cell_type gene_names = adata_rna.var.gene_ids.index.to_numpy() X = adata_rna.X @@ -40,22 +35,21 @@ def process_links(net, par): tfs = np.loadtxt(par["tf_all"], dtype=str) tf_names = [gene_name for gene_name in gene_names if (gene_name in tfs)] +# GRN inference +client = Client(processes=False) +def infer_grn(X, par): + print("Infer grn", flush=True) + + network = grnboost2(X, client_or_address=client, gene_names=gene_names, tf_names=tf_names) + network.rename(columns={'TF': 'source', 'target': 'target', 'importance': 'weight'}, inplace=True) + network.reset_index(drop=True, inplace=True) + network = process_links(network, par) + + return network if par['cell_type_specific']: - # GRN inference - client = Client(processes=False) - - def infer_grn(X, par): - print("Infer grn", flush=True) - - network = grnboost2(X, client_or_address=client, gene_names=gene_names, tf_names=tf_names) - network.rename(columns={'TF': 'source', 'target': 'target', 'importance': 'weight'}, inplace=True) - network.reset_index(drop=True, inplace=True) - network = process_links(network, par) - - return network i = 0 for group in tqdm(np.unique(groups), desc="Processing groups"): X_sub = X[groups == group, :] @@ -67,17 +61,7 @@ def infer_grn(X, par): grn = pd.concat([grn, net], axis=0).reset_index(drop=True) i += 1 else: - local_cluster = LocalCluster(n_workers=10, - threads_per_worker=1) - - custom_client = Client(local_cluster) - network = grnboost2(X, client_or_address=custom_client, gene_names=gene_names, tf_names=tf_names) - network.rename(columns={'TF': 'source', 'target': 'target', 'importance': 'weight'}, inplace=True) - network.reset_index(drop=True, inplace=True) - grn = process_links(network, par) - custom_client.close() - local_cluster.close() - + grn = infer_grn(X, par) # Save inferred GRN grn.to_csv(par['prediction'], sep=',')