add finemapping rule

ht-diva · Mar 2, 2024 · 4e27681 · 4e27681
1 parent 203bf6d
commit 4e27681
Show file tree

Hide file tree

Showing 19 changed files with 1,268 additions and 14 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,6 +1,6 @@
 FROM condaforge/mambaforge:latest
 LABEL io.github.snakemake.containerized="true"
-LABEL io.github.snakemake.conda_env_hash="f7297458609bceb0462c5a2467a4d166cc341f021f89686883de965a01db8e21"
+LABEL io.github.snakemake.conda_env_hash="28c328bc05583c80dbef9948c2ab8c01c4c5f9f5b7d2411314f06e0b3472cfd0"
 
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt update && apt install -y build-essential libz-dev && rm -rf /var/lib/apt/lists/*
@@ -54,6 +54,58 @@ COPY workflow/envs/create_inflation_factors_table.yaml /conda-envs/a160f42d06f9d
 RUN mkdir -p /conda-envs/a2826e2ef1005ed23bdbb3539321f7e9
 COPY workflow/envs/ldsc.yaml /conda-envs/a2826e2ef1005ed23bdbb3539321f7e9/environment.yaml
 
+# Conda environment:
+#   source: workflow/envs/plink-pandas.yml
+#   prefix: /conda-envs/a9b8ccc53333def92898879edc6df0ca
+#   name: plink-pandas
+#   dependencies:
+#     - bioconda::plink=1.90*
+#     - bioconda::tabix=1.11
+#     - python=3.11.*
+#     - numpy
+#     - scipy
+#     - pandas
+#     - pip
+RUN mkdir -p /conda-envs/a9b8ccc53333def92898879edc6df0ca
+COPY workflow/envs/plink-pandas.yml /conda-envs/a9b8ccc53333def92898879edc6df0ca/environment.yaml
+
+# Conda environment:
+#   source: workflow/envs/plink2.yml
+#   prefix: /conda-envs/fd0f8d28d055274e6ce4cf006d07ec05
+#   name: plink2
+#   channels:
+#     - defaults
+#   dependencies:
+#     - bioconda::plink2==2.00a5
+RUN mkdir -p /conda-envs/fd0f8d28d055274e6ce4cf006d07ec05
+COPY workflow/envs/plink2.yml /conda-envs/fd0f8d28d055274e6ce4cf006d07ec05/environment.yaml
+
+# Conda environment:
+#   source: workflow/envs/susier.yml
+#   prefix: /conda-envs/68f405a25f9c3bf3528a1b7b39978bcc
+#   name: susier
+#   dependencies:
+#     - r-base==4.3.0
+#     - r-susier==0.12.35
+#     - r-tidyverse==2.0.0
+#     - r-dplyr==1.1.2
+#     - r-stringr==1.5.0
+#     - r-purrr==1.0.1
+#     - r-glue==1.6.2
+#     - r-ggplot2==3.4.2
+#     - r-ggpubr==0.6.0
+#     - r-data.table==1.14.8
+#     - r-rcpp==1.0.*
+#     - r-rcppeigen==0.3.3.*
+#     - r-markdown==1.10
+#     - r-rlog==0.1.0
+#     - numpy==1.24.3
+#     - scipy==1.9.3
+#     - pandas==1.5.3
+#     - pip
+RUN mkdir -p /conda-envs/68f405a25f9c3bf3528a1b7b39978bcc
+COPY workflow/envs/susier.yml /conda-envs/68f405a25f9c3bf3528a1b7b39978bcc/environment.yaml
+
 # Conda environment:
 #   source: workflow/scripts/gwaspipe/environment.yml
 #   prefix: /conda-envs/ab67c3cfb8e1a5ad9d9cb7824966853e
@@ -83,5 +135,8 @@ COPY workflow/scripts/gwaspipe/environment.yml /conda-envs/ab67c3cfb8e1a5ad9d9cb
 RUN mamba env create --prefix /conda-envs/6e056d31662ab0bd2fd3fba49416042f --file /conda-envs/6e056d31662ab0bd2fd3fba49416042f/environment.yaml && \
     mamba env create --prefix /conda-envs/a160f42d06f9d24b41c5cbece52b682d --file /conda-envs/a160f42d06f9d24b41c5cbece52b682d/environment.yaml && \
     mamba env create --prefix /conda-envs/a2826e2ef1005ed23bdbb3539321f7e9 --file /conda-envs/a2826e2ef1005ed23bdbb3539321f7e9/environment.yaml && \
+    mamba env create --prefix /conda-envs/a9b8ccc53333def92898879edc6df0ca --file /conda-envs/a9b8ccc53333def92898879edc6df0ca/environment.yaml && \
+    mamba env create --prefix /conda-envs/fd0f8d28d055274e6ce4cf006d07ec05 --file /conda-envs/fd0f8d28d055274e6ce4cf006d07ec05/environment.yaml && \
+    mamba env create --prefix /conda-envs/68f405a25f9c3bf3528a1b7b39978bcc --file /conda-envs/68f405a25f9c3bf3528a1b7b39978bcc/environment.yaml && \
     mamba env create --prefix /conda-envs/ab67c3cfb8e1a5ad9d9cb7824966853e --file /conda-envs/ab67c3cfb8e1a5ad9d9cb7824966853e/environment.yaml && \
     mamba clean --all -y
diff --git a/Makefile b/Makefile
@@ -28,5 +28,5 @@ rerun:
 unlock:
 	snakemake --unlock
 
-dockerfile:
+dockerfile_:
 	snakemake --containerize --snakefile workflow/Snakefile > Dockerfile
diff --git a/config/config.yaml b/config/config.yaml
@@ -15,6 +15,54 @@ workspace_path: "/path/to/the/workspace"
 sumstat:
   pvalcol: "LOG10P"
   pthr: 1.7e-11
+  annotate: False
+
+
+# Genotype position definition
+genodata:
+  json: "data/genetic_data.json"
+  name: "INTERVAL"
+
+
+# Phenotype file used in the GWAS
+# -------------------------------
+# tab separated.
+# First two colums should be FID and IID
+pheno_file: "data/INTERVAL_NonImp_residuals_final.txt"
+run_list: "data/pheno_to_run.csv"
+sample_file: "data/samplelist.csv"
+
+# Clumping
+# --------
+# NB: logp1 and logp2 will only work with plink2
+clumping:
+  # logp1: 10.769551078621726
+  logp1: 7.3
+  logp2: 1.3010299956639813
+  r2: 0.1
+  kb: 10000
+  p1: 1.7e-11
+  p2: 0.05
+  totsize: 1e6
+
+
+# SusieR parameters
+# -----------------
+susieR:
+  # The following parameter will enable the use of correlation matrix based
+  # on LD as specified in [https://stephenslab.github.io/susieR/articles/finemapping_summary_statistics.html](https://stephenslab.github.io/susieR/articles/finemapping_summary_statistics.html)
+  # If set to False (default), it will use the genotypes coded with additive model
+  # together with the phenotype to evaluate the RSS model.
+  use_ld: False
+  # When using this pipeline on CHRIS samples, the IDs
+  # have leading zeros, and will have a total length of 10 characters.
+  # Thus within the `scripts/finemapping.R` will do a conversion
+  # with for zero padding of the IDs to match the ones in the genotypes.
+  # Set this value to `False` for remove 0 padding to 10 character.
+  chris_id: False
+  min_abs_corr: 0.1
+  iter: 1000
+
 
 params:
   harmonize_sumstats:

diff --git a/environment.yml b/environment.yml
@@ -10,4 +10,3 @@ dependencies:
   - snakemake-storage-plugin-s3
   - mamba
   - pip
-  - apptainer
diff --git a/slurm/config.yaml b/slurm/config.yaml
@@ -39,3 +39,17 @@ set-resources:
         mem_mb: 14336 + attempt * 2048
     save_min_pvalue:
         mem_mb: 2048 + attempt * 2048
+    sumstat_2_plink:
+        mem_mb: 12288 + attempt * 2048
+    cut_pheno:
+        mem_mb: 12288 + attempt * 2048
+    clumping:
+        mem_mb: 16384 + attempt * 2048
+    enlarge_and_merge:
+        mem_mb: 14336 + attempt * 2048
+    run_susieR:
+      mem_mb: 14336 + attempt * 2048
+    collect_by_pheno:
+      mem_mb: 12288 + attempt * 2048
+    collect_all:
+      mem_mb: 12288 + attempt * 2048
diff --git a/submit.sbatch b/submit.sbatch
@@ -11,7 +11,6 @@
 source ~/.bashrc
 module -s load singularity/3.8.5
 
-
 # set some singularity directories depending on frontend/computing node/vm
 case $(hostname) in
   hnode*)
@@ -36,5 +35,5 @@ if [ ! -d "pqtl_pipeline" ]; then
   git clone --recurse-submodules https://github.com/ht-diva/pqtl_pipeline.git
 fi
 cd pqtl_pipeline
-conda activate snakemake
+conda activate /group/diangelantonio/software/envs/snakemake
 make run
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -14,9 +14,9 @@ include: "rules/common.smk"
 include: "rules/metal.smk"
 include: "rules/ldsc.smk"
 include: "rules/tiledb.smk"
+include: "rules/finemapping.smk"
 
 
-# include: "rules/finemapping.smk"
 # include: "rules/move.smk"
 
 

diff --git a/workflow/envs/plink-pandas.yml b/workflow/envs/plink-pandas.yml
@@ -0,0 +1,9 @@
+name: plink-pandas
+dependencies:
+  - bioconda::plink=1.90*
+  - bioconda::tabix=1.11
+  - python=3.11.*
+  - numpy
+  - scipy
+  - pandas
+  - pip
diff --git a/workflow/envs/plink2.yml b/workflow/envs/plink2.yml
@@ -0,0 +1,5 @@
+name: plink2
+channels:
+  - defaults
+dependencies:
+  - bioconda::plink2==2.00a5
diff --git a/workflow/envs/susier.yml b/workflow/envs/susier.yml
@@ -0,0 +1,20 @@
+name: susier
+dependencies:
+  - r-base==4.3.0
+  - r-susier==0.12.35
+  - r-tidyverse==2.0.0
+  - r-dplyr==1.1.2
+  - r-stringr==1.5.0
+  - r-purrr==1.0.1
+  - r-glue==1.6.2
+  - r-ggplot2==3.4.2
+  - r-ggpubr==0.6.0
+  - r-data.table==1.14.8
+  - r-rcpp==1.0.*
+  - r-rcppeigen==0.3.3.*
+  - r-markdown==1.10
+  - r-rlog==0.1.0
+  - numpy==1.24.3
+  - scipy==1.9.3
+  - pandas==1.5.3
+  - pip
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -1,14 +1,27 @@
 from pathlib import Path
 import pandas as pd
+import json
 
+# Store config variables for ease access
+pvalcol = config["sumstat"]["pvalcol"]
+pthr = config["sumstat"]["pthr"]
+run_list = pd.read_csv(config["run_list"], header=0, sep="\t")
 
+# Load genetic data information
+genotype = config["genodata"]["name"]
+with open(config["genodata"]["json"], "r") as f:
+    gd = json.load(f)
+gt = gd[genotype]
+
+# Define input for the rules
 data = []
 with open(config["sumstats_path"], "r") as fp:
     lines = fp.readlines()
 
 for line in lines:
     p = Path(line.strip())
-    data.append((p.stem, str(p)))
+    seqid = ".".join(p.stem.split(".")[:3])
+    data.append((seqid, str(p)))
 
 analytes = (
     pd.DataFrame.from_records(data, columns=["seqid", "sumstat_path"])
@@ -17,6 +30,16 @@ analytes = (
 )
 
 
+def get_pfile_from_chrom(wildcards):
+    pfiles = gt["plinkfiles"]
+    nfiles = gt["nfiles"]
+    if nfiles == 1:
+        ff = pfiles
+    else:
+        ff = [p for p in pfiles if p.find(f"dedup_{wildcards.chrom}_") > 0]
+    return ff[0]
+
+
 def get_sumstats(wildcards):
     return analytes.loc[wildcards.seqid, "sumstat_path"]
 
@@ -74,4 +97,7 @@ def get_final_output():
             )
         )
 
+    if config.get("run").get("finemapping"):
+        final_output.append(ws_path("all_phenos_summary.cs"))
+
     return final_output
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,4 +10,3 @@ dependencies: @@
       - snakemake-storage-plugin-s3
       - mamba
       - pip
-      - apptainer