Skip to content

Commit

Permalink
add finemapping rule
Browse files Browse the repository at this point in the history
  • Loading branch information
gmauro committed Mar 2, 2024
1 parent 203bf6d commit 4e27681
Show file tree
Hide file tree
Showing 19 changed files with 1,268 additions and 14 deletions.
57 changes: 56 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FROM condaforge/mambaforge:latest
LABEL io.github.snakemake.containerized="true"
LABEL io.github.snakemake.conda_env_hash="f7297458609bceb0462c5a2467a4d166cc341f021f89686883de965a01db8e21"
LABEL io.github.snakemake.conda_env_hash="28c328bc05583c80dbef9948c2ab8c01c4c5f9f5b7d2411314f06e0b3472cfd0"

ENV DEBIAN_FRONTEND=noninteractive
RUN apt update && apt install -y build-essential libz-dev && rm -rf /var/lib/apt/lists/*
Expand Down Expand Up @@ -54,6 +54,58 @@ COPY workflow/envs/create_inflation_factors_table.yaml /conda-envs/a160f42d06f9d
RUN mkdir -p /conda-envs/a2826e2ef1005ed23bdbb3539321f7e9
COPY workflow/envs/ldsc.yaml /conda-envs/a2826e2ef1005ed23bdbb3539321f7e9/environment.yaml

# Conda environment:
# source: workflow/envs/plink-pandas.yml
# prefix: /conda-envs/a9b8ccc53333def92898879edc6df0ca
# name: plink-pandas
# dependencies:
# - bioconda::plink=1.90*
# - bioconda::tabix=1.11
# - python=3.11.*
# - numpy
# - scipy
# - pandas
# - pip
RUN mkdir -p /conda-envs/a9b8ccc53333def92898879edc6df0ca
COPY workflow/envs/plink-pandas.yml /conda-envs/a9b8ccc53333def92898879edc6df0ca/environment.yaml

# Conda environment:
# source: workflow/envs/plink2.yml
# prefix: /conda-envs/fd0f8d28d055274e6ce4cf006d07ec05
# name: plink2
# channels:
# - defaults
# dependencies:
# - bioconda::plink2==2.00a5
RUN mkdir -p /conda-envs/fd0f8d28d055274e6ce4cf006d07ec05
COPY workflow/envs/plink2.yml /conda-envs/fd0f8d28d055274e6ce4cf006d07ec05/environment.yaml

# Conda environment:
# source: workflow/envs/susier.yml
# prefix: /conda-envs/68f405a25f9c3bf3528a1b7b39978bcc
# name: susier
# dependencies:
# - r-base==4.3.0
# - r-susier==0.12.35
# - r-tidyverse==2.0.0
# - r-dplyr==1.1.2
# - r-stringr==1.5.0
# - r-purrr==1.0.1
# - r-glue==1.6.2
# - r-ggplot2==3.4.2
# - r-ggpubr==0.6.0
# - r-data.table==1.14.8
# - r-rcpp==1.0.*
# - r-rcppeigen==0.3.3.*
# - r-markdown==1.10
# - r-rlog==0.1.0
# - numpy==1.24.3
# - scipy==1.9.3
# - pandas==1.5.3
# - pip
RUN mkdir -p /conda-envs/68f405a25f9c3bf3528a1b7b39978bcc
COPY workflow/envs/susier.yml /conda-envs/68f405a25f9c3bf3528a1b7b39978bcc/environment.yaml

# Conda environment:
# source: workflow/scripts/gwaspipe/environment.yml
# prefix: /conda-envs/ab67c3cfb8e1a5ad9d9cb7824966853e
Expand Down Expand Up @@ -83,5 +135,8 @@ COPY workflow/scripts/gwaspipe/environment.yml /conda-envs/ab67c3cfb8e1a5ad9d9cb
RUN mamba env create --prefix /conda-envs/6e056d31662ab0bd2fd3fba49416042f --file /conda-envs/6e056d31662ab0bd2fd3fba49416042f/environment.yaml && \
mamba env create --prefix /conda-envs/a160f42d06f9d24b41c5cbece52b682d --file /conda-envs/a160f42d06f9d24b41c5cbece52b682d/environment.yaml && \
mamba env create --prefix /conda-envs/a2826e2ef1005ed23bdbb3539321f7e9 --file /conda-envs/a2826e2ef1005ed23bdbb3539321f7e9/environment.yaml && \
mamba env create --prefix /conda-envs/a9b8ccc53333def92898879edc6df0ca --file /conda-envs/a9b8ccc53333def92898879edc6df0ca/environment.yaml && \
mamba env create --prefix /conda-envs/fd0f8d28d055274e6ce4cf006d07ec05 --file /conda-envs/fd0f8d28d055274e6ce4cf006d07ec05/environment.yaml && \
mamba env create --prefix /conda-envs/68f405a25f9c3bf3528a1b7b39978bcc --file /conda-envs/68f405a25f9c3bf3528a1b7b39978bcc/environment.yaml && \
mamba env create --prefix /conda-envs/ab67c3cfb8e1a5ad9d9cb7824966853e --file /conda-envs/ab67c3cfb8e1a5ad9d9cb7824966853e/environment.yaml && \
mamba clean --all -y
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,5 @@ rerun:
unlock:
snakemake --unlock

dockerfile:
dockerfile_:
snakemake --containerize --snakefile workflow/Snakefile > Dockerfile
48 changes: 48 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,54 @@ workspace_path: "/path/to/the/workspace"
sumstat:
pvalcol: "LOG10P"
pthr: 1.7e-11
annotate: False


# Genotype position definition
genodata:
json: "data/genetic_data.json"
name: "INTERVAL"


# Phenotype file used in the GWAS
# -------------------------------
# tab separated.
# First two colums should be FID and IID
pheno_file: "data/INTERVAL_NonImp_residuals_final.txt"
run_list: "data/pheno_to_run.csv"
sample_file: "data/samplelist.csv"

# Clumping
# --------
# NB: logp1 and logp2 will only work with plink2
clumping:
# logp1: 10.769551078621726
logp1: 7.3
logp2: 1.3010299956639813
r2: 0.1
kb: 10000
p1: 1.7e-11
p2: 0.05
totsize: 1e6


# SusieR parameters
# -----------------
susieR:
# The following parameter will enable the use of correlation matrix based
# on LD as specified in [https://stephenslab.github.io/susieR/articles/finemapping_summary_statistics.html](https://stephenslab.github.io/susieR/articles/finemapping_summary_statistics.html)
# If set to False (default), it will use the genotypes coded with additive model
# together with the phenotype to evaluate the RSS model.
use_ld: False
# When using this pipeline on CHRIS samples, the IDs
# have leading zeros, and will have a total length of 10 characters.
# Thus within the `scripts/finemapping.R` will do a conversion
# with for zero padding of the IDs to match the ones in the genotypes.
# Set this value to `False` for remove 0 padding to 10 character.
chris_id: False
min_abs_corr: 0.1
iter: 1000


params:
harmonize_sumstats:
Expand Down
1 change: 0 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,3 @@ dependencies:
- snakemake-storage-plugin-s3
- mamba
- pip
- apptainer
14 changes: 14 additions & 0 deletions slurm/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,17 @@ set-resources:
mem_mb: 14336 + attempt * 2048
save_min_pvalue:
mem_mb: 2048 + attempt * 2048
sumstat_2_plink:
mem_mb: 12288 + attempt * 2048
cut_pheno:
mem_mb: 12288 + attempt * 2048
clumping:
mem_mb: 16384 + attempt * 2048
enlarge_and_merge:
mem_mb: 14336 + attempt * 2048
run_susieR:
mem_mb: 14336 + attempt * 2048
collect_by_pheno:
mem_mb: 12288 + attempt * 2048
collect_all:
mem_mb: 12288 + attempt * 2048
3 changes: 1 addition & 2 deletions submit.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
source ~/.bashrc
module -s load singularity/3.8.5


# set some singularity directories depending on frontend/computing node/vm
case $(hostname) in
hnode*)
Expand All @@ -36,5 +35,5 @@ if [ ! -d "pqtl_pipeline" ]; then
git clone --recurse-submodules https://github.com/ht-diva/pqtl_pipeline.git
fi
cd pqtl_pipeline
conda activate snakemake
conda activate /group/diangelantonio/software/envs/snakemake
make run
2 changes: 1 addition & 1 deletion workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ include: "rules/common.smk"
include: "rules/metal.smk"
include: "rules/ldsc.smk"
include: "rules/tiledb.smk"
include: "rules/finemapping.smk"


# include: "rules/finemapping.smk"
# include: "rules/move.smk"


Expand Down
9 changes: 9 additions & 0 deletions workflow/envs/plink-pandas.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: plink-pandas
dependencies:
- bioconda::plink=1.90*
- bioconda::tabix=1.11
- python=3.11.*
- numpy
- scipy
- pandas
- pip
5 changes: 5 additions & 0 deletions workflow/envs/plink2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
name: plink2
channels:
- defaults
dependencies:
- bioconda::plink2==2.00a5
20 changes: 20 additions & 0 deletions workflow/envs/susier.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
name: susier
dependencies:
- r-base==4.3.0
- r-susier==0.12.35
- r-tidyverse==2.0.0
- r-dplyr==1.1.2
- r-stringr==1.5.0
- r-purrr==1.0.1
- r-glue==1.6.2
- r-ggplot2==3.4.2
- r-ggpubr==0.6.0
- r-data.table==1.14.8
- r-rcpp==1.0.*
- r-rcppeigen==0.3.3.*
- r-markdown==1.10
- r-rlog==0.1.0
- numpy==1.24.3
- scipy==1.9.3
- pandas==1.5.3
- pip
28 changes: 27 additions & 1 deletion workflow/rules/common.smk
Original file line number Diff line number Diff line change
@@ -1,14 +1,27 @@
from pathlib import Path
import pandas as pd
import json

# Store config variables for ease access
pvalcol = config["sumstat"]["pvalcol"]
pthr = config["sumstat"]["pthr"]
run_list = pd.read_csv(config["run_list"], header=0, sep="\t")

# Load genetic data information
genotype = config["genodata"]["name"]
with open(config["genodata"]["json"], "r") as f:
gd = json.load(f)
gt = gd[genotype]

# Define input for the rules
data = []
with open(config["sumstats_path"], "r") as fp:
lines = fp.readlines()

for line in lines:
p = Path(line.strip())
data.append((p.stem, str(p)))
seqid = ".".join(p.stem.split(".")[:3])
data.append((seqid, str(p)))

analytes = (
pd.DataFrame.from_records(data, columns=["seqid", "sumstat_path"])
Expand All @@ -17,6 +30,16 @@ analytes = (
)


def get_pfile_from_chrom(wildcards):
pfiles = gt["plinkfiles"]
nfiles = gt["nfiles"]
if nfiles == 1:
ff = pfiles
else:
ff = [p for p in pfiles if p.find(f"dedup_{wildcards.chrom}_") > 0]
return ff[0]


def get_sumstats(wildcards):
return analytes.loc[wildcards.seqid, "sumstat_path"]

Expand Down Expand Up @@ -74,4 +97,7 @@ def get_final_output():
)
)

if config.get("run").get("finemapping"):
final_output.append(ws_path("all_phenos_summary.cs"))

return final_output
Loading

0 comments on commit 4e27681

Please sign in to comment.