diff --git a/Dockerfile b/Dockerfile index 299d068..94a291d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM condaforge/mambaforge:latest LABEL io.github.snakemake.containerized="true" -LABEL io.github.snakemake.conda_env_hash="f7297458609bceb0462c5a2467a4d166cc341f021f89686883de965a01db8e21" +LABEL io.github.snakemake.conda_env_hash="28c328bc05583c80dbef9948c2ab8c01c4c5f9f5b7d2411314f06e0b3472cfd0" ENV DEBIAN_FRONTEND=noninteractive RUN apt update && apt install -y build-essential libz-dev && rm -rf /var/lib/apt/lists/* @@ -54,6 +54,58 @@ COPY workflow/envs/create_inflation_factors_table.yaml /conda-envs/a160f42d06f9d RUN mkdir -p /conda-envs/a2826e2ef1005ed23bdbb3539321f7e9 COPY workflow/envs/ldsc.yaml /conda-envs/a2826e2ef1005ed23bdbb3539321f7e9/environment.yaml +# Conda environment: +# source: workflow/envs/plink-pandas.yml +# prefix: /conda-envs/a9b8ccc53333def92898879edc6df0ca +# name: plink-pandas +# dependencies: +# - bioconda::plink=1.90* +# - bioconda::tabix=1.11 +# - python=3.11.* +# - numpy +# - scipy +# - pandas +# - pip +RUN mkdir -p /conda-envs/a9b8ccc53333def92898879edc6df0ca +COPY workflow/envs/plink-pandas.yml /conda-envs/a9b8ccc53333def92898879edc6df0ca/environment.yaml + +# Conda environment: +# source: workflow/envs/plink2.yml +# prefix: /conda-envs/fd0f8d28d055274e6ce4cf006d07ec05 +# name: plink2 +# channels: +# - defaults +# dependencies: +# - bioconda::plink2==2.00a5 +RUN mkdir -p /conda-envs/fd0f8d28d055274e6ce4cf006d07ec05 +COPY workflow/envs/plink2.yml /conda-envs/fd0f8d28d055274e6ce4cf006d07ec05/environment.yaml + +# Conda environment: +# source: workflow/envs/susier.yml +# prefix: /conda-envs/68f405a25f9c3bf3528a1b7b39978bcc +# name: susier +# dependencies: +# - r-base==4.3.0 +# - r-susier==0.12.35 +# - r-tidyverse==2.0.0 +# - r-dplyr==1.1.2 +# - r-stringr==1.5.0 +# - r-purrr==1.0.1 +# - r-glue==1.6.2 +# - r-ggplot2==3.4.2 +# - r-ggpubr==0.6.0 +# - r-data.table==1.14.8 +# - r-rcpp==1.0.* +# - r-rcppeigen==0.3.3.* +# - r-markdown==1.10 +# - r-rlog==0.1.0 +# - numpy==1.24.3 +# - scipy==1.9.3 +# - pandas==1.5.3 +# - pip +RUN mkdir -p /conda-envs/68f405a25f9c3bf3528a1b7b39978bcc +COPY workflow/envs/susier.yml /conda-envs/68f405a25f9c3bf3528a1b7b39978bcc/environment.yaml + # Conda environment: # source: workflow/scripts/gwaspipe/environment.yml # prefix: /conda-envs/ab67c3cfb8e1a5ad9d9cb7824966853e @@ -83,5 +135,8 @@ COPY workflow/scripts/gwaspipe/environment.yml /conda-envs/ab67c3cfb8e1a5ad9d9cb RUN mamba env create --prefix /conda-envs/6e056d31662ab0bd2fd3fba49416042f --file /conda-envs/6e056d31662ab0bd2fd3fba49416042f/environment.yaml && \ mamba env create --prefix /conda-envs/a160f42d06f9d24b41c5cbece52b682d --file /conda-envs/a160f42d06f9d24b41c5cbece52b682d/environment.yaml && \ mamba env create --prefix /conda-envs/a2826e2ef1005ed23bdbb3539321f7e9 --file /conda-envs/a2826e2ef1005ed23bdbb3539321f7e9/environment.yaml && \ + mamba env create --prefix /conda-envs/a9b8ccc53333def92898879edc6df0ca --file /conda-envs/a9b8ccc53333def92898879edc6df0ca/environment.yaml && \ + mamba env create --prefix /conda-envs/fd0f8d28d055274e6ce4cf006d07ec05 --file /conda-envs/fd0f8d28d055274e6ce4cf006d07ec05/environment.yaml && \ + mamba env create --prefix /conda-envs/68f405a25f9c3bf3528a1b7b39978bcc --file /conda-envs/68f405a25f9c3bf3528a1b7b39978bcc/environment.yaml && \ mamba env create --prefix /conda-envs/ab67c3cfb8e1a5ad9d9cb7824966853e --file /conda-envs/ab67c3cfb8e1a5ad9d9cb7824966853e/environment.yaml && \ mamba clean --all -y diff --git a/Makefile b/Makefile index a305005..c66d079 100644 --- a/Makefile +++ b/Makefile @@ -28,5 +28,5 @@ rerun: unlock: snakemake --unlock -dockerfile: +dockerfile_: snakemake --containerize --snakefile workflow/Snakefile > Dockerfile diff --git a/config/config.yaml b/config/config.yaml index 0ab5744..ab8c666 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -15,6 +15,54 @@ workspace_path: "/path/to/the/workspace" sumstat: pvalcol: "LOG10P" pthr: 1.7e-11 + annotate: False + + +# Genotype position definition +genodata: + json: "data/genetic_data.json" + name: "INTERVAL" + + +# Phenotype file used in the GWAS +# ------------------------------- +# tab separated. +# First two colums should be FID and IID +pheno_file: "data/INTERVAL_NonImp_residuals_final.txt" +run_list: "data/pheno_to_run.csv" +sample_file: "data/samplelist.csv" + +# Clumping +# -------- +# NB: logp1 and logp2 will only work with plink2 +clumping: + # logp1: 10.769551078621726 + logp1: 7.3 + logp2: 1.3010299956639813 + r2: 0.1 + kb: 10000 + p1: 1.7e-11 + p2: 0.05 + totsize: 1e6 + + +# SusieR parameters +# ----------------- +susieR: + # The following parameter will enable the use of correlation matrix based + # on LD as specified in [https://stephenslab.github.io/susieR/articles/finemapping_summary_statistics.html](https://stephenslab.github.io/susieR/articles/finemapping_summary_statistics.html) + # If set to False (default), it will use the genotypes coded with additive model + # together with the phenotype to evaluate the RSS model. + use_ld: False + # When using this pipeline on CHRIS samples, the IDs + # have leading zeros, and will have a total length of 10 characters. + # Thus within the `scripts/finemapping.R` will do a conversion + # with for zero padding of the IDs to match the ones in the genotypes. + # Set this value to `False` for remove 0 padding to 10 character. + chris_id: False + min_abs_corr: 0.1 + iter: 1000 + params: harmonize_sumstats: diff --git a/environment.yml b/environment.yml index 83f0185..1205b1b 100644 --- a/environment.yml +++ b/environment.yml @@ -10,4 +10,3 @@ dependencies: - snakemake-storage-plugin-s3 - mamba - pip - - apptainer diff --git a/slurm/config.yaml b/slurm/config.yaml index f408086..1bdedd6 100644 --- a/slurm/config.yaml +++ b/slurm/config.yaml @@ -39,3 +39,17 @@ set-resources: mem_mb: 14336 + attempt * 2048 save_min_pvalue: mem_mb: 2048 + attempt * 2048 + sumstat_2_plink: + mem_mb: 12288 + attempt * 2048 + cut_pheno: + mem_mb: 12288 + attempt * 2048 + clumping: + mem_mb: 16384 + attempt * 2048 + enlarge_and_merge: + mem_mb: 14336 + attempt * 2048 + run_susieR: + mem_mb: 14336 + attempt * 2048 + collect_by_pheno: + mem_mb: 12288 + attempt * 2048 + collect_all: + mem_mb: 12288 + attempt * 2048 diff --git a/submit.sbatch b/submit.sbatch index 96eccda..7d4e0d9 100644 --- a/submit.sbatch +++ b/submit.sbatch @@ -11,7 +11,6 @@ source ~/.bashrc module -s load singularity/3.8.5 - # set some singularity directories depending on frontend/computing node/vm case $(hostname) in hnode*) @@ -36,5 +35,5 @@ if [ ! -d "pqtl_pipeline" ]; then git clone --recurse-submodules https://github.com/ht-diva/pqtl_pipeline.git fi cd pqtl_pipeline -conda activate snakemake +conda activate /group/diangelantonio/software/envs/snakemake make run diff --git a/workflow/Snakefile b/workflow/Snakefile index b165be1..01e97e9 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -14,9 +14,9 @@ include: "rules/common.smk" include: "rules/metal.smk" include: "rules/ldsc.smk" include: "rules/tiledb.smk" +include: "rules/finemapping.smk" -# include: "rules/finemapping.smk" # include: "rules/move.smk" diff --git a/workflow/envs/plink-pandas.yml b/workflow/envs/plink-pandas.yml new file mode 100644 index 0000000..8a46920 --- /dev/null +++ b/workflow/envs/plink-pandas.yml @@ -0,0 +1,9 @@ +name: plink-pandas +dependencies: + - bioconda::plink=1.90* + - bioconda::tabix=1.11 + - python=3.11.* + - numpy + - scipy + - pandas + - pip diff --git a/workflow/envs/plink2.yml b/workflow/envs/plink2.yml new file mode 100644 index 0000000..1d128cb --- /dev/null +++ b/workflow/envs/plink2.yml @@ -0,0 +1,5 @@ +name: plink2 +channels: + - defaults +dependencies: + - bioconda::plink2==2.00a5 diff --git a/workflow/envs/susier.yml b/workflow/envs/susier.yml new file mode 100644 index 0000000..220b889 --- /dev/null +++ b/workflow/envs/susier.yml @@ -0,0 +1,20 @@ +name: susier +dependencies: + - r-base==4.3.0 + - r-susier==0.12.35 + - r-tidyverse==2.0.0 + - r-dplyr==1.1.2 + - r-stringr==1.5.0 + - r-purrr==1.0.1 + - r-glue==1.6.2 + - r-ggplot2==3.4.2 + - r-ggpubr==0.6.0 + - r-data.table==1.14.8 + - r-rcpp==1.0.* + - r-rcppeigen==0.3.3.* + - r-markdown==1.10 + - r-rlog==0.1.0 + - numpy==1.24.3 + - scipy==1.9.3 + - pandas==1.5.3 + - pip diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 37fd6d0..8e55f01 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1,14 +1,27 @@ from pathlib import Path import pandas as pd +import json +# Store config variables for ease access +pvalcol = config["sumstat"]["pvalcol"] +pthr = config["sumstat"]["pthr"] +run_list = pd.read_csv(config["run_list"], header=0, sep="\t") +# Load genetic data information +genotype = config["genodata"]["name"] +with open(config["genodata"]["json"], "r") as f: + gd = json.load(f) +gt = gd[genotype] + +# Define input for the rules data = [] with open(config["sumstats_path"], "r") as fp: lines = fp.readlines() for line in lines: p = Path(line.strip()) - data.append((p.stem, str(p))) + seqid = ".".join(p.stem.split(".")[:3]) + data.append((seqid, str(p))) analytes = ( pd.DataFrame.from_records(data, columns=["seqid", "sumstat_path"]) @@ -17,6 +30,16 @@ analytes = ( ) +def get_pfile_from_chrom(wildcards): + pfiles = gt["plinkfiles"] + nfiles = gt["nfiles"] + if nfiles == 1: + ff = pfiles + else: + ff = [p for p in pfiles if p.find(f"dedup_{wildcards.chrom}_") > 0] + return ff[0] + + def get_sumstats(wildcards): return analytes.loc[wildcards.seqid, "sumstat_path"] @@ -74,4 +97,7 @@ def get_final_output(): ) ) + if config.get("run").get("finemapping"): + final_output.append(ws_path("all_phenos_summary.cs")) + return final_output diff --git a/workflow/rules/finemapping.smk b/workflow/rules/finemapping.smk index fdf6f0e..701cca6 100644 --- a/workflow/rules/finemapping.smk +++ b/workflow/rules/finemapping.smk @@ -1,10 +1,6 @@ -# Credits to @filosi +# Credits to @filosi, adapted from # https://github.com/EuracBiomedicalResearch/finemap_pipeline -# Store config variables for ease access -pvalcol = config["sumstat"]["pvalcol"] -pthr = config["sumstat"]["pthr"] - # Divide sumstat by chromosome rule sumstat_2_plink: @@ -15,7 +11,7 @@ rule sumstat_2_plink: output: temp(ws_path("fm/{seqid}/tmp/chr{chrom}_sumstat.csv")), resources: - runtime=lambda wc, attempt: attempt * 100, + runtime=lambda wc, attempt: attempt * 60, params: pval_thr=pthr, pval_col=pvalcol, @@ -31,8 +27,137 @@ rule cut_pheno: input: phenofile=config["pheno_file"], output: - temp(ws_path("{pheno}/tmp/phenotype.csv")), + temp(ws_path("fm/{seqid}/tmp/phenotype.csv")), + resources: + runtime=lambda wc, attempt: attempt * 60, conda: "../envs/plink-pandas.yml" script: "../scripts/separate_pheno.py" + + +rule clumping: + message: + "Run clumping" + input: + smstat=ws_path("fm/{seqid}/tmp/chr{chrom}_sumstat.csv"), + output: + ws_path("fm/{seqid}/clumping/chr{chrom}.clumps"), + ws_path("fm/{seqid}/clumping/chr{chrom}.log"), + params: + infile=get_pfile_from_chrom, + ofile=lambda wildcards, output: output[0].replace(".clumps", ""), + clump_logp1=config["clumping"]["logp1"], + clump_logp2=config["clumping"]["logp2"], + clump_r2=config["clumping"]["r2"], + clump_kb=config["clumping"]["kb"], + sampfile=config["sample_file"], + resources: + runtime=lambda wc, attempt: attempt * 100, + conda: + "../envs/plink2.yml" + shell: + """ + if [ -s {input} ] + then + plink2 --pfile {params.infile} --clump {input.smstat} \ + --clump-log10 'input-only' --clump-field {pvalcol} \ + --clump-log10-p1 {params.clump_logp1} --clump-log10-p2 {params.clump_logp2} \ + --clump-r2 {params.clump_r2} --clump-kb {params.clump_kb} \ + --clump-snp-field ID --out {params.ofile} \ + --memory {resources.mem_mb} \ + --keep {params.sampfile} + else + touch {output[0]} + touch {output[1]} + fi + """ + + +rule enlarge_and_merge: + message: + "Merge overlapping clumps after enlarging to at least 1Mb" + input: + rules.clumping.output[0], + output: + ws_path("fm/{seqid}/ld/chr{chrom}.ld"), + params: + plinkfile=get_pfile_from_chrom, + totsize=config["clumping"]["totsize"], + conda: + "../envs/plink-pandas.yml" + resources: + runtime=lambda wc, attempt: attempt * 100, + script: + "../scripts/run_susieR.py" + + +rule run_susieR: + input: + ldfile=rules.enlarge_and_merge.output, + smstat=ws_path("fm/{seqid}/tmp/chr{chrom}_sumstat.csv"), + phenofile=rules.cut_pheno.output, + output: + cs_smstat=ws_path("fm/{seqid}/cs/cs_chr{chrom}.cssmstat"), + cs_report=ws_path("fm/{seqid}/cs/cs_chr{chrom}.csreport"), + cs_rds=ws_path("fm/{seqid}/cs/cs_chr{chrom}_fit.rds"), #, + params: + use_ld=config["susieR"]["use_ld"], + chris_id=config["susieR"]["chris_id"], + iter=config["susieR"]["iter"], + min_abs_corr=config["susieR"]["min_abs_corr"], + resources: + runtime=lambda wc, attempt: attempt * 100, + conda: + "../envs/susier.yml" + script: + "../scripts/finemapping.R" + + +rule collect_by_pheno: + input: + expand( + ws_path("fm/{seqid}/cs/cs_chr{chrom}.cssmstat"), + chrom=lookup(query="pheno == '{seqid}'", within=run_list, cols="chrom"), + seqid=analytes.seqid, + ), + output: + ws_path("fm/{seqid}/summary.cs"), + resources: + runtime=lambda wc, attempt: attempt * 60, + params: + bypheno=False, + script: + "../scripts/aggregate_by_pheno.py" + + +# rule annotate_by_pheno: +# input: +# rules.collect_by_pheno.output +# output: +# ws_path("fm/{seqid}/summary_annot.cs") +# params: +# tophitsdir = config["sumstat"]["tophits_dir"] +# resources: +# mem_mb = 8000 +# script: +# "../scripts/annotate_cs.py" + + +rule collect_all: + input: + branch( + config["sumstat"]["annotate"] == True, + then=expand(ws_path("fm/{seqid}/summary_annot.cs"), seqid=analytes.seqid), + otherwise=expand(ws_path("fm/{seqid}/summary.cs"), seqid=analytes.seqid), + ), + output: + ws_path("all_phenos_summary.cs"), + resources: + runtime=lambda wc, attempt: attempt * 60, + params: + bypheno=True, + conda: + "../envs/plink-pandas.yml" + script: + "../scripts/aggregate_by_pheno.py" diff --git a/workflow/scripts/aggregate_by_pheno.py b/workflow/scripts/aggregate_by_pheno.py new file mode 100644 index 0000000..e6f84da --- /dev/null +++ b/workflow/scripts/aggregate_by_pheno.py @@ -0,0 +1,91 @@ +import pandas as pd +import os +from pathlib import Path + + +def main(summary_files, outfile, bypheno=True): + # Open output file + fo = open(outfile, "w") + + # Initialize variable to detect if it's the first time writing + # in the output file + firstw = True + wmode = "w" + + # Cycle over input files + for i, ff in enumerate(summary_files): + # Exctract phenotype from path + absp = Path(ff) + pheno = absp.parts[-2] + # Handle empty file with exception from pandas + try: + # Read input file + print(f"Computing phenotype: {pheno}") + try: + sdf = pd.read_csv(ff, sep="\t") + except pd.errors.EmptyDataError: + sdf = pd.DataFrame() + + # Handling an empty data.frame + if sdf.shape[0] > 0: + if bypheno: + tmpdf = summarize_df(sdf) + tmpdf["pheno"] = pheno + tmpdf["sumfile"] = ff + else: + tmpdf = sdf + + # Write results to output file + tmpdf.to_csv(fo, sep="\t", index=False, header=firstw, + mode=wmode) + + # After the first writing set header to false and append mode + firstw = False + wmode = "a" + except pd.errors.EmptyDataError: + continue + + # Close file + fo.close() + + +def summarize_df(df, cols=[]): + """This function handle the summary report for credible sets + produced by the runSusieR.R script. + + It extract the variable posterior probability and get the + best hit for each credible set. + + Parameters + ---------- + df : pandas.DataFrame + contains the summary statistic filtered by credible sets computed + with the susieR algorithm + cols : list + a list of characters containing a subset of the columns in the + original sumstat to report in the output + + Returns + ------- + dfres : pandas.DataFrame + a summary dataframe. One line for each credible set. + """ + gg = df.groupby(["CHROM", "cs"]) + idx = gg.agg({"variable_prob": "idxmax", "N": "size"}) + tmp = df.loc[idx["variable_prob"]] + idx = idx.set_index("variable_prob") + tmp["N_VARIANTS"] = idx["N"] + if cols: + try: + dfres = tmp[cols] + except KeyError: + print("Some columns are not in the data.frame") + dfres = tmp + else: + dfres = tmp + return dfres + + +if __name__ == "__main__": + + main(snakemake.input, snakemake.output[0], snakemake.params.bypheno) diff --git a/workflow/scripts/annotate_cs.py b/workflow/scripts/annotate_cs.py new file mode 100644 index 0000000..7e43e6c --- /dev/null +++ b/workflow/scripts/annotate_cs.py @@ -0,0 +1,32 @@ +import pandas as pd +import os + + +def main(pheno, tophits_dir, cs_file): + + try: + cs = pd.read_csv(cs_file, header=0, delimiter="\t") + except pd.errors.EmptyDataError: + return pd.DataFrame() + + tpfile = os.path.join(tophits_dir, f"{pheno}.regenie.filtered.gz") + tpdf = pd.read_csv(tpfile, header=0, delimiter="\t", + usecols=["ID", "GENE_NAME"]) + cs.set_index("ID", inplace=True) + tpdf.set_index("ID", inplace=True) + + cs_annot = cs.merge(tpdf, how="left", on="ID") + + return cs_annot + + +if __name__ == "__main__": + + cs_annot = main(pheno=snakemake.wildcards.pheno, + tophits_dir=snakemake.params.tophitsdir, + cs_file=snakemake.input[0]) + if cs_annot.shape[0] > 0: + cs_annot.to_csv(snakemake.output[0], index_label="ID", sep="\t") + else: + f = open(snakemake.output[0], "w") + f.close() diff --git a/workflow/scripts/create_sample_table.py b/workflow/scripts/create_sample_table.py new file mode 100644 index 0000000..dc49af5 --- /dev/null +++ b/workflow/scripts/create_sample_table.py @@ -0,0 +1,22 @@ +from pathlib import Path +import pandas as pd + +data = [] + +with open('../config/path_list.txt', 'r') as fp: + lines = fp.readlines() + +for line in lines: + p = Path(line.strip()) + data.append((p.stem, str(p))) + + +samples = ( + pd.DataFrame.from_records(data, columns=['sample_name', 'sample_path']).set_index("sample_name", drop=False).sort_index() +) + + + +# Writing to file +# with open("myfile.txt", "w") as fp: +# fp.writelines(samples) diff --git a/workflow/scripts/finemapping.R b/workflow/scripts/finemapping.R new file mode 100644 index 0000000..3281035 --- /dev/null +++ b/workflow/scripts/finemapping.R @@ -0,0 +1,234 @@ +#!/shared/bioinf/R/bin/Rscript-4.3-BioC3.17 + +#---- Libraries ---- +library(dplyr) +library(stringr) +library(glue) +library(susieR, lib.loc = "/home/mfilosi/R/rocker-rstudio/4.3") +library(Matrix) +library(rlog) +# library(Rfast, lib.loc = "/home/mfilosi/R/rocker-rstudio/4.3") +library(data.table) + +#---- Functions ---- +# Not needed any more +create_ldmat <- function(x, snplist){ + #---- Create LD matrix for all SNPs + #---- Check if it's plink1.9 or plink2 input ----- + nn <- names(x) + if (any(grepl("ID_", nn))){ + colix <- grep("ID_", nn) + } else if (any(grepl("SNP_", nn))){ + colix <- grep("SNP_", nn) + } else { + stop("Not conformable ld file") + } + + # Get Column for R value + rcol <- grep("^(UNPHASED_R|R)$", nn, perl=TRUE) + + # snps <- unique(c(x$ID_A, x$ID_B)) + snps <- unique(unlist(x[, colix])) + snps <- snps[!is.na(match(snps, snplist))] + nsnps <- length(snps) + snps <- data.frame(ID=snps) %>% mutate(i=1:n()) + + j1 <- j2 <- c("ID") + names(j1) <- nn[colix[1]] + names(j2) <- nn[colix[2]] + + ldmat2 <- x %>% + left_join(snps, by=j1) %>% + left_join(snps, by=j2) + + cormat <- matrix(0, nrow=nsnps, ncol=nsnps) + for (i in 1:nrow(ldmat2)){ + ix <- ldmat2[i, "i.x"] + iy <- ldmat2[i, "i.y"] + cormat[ix, iy] <- cormat[iy, ix] <- ldmat2[i, rcol] + } + diag(cormat) <- 1 + + dimnames(cormat) <- list(snps$ID, snps$ID) + # attr(cormat, "d") <- rep(nsnps, 2) + + return(cormat) +} + +err_handling <- function(e){ + rlog::log_error(glue("{e}")) + rlog::log_warn(glue("Creating empty output for clump: {clid}, chrom: {mychrom}")) + # Return an empty list and set class to "susie" + aa <- list() + aa$sets <- list() + class(aa) <- "susie" + return(aa) + } + +#---- Setup ---- +# cwd <- snakemake@params["cwd"] +ld_file <- snakemake@input[["ldfile"]] +sumstat_file <- snakemake@input[["smstat"]] +pheno_file <- snakemake@input[["phenofile"]] + +# Load parameters for data handling +compute_ld <- snakemake@params[["use_ld"]] +CHRIS <- snakemake@params[["chris_id"]] + +# Load parameters for susieR model +susie_min_abs_cor <- snakemake@params[["min_abs_corr"]] +susie_iter <- snakemake@params[["iter"]] + +cs_smstat_file <- snakemake@output[["cs_smstat"]] +cs_report_file <- snakemake@output[["cs_report"]] +cs_rds_file <- snakemake@output[["cs_rds"]] +cs_log <- snakemake@output[["cs_log"]] + +# RUN ONLY LOCALLY +# proj_path <- "/scratch/mfilosi/test_gwas_regenie/metaboGWAS" +# ld_file <- glue(proj_path, "/ld/meta_6/chr4.ld") +# geno_file <- glue(proj_path, "/ld/meta_6/chr4.ld_0.raw") +# snp_file <- glue(proj_path, "/ld/meta_6/chr4.ld_0.snplist") +# clump_file <- glue(proj_path, "/clumping/meta_6/chr4.clumps") +# finemap_file <- glue(proj_path, "/results/meta_6.finemapped") +# sumstat_file <- glue(proj_path, "/tmp-smstat/meta_6/chr4_sumstat.csv") +# pheno_file <- glue(proj_path, "/tmp-smstat/meta_6/phenotype.csv") + +rlog::log_info(glue("Running finemapping on file: {ld_file}")) + +ld_file_size <- file.size(ld_file) +if (ld_file_size > 0) { + rlog::log_debug("Start preprocessing for susieR") + geno_info <- read.table(ld_file, header = TRUE, sep = "\t") + + #---- Read phenotype file ---- + rlog::log_debug(glue("Reading phenotype file: {pheno_file}")) + pheno <- read.table(pheno_file, header = TRUE, sep = "\t") + y <- pheno["Y"] + sid <- pheno["IID"] + if (CHRIS) { + rlog::log_debug("Padding IDs...") + sid <- sid %>% + mutate(IID = str_pad(IID, side = "left", width = 10, pad = 0)) + } + + # Remove NA from sample list + nonasamp <- which(!is.na(y)) + sid <- sid[nonasamp, ] + y <- y[nonasamp, ] + + if (length(nonasamp) == 0){ + rlog::log_error("No samples left for analysis...") + rlog::log_error("Please check the overlap between sample ID in + genotype files and phenotype file.") + quit(save="no", status=1) + } else { + #---- read summary stat ---- + smstat <- fread(sumstat_file, header = TRUE, sep = "\t") + + #---- Prepare the result list ---- + cs_smstat_l <- list() + cs_report_l <- list() + cs_rssfit_l <- list() + + for (i in 1:nrow(geno_info)){ + #---- Get clump id and chromosome ---- + clid <- geno_info[i, "CLUMPID"] + mychrom <- geno_info[i, "CHROM"] + + #---- Logging ---- + rlog::log_info(glue("Processing clump id {clid} on chrom: {mychrom}")) + + #---- Read clump genotype file ---- + geno_file <- geno_info[i, "GENOFILE"] + # geno_file <- file.path(proj_path, geno_file) + genos <- fread(geno_file) + snps <- names(genos)[7:ncol(genos)] + snps <- gsub("_.+", "", snps) + rlog::log_info(glue("Analyzing: {length(snps)} snps!")) + + #---- Subset summary stat ---- + smstattmp <- smstat[ID %in% snps, ] + ix <- match(snps, smstattmp$ID) + smstattmp <- smstattmp[ix, ] + + #---- Match phenotype with genotypes ---- + gsix <- match(genos$IID, sid) + genos_nona <- genos[which(!is.na(gsix))] + ytmp <- y[gsix[!is.na(gsix)]] + sidtmp <- sid[gsix[!is.na(gsix)]] + X <- as.matrix(genos_nona[,7:ncol(genos_nona)]) * 1.0 + + rlog::log_debug("Compute susieR model...\n") + if (compute_ld){ + betas <- smstattmp$BETA + sebetas <- smstattmp$SE + n <- min(smstattmp$N, na.rm=TRUE) + + rlog::log_debug("Compute correlation matrix...") + R <- cor(X) + rlog::log_debug("Run susie model on LD.") + rss_ress <- tryCatch( + susie_rss(bhat = betas, shat = sebetas, n = n, R = R, max_iter = susie_iter, min_abs_corr = susie_min_abs_cor) + , error = err_handling + ) + } else { + rlog::log_debug("Run susie model on genotype.") + rss_ress <- tryCatch( + susie(X, y = ytmp, max_iter = susie_iter, + min_abs_corr = susie_min_abs_cor), + error = err_handling + ) + } + + cs <- rss_ress$sets + if (length(cs$cs) > 0) { + rlog::log_info("Found some credible sets!") + vars <- summary(rss_ress) + allsnps <- vars$vars + allsnps <- allsnps %>% mutate(ID = snps[variable]) + + smstatcs <- smstattmp %>% + left_join(allsnps, by = "ID") %>% + filter(cs > 0) %>% + mutate(csid = paste(CHROM, cs, sep = "_")) + csrep <- vars$cs %>% + mutate(CLUMPID = clid, + CHROM = mychrom, + csid = paste(CHROM, cs, sep = "_")) + + cs_smstat_l[[i]] <- data.table(smstatcs) + cs_report_l[[i]] <- data.table(csrep) + cs_rssfit_l[[i]] <- rss_ress + } else { + rlog::log_info("No credible set found!") + } + } + + cs_smstat <- data.table::rbindlist(cs_smstat_l) + cs_report <- data.table::rbindlist(cs_report_l) + + #---- Saving output ---- + rlog::log_info("Saving output...") + write.table(cs_report, file = cs_report_file, sep = "\t", + col.names = TRUE, row.names = FALSE, quote = FALSE) + write.table(cs_smstat, file = cs_smstat_file, sep = "\t", + col.names = TRUE, row.names = FALSE, quote = FALSE) + saveRDS(cs_rssfit_l, file = cs_rds_file) + } +} else { + rlog::log_debug("Touch output files...") + cat(file = cs_report_file) + cat(file = cs_smstat_file) + cat(file = cs_rds_file) +} + +rlog::log_info(cat("\n", + "----------", "\n", + "Done!!!", "\n", + "----------", "\n")) + +# susie_plot(rss_res,y = "PIP") +# susie_plot(zscores,y = "z") +# +# lambda <- estimate_s_rss(zscores, R = ldmat, n=n) diff --git a/workflow/scripts/run_susieR.py b/workflow/scripts/run_susieR.py new file mode 100644 index 0000000..d0ac57c --- /dev/null +++ b/workflow/scripts/run_susieR.py @@ -0,0 +1,473 @@ +import os +import csv +import pandas as pd +import numpy as np +import subprocess as sb +from tempfile import NamedTemporaryFile + + +class Clump(object): + def __init__(self, lead="", proxies=[], begin=0, end=0, **kwargs): + self.lead = lead + self.proxies = proxies + + if begin > end: + self.begin = end + self.end = begin + self.strand = "-" + else: + self.begin = begin + self.end = end + self.strand = "+" + + self._span = self.end - self.begin + + try: + pval = kwargs["pvalue"] + except KeyError: + pval = None + self.pvalue = pval + + @classmethod + def from_clumpfileline(cls, row): + lead = row[2] + proxies = get_snps_fromclump(row) + try: + pvalue = float(row[3]) + except ValueError: + pvalue = 1 + + return cls(lead=lead, proxies=proxies, pvalue=pvalue) + + @property + def pvalue(self): + return self._pvalue + + @property + def lpvalue(self): + return -np.log10(self._pvalue) + + @pvalue.setter + def pvalue(self, new): + try: + if new >= 0 and new <= 1: + self._pvalue = new + else: + raise ValueError + except ValueError: + print(f"Incorrect range for pvalue {new}") + + @property + def span(self): + return self._span + + + def enlarge_clump(self, totsize=1000000): + + if self.span == 0: + cllimits = self.get_span_from_clump() + clspan = cllimits[1] - cllimits[0] + else: + cllimits = self.get_interval() + clspan = self.span + + if clspan < totsize: + newlimits = widen_span(cllimits, tot=totsize) + # newspan = np.array(cllimits) + # self.span = newspan + else: + newlimits = cllimits + self.set_interval(newlimits) + + return self + + def get_span_from_clump(self): + try: + pos = [ll.split(":")[1] for ll in self.proxies + [self.lead] + if ll != "."] + pos = [int(p) for p in pos] + # print(pos) + # pos.append(self.begin) + # print(pos) + except IndexError: + # The list of proxies is empty + print("No proxies provided...") + pos = [0, 0] + except ValueError: + # The ID are not formatted as expected (chr:pos) + print("Cannot retrieve position from proxies") + pos = [0, 0] + + return [min(pos), max(pos)] + + def to_list(self): + return [self.lead] + self.proxies + + def get_interval(self): + return [self.begin, self.end] + + def set_interval(self, new): + if isinstance(new, list): + tmp = np.array(new) + tmp.sort() + try: + tmp.astype("int64") + except ValueError as e: + print(f"Please provide a list of numerical values to set \ + span: \n {e}") + + if isinstance(new, np.ndarray): + tmp = new.copy() + tmp.sort() + + if isinstance(new, dict): + tmp = self._span + try: + tmp[0] = np.int64(new['begin']) + tmp[1] = np.int64(new['end']) + except KeyError: + print("Please provide a correctly formed dictionary with \ + keys 'begin' and 'end'") + except ValueError: + print("Some values in the dictionary are not numeric") + + self._span = tmp[-1] - tmp[0] + self.begin = tmp[0] + self.end = tmp[-1] + + def __add__(self, other): + if isinstance(other, Clump): + px = self.proxies + other.proxies + ix = np.argsort(np.array([self.pvalue, other.pvalue])) + + if ix[0] == 0: + ll = self.lead + px += [other.lead] + pval = self.pvalue + else: + ll = other.lead + px += [self.lead] + pval = other.pvalue + + # Initialize new clump + res = Clump(lead=ll, proxies=px, pvalue=pval) + + # Update interval + spanarra = np.array([self.get_interval(), other.get_interval()]) + st = spanarra.min(axis=0)[0] + en = spanarra.max(axis=0)[-1] + res.set_interval([st, en]) + + return res + else: + raise NotImplementedError(f"Cannot add Clump class to \ + type {other.__class__}") + + +def get_snps_fromclump(line): + if line[-1] == 'NONE': + snps = [] + else: + snps = line[-1].replace('(1)', '') + snps = snps.split(',') + + return snps + + +def read_clump_file(file, enlarge=False, merge=False, **kwargs): + """ + Parameters + ---------- + file: str, FileBuffer + read a clump file with extension `.clumped` resulted from + plink clumping and merging over chromosomes + enlarge: bool + if set to True try to enlarge the the clumping size to a minimum of + `totsize`. Parameter `totsize` can be specified through `kwargs` + merge: bool + if set to True search in the clumplist for overlapping regions and + merge them if it finds any + + kwargs: Additional arguments can be passed to the function. For now only + `totsize` a numerical value specifing the minimum span of a clumping region + in bp (i.e. 1Mb: 1e6, 1kb: 1000). Default = 1Mb (1e6). + + Returns + ------- + list of Clump instances + a list of Clump() instances containing the clumping + provided by the plink software + """ + mylist = [] + + # Handling empty files and do nothing... + + with open(file, mode='r') as fc: + fcr = csv.reader(fc, delimiter='\t', skipinitialspace=True) + next(fcr) + + for ll in fcr: + if len(ll) > 0: + # snps = get_snps_fromclump(ll) + # cc = Clump( + # lead=ll[2], + # proxies=snps, + # pvalue=float(ll[3]) + # ) + cc = Clump.from_clumpfileline(ll) + mylist.append(cc) + + # Enlarge clump to a minimum size of `totsize` + # TODO: check if totsize is > 0...and other checks on totsize + + if enlarge: + try: + totsize = kwargs['totsize'] + except KeyError: + totsize = 1e6 + # mylist = [enlarge_clump(cl, totsize=totsize) for cl in mylist] + mylist = [cl.enlarge_clump(totsize=totsize) for cl in mylist] + + # Merge overlapping clumps + if merge: + mylist = sort_and_merge(mylist) + + return mylist + + +def compute_ld(snps, clid, plinkfile, dryrun=False, prefix="ld_clump", + **kwargs): + # Handle kwargs + try: + mem = kwargs["memory"] + mem = int(mem) + except KeyError: + mem = 8192 + + # plinkfile = os.path.join(genopath, f"chr{mychr}.forgwas.nofid") + + # Define file names + ofile = f'{prefix}_{clid}' + genofile = ofile + ".raw" + snpfile = ofile + ".snplist" + + # Write snp list + ff = open(snpfile, "w") + # with NamedTemporaryFile(mode='w') as ftmp: + for s in snps: + ff.write(s) + ff.write("\n") + # ftmp.write(s) + # ftmp.write("\n") + ff.close() + + # DO NOT COMPUTE LD matrix with plink, sometimes it's not semi-positive + # defined and can contain negative eigenvalues. + # Thus extract the genotype matrix and compute correlation in python/R + # See Issue #91: https://github.com/stephenslab/susieR/issues/91 + # ldcommand = ['plink', '--bfile', plinkfile, '--keep-allele-order', + # # '--ld-window-r2', '0.1', + # '--r', 'square', '--extract', # '--ld-snp-list' + # snpfile, '--out', ofile, '--memory', str(mem)] + ldcommand = ['plink', '--bfile', plinkfile, '--keep-allele-order', + '--extract', snpfile, + '--recode', 'A', '--out', ofile, '--memory', str(mem)] + print(" ".join(ldcommand)) + + if dryrun: + print('---- This is a dry run ----') + print('---- Running the following command...----') + print(' '.join(ldcommand)) + else: + if len(snps) > 1: + myproc = sb.run(ldcommand) + + if myproc.returncode > 0: + print("OOOpppps some issue with this process") + + return genofile, snpfile + + +def process_ld_file(ldfile): + # Read the ld file and adjust column names + # df = pd.read_csv(ldfile, sep="\t", header=0) + print(ldfile) + df = pd.read_csv(ldfile, sep="\s+", header=0) + cc = df.columns + df.columns = [c.replace("#", "") for c in cc] + + return df + + +def widen_span(span, tot): + span2 = np.array(span) + span2.sort() + spandist = np.diff(span2)[0] + totdiff = tot - spandist + + if totdiff > 0: + span2[0] -= totdiff / 2.0 + span2[1] += totdiff / 2.0 + + return span2 + + +def intersect(cl1, cl2): + cl1.sort() + cl2.sort() + + # Handle possible cases + + if (cl1[0] > cl2[1]) or (cl1[1] < cl2[0]): + # cl1 and cl2 are dop not overlap + # |-------| cl1 + # |-------| cl2 + ck = False + else: + if (cl1[1] <= cl2[1]) and (cl1[1] >= cl2[0]): + # cl1 end is within cl2 range + # |-------| cl1 + # |-------| cl2 + ck = True + + if (cl1[0] >= cl2[0]) and (cl1[0] < cl2[1]): + # cl1 end is within cl2 range + # |-------| cl1 + # |-------| cl2 + ck = True + + return ck + + +def main_tmp(clumpfile, gendata, outfile="", **kwargs): + # Get info on genetic data + # gg = GenDataList.from_json("../../metaboGWAS/genetic_data.json") + # gg = GenDataList.from_json(gendata) + # ggg = gg.gendata["HRC13K"] + + genopath = {os.path.dirname(f) for f in gendata.get_plinkfiles()} + genopath = genopath.pop() + + # Read clump file + cldf = read_cl_file(clumpfile) + clumplist = read_clump_file(clumpfile) + dflist = [] + + # Run LD computation for each clumping region + + for clump in cldf.iterrows(): + ldfile = compute_ld(clump, genopath=genopath, + prefix=outfile, **kwargs) + # Get list of betas and sd for betas + try: + dflist.append(process_ld_file(ldfile, clump[0])) + except FileNotFoundError: + print(f"Cannot find file {ldfile}") + dfall = pd.concat(dflist, ignore_index=True) + + print(f"Export csv file to {outfile}") + dfall.to_csv(outfile, sep="\t", index=False, header=True) + + +def sort_and_merge(clumplist): + nclumps = len(clumplist) + res = np.zeros(nclumps) * np.nan + clumplist.sort(key=lambda x: x.begin) + newclumplist = [] + clustid = 0 + res[0] = clustid + clnew = clumplist[0] + + for i in range(1, nclumps): + if clumplist[i].begin < clumplist[i - 1].end: + # Found an overlap, then merge clumps + res[i] = clustid + clnew += clumplist[i] + else: + # Assign merged clumps into the results + newclumplist.append(clnew) + + # Set new pointer to new cluster + clnew = clumplist[i] + + # Update cluster id + clustid += 1 + res[i] = clustid + + # Add the last merge of the cycle + newclumplist.append(clnew) + + return newclumplist + + +def main(clumpfile, plinkfile, chr, outfile="", **kwargs): + + try: + totsize = int(kwargs["totsize"]) + except KeyError: + totsize = 1e6 + except ValueError as e: + print("Invalid totsize value. Set it to default: 1e6", e) + totsize = 1e6 + + try: + # Get clumping file size + fs = os.path.getsize(clumpfile) + except OSError: + # If the file does not exists, then set size to 0 + fs = 0 + + # Prepare the file for output + fout = open(outfile, "w") + # If file size is not 0... + if fs > 0: + clumplist = read_clump_file(clumpfile, enlarge=True, merge=True, + totsize=1e6) + fw = csv.writer(fout, delimiter="\t") + + # Output header + myheader = ["CHROM", "CLUMPID", "GENOFILE", "SNPLIST"] + firstw = True + for i, cl in enumerate(clumplist): + snps = cl.to_list() + genofile, snpfile = compute_ld(snps, clid=i, + plinkfile=plinkfile, + prefix=outfile, **kwargs) + + # Check if the written file exists and is not empty + try: + if os.path.getsize(genofile) > 0: + # Write header if it's the first time writing + if firstw: + fw.writerow(myheader) + firstw = False + fw.writerow([chr, i, genofile, snpfile]) + except FileNotFoundError: + pass + + fout.close() + + return outfile + + +if __name__ == "__main__": + clumplist = main(clumpfile=snakemake.input[0], + plinkfile=snakemake.params.plinkfile, + outfile=snakemake.output[0], + chr=snakemake.wildcards.chrom, + memory=snakemake.resources.mem_mb, + totsize=snakemake.params.totsize) + # import argparse + # parser = argparse.ArgumentParser() + + # parser.add_argument("--clumpfile", default="", required=True) + # parser.add_argument("--outfile", default="") # , required=True) + # parser.add_argument("--dryrun", action='store_true') + # parser.add_argument("--plinkfile", required=True) + # parser.add_argument("--memory", default=8192) + # parser.add_argument("--chr", type=int) + + # args = parser.parse_args() + + # clumplist = main(args.clumpfile, plinkfile=args.plinkfile, + # outfile=args.outfile, chr=args.chr) diff --git a/workflow/scripts/separate_gwas.py b/workflow/scripts/separate_gwas.py new file mode 100644 index 0000000..9202fa8 --- /dev/null +++ b/workflow/scripts/separate_gwas.py @@ -0,0 +1,76 @@ +import pandas as pd +import argparse +import os +import subprocess as sb +import gzip +from tempfile import NamedTemporaryFile + + +def extract_chr_from_smstat(inputfile, chrom=None): + # Extract header from sm stat + myhead = pd.read_csv(inputfile, sep='\t', header=None, nrows=1) + myhead = myhead.iloc[0, :].tolist() + with NamedTemporaryFile(mode="w+b") as tmpfile: + p1 = sb.run(["tabix", f"{inputfile}", f"{chrom}"], stdout=tmpfile) + # p2 = sb.run(["gzip", "-c"], stdin=p1.stdout, stdout=tmpfile) + try: + p1.check_returncode() + df = pd.read_csv(tmpfile.name, sep="\t", header=None) + df.columns = myhead + except sb.CalledProcessError: + df = pd.DataFrame() + return df + + +def main(inputfile, outdir, pval_thr=5e-8, pvalcol="LOG10P", pheno="", + phenofile="", chrom=None): + + # Columns to write for the sumstat + cols_to_write = ["CHROM", "GENPOS", "ID", pvalcol, "BETA", "SE", "N", + "CHISQ", "ALLELE0", "ALLELE1"] + + print(f"Read gwas: {inputfile}") + mygwas = extract_chr_from_smstat(inputfile, chrom=chrom) + + # Compute p_values from LOGP + print("Compute pvalues") + mygwas['P'] = 10 ** -mygwas[pvalcol] + ix = mygwas['P'] < pval_thr + mygwas_filt = mygwas[ix] + + if mygwas_filt.shape[0] > 0: + # Filter out NA values + ix_nona = ~mygwas[pvalcol].isna() + mygwas.loc[ix_nona, cols_to_write].to_csv(outdir, sep='\t', + index=False, header=True, + na_rep="NA") + else: + # Touch file + f = open(outdir, "w") + f.close() + + +if __name__ == '__main__': + main(inputfile=snakemake.input.sumstat, + outdir=snakemake.output[0], + pval_thr=snakemake.params.pval_thr, + pvalcol=snakemake.params.pval_col, + pheno=snakemake.wildcards.pheno, + chrom=snakemake.wildcards.chrom) + +# parser = argparse.ArgumentParser() +# parser.add_argument('-i', '--infile', default=None, type=str) +# parser.add_argument('-o', '--outdir', default=".") +# parser.add_argument('--pcol', default="LOG10P") +# parser.add_argument('--pthr', default=5e-8, type=float) +# parser.add_argument('--pheno', default='') +# args = parser.parse_args() + +# if not os.path.exists(args.outdir): +# os.makedirs(args.outdir) + +# main(inputfile=args.infile, +# outdir=args.outdir, +# pval_thr=args.pthr, +# pvalcol=args.pcol, +# pheno=args.pheno) diff --git a/workflow/scripts/separate_pheno.py b/workflow/scripts/separate_pheno.py new file mode 100644 index 0000000..87947ab --- /dev/null +++ b/workflow/scripts/separate_pheno.py @@ -0,0 +1,26 @@ +import pandas as pd + + +def main(phenofile, pheno, outfile): + print(outfile) + phenoval = pd.read_csv(phenofile, sep="\t", header=0, + usecols=["FID", "IID", pheno]) + phenoval.rename(columns={pheno: "Y"}, inplace=True) + phenoval.to_csv(outfile, index=False, sep="\t") + + +if __name__ == "__main__": + main(phenofile=snakemake.input.phenofile, + pheno=snakemake.wildcards.pheno, + outfile=snakemake.output[0]) + # import argparse + + # parser = argparse.ArgumentParser() + # parser.add_argument("--phenofile", required=True) + # parser.add_argument("--outfile", required=True) + # parser.add_argument("--pheno", required=True) + + # # fake_args = ["--phenofile", snakemake.input.sumstat] + # args = parser.parse_args() + + # main(phenofile=args.phenofile, pheno=args.pheno, outfile=args.outfile)