mskcc · anoronh4 · Sep 20, 2024 · Sep 23, 2024 · Sep 23, 2024 · Sep 25, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - [#118](https://github.com/mskcc/forte/pull/118) - change the way the plug-n-play starfusion reference is downloaded.
 
+- [#128](https://github.com/mskcc/forte/pull/128) - full support for GRCh38 added
+
 ### `Fixed`
 
 - [#119](https://github.com/mskcc/forte/pull/119) - change script error behavior in METAFUSION_RUN process

diff --git a/bin/final_generate_v75_gene_bed.R → bin/generate_gene_bed.R b/bin/final_generate_v75_gene_bed.R → bin/generate_gene_bed.R
@@ -3,7 +3,7 @@
 # __author__      = "Alexandria Dymun"
 # __email__       = "[email protected]"
 # __contributor__ = "Anne Marie Noronha ([email protected])"
-# __version__     = "0.0.1"
+# __version__     = "0.0.2"
 # __status__      = "Dev"
 
 
@@ -12,11 +12,12 @@ suppressPackageStartupMessages({
     library(dplyr)
     library(data.table)
     library(stringr)
+    options(scipen = 999)
 })
 
 usage <- function() {
     message("Usage:")
-    message("final_generate_v75_gene_bed.R <in.gff> <out.bed>")
+    message("generate_gene_bed.R <in.gff> <out.bed>")
 }
 
 args = commandArgs(TRUE)
@@ -26,15 +27,10 @@ if (length(args)!=2) {
     quit()
 }
 
-# Utilized gtf from igenomes for FORTE This corresponds to GRCh37 ensembl 75
-# Add introns to gtf, convert to gff3
-# bsub -R "rusage[mem=64]" -o add_introns_agat_%J.out singularity exec -B /juno/ \\
-# -B /tmp -B /scratch/ docker://quay.io/biocontainers/agat:0.8.0--pl5262hdfd78af_0  \\
-# /bin/bash -c "agat_sp_add_introns.pl -g /juno/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf\\
-# -o genes.INTRONS.gff3"
-
 gtf <- rtracklayer::import(args[1])
 gtf_df <- as.data.frame(gtf)
+#remove incomplete transcripts mRNA_end_NF and mRNA_start_NF (not finished)
+gtf_df <- gtf_df[!grepl("NF",gtf_df$tag),]
 
 file.to_write <- args[2]
 
@@ -44,7 +40,8 @@ gtf_df <- gtf_df %>%
         chr = seqnames
     ) %>%
     select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>%
-    filter(type %in% c("exon","intron","UTR","CDS","cds","utr")) %>% mutate(start = start-1)
+    filter(type %in% c("exon","intron","UTR","CDS","cds","utr","five_prime_utr","three_prime_utr")) %>%
+    mutate(gene_name = ifelse(is.na(gene_name),gene_id,gene_name)) %>% mutate(start = start-1)
 
 
 #START CLOCK
@@ -106,6 +103,8 @@ modify_transcript <- function(transcript){
             transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr5"
         }
     }
+    transcript$type[transcript$type == "five_prime_utr"] <- "utr5"
+    transcript$type[transcript$type == "three_prime_utr"] <- "utr3"
     #### Any exon that remains after teh cds change, is likely and untranslated region. change below
 
     # Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5)

diff --git a/bin/make_gene_info_for_forte.R b/bin/make_gene_info_for_forte.R
@@ -106,7 +106,7 @@ gene_info <- rbind(gene_info,add_these_excess_gene_ids)
 gene_info <- merge(gene_info,do.call(rbind,unique_id_to_names[versioned_gtf])[,c("gene_id","gene_id_with_version")],by = "gene_id",all.x = T, all.y = F)
 
 gene_info$Synonyms <- ifelse(is.na(gene_info$gene_id_with_version),gene_info$gene_id,paste0(gene_info$gene_id,"|",gene_info$gene_id_with_version))
-gene_info$Symbol <- gene_info$gene_name
+gene_info$Symbol <- ifelse(is.na(gene_info$gene_name), gene_info$gene_id, gene_info$gene_name)
 
 gene_info <- gene_info[,c("Symbol","Synonyms")]
 

diff --git a/conf/igenomes.config b/conf/igenomes.config
@@ -34,11 +34,21 @@ params {
             ensembl_version = 75
         }
         'GRCh38' {
-            fasta          = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa"
-            gtf            = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.gtf"
-            refflat        = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes.gencode/refFlat.txt.gz"
-            starfusion_url = "https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.10/GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz"
-            cdna           = "https://ftp.ensembl.org/pub/release-86/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz"
+            ensembl_version      = 111
+            fasta                = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38Decoy/Sequence/WholeGenomeFasta/genome.fa"
+            //fasta                = "https://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"
+            gtf                  = "https://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.GRCh38.111.gtf.gz"
+            //forte will generate refflat from gtf
+            refflat              = null
+            starfusion_url       = "https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.10/GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz"
+            cdna                 = "https://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz"
+            metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh38/blocklist_breakpoints.hg38.bedpe.gz"
+            baits {
+                'idt_v2' {
+                    targets = "/juno/work/ccs/cmopipeline/forte/GRCh38_probes/xgen-exome-hyb-panel-v2-targets-hg38.bed"
+                    baits   = "/juno/work/ccs/cmopipeline/forte/GRCh38_probes/xgen-exome-hyb-panel-v2-probes-hg38.bed"
+                }
+            }
         }
         'smallGRCh37' {
             fasta          = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta"
@@ -48,7 +58,6 @@ params {
             cdna           = "http://ftp.ensemblgenomes.org/pub/viruses/fasta/sars_cov_2/cdna/Sars_cov_2.ASM985889v3.cdna.all.fa.gz"
             metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/blocklist_breakpoints.bedpe"
             ensembl_version = 75
-
         }
 /*
         'hg38' {

diff --git a/conf/modules.config b/conf/modules.config
@@ -27,6 +27,14 @@ process {
         ]
     }
 
+    withName: '.*:PREPARE_REFERENCES:GUNZIP.*' {
+        storeDir = { "${params.reference_base}/${params.genome}/${task.process.tokenize(':')[-1].toLowerCase()}" }
+    }
+
+    withName: 'FASTAREMOVEPREFIX' {
+        storeDir = { "${params.reference_base}/${params.genome}/fasta" }
+    }
+
     withName: 'MSKCC_FORTE:FORTE:MULTIQC' {
         publishDir = [
             path: { "${report.folder}/report" },
@@ -210,6 +218,7 @@ process {
         ]
     }
     withName: 'AGAT_SPADDINTRONS' {
+        cpus = { 4 * task.attempt }
         storeDir = { "${params.reference_base}/${params.genome}/metafusion/introns" }
         publishDir = [
             enabled: false,
@@ -475,7 +484,7 @@ process {
         ]
     }
 
-    withName: ARRIBA {
+    withName: ARRIBA_ARRIBA {
         ext.args = {
             "-s ${meta.single_end || meta.strandedness == "forward" ? "yes" : meta.strandedness == "reverse" ? "reverse" : "no" }"
         }

diff --git a/modules.json b/modules.json
@@ -21,9 +21,9 @@
                         "git_sha": "6898156da3604a6bdf26c36036053a970050fea0",
                         "installed_by": ["modules"]
                     },
-                    "arriba": {
+                    "arriba/arriba": {
                         "branch": "master",
-                        "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
+                        "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
                         "installed_by": ["modules"]
                     },
                     "cat/cat": {
@@ -48,12 +48,12 @@
                     },
                     "gatk4/bedtointervallist": {
                         "branch": "master",
-                        "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
+                        "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
                         "installed_by": ["modules"]
                     },
                     "gatk4/createsequencedictionary": {
                         "branch": "master",
-                        "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
+                        "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
                         "installed_by": ["modules"]
                     },
                     "gunzip": {
@@ -134,7 +134,7 @@
                     },
                     "samtools/faidx": {
                         "branch": "master",
-                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
                         "installed_by": ["modules"]
                     },
                     "samtools/index": {
@@ -149,13 +149,14 @@
                     },
                     "star/align": {
                         "branch": "master",
-                        "git_sha": "57d75dbac06812c59798a48585032f6e50bb1914",
+                        "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
                         "installed_by": ["modules"]
                     },
                     "star/genomegenerate": {
                         "branch": "master",
-                        "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
-                        "installed_by": ["modules"]
+                        "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
+                        "installed_by": ["modules"],
+                        "patch": "modules/nf-core/star/genomegenerate/star-genomegenerate.diff"
                     },
                     "subread/featurecounts": {
                         "branch": "master",

diff --git a/modules/local/agfusion/batch/main.nf b/modules/local/agfusion/batch/main.nf
@@ -5,8 +5,8 @@ process AGFUSION_BATCH {
     // Note: 2.7X indices incompatible with AWS iGenomes.
     conda 'bioconda::agfusion=1.252'
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'docker://cmopipeline/agfusion:0.0.6' :
-        'docker.io/cmopipeline/agfusion:0.0.6' }"
+        'docker://cmopipeline/agfusion:0.0.7' :
+        'docker.io/cmopipeline/agfusion:0.0.7' }"
 
     input:
     tuple val(meta), path(fusions)

diff --git a/modules/local/agfusion/container/Dockerfile b/modules/local/agfusion/container/Dockerfile
@@ -1,14 +1,30 @@
-FROM ubuntu:bionic-20230530
+FROM ubuntu:jammy-20240911.1
 
 LABEL maintainer="Anne Marie Noronha ([email protected])" \
-    version.image="0.0.6"
+    version.image="0.0.7"
 
 # INSTALL DEPENDENCIES
 
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update -y
-RUN apt-get install -y build-essential python3 python3-pip python3-matplotlib python3-pandas python3-future python3-biopython curl less vim libnss-sss git zip
+RUN apt-get install -y \
+    build-essential \
+    python3 \
+    python3-pip \
+    python3-matplotlib \
+    python3-pandas \
+    python3-future \
+    python3-biopython \
+    python3-dev \
+    default-libmysqlclient-dev \
+    pkg-config \
+    curl \
+    less \
+    vim \
+    libnss-sss \
+    git \
+    zip
 RUN pip3 install --upgrade pip
 RUN pip3 install pyensembl
 
@@ -18,9 +34,8 @@ RUN pip3 install mysqlclient
 
 # INSTALL AGFUSION & DATABASE FILES
 WORKDIR /usr/local/bin
-RUN git clone https://github.com/mskcc/AGFusion.git --branch v1.4.1-fork1 --single-branch
+RUN git clone https://github.com/mskcc/AGFusion.git --branch v1.4.[email protected] --single-branch
 WORKDIR /usr/local/bin/AGFusion
+RUN pip3 install -r requirements.txt
 RUN pip3 install .
 
-# downgrade pyensembl for compatibility
-RUN pip3 install gtfparse==1.2.1 --upgrade
diff --git a/modules/local/agfusion/download/main.nf b/modules/local/agfusion/download/main.nf
@@ -4,8 +4,8 @@ process AGFUSION_DOWNLOAD {
     // Note: 2.7X indices incompatible with AWS iGenomes.
     conda 'bioconda::agfusion=1.252'
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'docker://cmopipeline/agfusion:0.0.6' :
-        'docker.io/cmopipeline/agfusion:0.0.6' }"
+        'docker://cmopipeline/agfusion:0.0.7' :
+        'docker.io/cmopipeline/agfusion:0.0.7' }"
 
     input:
     val(ensembl_release)
@@ -25,13 +25,13 @@ process AGFUSION_DOWNLOAD {
         ['GRCh38','hg38'].contains(genome) ? 'hg38' :
         ['GRCm38','mm10'].contains(genome) ? 'mm10' : ''
     def pyensembl_species = ['GRCm38','mm10'].contains(genome) ? 'mus_musculus' : 'homo_sapiens'
-    if (ensembl_release < 93) {
+    if (ensembl_release < 112) {
         """
         export PYENSEMBL_CACHE_DIR=\$PWD/pyensembl_cache
 
         pyensembl install --species ${pyensembl_species} --release ${ensembl_release}
 
-        agfusion download -g ${agfusion_genome}
+        agfusion download -s ${pyensembl_species} -r ${ensembl_release}
 
         cat <<-END_VERSIONS > versions.yml
         "${task.process}":
@@ -44,7 +44,7 @@ process AGFUSION_DOWNLOAD {
 
         pyensembl install --species ${pyensembl_species} --release ${ensembl_release}
 
-        curl http://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/database_files/pfamA.txt.gz > pfamA.txt.gz
+        curl http://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam37.0/database_files/pfamA.txt.gz > pfamA.txt.gz
         gunzip pfamA.txt.gz
         agfusion build --dir . --species ${agfusion_genome} --release ${ensembl_release} --pfam pfamA.txt
         rm pfamA.txt

diff --git a/modules/local/fastaremoveprefix/environment.yml b/modules/local/fastaremoveprefix/environment.yml
@@ -0,0 +1,5 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - conda-forge::gawk=5.3.0
diff --git a/modules/local/fastaremoveprefix/main.nf b/modules/local/fastaremoveprefix/main.nf
@@ -0,0 +1,32 @@
+process FASTAREMOVEPREFIX {
+    tag "$fasta"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/gawk:5.3.0' :
+        'biocontainers/gawk:5.3.0' }"
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    input:
+    tuple val(meta), path(fasta, name: 'input/*')
+
+    output:
+    tuple val(meta), path("*.{fa,fasta}"), emit: fasta
+    path "versions.yml"                  , emit: versions
+
+    script:
+    def modified_fasta = fasta.fileName.name
+    """
+    cat ${fasta} | sed "s/^>chr/>/g" | sed "s/^>M />MT /g" > ${modified_fasta}
+
+    cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//')
+    END_VERSIONS
+    """
+
+
+}
diff --git a/modules/local/metafusion/genebed/main.nf b/modules/local/metafusion/genebed/main.nf
@@ -9,10 +9,9 @@ process METAFUSION_GENEBED {
 
     input:
     tuple val(meta), path(gff)
-    val ensembl_version
 
     output:
-    tuple val(meta), path("*.metafusion.gene.bed"), emit: metafusion_gene_bed
+    tuple val(meta), path("${meta.id}.metafusion.gene.bed"), emit: metafusion_gene_bed
     path "versions.yml"                           , emit: versions
 
     when:
@@ -22,27 +21,29 @@ process METAFUSION_GENEBED {
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    final_generate_v75_gene_bed.R \\
+    generate_gene_bed.R \\
         $gff \\
-        ${ensembl_version}.metafusion.gene.bed
+        ${prefix}.metafusion.gene.bed
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         R: \$(R --version | head -n1)
-        final_generate_v75_gene_bed.R: 0.0.1
+        generate_gene_bed.R: 0.0.2
     END_VERSIONS
     """
 
     stub:
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
+
     """
     touch ${prefix}.metafusion.gene.bed
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         R: \$(R --version | head -n1)
-        final_generate_v75_gene_bed.R: 0.0.1
+        generate_gene_bed.R: 0.0.2
     END_VERSIONS
     """
+
 }