Merged in release (pull request #201)

Release
TheJacksonLaboratory · Aug 16, 2024 · 3f55b38 · 3f55b38
2 parents 67cd86b + ce3d5ac
commit 3f55b38
Show file tree

Hide file tree

Showing 60 changed files with 1,160 additions and 41 deletions.
diff --git a/.zenodo.json b/.zenodo.json
@@ -40,6 +40,10 @@
             "affiliation": "The Jackson Laboratory",
             "name": "Ardian Ferraj"
         },
+        {
+            "affiliation": "The Jackson Laboratory",
+            "name": "Tejas Temker"
+        },
         {
             "affiliation": "The Jackson Laboratory",
             "name": "Anuj Srivastava"

diff --git a/ReleaseNotes.md b/ReleaseNotes.md
@@ -1,5 +1,48 @@
 # RELEASE NOTES
 
+## Release 0.7.0
+
+In this release we add a new workflow for calling copy number variation (CNV) from raw Illumina IDAT genotype array files. Currently the Illumina IlluminaCytoSNP v2.1 array is supported, but support for additional arrays is possible.  
+
+We make additional minor changes as described below. 
+
+### Pipelines Added:
+
+1. CNV calling from Illumina genotype array data (--cnv_array)
+
+### Modules Added:
+
+1. modules/bcftools/bcftools_gtct2vcf.nf
+1. modules/bcftools/bcftools_query_ascat.nf
+1. modules/illumina/iaap_cli.nf
+1. modules/ascat/ascat_run.nf
+1. modules/ascat/ascat_annotation.nf
+
+### Pipeline Changes:
+
+None
+
+### Module Changes:
+
+1. Replaced the incorrect `${task.mem}` with `${task.memory}` in the Nextflow error catch statement in modules related to the SV calling workflows.   
+1. utility_modules/gzip.nf: Memory request increase  
+
+### Scripts Added:
+
+1. cnv_array/ASCAT_run.R
+1. cnv_array/annotate_ensembl_genes.pl
+1. cnv_array/seg_plot.R
+1. cnv_array/segment_raw_extend.pl
+
+### Script Changes:
+
+None
+
+### NF-Test Modules Added: 
+
+1. tests/workflows/cnv_array.nf.test
+
+
 ## Release 0.6.7
 
 In this release we make the following minor adjustments:   

diff --git a/bin/cnv_array/ASCAT_run.R b/bin/cnv_array/ASCAT_run.R
@@ -0,0 +1,107 @@
+suppressMessages(library(ASCAT))
+
+### ASCAT Run ######
+
+# Note: this script expects ASCAT is running on single sample BAF/LRR files.
+
+args=(commandArgs(TRUE))
+
+sampleID = args[1]
+BAF_file = args[2]
+LRR_file = args[3]
+gender   = args[4]
+platform = args[5]
+GC_file  = args[6]
+RT_file  = args[7]
+
+######
+
+# Expected SNP POS file: 
+# Probe Set ID    Chromosome      Physical Position
+# CN_473963       1       61736
+# CN_473964       1       61808
+
+## the above can be taken from the BAF file. The BAF file contains positions for all valid SNPs. 
+SNPpos <- read.table(BAF_file, sep = "\t", header = TRUE)[ ,1:3]
+colnames(SNPpos) <- c('Probe_Set_ID', 'Chromosome', 'Physical_Position')
+
+##
+
+if (gender == 'NA') {
+  gender = 'XY'
+}
+
+ascat.bc = ascat.loadData(Tumor_LogR_file = LRR_file, Tumor_BAF_file = BAF_file, gender = gender, genomeVersion = "hg38")
+
+ascat.bc$samples[1] <- sampleID
+colnames(ascat.bc[["Tumor_LogR"]]) <- sampleID
+colnames(ascat.bc[["Tumor_BAF"]]) <- sampleID
+
+ascat.plotRawData(ascat.bc, img.prefix = "Before_correction_")
+
+ascat.bc = ascat.correctLogR(ascat.bc, GCcontentfile = GC_file, replictimingfile = RT_file)
+
+ascat.plotRawData(ascat.bc, img.prefix = "After_correction_")
+
+gg = ascat.predictGermlineGenotypes(ascat.bc, platform = platform)
+
+ascat.bc = ascat.aspcf(ascat.bc, ascat.gg = gg)
+
+ascat.plotSegmentedData(ascat.bc)
+
+ascat.output = ascat.runAscat(ascat.bc, write_segments = T)
+
+##
+
+QC = ascat.metrics(ascat.bc, ascat.output)
+
+write.table(as.data.frame(QC), file = paste0(sampleID, "_sample.QC.txt"), sep="\t", quote=F, row.names=F, col.names=T)
+
+save(ascat.bc, ascat.output, QC, file = paste0(sampleID, "_ASCAT_objects.Rdata"))
+
+##
+
+if ( length(ascat.output$failedarrays) == 0 ) {
+
+  num_probes <- vector(mode="numeric", length=nrow(ascat.output$segments_raw))
+  for (i in 1:nrow(ascat.output$segments_raw)) {
+    L1 = which(SNPpos$Chromosome == ascat.output$segments_raw$chr[i] & SNPpos$Physical_Position == ascat.output$segments_raw$startpos[i])
+    L2 = which(SNPpos$Chromosome ==  ascat.output$segments_raw$chr[i] & SNPpos$Physical_Position == ascat.output$segments_raw$endpos[i])
+    num_probes[i] = L2[length(L2)] - L1[1] + 1
+  }
+  seg_raw = cbind(ascat.output$segments_raw,num_probes)
+
+  num_probes <- vector(mode="numeric", length=nrow(ascat.output$segments))
+  for (i in 1:nrow(ascat.output$segments)) {
+
+    #print(i)
+    L1 = which(SNPpos$Chromosome == ascat.output$segments$chr[i] & SNPpos$Physical_Position == ascat.output$segments$startpos[i])
+    L2 = which(SNPpos$Chromosome ==  ascat.output$segments$chr[i] & SNPpos$Physical_Position == ascat.output$segments$endpos[i])
+    num_probes[i] = L2[length(L2)] - L1[1] + 1
+
+  }
+  seg = cbind(ascat.output$segments,num_probes)
+
+  seg_raw_dfs <- split(seg_raw, seg_raw$sample)
+  seg_dfs <- split(seg, seg$sample)
+
+  for (samp in names(seg_raw_dfs)){
+    write.table(seg_raw_dfs[[samp]], file = paste0(samp, ".segments_raw.txt"), sep="\t", quote=F, row.names=F)
+    write.table(seg_dfs[[samp]], file = paste0(samp, ".segments.txt"), sep="\t", quote=F, row.names=F)
+    write.table(as.data.frame(ascat.output$aberrantcellfraction)[row.names(as.data.frame(ascat.output$aberrantcellfraction)) %in% samp,], file=paste(samp,".aberrantcellfraction.txt",sep=""), sep="\t", quote=F, row.names=F, col.names=F)
+    write.table(as.data.frame(ascat.output$ploidy)[row.names(as.data.frame(ascat.output$ploidy)) %in% samp,], file=paste(samp,".ploidy.txt",sep=""), sep="\t", quote=F, row.names=F, col.names=F)
+  }
+
+} else {
+
+  write.table(as.data.frame(ascat.output$failedarrays), file="ASCAT.failedarrays.txt", sep="\t", quote=F, row.names=F, col.names=F)
+
+}
+
+if ( !is.null(ascat.output$nonaberrantarrays) ) {
+
+  write.table(as.data.frame(ascat.output$nonaberrantarrays), file="ASCAT.nonaberrantarrays.txt", sep="\t", quote=F, row.names=F, col.names=F)
+
+}
+
+sessionInfo()
diff --git a/bin/cnv_array/annotate_ensembl_genes.pl b/bin/cnv_array/annotate_ensembl_genes.pl
@@ -0,0 +1,132 @@
+#!/usr/bin/perl -w
+use POSIX;
+use File::Basename;
+
+# This script annotates ensembl genes with copy number and breakpoints
+# perl ensemblegenes_cnv_break.pl *.segments_raw.extend.txt mart_export_gene_chr1-Y.hg19ensembl75-85.08232016.txt
+
+if ($#ARGV != 1) {
+	print "This scripts requires: <file_cn> <file_gene> \n";
+	exit(-1);
+}
+
+$file_cn = $ARGV[0];		
+$file_gene = $ARGV[1];
+
+$file_output = basename($file_cn,".txt").".ensgene_cnvbreak.txt";
+open(OUTFILE, ">$file_output");
+
+open(GENEFILE, "$file_gene") or die "can't open $file_gene: $!";
+$gene = <GENEFILE>;
+chomp($gene);
+
+open(CNFILE, "$file_cn") or die "can't open $file_cn: $!";
+@data = <CNFILE>;
+close(CNFILE);
+chomp(@data);
+
+#print OUTFILE "$tmp\tstartext\tendext\tstartext_desc\tendext_desc\tCN_raw\tLOH\tparm_fraction\tqarm_fraction\tploidy\tcopydiff_2\tcopydiff_ploidy\tlogratio_2\tlogratio_ploidy\n";
+print OUTFILE "$gene\tnum_cnv_seg\tseg_desc\tploidy\tnMajor\tnMinor\tnAraw\tnBraw\tCN_raw\tLOH\tcopydiff_2\tcopydiff_ploidy\tlogratio_2\tlogratio_ploidy\tnMajor_max\tnMinor_max\tnAraw_max\tnBraw_max\tCN_raw_max\tLOH_max\tcopydiff_2_max\tcopydiff_ploidy_max\tlogratio_2_max\tlogratio_ploidy_max\n";
+
+while ($gene = <GENEFILE>) {
+
+    chomp($gene);
+    @line = split(/\t/, $gene);
+    $chr = $line[2];
+    $start = $line[3];
+    $end = $line[4];
+
+    #$cnraw1=999;
+    $numseg=0;
+    $region="";
+    %segline = ();
+    @n = ();
+
+    for ($j=1; $j<=$#data; $j++) {
+        @segment = split(/\t/, $data[$j]);
+
+        $chr_cn = $segment[1];
+        $pos1 = $segment[2];
+        $pos2 = $segment[3];
+        $pos1ext = $segment[9];
+        $pos2ext = $segment[10];
+        $left = $segment[11];
+        $right = $segment[12];
+        $cnraw = $segment[13];
+
+        if (($chr_cn eq $chr) && ($start <= $pos2ext) && ($end >= $pos1ext)) { #overlap
+            #$numseg++;
+            push(@n, $cnraw);
+            $segline{$cnraw} = [ @segment ];
+
+            #check if overlap with regions with no call
+            if (($start <= $pos1) && ($end >= $pos1ext)) {
+                $region = $region.$left.";";
+            }
+            if (($start <= $pos2ext) && ($end >= $pos2)) {
+                $region = $region.$right.";";
+            }
+
+            #if ($cnraw < $cnraw1) {
+            #    $cnraw1 = $cnraw;
+            #    $count = $j;
+            #}
+        }
+    }
+
+    if ($region eq "") {
+        $region = "NA";
+    }
+
+    if ($#n >= 0) {
+
+        $numseg = $#n +1;
+        @sortn = sort{ $a <=> $b } @n;
+
+        $nA = $segline{$sortn[0]}[4];
+        $nB = $segline{$sortn[0]}[5];
+        $rawA = $segline{$sortn[0]}[6];
+        $rawB = $segline{$sortn[0]}[7];
+        $cnraw = $segline{$sortn[0]}[13];
+        $loh = $segline{$sortn[0]}[14];
+        $ploidy= $segline{$sortn[0]}[17];
+        $copydiff1 = $segline{$sortn[0]}[18];
+        $copydiff2 = $segline{$sortn[0]}[19];
+        $logratio1 = $segline{$sortn[0]}[20];
+        $logratio2 = $segline{$sortn[0]}[21];
+
+        $outline = "$gene\t$numseg\t$region\t$ploidy\t$nA\t$nB\t$rawA\t$rawB\t$cnraw\t$loh\t$copydiff1\t$copydiff2\t$logratio1\t$logratio2\t";
+
+        if ($numseg > 1 ) {
+            $nA = $segline{$sortn[$#sortn]}[4];
+            $nB = $segline{$sortn[$#sortn]}[5];
+            $rawA = $segline{$sortn[$#sortn]}[6];
+            $rawB = $segline{$sortn[$#sortn]}[7];
+            $cnraw = $segline{$sortn[$#sortn]}[13];
+            $loh = $segline{$sortn[$#sortn]}[14];
+            $copydiff1 = $segline{$sortn[$#sortn]}[18];
+            $copydiff2 = $segline{$sortn[$#sortn]}[19];
+            $logratio1 = $segline{$sortn[$#sortn]}[20];
+            $logratio2 = $segline{$sortn[$#sortn]}[21];
+        }
+        else {
+            $nA = "NA";
+            $nB = "NA";
+            $rawA = "NA";
+            $rawB = "NA";
+            $cnraw = "NA";
+            $loh = "NA";
+            $copydiff1 = "NA";
+            $copydiff2 = "NA";
+            $logratio1 = "NA";
+            $logratio2 = "NA";
+
+        }
+
+        $outline = $outline."$nA\t$nB\t$rawA\t$rawB\t$cnraw\t$loh\t$copydiff1\t$copydiff2\t$logratio1\t$logratio2";
+        print OUTFILE "$outline\n";
+    }
+}
+
+close (GENEFILE);
+close (OUTFILE);