diff --git a/.gitignore b/.gitignore index 6bfdf18e..a295e6f6 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ design.csv sv_input.csv test.csv test2.csv +.nf-test/ +.nf-test* \ No newline at end of file diff --git a/bin/cnv_array/ASCAT_run.R b/bin/cnv_array/ASCAT_run.R new file mode 100644 index 00000000..6911201d --- /dev/null +++ b/bin/cnv_array/ASCAT_run.R @@ -0,0 +1,107 @@ +suppressMessages(library(ASCAT)) + +### ASCAT Run ###### + +# Note: this script expects ASCAT is running on single sample BAF/LRR files. + +args=(commandArgs(TRUE)) + +sampleID = args[1] +BAF_file = args[2] +LRR_file = args[3] +gender = args[4] +platform = args[5] +GC_file = args[6] +RT_file = args[7] + +###### + +# Expected SNP POS file: +# Probe Set ID Chromosome Physical Position +# CN_473963 1 61736 +# CN_473964 1 61808 + +## the above can be taken from the BAF file. The BAF file contains positions for all valid SNPs. +SNPpos <- read.table(BAF_file, sep = "\t", header = TRUE)[ ,1:3] +colnames(SNPpos) <- c('Probe_Set_ID', 'Chromosome', 'Physical_Position') + +## + +if (gender == 'NA') { + gender = 'XY' +} + +ascat.bc = ascat.loadData(Tumor_LogR_file = LRR_file, Tumor_BAF_file = BAF_file, gender = gender, genomeVersion = "hg38") + +ascat.bc$samples[1] <- sampleID +colnames(ascat.bc[["Tumor_LogR"]]) <- sampleID +colnames(ascat.bc[["Tumor_BAF"]]) <- sampleID + +ascat.plotRawData(ascat.bc, img.prefix = "Before_correction_") + +ascat.bc = ascat.correctLogR(ascat.bc, GCcontentfile = GC_file, replictimingfile = RT_file) + +ascat.plotRawData(ascat.bc, img.prefix = "After_correction_") + +gg = ascat.predictGermlineGenotypes(ascat.bc, platform = platform) + +ascat.bc = ascat.aspcf(ascat.bc, ascat.gg = gg) + +ascat.plotSegmentedData(ascat.bc) + +ascat.output = ascat.runAscat(ascat.bc, write_segments = T) + +## + +QC = ascat.metrics(ascat.bc, ascat.output) + +write.table(as.data.frame(QC), file = paste0(sampleID, "_sample.QC.txt"), sep="\t", quote=F, row.names=F, col.names=T) + +save(ascat.bc, ascat.output, QC, file = paste0(sampleID, "_ASCAT_objects.Rdata")) + +## + +if ( length(ascat.output$failedarrays) == 0 ) { + + num_probes <- vector(mode="numeric", length=nrow(ascat.output$segments_raw)) + for (i in 1:nrow(ascat.output$segments_raw)) { + L1 = which(SNPpos$Chromosome == ascat.output$segments_raw$chr[i] & SNPpos$Physical_Position == ascat.output$segments_raw$startpos[i]) + L2 = which(SNPpos$Chromosome == ascat.output$segments_raw$chr[i] & SNPpos$Physical_Position == ascat.output$segments_raw$endpos[i]) + num_probes[i] = L2[length(L2)] - L1[1] + 1 + } + seg_raw = cbind(ascat.output$segments_raw,num_probes) + + num_probes <- vector(mode="numeric", length=nrow(ascat.output$segments)) + for (i in 1:nrow(ascat.output$segments)) { + + #print(i) + L1 = which(SNPpos$Chromosome == ascat.output$segments$chr[i] & SNPpos$Physical_Position == ascat.output$segments$startpos[i]) + L2 = which(SNPpos$Chromosome == ascat.output$segments$chr[i] & SNPpos$Physical_Position == ascat.output$segments$endpos[i]) + num_probes[i] = L2[length(L2)] - L1[1] + 1 + + } + seg = cbind(ascat.output$segments,num_probes) + + seg_raw_dfs <- split(seg_raw, seg_raw$sample) + seg_dfs <- split(seg, seg$sample) + + for (samp in names(seg_raw_dfs)){ + write.table(seg_raw_dfs[[samp]], file = paste0(samp, ".segments_raw.txt"), sep="\t", quote=F, row.names=F) + write.table(seg_dfs[[samp]], file = paste0(samp, ".segments.txt"), sep="\t", quote=F, row.names=F) + write.table(as.data.frame(ascat.output$aberrantcellfraction)[row.names(as.data.frame(ascat.output$aberrantcellfraction)) %in% samp,], file=paste(samp,".aberrantcellfraction.txt",sep=""), sep="\t", quote=F, row.names=F, col.names=F) + write.table(as.data.frame(ascat.output$ploidy)[row.names(as.data.frame(ascat.output$ploidy)) %in% samp,], file=paste(samp,".ploidy.txt",sep=""), sep="\t", quote=F, row.names=F, col.names=F) + } + +} else { + + write.table(as.data.frame(ascat.output$failedarrays), file="ASCAT.failedarrays.txt", sep="\t", quote=F, row.names=F, col.names=F) + +} + +if ( !is.null(ascat.output$nonaberrantarrays) ) { + + write.table(as.data.frame(ascat.output$nonaberrantarrays), file="ASCAT.nonaberrantarrays.txt", sep="\t", quote=F, row.names=F, col.names=F) + +} + +sessionInfo() \ No newline at end of file diff --git a/bin/cnv_array/annotate_ensembl_genes.pl b/bin/cnv_array/annotate_ensembl_genes.pl new file mode 100644 index 00000000..7b96f7b6 --- /dev/null +++ b/bin/cnv_array/annotate_ensembl_genes.pl @@ -0,0 +1,132 @@ +#!/usr/bin/perl -w +use POSIX; +use File::Basename; + +# This script annotates ensembl genes with copy number and breakpoints +# perl ensemblegenes_cnv_break.pl *.segments_raw.extend.txt mart_export_gene_chr1-Y.hg19ensembl75-85.08232016.txt + +if ($#ARGV != 1) { + print "This scripts requires: \n"; + exit(-1); +} + +$file_cn = $ARGV[0]; +$file_gene = $ARGV[1]; + +$file_output = basename($file_cn,".txt").".ensgene_cnvbreak.txt"; +open(OUTFILE, ">$file_output"); + +open(GENEFILE, "$file_gene") or die "can't open $file_gene: $!"; +$gene = ; +chomp($gene); + +open(CNFILE, "$file_cn") or die "can't open $file_cn: $!"; +@data = ; +close(CNFILE); +chomp(@data); + +#print OUTFILE "$tmp\tstartext\tendext\tstartext_desc\tendext_desc\tCN_raw\tLOH\tparm_fraction\tqarm_fraction\tploidy\tcopydiff_2\tcopydiff_ploidy\tlogratio_2\tlogratio_ploidy\n"; +print OUTFILE "$gene\tnum_cnv_seg\tseg_desc\tploidy\tnMajor\tnMinor\tnAraw\tnBraw\tCN_raw\tLOH\tcopydiff_2\tcopydiff_ploidy\tlogratio_2\tlogratio_ploidy\tnMajor_max\tnMinor_max\tnAraw_max\tnBraw_max\tCN_raw_max\tLOH_max\tcopydiff_2_max\tcopydiff_ploidy_max\tlogratio_2_max\tlogratio_ploidy_max\n"; + +while ($gene = ) { + + chomp($gene); + @line = split(/\t/, $gene); + $chr = $line[2]; + $start = $line[3]; + $end = $line[4]; + + #$cnraw1=999; + $numseg=0; + $region=""; + %segline = (); + @n = (); + + for ($j=1; $j<=$#data; $j++) { + @segment = split(/\t/, $data[$j]); + + $chr_cn = $segment[1]; + $pos1 = $segment[2]; + $pos2 = $segment[3]; + $pos1ext = $segment[9]; + $pos2ext = $segment[10]; + $left = $segment[11]; + $right = $segment[12]; + $cnraw = $segment[13]; + + if (($chr_cn eq $chr) && ($start <= $pos2ext) && ($end >= $pos1ext)) { #overlap + #$numseg++; + push(@n, $cnraw); + $segline{$cnraw} = [ @segment ]; + + #check if overlap with regions with no call + if (($start <= $pos1) && ($end >= $pos1ext)) { + $region = $region.$left.";"; + } + if (($start <= $pos2ext) && ($end >= $pos2)) { + $region = $region.$right.";"; + } + + #if ($cnraw < $cnraw1) { + # $cnraw1 = $cnraw; + # $count = $j; + #} + } + } + + if ($region eq "") { + $region = "NA"; + } + + if ($#n >= 0) { + + $numseg = $#n +1; + @sortn = sort{ $a <=> $b } @n; + + $nA = $segline{$sortn[0]}[4]; + $nB = $segline{$sortn[0]}[5]; + $rawA = $segline{$sortn[0]}[6]; + $rawB = $segline{$sortn[0]}[7]; + $cnraw = $segline{$sortn[0]}[13]; + $loh = $segline{$sortn[0]}[14]; + $ploidy= $segline{$sortn[0]}[17]; + $copydiff1 = $segline{$sortn[0]}[18]; + $copydiff2 = $segline{$sortn[0]}[19]; + $logratio1 = $segline{$sortn[0]}[20]; + $logratio2 = $segline{$sortn[0]}[21]; + + $outline = "$gene\t$numseg\t$region\t$ploidy\t$nA\t$nB\t$rawA\t$rawB\t$cnraw\t$loh\t$copydiff1\t$copydiff2\t$logratio1\t$logratio2\t"; + + if ($numseg > 1 ) { + $nA = $segline{$sortn[$#sortn]}[4]; + $nB = $segline{$sortn[$#sortn]}[5]; + $rawA = $segline{$sortn[$#sortn]}[6]; + $rawB = $segline{$sortn[$#sortn]}[7]; + $cnraw = $segline{$sortn[$#sortn]}[13]; + $loh = $segline{$sortn[$#sortn]}[14]; + $copydiff1 = $segline{$sortn[$#sortn]}[18]; + $copydiff2 = $segline{$sortn[$#sortn]}[19]; + $logratio1 = $segline{$sortn[$#sortn]}[20]; + $logratio2 = $segline{$sortn[$#sortn]}[21]; + } + else { + $nA = "NA"; + $nB = "NA"; + $rawA = "NA"; + $rawB = "NA"; + $cnraw = "NA"; + $loh = "NA"; + $copydiff1 = "NA"; + $copydiff2 = "NA"; + $logratio1 = "NA"; + $logratio2 = "NA"; + + } + + $outline = $outline."$nA\t$nB\t$rawA\t$rawB\t$cnraw\t$loh\t$copydiff1\t$copydiff2\t$logratio1\t$logratio2"; + print OUTFILE "$outline\n"; + } +} + +close (GENEFILE); +close (OUTFILE); diff --git a/bin/cnv_array/seg_plot.R b/bin/cnv_array/seg_plot.R new file mode 100644 index 00000000..f2bb5a25 --- /dev/null +++ b/bin/cnv_array/seg_plot.R @@ -0,0 +1,117 @@ +# updated to have Graces Sept '19 tweaks + +options(scipen = 999) + +args=(commandArgs(TRUE)) + +# filename is name of the *segments_raw.extend.txt file +filename <- args[1] + +sampleID <- args[2] + +# outdir is the dir where png result will be written ... use "./" for current dir +outdir <- args[3] + + +CNS <-read.table(filename,header=T,sep="\t") + +gender <- 'female' +sex <- 'female' + +if (sex == "female") { + CNS=CNS[CNS$chr!="Y",] +} + +#title of plot +ploidy=round(CNS$ploidy[1], digits=2) +plottitle=paste( gsub("_"," ", sampleID), " ploidy=",ploidy,sep="") + +chromo <- unique(CNS$chr) +chromo +xx=0 +y = c() +start=CNS$startpos +end=CNS$endpos + +for (x in chromo) { + start[CNS$chr == x]=start[CNS$chr == x]+xx + end[CNS$chr == x]=end[CNS$chr == x]+xx + tmp = CNS$endpos[CNS$chr == x] + xx=tail(tmp,1)+xx + y <- c(y, xx) +} + +png(paste(outdir,sampleID,"_segmentsgenomeplot.copydiffploidy.png",sep=""), width=1300,height=600) + +par(mar = c(5, 5, 8, 4)) + +val=CNS$copydiff_ploidy + +# in the following stmt, family="serif" changes font to times-roman; cex.main=1.8 scales up the title font size +# formerly also used: ylim=c(-7,20), +plot( c(start,end), c(val,val), col="white", main=plottitle, xlab="Chromosome", ylab="Delta from Ploidy", + ylim=c(-8, max( c(val,val) ) )) + +for (i in 1:length(start)) { + if (CNS$LOH[i]==1) { + polygon(c(start[i],end[i],end[i],start[i]),c(min(-7),min(-7),max(-6),max(-6)),col="lightsteelblue",border="lightsteelblue",lwd=2) + } + +} + +segments(start,val,end,val,col="tomato",lwd=5) +abline(v=y,col="grey") +posy=c(-8) +i=1 +l=0 +for (x in chromo) { + posx=(l+y[i])/2 + text(posx,posy,x,cex=1.2,srt=45) + l=y[i] + i=i+1 +} +abline(h=0,col="black",lty=2,lwd=1.5) + +# formerly also used: inset=c(0,-0.1), +legend("topright", inset=-0.1, c("Difference from sample ploidy ", "LOH"), xpd=TRUE, horiz=T, + bty="n", lty=c(1,1), lwd=6, col=c("tomato", "lightsteelblue"), cex=1.5 ) + +dev.off() + + +png(paste(outdir,sampleID,"_segmentsgenomeplot.CNraw_loh.png",sep=""), width=1300,height=600) + +par(mar = c(5, 5, 8, 4)) + +val=CNS$CN_raw +d=max(val)/100 +plot(c(start,end),c(val,val),col="white",ylab="CN, CN Major, CN Minor",main=plottitle,ylim=c(-d,max(val)+4*d),xaxt='n',xlab="chromosomes") + +for (i in 1:length(start)) { + if (CNS$LOH[i]==1) { + polygon(c(start[i],end[i],end[i],start[i]),c(min(val),min(val),max(val),max(val)),col="lightsteelblue",border="lightsteelblue",lwd=2) + } + +} + +val=CNS$nBraw-d +segments(start,val,end,val,col="blue",lwd=4) +val=CNS$nAraw +segments(start,val,end,val,col="red",lwd=4) +val=CNS$CN_raw+d +segments(start,val,end,val,col="purple",lwd=4) +abline(v=y,col="grey") +posy=max(val)+2*d +i=1 +l=0 +for (x in chromo) { + posx=(l+y[i])/2 + text(posx,posy,x,cex=1,srt=45) + l=y[i] + i=i+1 +} +abline(h=CNS$ploidy[1],col="black",lty=2,lwd=1.5) + +legend("topright", c("CN Total", "CN Major", "CN Minor", "LOH"),xpd=TRUE,horiz=T, inset=c(0,-0.1), bty = "n", lty=c(1,1,1,1), lwd=6, col = c("purple", "red", "blue", "lightsteelblue"), cex = 1) +dev.off() + diff --git a/bin/cnv_array/segment_raw_extend.pl b/bin/cnv_array/segment_raw_extend.pl new file mode 100644 index 00000000..04cd5706 --- /dev/null +++ b/bin/cnv_array/segment_raw_extend.pl @@ -0,0 +1,288 @@ +#!/usr/bin/perl -w +use POSIX; +use File::Basename; + +# This script adds to segment file the arm fraction, LOH and CN diff and log ratio relative to 2 and ploidy +# The segments are extended + +# perl segment_raw_annotate.pl *segments_raw.txt *ploidy.txt hg38_chromosome_arm.txt [male, female, unknown] + +if ($#ARGV != 3) { + print "This scripts requires: \n"; + exit(-1); +} + +$file_cn = $ARGV[0]; +$file_ploidy = $ARGV[1]; +$file_arm = $ARGV[2]; +$gender = $ARGV[3]; + +$file_output = basename($file_cn,".txt").".extend.txt"; + +$ploidy = `cat $file_ploidy`; +chomp($ploidy); + +# $gender = `cat $file_gender`; +# chomp($gender); + +if (($gender eq "female") || ($gender eq "unknown")) { + $cn_factor = 1; +} +elsif ($gender eq "male") { + $cn_factor= 0.5; +} + +$tmp = `cat $file_arm | awk 'NR>1'`; +@arm = split(/\n/,$tmp); +chomp(@arm); + +open(CN, "$file_cn") or die "can't open $file_cn: $!"; +$tmp = ; +chomp($tmp); + +open(OUTFILE, ">$file_output"); +print OUTFILE "$tmp\tstartext\tendext\tstartext_desc\tendext_desc\tCN_raw\tLOH\tparm_fraction\tqarm_fraction\tploidy\tcopydiff_2\tcopydiff_ploidy\tlogratio_2\tlogratio_ploidy\n"; + +open(TMPFILE, ">tmp.txt"); + +#merge segments +$tmp = ; +chomp($tmp); +@line = split(/\t/,$tmp); +print "@line\n"; +$sample = $line[0]; +$chromo = $line[1]; +$n1 = $line[4]; +$n2 = $line[5]; +$cn1 = $line[6]; +$cn2 = $line[7]; +$start = $line[2]; +$end = $line[3]; +$num = $line[8]; + +print "$num\n"; + +while ($tmp = ) { + chomp($tmp); + @line = split(/\t/,$tmp); + + if (($chromo eq $line[1]) && ($cn1 == $line[6]) && ($cn2 == $line[7])) { + $end = $line[3]; + $num = $num + $line[8]; + } + else { + print TMPFILE "$sample\t$chromo\t$start\t$end\t$n1\t$n2\t$cn1\t$cn2\t$num\n"; + $sample = $line[0]; + $chromo = $line[1]; + $n1 = $line[4]; + $n2 = $line[5]; + $cn1 = $line[6]; + $cn2 = $line[7]; + $start = $line[2]; + $end = $line[3]; + $num = $line[8]; + } +} +#lastline +print TMPFILE "$sample\t$chromo\t$start\t$end\t$n1\t$n2\t$cn1\t$cn2\t$num\n"; + +close (CN); +close (TMPFILE); + +open(CN, "tmp.txt") or die "can't open tmp.txt: $!"; +@seg = ; +chomp(@seg); +close (CN); +$n = 0; + +for ($j=0; $j<$#seg; $j++) { + + @array1 = split(/\t/,$seg[$j]); + @array2 = split(/\t/,$seg[$j+1]); + #$x1 = $array1[2]; + $x2 = $array1[3]; + $y1 = $array2[2]; + #$y2 = $array2[3]; + + if ($array1[1] ne $n) { #first line for chr + + $n = $array1[1]; + $left = 0; + $left1 = "telomere"; + + for ($i=1; $i<=$#arm; $i+=2) { + @line = split(/\t/,$arm[$i]); + if ($n eq substr($line[0],3)) { + $a = $line[1]; + $b = $line[2]; + } + } + + if ($array2[1] ne $n) { #last line for chr + $right = $b; + $right1 = "telomere"; + } + elsif (($x2 < $a) && ($y1 > $a)) { + $right = $a; + $right1 = "centromere"; + } + else { + $right = floor(($x2 + $y1)/2); + $right1 = "no_probe"; + } + } + else { + + $left = $right + 1; + $left1 = $right1; + + if ($array2[1] ne $n) { #last line for chr + + $right = $b; + $right1 = "telomere"; + } + elsif (($x2 < $a) && ($y1 > $a)) { + $right = $a; + $right1 = "centromere"; + } + else { + $right = floor(($x2 + $y1)/2); + $right1 = "no_probe"; + } + } + + $copy = $array1[6] + $array1[7]; + if ($array1[6] >= 0.5 && $array1[7] <= 0.1) { + $loh=1; + } + else { + $loh=0; + } + + for ($i=0; $i<=$#arm; $i+=2) { + @line = split(/\t/,$arm[$i]); + if ($n eq substr($line[0],3)) { + if (($right>=$line[1]) && ($left<=$line[2])) { + @tmp = ($left,$right,$line[1],$line[2]); + @sorttmp = sort{ $a <=> $b } @tmp; + $overlap1=($sorttmp[2]-$sorttmp[1])/($line[2]-$line[1]); + } + else { + $overlap1=0; + } + } + } + + for ($i=1; $i<=$#arm; $i+=2) { + @line = split(/\t/,$arm[$i]); + if ($n eq substr($line[0],3)) { + if (($right>=$line[1]) && ($left<=$line[2])) { + @tmp = ($left,$right,$line[1],$line[2]); + @sorttmp = sort{ $a <=> $b } @tmp; + $overlap2=($sorttmp[2]-$sorttmp[1])/($line[2]-$line[1]); + } + else { + $overlap2=0; + } + } + } + + if (($n eq "X") || ($n eq "Y")) { + $diff1=$copy - ($cn_factor * 2); + $diff2=$copy- ($cn_factor * $ploidy); + $logratio1 = log(($copy+0.01)/($cn_factor * 2))/log(2); + $logratio2 = log(($copy+0.01)/($cn_factor * $ploidy))/log(2); + } + else { + $diff1=$copy-2; + $diff2=$copy-$ploidy; + $logratio1 = log(($copy+0.01)/2)/log(2); + $logratio2 = log(($copy+0.01)/$ploidy)/log(2); + } + + print OUTFILE "$seg[$j]\t$left\t$right\t$left1\t$right1\t$copy\t$loh\t$overlap1\t$overlap2\t$ploidy\t$diff1\t$diff2\t$logratio1\t$logratio2\n"; +} + +@array1 = split(/\t/,$seg[$#seg]); + +if ($array1[1] ne $n) { #first line for chr + + $n = $array1[1]; + $left = 0; + $left1 = "telomere"; + + for ($i=1; $i<=$#arm; $i+=2) { + @line = split(/\t/,$arm[$i]); + if ($n eq substr($line[0],3)) { + $a = $line[1]; + $b = $line[2]; + } + } + + $right = $b; + $right1 = "telomere"; + +} +else { + + $left = $right + 1; + $left1 = $right1; + + $right = $b; + $right1 = "telomere"; + +} + +$copy = $array1[6] + $array1[7]; +if ($array1[6] >= 0.5 && $array1[7] <= 0.1) { + $loh=1; +} +else { + $loh=0; +} + +for ($i=0; $i<=$#arm; $i+=2) { + @line = split(/\t/,$arm[$i]); + if ($n eq substr($line[0],3)) { + if (($right>=$line[1]) && ($left<=$line[2])) { + @tmp = ($left,$right,$line[1],$line[2]); + @sorttmp = sort{ $a <=> $b } @tmp; + $overlap1=($sorttmp[2]-$sorttmp[1])/($line[2]-$line[1]); + } + else { + $overlap1=0; + } + } +} + +for ($i=1; $i<=$#arm; $i+=2) { + @line = split(/\t/,$arm[$i]); + if ($n eq substr($line[0],3)) { + if (($right>=$line[1]) && ($left<=$line[2])) { + @tmp = ($left,$right,$line[1],$line[2]); + @sorttmp = sort{ $a <=> $b } @tmp; + $overlap2=($sorttmp[2]-$sorttmp[1])/($line[2]-$line[1]); + } + else { + $overlap2=0; + } + } +} + +if (($n eq "X") || ($n eq "Y")) { + $diff1=$copy - ($cn_factor * 2); + $diff2=$copy- ($cn_factor * $ploidy); + $logratio1 = log(($copy+0.01)/($cn_factor * 2))/log(2); + $logratio2 = log(($copy+0.01)/($cn_factor * $ploidy))/log(2); +} +else { + $diff1=$copy-2; + $diff2=$copy-$ploidy; + $logratio1 = log(($copy+0.01)/2)/log(2); + $logratio2 = log(($copy+0.01)/$ploidy)/log(2); +} + +print OUTFILE "$seg[$j]\t$left\t$right\t$left1\t$right1\t$copy\t$loh\t$overlap1\t$overlap2\t$ploidy\t$diff1\t$diff2\t$logratio1\t$logratio2\n"; + +close(CN); +close (OUTFILE); diff --git a/bin/help/cnv_array.nf b/bin/help/cnv_array.nf new file mode 100644 index 00000000..3f27cb8f --- /dev/null +++ b/bin/help/cnv_array.nf @@ -0,0 +1,18 @@ +def help() { + println ''' +Parameter | Default | Description + +--pubdir | / | The directory that the saved outputs will be stored. +--organize_by | sample | How to organize the output folder structure. Options: sample or analysis. +-w | / | The directory that all intermediary files and nextflow processes utilize. This directory can become quite large. This should be a location on /fastscratch or other directory with ample storage. +--gtc_csv | '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.csv' | Genotype Call (GTC) manifest for IDAT conversion. Provided by Illumina. +--bpm_file | '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.bpm' | Manifest file describing the SNP or probe content on a BeadChip. Provided by Illumina. +--egt_file | '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L.egt' | Cluster file describing the cluster positions for the Illumina genotyping array. Provided by Illumina. +--ref_fa | '/projects/compsci/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' | The reference fasta file. Reference FASTA build should match Illumina provided files. +--snp_platform | 'IlluminaCytoSNP' | SNP platform supported by ASCAT. See full supported list here: https://github.com/VanLoo-lab/ascat?tab=readme-ov-file#supported-arrays-without-matched-germline +--gc_file | '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_GCcontent_validSNPloci.txt' | ASCAT’s GC correction file, generated from scripts at https://github.com/VanLoo-lab/ascat/tree/master/LogRcorrection +--rt_file | '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_ReplicationTiming_SNPloci_hg38.txt' | ASCAT’s replication timing file, generated from scripts at https://github.com/VanLoo-lab/ascat/tree/master/LogRcorrection +--chrArm | '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/GRCh38_chromosome_arm.txt' | Chromosome arm locations, used in CNV segment annotation. +--cnvGeneFile | '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/biomaRt_GRCh38_ensemblv102_CNVgeneAnnotations_primaryChroms.txt' +''' +} diff --git a/bin/log/cnv_array.nf b/bin/log/cnv_array.nf new file mode 100644 index 00000000..e62d0137 --- /dev/null +++ b/bin/log/cnv_array.nf @@ -0,0 +1,45 @@ +def param_log() { + // Check required parameters + if (!params.bpm_file) { + error "'--bpm_file': is not provided, it is a required parameter." + } + + if (!params.egt_file) { + error "'--egt_file': is not provided, it is a required parameter." + } + + if (!params.gtc_csv) { + error "'--gtc_csv': is not provided, it is a required parameter." + } + + if (!params.ref_fa) { + error "'--ref_fa': is not provided, it is a required parameter." + } + + + // Log parameter information + log.info """ + CNV_ARRAY PARAMETER LOG + + --comment: ${params.comment ?: 'N/A'} + + Results Published to: ${params.pubdir ?: 'N/A'} + ______________________________________________________ + --bpm_file ${params.bpm_file} + --egt_file ${params.egt_file} + --gtc_csv ${params.gtc_csv} + --ref_fa ${params.ref_fa} + --snp_platform ${params.snp_platform} + --gc_file ${params.gc_file} + --rt_file ${params.rt_file} + --chrArm ${params.chrArm} + --cnvGeneFile ${params.cnvGeneFile} + -w ${workDir} + -c ${params.config} + + Project Directory: ${projectDir} + Command line call: + ${workflow.commandLine} + ______________________________________________________ + """ +} diff --git a/bin/shared/extract_cnv_array_csv.nf b/bin/shared/extract_cnv_array_csv.nf new file mode 100644 index 00000000..68dfb6a5 --- /dev/null +++ b/bin/shared/extract_cnv_array_csv.nf @@ -0,0 +1,100 @@ +// Function to extract information (meta data + file(s)) from csv file(s) +// https://github.com/nf-core/sarek/blob/master/workflows/sarek.nf#L1084 + +ANSI_RED = "\u001B[31m"; +ANSI_RESET = "\u001B[0m"; + +def extract_csv(csv_file) { + // check that the sample sheet is not 1 line or less, because it'll skip all subsequent checks if so. + file(csv_file).withReader('UTF-8') { reader -> + def line, numberOfLinesInSampleSheet = 0; + while ((line = reader.readLine()) != null) {numberOfLinesInSampleSheet++} + if (numberOfLinesInSampleSheet < 2) { + System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) + System.err.println(ANSI_RED + "Samplesheet had less than two lines. The sample sheet must be a csv file with a header, so at least two lines." + ANSI_RESET) + System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) + System.exit(1) + } + } + + Channel.from(csv_file).splitCsv(header: true) + .map{ row -> + if (!(row.sampleID) || !(row.idat_red) || !(row.idat_green)) { + System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) + System.err.println(ANSI_RED + "Missing field in csv file header. The csv file must have fields: 'sampleID', 'idat_red', 'idat_green'." + ANSI_RESET) + System.err.println(ANSI_RED + "Exiting now." + ANSI_RESET) + System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) + System.exit(1) + } + [row.sampleID.toString(), row] + }.groupTuple() + .map{ meta, rows -> + size = rows.size() + [rows, size] + }.transpose() + .map{ row, numLanes -> //from here do the usual thing for csv parsing + + if (row.idat_red.substring(row.idat_red.lastIndexOf(System.getProperty("file.separator")) + 1).count("_") > 2){ + System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) + System.err.println(ANSI_RED + "The file: " + row.idat_red + " containes more than 2 underscores in the name." + ANSI_RESET) + System.err.println(ANSI_RED + "IDAT files must have only 2 underscores (i.e., xxx_xxx_Grn.idat and xxx_xxx_Red.idat)." + ANSI_RESET) + System.err.println(ANSI_RED + "GEO (and others) rename files to have more than 2 (i.e., GSMxxx_xxx_xxx_Red.idat). File names must be adjusted prior to running." + ANSI_RESET) + System.err.println(ANSI_RED + "Exiting now." + ANSI_RESET) + System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) + System.exit(1) + } + if (row.idat_green.substring(row.idat_green.lastIndexOf(System.getProperty("file.separator")) + 1).count("_") > 2){ + System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) + System.err.println(ANSI_RED + "The file: " + row.idat_green + " containes more than 2 underscores in the name." + ANSI_RESET) + System.err.println(ANSI_RED + "IDAT files must have only 2 underscores (i.e., xxx_xxx_Grn.idat and xxx_xxx_Red.idat)." + ANSI_RESET) + System.err.println(ANSI_RED + "GEO (and others) rename files to have more than 2 (i.e., GSMxxx_xxx_xxx_Red.idat). File names must be adjusted prior to running." + ANSI_RESET) + System.err.println(ANSI_RED + "Exiting now." + ANSI_RESET) + System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) + System.exit(1) + } + + // Metadata to identify samplesheet + def meta = [:] + + if (row.sampleID) meta.sampleID = row.sampleID.toString() + + if (row.gender != "XY" && row.gender != "XX" && row.gender != ""){ + System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) + System.err.println(ANSI_RED + "Gender must be 'XX', 'XY' or empty. " + row.gender + " was provided, and isn't valid." + ANSI_RESET) + System.err.println(ANSI_RED + "Exiting now." + ANSI_RESET) + System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) + System.exit(1) + } + + if (row.gender == "") { + meta.gender = 'NA' + } else { + meta.gender = row.gender.toString() + } + + // join meta to idat, and check file existence. + try { + file(row.idat_red, checkIfExists: true) + } + catch (Exception e) { + System.err.println(ANSI_RED + "---------------------------------------------" + ANSI_RESET) + System.err.println(ANSI_RED + "The file: " + row.idat_red + " does not exist. Use absolute paths, and check for correctness." + ANSI_RESET) + System.err.println(ANSI_RED + "Exiting now." + ANSI_RESET) + System.err.println(ANSI_RED + "---------------------------------------------" + ANSI_RESET) + System.exit(1) + } + try { + file(row.idat_green, checkIfExists: true) + } + catch (Exception e) { + System.err.println(ANSI_RED + "---------------------------------------------" + ANSI_RESET) + System.err.println(ANSI_RED + "The file: " + row.idat_green + " does not exist. Use absolute paths, and check for correctness." + ANSI_RESET) + System.err.println(ANSI_RED + "Exiting now." + ANSI_RESET) + System.err.println(ANSI_RED + "---------------------------------------------" + ANSI_RESET) + System.exit(1) + } + + return [meta.sampleID, meta, row.idat_red, row.idat_green] + + } +} diff --git a/config/cnv_array.config b/config/cnv_array.config new file mode 100644 index 00000000..18eebf60 --- /dev/null +++ b/config/cnv_array.config @@ -0,0 +1,18 @@ +manifest { + name = "cnv_array" + description = 'Pipeline for processing Copy Number Variation from Illumina Genotype Array.' + author = 'Tejas Temker, Michael Lloyd, Copyright Jackson Laboratory 2024' + version = "0.1.0" +} + +params { + gtc_csv = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.csv' + bpm_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.bpm' + egt_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L.egt' + ref_fa = '/projects/compsci/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' + snp_platform = 'IlluminaCytoSNP' + gc_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_GCcontent_validSNPloci.txt' + rt_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_ReplicationTiming_SNPloci_hg38.txt' + chrArm = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/GRCh38_chromosome_arm.txt' + cnvGeneFile = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/biomaRt_GRCh38_ensemblv102_CNVgeneAnnotations_primaryChroms.txt' +} diff --git a/main.nf b/main.nf index 31549e60..73a303af 100644 --- a/main.nf +++ b/main.nf @@ -48,6 +48,9 @@ else if (params.workflow == "gbrs"){ else if (params.workflow == "amplicon"){ include {AMPLICON} from './workflows/amplicon' } +else if (params.workflow == "cnv_array"){ + include {CNV_ARRAY} from './workflows/cnv_array' +} else { // if workflow name is not supported: exit 1, "ERROR: No valid pipeline called. '--workflow ${params.workflow}' is not a valid workflow name." @@ -100,4 +103,7 @@ workflow{ if (params.workflow == "amplicon"){ AMPLICON() } + if (params.workflow == "cnv_array"){ + CNV_ARRAY() + } } diff --git a/modules/ascat/ascat_annotation.nf b/modules/ascat/ascat_annotation.nf new file mode 100644 index 00000000..3eda34fe --- /dev/null +++ b/modules/ascat/ascat_annotation.nf @@ -0,0 +1,29 @@ +process ASCAT_ANNOTATION { + + tag "$sampleID" + + cpus = 1 + memory = 24.GB + time = '01:30:00' + errorStrategy = { (task.exitStatus == 140) ? { log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish' }.call() : 'finish' } + + container 'quay.io/jaxcompsci/ascat:v3.1.3' + + publishDir "${params.pubdir}/${params.organize_by == 'sample' ? sampleID : 'ascat_annotation'}", mode: 'copy' + + input: + tuple val(sampleID), val(meta), path(segments_raw), path(ploidy) + + output: + tuple val(sampleID), val(meta), path("*.segments_raw.extend.txt"), emit: seg_extended + tuple val(sampleID), val(meta), path("*.ensgene_cnvbreak.txt"), emit: ensembl_annot + tuple val(sampleID), val(meta), path("*.png"), emit: png + + script: + gender = meta.gender == 'XX' ? 'female' : 'male' + """ + perl ${projectDir}/bin/cnv_array/segment_raw_extend.pl ${segments_raw} ${ploidy} ${params.chrArm} ${gender} + perl ${projectDir}/bin/cnv_array/annotate_ensembl_genes.pl ${sampleID}.segments_raw.extend.txt ${params.cnvGeneFile} + R CMD BATCH --slave "--args ${sampleID}.segments_raw.extend.txt ${sampleID} ./ " ${projectDir}/bin/cnv_array/seg_plot.R + """ +} diff --git a/modules/ascat/ascat_run.nf b/modules/ascat/ascat_run.nf new file mode 100644 index 00000000..4725ade8 --- /dev/null +++ b/modules/ascat/ascat_run.nf @@ -0,0 +1,33 @@ +process ASCAT { + tag "$sampleID" + + cpus 1 + memory 24.GB + time '01:30:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/ascat:v3.1.3' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'ascat' }", mode: 'copy' + + input: + tuple val(sampleID), val(meta), path(BAF), path(LRR) + + output: + tuple val(sampleID), val(meta), path("*.txt"), emit: all_txt + tuple val(sampleID), val(meta), path("*.png"), emit: all_png + tuple val(sampleID), val(meta), path("*.Rdata"), emit: ascat_rdata + tuple val(sampleID), val(meta), path("*segments_raw.txt"), path("*.ploidy.txt"), emit: seg_ploidy + + script: + """ + Rscript ${projectDir}/bin/cnv_array/ASCAT_run.R \ + ${sampleID} \ + ${BAF} \ + ${LRR} \ + ${meta.gender} \ + ${params.snp_platform} \ + ${params.gc_file} \ + ${params.rt_file} + """ +} diff --git a/modules/bcftools/bcftools_gtct2vcf.nf b/modules/bcftools/bcftools_gtct2vcf.nf new file mode 100644 index 00000000..8db2b948 --- /dev/null +++ b/modules/bcftools/bcftools_gtct2vcf.nf @@ -0,0 +1,34 @@ +process BCFTOOLS_GTC2VCF { + + tag "$sampleID" + + cpus = 1 + memory 24.GB + time '01:30:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/gtc2vcf_with_tools:v2' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'bcftools' }", pattern: "*.{vcf,tsv}", mode: 'copy' + + input: + tuple val(sampleID), val(meta), path(gtc) + + output: + tuple val(sampleID), val(meta), path('*_convert.bcf'), path('*_convert.bcf.csi'), path('*_convert.vcf'), path('*_convert_info.tsv'), emit: gtc2vcf + + script: + """ + bcftools +gtc2vcf --no-version -Ou \ + --bpm ${params.bpm_file} \ + --csv ${params.gtc_csv} \ + --egt ${params.egt_file} \ + --gtcs ./ \ + --fasta-ref ${params.ref_fa} \ + --extra ${sampleID}_convert_info.tsv | \ + bcftools sort -Ou -T ./bcftools. | \ + bcftools norm --no-version -Ob -c x -f ${params.ref_fa} | \ + tee ${sampleID}_convert.bcf | \ + bcftools index --force --output ${sampleID}_convert.bcf.csi + bcftools convert -O v -o ${sampleID}_convert.vcf ${sampleID}_convert.bcf + """ +} diff --git a/modules/bcftools/bcftools_query_ascat.nf b/modules/bcftools/bcftools_query_ascat.nf new file mode 100644 index 00000000..8d448242 --- /dev/null +++ b/modules/bcftools/bcftools_query_ascat.nf @@ -0,0 +1,28 @@ +process BCFTOOLS_QUERY_ASCAT { + + tag "$sampleID" + + cpus 1 + memory 8.GB + time '01:00:00' + errorStrategy 'finish' + + container 'quay.io/jaxcompsci/gtc2vcf_with_tools:v2' + publishDir "${params.pubdir}/${params.organize_by == 'sample' ? sampleID : 'bcftools'}", mode: 'copy' + + input: + tuple val(sampleID), val(meta), path(bcf), path(csi), path(vcf), path(tsv) + + output: + tuple val(sampleID), val(meta), path('*_convert.BAF'), path('*_convert.LRR'), emit: baf_lrr + + script: + """ + (bcftools query -l ${sampleID}_convert.bcf | awk 'BEGIN{printf("\\tCHROM\\tPOS");} {printf("\\t%s",\$1);} END{printf("\\n");}' && bcftools query -f '%ID\\t%CHROM\\t%POS[\\t%BAF]\\n' ${sampleID}_convert.bcf) > ${sampleID}_convert.BAF + + (bcftools query -l ${sampleID}_convert.bcf | awk 'BEGIN{printf("\\tCHROM\\tPOS");} {printf("\\t%s",\$1);} END{printf("\\n");}' && bcftools query -f '%ID\\t%CHROM\\t%POS[\\t%LRR]\\n' ${sampleID}_convert.bcf) > ${sampleID}_convert.LRR + + sed -i s/chr// ${sampleID}_convert.BAF + sed -i s/chr// ${sampleID}_convert.LRR + """ +} diff --git a/modules/illumina/iaap_cli.nf b/modules/illumina/iaap_cli.nf new file mode 100644 index 00000000..c882e964 --- /dev/null +++ b/modules/illumina/iaap_cli.nf @@ -0,0 +1,33 @@ +process IAAP_CLI { + + tag "$sampleID" + + cpus = 1 + memory 24.GB + time '01:30:00' + + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/gtc2vcf_with_tools:v2' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'iaap_cli' }", pattern: "*.gtc", mode:'copy', enabled: params.keep_intermediate + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'iaap_cli' }", pattern: "*.log", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), val(meta), path(red_idat), path(green_idat) + + output: + tuple val(sampleID), val(meta), path("*.gtc"), emit: gtc + tuple val(sampleID), val(meta), emit: ascat2r + path "iaap_cli.log", emit: iaap_cli_log + + script: + """ + /usr/local/bin/iaap-cli/iaap-cli gencall \ + ${params.bpm_file} \ + ${params.egt_file} \ + ./ \ + --idat-folder ./ \ + --output-gtc >> iaap_cli.log 2>&1 + """ +} diff --git a/nextflow.config b/nextflow.config index 3fdcb69e..5bac0e2a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,29 +1,27 @@ -/*___________________________________________________ - - Nextflow DSL2 Main Config - - Authors: Anuj Srivastava, Carolyn Paisie, Barry Guglielmo, Michael Lloyd, Brian Sanderson, Sai Lek, Harshpreet Chandok, Peter Fields - Copyright of Jackson Laboratories 2022 - -_____________________________________________________*/ - params { - // Select workflow - workflow = 'rnaseq' + // set workflow + pipeline = 'Not_Specified' + workflow = params.pipeline + + // define reference_cache directory + reference_cache='/projects/omics_share' - // select config from config folder to use + // select config from config folder to use based on workflow config = "config/${params.workflow}.config" // set publish directory for data to save (easier to follow) - pubdir = "../${workflow}" + pubdir = "/flashscratch/${USER}" + + profile = 'sumner2' // organize output: // by sample folders (with many analysis in one sample folder) or by // analysis folder (with many samples in one folder per analysis) - organize_by = 'sample' // analysis keep_intermediate = false // true - + fastq2 = true // default is PE for workflows + tmpdir = "/flashscratch/${USER}" // generic param + // get help help = null @@ -31,29 +29,30 @@ params { comment = '' } -// specific config for the pipeline - try { includeConfig params.config } catch (Exception e) { - System.err.println("ERROR: Could not load ${params.config} check that you are using a valid pipeline name") + System.err.println("ERROR: Could not load ${params.config} check that you are using a valid workflow name") + System.exit(1) } // work directory is important as it will be large, plan accordingly -workDir = "/fastscratch/${USER}/${params.workflow}" +workDir = "/flashscratch/${USER}/${params.workflow}" manifest { name = "The Jackson Laboratory Computational Sciences Nextflow based analysis pipelines" homePage = "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" mainScript = "main.nf" - nextflowVersion = "!>=20.10.0" - version = "0.4.1" + nextflowVersion = "!>=22.04.3" + version = "PIVOT" + author = 'Michael Lloyd, Brian Sanderson, Barry Guglielmo, Sai Lek, Peter Fields, Harshpreet Chandok, Carolyn Paisie, Gabriel Rech, Ardian Ferraj, Tejas Temker, Anuj Srivastava. Copyright Jackson Laboratory 2024' } + profiles { sumner { includeConfig "config/profiles/sumner.config" } - sumner2 { includeConfig "config/profiles/sumner2.config" } + sumner2 { includeConfig "config/profiles/sumner2.config" } elion { includeConfig "config/profiles/elion.config" } } diff --git a/nf-test.config b/nf-test.config new file mode 100644 index 00000000..9d571fdc --- /dev/null +++ b/nf-test.config @@ -0,0 +1,13 @@ +config { + + testsDir "tests" + workDir ".nf-test" + configFile "tests/nextflow.config" + profile "sumner2" + stage { + symlink "subworkflows/" + symlink "modules/" + symlink "test/" + symlink "workflows/" + } +} diff --git a/test/cnv_array/example_sample_input.csv b/test/cnv_array/example_sample_input.csv new file mode 100644 index 00000000..2df939c1 --- /dev/null +++ b/test/cnv_array/example_sample_input.csv @@ -0,0 +1,4 @@ +sampleID,gender,idat_red,idat_green +Test_Sample_XY,XY,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/CNV_ARRAY/raw_idat/GSM7177504_205848650018R01C01_Red.idat,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/CNV_ARRAY/raw_idat/GSM7177504_205848650018R01C01_Grn.idat +Test_Sample_XX,XX,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/CNV_ARRAY/raw_idat/GSM7177504_205848650018R01C01_Red.idat,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/CNV_ARRAY/raw_idat/GSM7177504_205848650018R01C01_Grn.idat +Test_Sample_NA,,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/CNV_ARRAY/raw_idat/GSM7177504_205848650018R01C01_Red.idat,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/CNV_ARRAY/raw_idat/GSM7177504_205848650018R01C01_Grn.idat diff --git a/test/cnv_array/fail_example_input.csv b/test/cnv_array/fail_example_input.csv new file mode 100644 index 00000000..af206bfd --- /dev/null +++ b/test/cnv_array/fail_example_input.csv @@ -0,0 +1,2 @@ +sampleID,gender,idat_red,idat_green +Test_Sample_XY,XY,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/CNV_ARRAY/raw_idat/GSM7177504_205848650018_R01C01_Red.idat,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/CNV_ARRAY/raw_idat/GSM7177504_205848650018_R01C01_Grn.idat \ No newline at end of file diff --git a/tests/nextflow.config b/tests/nextflow.config new file mode 100644 index 00000000..c99eca59 --- /dev/null +++ b/tests/nextflow.config @@ -0,0 +1,5 @@ +/* +======================================================================================== + Nextflow config file for running tests +======================================================================================== +*/ \ No newline at end of file diff --git a/tests/workflows/cnv_array.nf.test b/tests/workflows/cnv_array.nf.test new file mode 100644 index 00000000..136e05c5 --- /dev/null +++ b/tests/workflows/cnv_array.nf.test @@ -0,0 +1,139 @@ +nextflow_workflow { + + name "Test Workflow CNV_ARRAY" + script "workflows/cnv_array.nf" + workflow "CNV_ARRAY" + + test("Full Workflow") { + tag "primary" + when { + params { + csv_input = "${baseDir}/test/cnv_array/example_sample_input.csv" + pipeline = 'cnv_array' + } + } + + then { + assert workflow.success + } + } + + test("Full Workflow -- GEO filename failure") { + tag "primary" + when { + params { + csv_input = "${baseDir}/test/cnv_array/fail_example_input.csv" + pipeline = 'cnv_array' + } + } + + then { + assert workflow.failed + } + } + // test("IAAP_CLI Process") { + // tag "IAAP_CLI" + // tag "process" + // when { + // params { + // outdir = "tests/results" + // bpm_file = "${baseDir}/test/cnv_array/bpm_file" + // egt_file = "${baseDir}/test/cnv_array/egt_file" + // csv_input = "${baseDir}/test/cnv_array/csv_input" + // gc_file = "${baseDir}/test/cnv_array/gc_file" + // rt_file = "${baseDir}/test/cnv_array/rt_file" + // pipeline = 'cnv_array' + // } + // } + + // then { + // assert workflow.success + // // assert GTC format files + // assert file("tests/results/iaap_cli_output.csv").exists() + // } + // } + + // test("BCFTOOLS_GTC2VCF Process") { + // tag "BCFTOOLS_GTC2VCF" + // tag "process" + // when { + // params { + // outdir = "tests/results" + // bpm_file = "${baseDir}/test/cnv_array/bpm_file" + // egt_file = "${baseDir}/test/cnv_array/egt_file" + // csv_input = "${baseDir}/test/cnv_array/csv_input" + // gc_file = "${baseDir}/test/cnv_array/gc_file" + // rt_file = "${baseDir}/test/cnv_array/rt_file" + // } + // } + + // then { + // assert workflow.success + // // assert the tsv, bcf, csi and vcf files + // assert file("tests/results/bcftools_gtct2vcf_output.vcf").exists() + // } + // } + + // test("BCFTOOLS_QUERY_ASCAT Process") { + // tag "BCFTOOLS_QUERY_ASCAT" + // tag "process" + // when { + // params { + // outdir = "tests/results" + // bpm_file = "${baseDir}/test/cnv_array/bpm_file" + // egt_file = "${baseDir}/test/cnv_array/egt_file" + // csv_input = "${baseDir}/test/cnv_array/csv_input" + // gc_file = "${baseDir}/test/cnv_array/gc_file" + // rt_file = "${baseDir}/test/cnv_array/rt_file" + // } + // } + + // then { + // assert workflow.success + // // assert the files BAF and LRR + // assert file("tests/results/bcftools_query_ascat_output.csv").exists() + // } + // } + + // test("ASCAT Process") { + // tag "ASCAT" + // tag "process" + // when { + // params { + // outdir = "tests/results" + // bpm_file = "${baseDir}/test/cnv_array/bpm_file" + // egt_file = "${baseDir}/test/cnv_array/egt_file" + // csv_input = "${baseDir}/test/cnv_array/csv_input" + // gc_file = "${baseDir}/test/cnv_array/gc_file" + // rt_file = "${baseDir}/test/cnv_array/rt_file" + // } + // } + + // then { + // assert workflow.success + // // Assert the files + // assert file("tests/results/ascat_output.csv").exists() + // } + // } + + // test("ASCAT_ANNOTATION Process") { + // tag "ASCAT_ANNOTATION" + // tag "process" + // when { + // params { + // outdir = "tests/results" + // bpm_file = "${baseDir}/test/cnv_array/bpm_file" + // egt_file = "${baseDir}/test/cnv_array/egt_file" + // csv_input = "${baseDir}/test/cnv_array/csv_input" + // gc_file = "${baseDir}/test/cnv_array/gc_file" + // rt_file = "${baseDir}/test/cnv_array/rt_file" + // } + // } + + // then { + // assert workflow.success + // // assert the files .txt file and ploly file + // assert file("tests/results/ascat_annotation_output.csv").exists() + // } + // } +} diff --git a/workflows/cnv_array.nf b/workflows/cnv_array.nf new file mode 100644 index 00000000..449ae25d --- /dev/null +++ b/workflows/cnv_array.nf @@ -0,0 +1,44 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// Import modules +include {help} from "${projectDir}/bin/help/cnv_array.nf" +include {param_log} from "${projectDir}/bin/log/cnv_array.nf" +include {extract_csv} from "${projectDir}/bin/shared/extract_cnv_array_csv.nf" +include {IAAP_CLI} from "${projectDir}/modules/illumina/iaap_cli.nf" +include {BCFTOOLS_GTC2VCF} from "${projectDir}/modules/bcftools/bcftools_gtct2vcf.nf" +include {BCFTOOLS_QUERY_ASCAT} from "${projectDir}/modules/bcftools/bcftools_query_ascat.nf" +include {ASCAT} from "${projectDir}/modules/ascat/ascat_run.nf" +include {ASCAT_ANNOTATION} from "${projectDir}/modules/ascat/ascat_annotation.nf" + + +// Help if needed +if (params.help) { + help() + exit 0 +} + +// Log parameter info +param_log() +// Parameter validation +if (!params.bpm_file || !params.egt_file) { + exit 1, "All parameters (bpm_file, egt_file) are required." +} + +if (params.csv_input) { + ch_input = extract_csv(file(params.csv_input, checkIfExists: true)) +} else { + exit 1, "Workflow requires a CSV manifest. See `--help` for information." +} + +GC_file = file(params.gc_file, checkIfExists: true) +RT_file = file(params.rt_file, checkIfExists: true) + +// Main workflow +workflow CNV_ARRAY { + IAAP_CLI(ch_input) + BCFTOOLS_GTC2VCF(IAAP_CLI.out.gtc) + BCFTOOLS_QUERY_ASCAT(BCFTOOLS_GTC2VCF.out.gtc2vcf) + ASCAT(BCFTOOLS_QUERY_ASCAT.out.baf_lrr) + ASCAT_ANNOTATION(ASCAT.out.seg_ploidy) +}