Skip to content

Commit

Permalink
Merged in release (pull request #201)
Browse files Browse the repository at this point in the history
Release
  • Loading branch information
MikeWLloyd committed Aug 16, 2024
2 parents 67cd86b + ce3d5ac commit 3f55b38
Show file tree
Hide file tree
Showing 60 changed files with 1,160 additions and 41 deletions.
4 changes: 4 additions & 0 deletions .zenodo.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@
"affiliation": "The Jackson Laboratory",
"name": "Ardian Ferraj"
},
{
"affiliation": "The Jackson Laboratory",
"name": "Tejas Temker"
},
{
"affiliation": "The Jackson Laboratory",
"name": "Anuj Srivastava"
Expand Down
43 changes: 43 additions & 0 deletions ReleaseNotes.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,48 @@
# RELEASE NOTES

## Release 0.7.0

In this release we add a new workflow for calling copy number variation (CNV) from raw Illumina IDAT genotype array files. Currently the Illumina IlluminaCytoSNP v2.1 array is supported, but support for additional arrays is possible.

We make additional minor changes as described below.

### Pipelines Added:

1. CNV calling from Illumina genotype array data (--cnv_array)

### Modules Added:

1. modules/bcftools/bcftools_gtct2vcf.nf
1. modules/bcftools/bcftools_query_ascat.nf
1. modules/illumina/iaap_cli.nf
1. modules/ascat/ascat_run.nf
1. modules/ascat/ascat_annotation.nf

### Pipeline Changes:

None

### Module Changes:

1. Replaced the incorrect `${task.mem}` with `${task.memory}` in the Nextflow error catch statement in modules related to the SV calling workflows.
1. utility_modules/gzip.nf: Memory request increase

### Scripts Added:

1. cnv_array/ASCAT_run.R
1. cnv_array/annotate_ensembl_genes.pl
1. cnv_array/seg_plot.R
1. cnv_array/segment_raw_extend.pl

### Script Changes:

None

### NF-Test Modules Added:

1. tests/workflows/cnv_array.nf.test


## Release 0.6.7

In this release we make the following minor adjustments:
Expand Down
107 changes: 107 additions & 0 deletions bin/cnv_array/ASCAT_run.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
suppressMessages(library(ASCAT))

### ASCAT Run ######

# Note: this script expects ASCAT is running on single sample BAF/LRR files.

args=(commandArgs(TRUE))

sampleID = args[1]
BAF_file = args[2]
LRR_file = args[3]
gender = args[4]
platform = args[5]
GC_file = args[6]
RT_file = args[7]

######

# Expected SNP POS file:
# Probe Set ID Chromosome Physical Position
# CN_473963 1 61736
# CN_473964 1 61808

## the above can be taken from the BAF file. The BAF file contains positions for all valid SNPs.
SNPpos <- read.table(BAF_file, sep = "\t", header = TRUE)[ ,1:3]
colnames(SNPpos) <- c('Probe_Set_ID', 'Chromosome', 'Physical_Position')

##

if (gender == 'NA') {
gender = 'XY'
}

ascat.bc = ascat.loadData(Tumor_LogR_file = LRR_file, Tumor_BAF_file = BAF_file, gender = gender, genomeVersion = "hg38")

ascat.bc$samples[1] <- sampleID
colnames(ascat.bc[["Tumor_LogR"]]) <- sampleID
colnames(ascat.bc[["Tumor_BAF"]]) <- sampleID

ascat.plotRawData(ascat.bc, img.prefix = "Before_correction_")

ascat.bc = ascat.correctLogR(ascat.bc, GCcontentfile = GC_file, replictimingfile = RT_file)

ascat.plotRawData(ascat.bc, img.prefix = "After_correction_")

gg = ascat.predictGermlineGenotypes(ascat.bc, platform = platform)

ascat.bc = ascat.aspcf(ascat.bc, ascat.gg = gg)

ascat.plotSegmentedData(ascat.bc)

ascat.output = ascat.runAscat(ascat.bc, write_segments = T)

##

QC = ascat.metrics(ascat.bc, ascat.output)

write.table(as.data.frame(QC), file = paste0(sampleID, "_sample.QC.txt"), sep="\t", quote=F, row.names=F, col.names=T)

save(ascat.bc, ascat.output, QC, file = paste0(sampleID, "_ASCAT_objects.Rdata"))

##

if ( length(ascat.output$failedarrays) == 0 ) {

num_probes <- vector(mode="numeric", length=nrow(ascat.output$segments_raw))
for (i in 1:nrow(ascat.output$segments_raw)) {
L1 = which(SNPpos$Chromosome == ascat.output$segments_raw$chr[i] & SNPpos$Physical_Position == ascat.output$segments_raw$startpos[i])
L2 = which(SNPpos$Chromosome == ascat.output$segments_raw$chr[i] & SNPpos$Physical_Position == ascat.output$segments_raw$endpos[i])
num_probes[i] = L2[length(L2)] - L1[1] + 1
}
seg_raw = cbind(ascat.output$segments_raw,num_probes)

num_probes <- vector(mode="numeric", length=nrow(ascat.output$segments))
for (i in 1:nrow(ascat.output$segments)) {

#print(i)
L1 = which(SNPpos$Chromosome == ascat.output$segments$chr[i] & SNPpos$Physical_Position == ascat.output$segments$startpos[i])
L2 = which(SNPpos$Chromosome == ascat.output$segments$chr[i] & SNPpos$Physical_Position == ascat.output$segments$endpos[i])
num_probes[i] = L2[length(L2)] - L1[1] + 1

}
seg = cbind(ascat.output$segments,num_probes)

seg_raw_dfs <- split(seg_raw, seg_raw$sample)
seg_dfs <- split(seg, seg$sample)

for (samp in names(seg_raw_dfs)){
write.table(seg_raw_dfs[[samp]], file = paste0(samp, ".segments_raw.txt"), sep="\t", quote=F, row.names=F)
write.table(seg_dfs[[samp]], file = paste0(samp, ".segments.txt"), sep="\t", quote=F, row.names=F)
write.table(as.data.frame(ascat.output$aberrantcellfraction)[row.names(as.data.frame(ascat.output$aberrantcellfraction)) %in% samp,], file=paste(samp,".aberrantcellfraction.txt",sep=""), sep="\t", quote=F, row.names=F, col.names=F)
write.table(as.data.frame(ascat.output$ploidy)[row.names(as.data.frame(ascat.output$ploidy)) %in% samp,], file=paste(samp,".ploidy.txt",sep=""), sep="\t", quote=F, row.names=F, col.names=F)
}

} else {

write.table(as.data.frame(ascat.output$failedarrays), file="ASCAT.failedarrays.txt", sep="\t", quote=F, row.names=F, col.names=F)

}

if ( !is.null(ascat.output$nonaberrantarrays) ) {

write.table(as.data.frame(ascat.output$nonaberrantarrays), file="ASCAT.nonaberrantarrays.txt", sep="\t", quote=F, row.names=F, col.names=F)

}

sessionInfo()
132 changes: 132 additions & 0 deletions bin/cnv_array/annotate_ensembl_genes.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#!/usr/bin/perl -w
use POSIX;
use File::Basename;

# This script annotates ensembl genes with copy number and breakpoints
# perl ensemblegenes_cnv_break.pl *.segments_raw.extend.txt mart_export_gene_chr1-Y.hg19ensembl75-85.08232016.txt

if ($#ARGV != 1) {
print "This scripts requires: <file_cn> <file_gene> \n";
exit(-1);
}

$file_cn = $ARGV[0];
$file_gene = $ARGV[1];

$file_output = basename($file_cn,".txt").".ensgene_cnvbreak.txt";
open(OUTFILE, ">$file_output");

open(GENEFILE, "$file_gene") or die "can't open $file_gene: $!";
$gene = <GENEFILE>;
chomp($gene);

open(CNFILE, "$file_cn") or die "can't open $file_cn: $!";
@data = <CNFILE>;
close(CNFILE);
chomp(@data);

#print OUTFILE "$tmp\tstartext\tendext\tstartext_desc\tendext_desc\tCN_raw\tLOH\tparm_fraction\tqarm_fraction\tploidy\tcopydiff_2\tcopydiff_ploidy\tlogratio_2\tlogratio_ploidy\n";
print OUTFILE "$gene\tnum_cnv_seg\tseg_desc\tploidy\tnMajor\tnMinor\tnAraw\tnBraw\tCN_raw\tLOH\tcopydiff_2\tcopydiff_ploidy\tlogratio_2\tlogratio_ploidy\tnMajor_max\tnMinor_max\tnAraw_max\tnBraw_max\tCN_raw_max\tLOH_max\tcopydiff_2_max\tcopydiff_ploidy_max\tlogratio_2_max\tlogratio_ploidy_max\n";

while ($gene = <GENEFILE>) {

chomp($gene);
@line = split(/\t/, $gene);
$chr = $line[2];
$start = $line[3];
$end = $line[4];

#$cnraw1=999;
$numseg=0;
$region="";
%segline = ();
@n = ();

for ($j=1; $j<=$#data; $j++) {
@segment = split(/\t/, $data[$j]);

$chr_cn = $segment[1];
$pos1 = $segment[2];
$pos2 = $segment[3];
$pos1ext = $segment[9];
$pos2ext = $segment[10];
$left = $segment[11];
$right = $segment[12];
$cnraw = $segment[13];

if (($chr_cn eq $chr) && ($start <= $pos2ext) && ($end >= $pos1ext)) { #overlap
#$numseg++;
push(@n, $cnraw);
$segline{$cnraw} = [ @segment ];

#check if overlap with regions with no call
if (($start <= $pos1) && ($end >= $pos1ext)) {
$region = $region.$left.";";
}
if (($start <= $pos2ext) && ($end >= $pos2)) {
$region = $region.$right.";";
}

#if ($cnraw < $cnraw1) {
# $cnraw1 = $cnraw;
# $count = $j;
#}
}
}

if ($region eq "") {
$region = "NA";
}

if ($#n >= 0) {

$numseg = $#n +1;
@sortn = sort{ $a <=> $b } @n;

$nA = $segline{$sortn[0]}[4];
$nB = $segline{$sortn[0]}[5];
$rawA = $segline{$sortn[0]}[6];
$rawB = $segline{$sortn[0]}[7];
$cnraw = $segline{$sortn[0]}[13];
$loh = $segline{$sortn[0]}[14];
$ploidy= $segline{$sortn[0]}[17];
$copydiff1 = $segline{$sortn[0]}[18];
$copydiff2 = $segline{$sortn[0]}[19];
$logratio1 = $segline{$sortn[0]}[20];
$logratio2 = $segline{$sortn[0]}[21];

$outline = "$gene\t$numseg\t$region\t$ploidy\t$nA\t$nB\t$rawA\t$rawB\t$cnraw\t$loh\t$copydiff1\t$copydiff2\t$logratio1\t$logratio2\t";

if ($numseg > 1 ) {
$nA = $segline{$sortn[$#sortn]}[4];
$nB = $segline{$sortn[$#sortn]}[5];
$rawA = $segline{$sortn[$#sortn]}[6];
$rawB = $segline{$sortn[$#sortn]}[7];
$cnraw = $segline{$sortn[$#sortn]}[13];
$loh = $segline{$sortn[$#sortn]}[14];
$copydiff1 = $segline{$sortn[$#sortn]}[18];
$copydiff2 = $segline{$sortn[$#sortn]}[19];
$logratio1 = $segline{$sortn[$#sortn]}[20];
$logratio2 = $segline{$sortn[$#sortn]}[21];
}
else {
$nA = "NA";
$nB = "NA";
$rawA = "NA";
$rawB = "NA";
$cnraw = "NA";
$loh = "NA";
$copydiff1 = "NA";
$copydiff2 = "NA";
$logratio1 = "NA";
$logratio2 = "NA";

}

$outline = $outline."$nA\t$nB\t$rawA\t$rawB\t$cnraw\t$loh\t$copydiff1\t$copydiff2\t$logratio1\t$logratio2";
print OUTFILE "$outline\n";
}
}

close (GENEFILE);
close (OUTFILE);
Loading

0 comments on commit 3f55b38

Please sign in to comment.