diff --git a/README.md b/README.md index 6b86511..d2aecd4 100644 --- a/README.md +++ b/README.md @@ -138,11 +138,6 @@ piece) AGATCGGAAGAGCG) currently only used for adaptor ligase see -al and when * `-s` the posible distance of the start. This is the distance count from the start of the read to the first basepair of the barcode or enzyme (standard 0, maximum 20) -* `-cc` Checks the complete read for the enzyme (if false, stops at the -first possible enzyme cutsite) (use values true or false, standard is true). If -used, the sequence after the enzyme site is compared to the adaptors, if the -first basepairs of the sequence are compaired to the first basepairs of the -adaptor * `-kc` Keep the enzyme cut-site remains (standard true) (example: enzyme ApeKI and restriction site G^CWGC: "ApeKI \tab CAGC,CTGC") * `-ea` Add enzymes from the given file (keeps the standard enzymes, and @@ -151,10 +146,7 @@ cutsites are comma separeted)) (only use once, not use -er) (example: enzyme ApeKI and restriction site G^CWGC: "ApeKI \tab CAGC,CTGC") * `-er` Replace enzymes from the given file (do not keep the standard enzymes) (enzyme file: no header, enzyme name tab cutsites (multiple cutsites -are comma separeted)) (only use once, not use -ea) -* `-al` check for adaptor ligase: no (for no check) or a positive integer -(starts at 0), for the number of mismatches (only checks 10 basepairs of -the adaptor), standard 1 +are comma separeted)) (only use once, not use -ea) * `-scb` Use self correcting barcodes (barcodes created by the barcodeGenerator) (standard false) * `-malg` the used algorithm to find mismatches and indels, possible @@ -259,4 +251,9 @@ v1.1.2 the read is considered as unvalid (previous was first sample) v1.1.3 -* On request added the enzyme AvaII \ No newline at end of file +* On request added the enzyme AvaII + +v1.1.4 +* Update adaptor ligase finding algorithm +* Removed unneeded, confusing parameters -cc and -al +* Removed unused code \ No newline at end of file diff --git a/releases/GBSX_v1.1.4/GBSX_digest_v1.0.pl b/releases/GBSX_v1.1.4/GBSX_digest_v1.0.pl new file mode 100644 index 0000000..3499548 --- /dev/null +++ b/releases/GBSX_v1.1.4/GBSX_digest_v1.0.pl @@ -0,0 +1,380 @@ +#!/usr/bin/perl + +#################################################################################################################### +# This is GBSX v1.0. A toolkit for experimental design and demultiplexing genotyping by sequencing experiments. # +# # +# Copyright (C) 2014 KU Leuven # +# # +# This file is part of GBSX. # +# # +# GBSX is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation, either version 3 of the License, or # +# (at your option) any later version. # +# # +# GBSX is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with GBSX. If not, see . # +#################################################################################################################### + +######################################################################################################################################################################## +use Getopt::Std; +use Bio::SeqIO; +use Data::Dumper; +use Bio::PrimarySeq; +use Bio::DB::Fasta; +use Bio::Restriction::EnzymeCollection; +use Bio::Restriction::Analysis; +use Bio::Restriction::Enzyme; + + + +######################################################################################################################################################################## +my $program_information = "GBSX v1.0"; +my $license = "GBSX v1.0 Copyright (C) 2014 KU Leuven\nThis program comes with ABSOLUTELY NO WARRANTY; for details type `perl GBSX_digest_v1.0.pl -w'.\nThis is free software, and you are welcome to redistribute it under certain conditions; type `perl GBSX_digest_v1.0.pl -c' for details.\n"; + +$Getopt::Std::STANDARD_HELP_VERSION = 1; +my $help_message="\n$license\n####################################################################################\nusage: perl GBSX_digest_v1.0.pl -d \'digest sequence\' -l \'read length\' -f \'file of reference fasta file location(s)\'\n\noptional parameters: \n\t-e \'enzyme name to use\' (default: Enzyme)\n\t-g \'genome name to use in bed file name\' (default: genome)\n\t-n \'minimum size fragments to include\' (default: 100)\n\t-m \'maximum size fragments to use\' (default: 1000)\n\n\t-E \'second enzyme name to use\' (default: Enzyme2)\n\t-D \'digest sequence for a second enzyme\' (default: not declared)\n\t-R \'digest sequence for a third enzyme\' (default: not declared)\n"; + +######################################################################################################################################################################## +######################################################################################################################################################################## +#input parameters +sub HELP_MESSAGE {die "$help_message\n";} +sub VERSION_MESSAGE {print "$program_information\n";} + +my %opts = (); +getopts('e:d:g:l:f:m:n:E:D:R:cwh'); + +#die with help message if -h +if ($opt_h){ die "$help_message\n"; } +#die with warrenty message if -w +if ($opt_w){ die "GBSX is distributed in the hope that it will be useful,\nbut WITHOUT ANY WARRANTY; without even the implied warranty of\nMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\nGNU General Public License for more details.\n"; } +#die with conditions message if -c +if ($opt_c){ die "GBSX is free software: you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation, either version 3 of the License, or\n(at your option) any later version.\n"; } + + +#enzyme name (optional) +my $enzyme_name = $opt_e || "Enzyme"; + +#digest sequence: example "RCATG^Y"; +my $digest_seq = $opt_d || "not_defined"; if ($digest_seq ne "not_defined"){$digest_seq=uc($digest_seq);} + +#genome name to use for bedfile (optional) +my $bed_genome_name = $opt_g || "genome"; + +#define read length: assumes PE, only report back in bed first and last number of bps corresponding to this +my $read_length = $opt_l || "not_defined"; + +#file of reference fasta locations +my $fasta_file_locations = $opt_f || "not_defined"; + +#max/min fragment sizes (optional, otherwise use defaults) +my $max_size = $opt_m || 1000; +my $min_size = $opt_n || 100; + +#enzyme2 name (optional, otherwise if second digest just default to Enzyme2) +my $enzyme2_name = $opt_E || "Enzyme2"; + #make sure starts Upper case + $enzyme2_name = ucfirst($enzyme2_name); + #double check this is not the same as Enzyme1 + if ($enzyme2_name eq $enzyme_name){die "Please use different enzyme names for enzyme 1 and enzyme 2\n"; } + +#digest sequence for a second enzyme: example "RCATG^Y"; (optional, otherwise no second digest) +my $digest2_seq = $opt_D || "not_defined"; if ($digest2_seq ne "not_defined"){$digest2_seq=uc($digest2_seq);} + #double check this is not the same as Enzyme1 + if ($digest2_seq eq $digest_seq){die "Please use different enzyme cut sequences for enzyme 1 and enzyme 2\n"; } + +#digest sequence for a third enzyme: example "RCATG^Y"; (optional, otherwise no third digest) +my $digest3_seq = $opt_R || "not_defined"; + if ($digest3_seq ne "not_defined"){$digest3_seq=uc($digest3_seq); + #double check this is not the same as Enzyme1 or Enzyme3 + if ( ($digest3_seq eq $digest2_seq) || ($digest3_seq eq $digest_seq) ){die "Please use different enzyme cut sequences for enzyme 1, enzyme 2, and enzyme 3\n"; } + #we call this Enzyme3, so make sure nothing else uses this name + if ( ($enzyme2_name eq "Enzyme3") || ($enzyme_name eq "Enzyme3") ){ die "Enzyme3 is a reseved name, please use a different enzyme name\n"; } + } + +#die if required parameters are missing +if ( ($digest_seq eq "not_defined") || ($read_length eq "not_defined") || ($fasta_file_locations eq "not_defined") ){ + my $missing=" "; + if ($digest_seq eq "not_defined"){$missing = $missing."-d (digest sequence) ";} + if ($read_length eq "not_defined"){$missing = $missing."-l (read length) ";} + if ($fasta_file_locations eq "not_defined"){$missing = $missing."-f (reference fasta locations) ";} + die "missing parameter(s):$missing\n"; +} + +######################################################################################################################################################################## +######################################################################################################################################################################## +#first basic steps to get ready for the analysis (mostly defining variables) + +#using input information, define outfiles: +my $outfile; my $main_bed; +if ($digest2_seq ne "not_defined"){ + $outfile = $bed_genome_name.".".$enzyme_name.".".$enzyme2_name.".".$read_length."nt.digest_results"; + $main_bed = $bed_genome_name.".".$enzyme_name.".".$enzyme2_name.".".$read_length."nt.digest.bed"; +}else{ + $outfile = $bed_genome_name.".".$enzyme_name.".".$read_length."nt.digest_results"; + $main_bed = $bed_genome_name.".".$enzyme_name.".".$read_length."nt.digest.bed"; +} + +#get reference fasta files +my @ref_chr; +open (ref_files,$fasta_file_locations)||die "could not open file $fasta_file_locations\n"; +while(){ + chop; + $_=~s/\n//;$_=~s/\r//; + if ($_ ne ""){ + push(@ref_chr,$_); + #current script only works with one sequence per file, add a check for this + my $grep_results = `grep \"^\>\" $_`; + my @grep_array = split(/\n/,$grep_results); my $number_sequences_in_file = @grep_array; + if ($number_sequences_in_file > 1){die "please use only one sequence per fasta file, $_ had $number_sequences_in_file sequences\n";} + } +} +close(ref_files)||die "could not close file $fasta_file_locations\n"; + +#define enzyme to use +my $rest_enzyme = new Bio::Restriction::Enzyme(-enzyme => $enzyme_name, -seq => $digest_seq); + +#optional: define a second enzyme +my $rest_enzyme2; if ($digest2_seq ne "not_defined"){ $rest_enzyme2 = new Bio::Restriction::Enzyme(-enzyme => $enzyme2_name, -seq => $digest2_seq); } + +#optional: define a third enzyme +my $rest_enzyme3; if ($digest3_seq ne "not_defined"){ $rest_enzyme3 = new Bio::Restriction::Enzyme(-enzyme => "Enzyme3", -seq => $digest3_seq); } + +#define ouput count variables +my $total_cuts=0; my $total_fragments=0; my $total_fragments_between_min_and_max = 0; +my $bin_size=100;#hard coded for now +my $temp_size=$min_size; + until (($temp_size+$bin_size)>$max_size){ + my $temp_size_plus_bin = $temp_size+$bin_size; + my $variable_name = "total_fragments_between_".$temp_size."_and_".$temp_size_plus_bin; + $$variable_name=0; + $temp_size=$temp_size+$bin_size; + } +my $total_fragments_less_min = 0; +my %fragments_in_range_per_chr; my %fragments_in_range_per_chr_enzyme1_end; my %fragments_in_range_per_chr_enzyme2_end; my %fragments_in_range_per_chr_enzymeboth_end; + +#define ouput count variables, optional for a second enzyme +my $total_cuts_enzyme2=0; my $fragments_both_ends_enzyme1 = 0; my $fragments_both_ends_enzyme2 = 0; my $fragments_end_from_each_enzyme = 0; + +#define ouput count variables, optional for a third enzyme +my $fragments_in_range_contain_third_enzyme = 0; + +#define ouput count variables, distribution of distances +my $one_kb = 0; my $ten_kb = 0; my $hundred_kb=0; my $one_Mb = 0; my $ten_Mb = 0; my $more_ten_kb=0; + +######################################################################################################################################################################## +######################################################################################################################################################################## +#per chromosome, perform digest and add to totals + +#open bed file to print to +open(outbed,">$main_bed")||die "could not open out bedfile $main_bed\n"; + +foreach my $chr (@ref_chr){ + + #just get chr name and remove directory structure + my @chr_name_parts =split(/\//,$chr);my $number_parts=@chr_name_parts; + my $chr_name=@chr_name_parts[$number_parts-1]; + $chr_name=~s/.fa$//i;$chr_name=~s/.fasta$//i; + $fragments_in_range_per_chr{$chr_name}=0; + $fragments_in_range_per_chr_enzyme1_end{$chr_name}=0; + $fragments_in_range_per_chr_enzyme2_end{$chr_name}=0; + $fragments_in_range_per_chr_enzymeboth_end{$chr_name}=0; + + #get fasta file for this chr + my $seqio_object1 = Bio::SeqIO->new(-file => $chr, -format => "fasta"); + my $seq = $seqio_object1->next_seq; + my $fastadb = Bio::DB::Fasta->new($chr); + + #digest + my $rest_analysis=Bio::Restriction::Analysis->new(-seq=>$seq, -enzymes=>$rest_enzyme); + + #just get cut numbers first + my $chr_cuts=$rest_analysis->cuts_by_enzyme($enzyme_name); + $total_cuts=$total_cuts+$chr_cuts; + + #optional digest and cut numbers for a second enzyme + my $rest_analysis2;my $chr_cuts2; + if ($digest2_seq ne "not_defined"){ + $rest_analysis2=Bio::Restriction::Analysis->new(-seq=>$seq, -enzymes=>$rest_enzyme2); + $chr_cuts2=$rest_analysis2->cuts_by_enzyme($enzyme2_name); + $total_cuts_enzyme2=$total_cuts_enzyme2+$chr_cuts2; + } + + #get locations of cuts + my @locations1 = $rest_analysis->positions($enzyme_name); + #cut locations if second enzyme + my %hash_locations1; my %hash_locations2;#use position as keys + my @locations2;my @all_locations; + if ($digest2_seq ne "not_defined"){ + @locations2=$rest_analysis2->positions($enzyme2_name); + @all_locations = (@locations1,@locations2); + @all_locations=sort {$a <=> $b} @all_locations; + + foreach my $position1 (@locations1){ $hash_locations1{$position1}=1; } + foreach my $position2 (@locations2){ $hash_locations2{$position2}=1; } + }else{ + @all_locations=@locations1; + } + + #go through all cut locations and check fragment sizes + my @chr_with_directories = split(/\//,$chr); $chr_name=pop @chr_with_directories; $chr_name=~s/.fa//; + my $previous_location=0; my $previous_fragment_in_range_end=0; + + #to analyze every fragment, add the end of the chromosome as a position + my $chr_end_position = $seq->length; + push(@all_locations,$chr_end_position); + + foreach my $location (@all_locations){ + $total_fragments++; + if ($location<$previous_location){die "error in order, $location < $previous_location";} + my $fragment_length = $location-$previous_location; + if (($fragment_length<=$max_size)&&($fragment_length>=$min_size)){ + $total_fragments_between_min_and_max++; + $fragments_in_range_per_chr{$chr_name}=$fragments_in_range_per_chr{$chr_name}+1; + + #for distribution of distance between sequenced fragments + if ($previous_fragment_in_range_end!=0){ + my $distance_this_fragment_to_previous=$previous_location-$previous_fragment_in_range_end; + if ( $distance_this_fragment_to_previous<=1000 ){ + $one_kb++; + }elsif( ($distance_this_fragment_to_previous>1000) && ($distance_this_fragment_to_previous<=10000)){ + $ten_kb++; + }elsif( ($distance_this_fragment_to_previous>10000) && ($distance_this_fragment_to_previous<=100000)){ + $hundred_kb++; + }elsif( ($distance_this_fragment_to_previous>100000) && ($distance_this_fragment_to_previous<=1000000)){ + $one_Mb++; + }elsif( ($distance_this_fragment_to_previous>1000000) && ($distance_this_fragment_to_previous<=10000000)){ + $ten_Mb++; + }elsif( ($distance_this_fragment_to_previous>10000000) ){ + $more_ten_kb++; + }else{ + die "error in distances\n"; + } + } + $previous_fragment_in_range_end=$location; + + + #if second enzyme, are ends from one or both enzymes + if ($digest2_seq ne "not_defined"){ + if ( (exists($hash_locations1{$previous_location})) && (exists($hash_locations1{$location})) ){ + $fragments_both_ends_enzyme1++; #both ends from enzyme1 + $fragments_in_range_per_chr_enzyme1_end{$chr_name}=$fragments_in_range_per_chr_enzyme1_end{$chr_name}+1; + }elsif ( (exists($hash_locations2{$previous_location})) && (exists($hash_locations2{$location})) ){ + $fragments_both_ends_enzyme2++; #both ends from enzyme2 + $fragments_in_range_per_chr_enzyme2_end{$chr_name}=$fragments_in_range_per_chr_enzyme2_end{$chr_name}+1; + }elsif ( (exists($hash_locations1{$previous_location})) && (exists($hash_locations2{$location})) ){ + $fragments_end_from_each_enzyme++; #one end from each enzyme + $fragments_in_range_per_chr_enzymeboth_end{$chr_name}=$fragments_in_range_per_chr_enzymeboth_end{$chr_name}+1; + }elsif ( (exists($hash_locations2{$previous_location})) && (exists($hash_locations1{$location})) ){ + $fragments_end_from_each_enzyme++; #one end from each enzyme + $fragments_in_range_per_chr_enzymeboth_end{$chr_name}=$fragments_in_range_per_chr_enzymeboth_end{$chr_name}+1; + } + } + + #counts on bin sizes + my $temp_size=$min_size; + until (($temp_size+$bin_size)>$max_size){ + my $temp_size_plus_bin = $temp_size+$bin_size; + if ( ($fragment_length>=$temp_size)&&($fragment_length<$temp_size_plus_bin) ){ + my $variable_name = "total_fragments_between_".$temp_size."_and_".$temp_size_plus_bin; + $$variable_name=$$variable_name+1; + } + $temp_size=$temp_size+$bin_size; + } + + #print to bed the potentially sequenced bases + if ($fragment_length <= (2*$read_length)){ + print outbed "$chr_name\t$previous_location\t$location\n"; + }else{ + #print first read + $previous_location_plus=$previous_location+$read_length; + print outbed "$chr_name\t$previous_location\t$previous_location_plus\n"; + #print second read + $location_minus=$location-$read_length; + print outbed "$chr_name\t$location_minus\t$location\n"; + } + + #if third enzyme requested, check if there is a digest site in this fragment + if ($digest3_seq ne "not_defined"){ + my $fragment_seq = $fastadb->seq($chr_name, ($previous_location+1) => ($location)); + my $frag_seq_obj = Bio::Seq->new( -seq => $fragment_seq ); + my $rest_analysis3=Bio::Restriction::Analysis->new(-seq=>$frag_seq_obj, -enzymes=>$rest_enzyme3); + my $frag_cuts3=$rest_analysis3->cuts_by_enzyme("Enzyme3"); + if ($frag_cuts3>0){ $fragments_in_range_contain_third_enzyme++ } + } + + }elsif ($fragment_length<$min_size){ + $total_fragments_less_min++; + } + $previous_location=$location; + } + +} + +#close bedfile +close(outbed)||die "could not open out bedfile $main_bed\n"; + +######################################################################################################################################################################## +######################################################################################################################################################################## +#print to output file general digest info +open(out,">$outfile")||die "could not open outfile $outfile\n"; + +#print input parameters +print out "input parameters:\n\tenzyme name used: $enzyme_name\n\tenzyme cut sequence: $digest_seq\n"; +if ($digest2_seq ne "not_defined"){ print out "\tsecond enzyme name used: $enzyme2_name\n\tsecond enzyme cut sequence: $digest2_seq\n"; } +if ($digest3_seq ne "not_defined"){ print out "\tthird enzyme cut sequence: $digest3_seq\n"; } +print out "\tfor bedfile, genome name used: $bed_genome_name\n\tread length provided: $read_length\nminimum fragment size: $min_size\nmaximum fragment size: $max_size\n\nreference sequence(s) analyzed:\n"; +foreach my $ref_to_print_to_output (@ref_chr){ print out "\t$ref_to_print_to_output\n"; } +print out "\n"; + +#print number of cuts +print out "total cuts for enzyme ($enzyme_name): $total_cuts\n"; +if ($digest2_seq ne "not_defined"){ print out "total cuts for second enzyme ($enzyme2_name): $total_cuts_enzyme2\n"; } + +#print number of fragments +print out "\nlooked at $total_fragments fragments in total\n"; +print out "$total_fragments_between_min_and_max fragments <= $max_size nt and >= $min_size nt\n"; +if ($digest3_seq ne "not_defined"){ print out "\t-containing one or more third digest site ($digest3_seq): $fragments_in_range_contain_third_enzyme\n"; } +if ($digest2_seq ne "not_defined"){ + print out "\t-with both ends from $enzyme_name: $fragments_both_ends_enzyme1\n"; + print out "\t-with both ends from $enzyme2_name: $fragments_both_ends_enzyme2\n"; + print out "\t-with an end from each enzyme: $fragments_end_from_each_enzyme\n"; + print out "\tfragments in range per reference sequence: (both ends from $enzyme_name / both ends from $enzyme2_name / an end from each enzyme)\n"; +}else{ + print out "\tfragments in range per reference sequence:\n"; +} + +#print fragment chromosome distribution +my @chr_names_for_count_print = keys %fragments_in_range_per_chr; +foreach my $chr_name_for_count_print (@chr_names_for_count_print){ + if ($digest2_seq ne "not_defined"){ + print out "\t\t$chr_name_for_count_print had $fragments_in_range_per_chr{$chr_name_for_count_print} fragments ($fragments_in_range_per_chr_enzyme1_end{$chr_name_for_count_print}/$fragments_in_range_per_chr_enzyme2_end{$chr_name_for_count_print}/$fragments_in_range_per_chr_enzymeboth_end{$chr_name_for_count_print})\n"; + }else{ + print out "\t\t$chr_name_for_count_print had $fragments_in_range_per_chr{$chr_name_for_count_print} fragments\n"; + } +} + +#print fragment distance distribution +print out "\tdistances between fragments in range:\n"; +print out "\t\t<=1kb $one_kb fragment pairs\n\t\t1kb-10kb $ten_kb fragment pairs\n\t\t10kb-100kb $hundred_kb fragment pairs\n\t\t100kb-1Mb $one_Mb fragment pairs\n"; +print out "\t\t1Mb-10Mb $ten_Mb fragment pairs\n\t\t>10Mb $more_ten_kb fragment pairs\n"; + + +#print fragment size distribution +print out "\tfragments less than $min_size nt: $total_fragments_less_min \n\n"; +my $temp_size=$min_size; +until (($temp_size+$bin_size)>$max_size){ + my $temp_size_plus_bin = $temp_size+$bin_size; + my $variable_name = "total_fragments_between_".$temp_size."_and_".$temp_size_plus_bin; + $temp_size_plus_bin--; + print out "\tfragments $temp_size-$temp_size_plus_bin nt: $$variable_name\n"; + $temp_size=$temp_size+$bin_size; +} +print out "\n"; + +close(out)||die "could not close outfile $outfile\n"; diff --git a/releases/latest/GBSX_v1.1.3.jar b/releases/GBSX_v1.1.4/GBSX_v1.1.4.jar similarity index 83% rename from releases/latest/GBSX_v1.1.3.jar rename to releases/GBSX_v1.1.4/GBSX_v1.1.4.jar index da9212b..24319e0 100644 Binary files a/releases/latest/GBSX_v1.1.3.jar and b/releases/GBSX_v1.1.4/GBSX_v1.1.4.jar differ diff --git a/releases/latest/GBSX_v1.1.4.jar b/releases/latest/GBSX_v1.1.4.jar new file mode 100644 index 0000000..24319e0 Binary files /dev/null and b/releases/latest/GBSX_v1.1.4.jar differ diff --git a/src/be/uzleuven/gc/logistics/GBSX/GBSX.java b/src/be/uzleuven/gc/logistics/GBSX/GBSX.java index 282d032..8d91c5f 100644 --- a/src/be/uzleuven/gc/logistics/GBSX/GBSX.java +++ b/src/be/uzleuven/gc/logistics/GBSX/GBSX.java @@ -35,7 +35,7 @@ public class GBSX { public static final boolean DEBUG = false; - public final static String VERSION = "GBSX v1.1.3"; + public final static String VERSION = "GBSX v1.1.4"; private final static String LICENCE = "GPLv3"; /** diff --git a/src/be/uzleuven/gc/logistics/GBSX/demultiplexer/GBSdemultiplex.java b/src/be/uzleuven/gc/logistics/GBSX/demultiplexer/GBSdemultiplex.java index 06a29bd..446282f 100644 --- a/src/be/uzleuven/gc/logistics/GBSX/demultiplexer/GBSdemultiplex.java +++ b/src/be/uzleuven/gc/logistics/GBSX/demultiplexer/GBSdemultiplex.java @@ -36,7 +36,7 @@ public class GBSdemultiplex { public static final boolean DEBUG = false; - public final static String VERSION = "GBSX v1.2"; + public final static String VERSION = "GBSX v1.3"; public final static String LICENCE = "GPLv3"; /* diff --git a/src/be/uzleuven/gc/logistics/GBSX/demultiplexer/application/FastqDemultiplex.java b/src/be/uzleuven/gc/logistics/GBSX/demultiplexer/application/FastqDemultiplex.java index ed3e944..18fcf5c 100644 --- a/src/be/uzleuven/gc/logistics/GBSX/demultiplexer/application/FastqDemultiplex.java +++ b/src/be/uzleuven/gc/logistics/GBSX/demultiplexer/application/FastqDemultiplex.java @@ -38,6 +38,8 @@ import be.uzleuven.gc.logistics.GBSX.utils.fastq.model.FastqParts; import be.uzleuven.gc.logistics.GBSX.demultiplexer.model.DemultiplexArguments; import be.uzleuven.gc.logistics.GBSX.demultiplexer.model.DemultiplexParameters; +import be.uzleuven.gc.logistics.GBSX.demultiplexer.model.ProcessedFragment; +import be.uzleuven.gc.logistics.GBSX.demultiplexer.model.SampleBarcodeCombination; import be.uzleuven.gc.logistics.GBSX.utils.enzyme.model.EnzymeComparator; import be.uzleuven.gc.logistics.GBSX.utils.enzyme.model.EnzymeEnum; import be.uzleuven.gc.logistics.GBSX.utils.fastq.infrastructure.FastqPairBufferedReader; @@ -78,6 +80,7 @@ public final class FastqDemultiplex { * this makes the bases for the demultiplexing. The given array contains all the needed arguments. *
If not, there will be thrown exceptions like IllegalArgumentException or RunTimeException. This will end the program and the JVM on a correct way * @param args String[] | all the arguments needed for the excecution. + * @throws be.uzleuven.gc.logistics.GBSX.utils.exceptions.StopExcecutionException */ public FastqDemultiplex(String[] args) throws StopExcecutionException{ this.parameters = new DemultiplexParameters(); @@ -322,7 +325,7 @@ private void processPairEndFastqFiles(){ //read the next fastq line try { //try to parse the fastq - ProcessedFragment newReads = this.parseFastqRead_new(fastq1, fastq2); + ProcessedFragment newReads = this.parseFastqRead(fastq1, fastq2); //if a read is empty: correct it if (newReads.getRead1().getSequence().equals("")){ newReads = new ProcessedFragment(newReads.getSample(), new FastqRead(newReads.getRead1().getDescription(), "N", "#"), newReads.getRead2(), newReads.getMismatch()); @@ -498,319 +501,7 @@ public void writeToLog(String log){ Logger.getLogger(FastqDemultiplex.class.getName()).log(Level.SEVERE, null, ex); } } - - /** - * this method gets two reads from a pair-end fastq file. - *
Read1 is from the first read, Read2 is from the reverse read - *
The reads are made of the 4 lines for every read: the information line, the sequence line, the + line and the quality line. - *
The first read is searched for the barcode + enzyme site (looked to all known samples) - *
if no barcode + enzyme (perfect match) is found, a InvalidReadException is thrown - *
else these are trimed from the read (both from the sequence as the quality) - *
then the read is searched for a cutsite of the same enzyme, if any that piece + the rest is removed (both from the sequence as the quality) - *
The second read is searched for the enzyme site - *
if no enzyme site (perfect match) is found, a InvalidReadException is thrown - *
else these are trimed from the read (both sequence as quality) - *
then the read is searched for a secund cutsite (this will be the complement of the enzyme site + barcode) - *
if found these are removed + rest after (both from the sequence as the quality) - *
a new array of Strings is created with 0 as the modified read of read1, and 1 as the modified read of read2 - * @param read1 Map | the 4 lines for the first read: the information line, the sequence line, the + line and the quality line (as Fastq from BioJava) - * @param read2 Map | the 4 lines for the second read: the information line, the sequence line, the + line and the quality line (as Fastq from BioJava) - * @return ProcessedRead | All information about the processed reads - * @throws InvalidReadException if the first or second read doesn't has the right index (barcode + enzyme site for read1, enzyme site for read2) - * @see FastqDemultiplex#findBestBarcode(java.lang.String) - * @see FastqDemultiplex#findRead1EnzymeLocation(java.lang.String, be.uzleuven.gc.logistics.gbsDemultiplex.model.Sample) - * @see ProcessedFragment - */ - private ProcessedFragment parseFastqRead(FastqRead read1, FastqRead read2) throws InvalidReadException{ - //search for the optimal barcode - //SampleBarcodeCombination sampleBarcodeCombination = this.findBestBarcode(read1.getSequence()); - SampleBarcodeCombination sampleBarcodeCombination = this.findGBSBarcode(read1.getSequence(), this.parameters.getStartDistance()); - if (sampleBarcodeCombination == null){ - //no optimal barcode found in the first read - throw new InvalidReadException(read1, read2, InvalidReadEnum.READ1); - } - Sample sample = sampleBarcodeCombination.getSample(); - String barcodeEnzyme = sampleBarcodeCombination.getSample().getBarcode(); - String enzymeCutsite = sampleBarcodeCombination.getEnzymeCutsite(); - int barcodeEnzymeLength = sampleBarcodeCombination.getLengthFoundBarcode(); - if (! this.parameters.keepCutSites()){ - //the cutsite mustn't be kept - barcodeEnzyme += sampleBarcodeCombination.getEnzymeCutsite(); - barcodeEnzymeLength += sampleBarcodeCombination.getLengthFoundEnzyme(); - } - //remove the barcode and the enzyme site - int read1BarcodeLocation = sampleBarcodeCombination.getLocation(); - String read1modifiedSequence = read1.getSequence().substring(read1BarcodeLocation + barcodeEnzymeLength, read1.getSequence().length() - (this.longestBarcodeLength - sample.getBarcode().length())); - String read1modifiedQuality = read1.getQuality().substring(read1BarcodeLocation + barcodeEnzymeLength, read1.getSequence().length() - (this.longestBarcodeLength - sample.getBarcode().length())); - - if (this.parameters.keepCutSites()){ - //add the cutsite to the barcodeEnzyme when not already added (to have the correct complement) - barcodeEnzyme += sampleBarcodeCombination.getEnzymeCutsite(); - } - //find the next enzyme site (if there is any) - int read1EndLocation = -1; - int[] read1EndLocationLength = {-1, 0}; -// EnzymeComparator enzymeComparator = new EnzymeComparator(); -// if (enzymeComparator.compare(sample.getEnzyme(), this.parameters.getNeutralEnzyme()) != 0){ - read1EndLocationLength = this.findRead1EnzymeLocation(read1modifiedSequence, sample); - read1EndLocation = read1EndLocationLength[0]; -// } - String read1optimalSequence = read1modifiedSequence; - String read1optimalQuality = read1modifiedQuality; - if (read1EndLocation != -1){ - //compliment barcode found - if (this.parameters.keepCutSites() && ! this.parameters.isRadData()){ - //cutsites must be kept - read1EndLocation += read1EndLocationLength[1]; - } - read1optimalSequence = read1modifiedSequence.substring(0, read1EndLocation); - read1optimalQuality = read1modifiedQuality.substring(0, read1EndLocation); - } - - - //parsing of read2 - //find the first enzyme location - - //just cut the first basepairs because these will be the enzyme site - int read2firstEnzymeLocation = 0; - String read2modifiedSequence = read2.getSequence().substring(read2firstEnzymeLocation); - String read2modifiedQuality = read2.getQuality().substring(read2firstEnzymeLocation); - if (this.parameters.isRadData()){ - //RAD data -// if (this.findingDistanceAlgorithm.isEquivalent(read2modifiedSequence.substring(0, sample.getBarcode().length()), sample.getBarcode(), this.parameters.getAllowedMismatchesBarcode(sample))){ -// //possible barcode found -// read2modifiedSequence = read2modifiedSequence.substring(sample.getBarcode().length()); -// read2modifiedQuality = read2modifiedQuality.substring(sample.getBarcode().length()); -// if (! this.parameters.keepCutSites()){ -// //remove the enzyme site -// String foundEnzyme = ""; -// for (String enzymeSite : sample.getEnzyme().getInitialCutSiteRemnant()){ -// if (this.findingDistanceAlgorithm.isEquivalent(read2modifiedSequence.substring(0, enzymeSite.length()), enzymeSite, this.parameters.getAllowedMismatchesEnzyme())){ -// foundEnzyme = enzymeSite; -// } -// } -// read2modifiedSequence = read2modifiedSequence.substring(foundEnzyme.length()); -// read2modifiedQuality = read2modifiedQuality.substring(foundEnzyme.length()); -// } -// } - }else{ - //GBS data - if (! this.parameters.keepCutSites()){ - //remove the enzyme site - String foundEnzyme = ""; - for (String enzymeSite : sample.getEnzyme().getInitialCutSiteRemnant()){ - if (this.findingDistanceAlgorithm.isEquivalent(read2modifiedSequence.substring(0, enzymeSite.length()), enzymeSite, this.parameters.getAllowedMismatchesEnzyme())){ - foundEnzyme = enzymeSite; - } - } - read2modifiedSequence = read2modifiedSequence.substring(foundEnzyme.length()); - read2modifiedQuality = read2modifiedQuality.substring(foundEnzyme.length()); - } - } - - //find the next enzyme site (if there is any) - //String complementBarcodeEnzyme = BasePair.getComplementSequence(barcodeEnzyme); - int[] read2secondEnzymeLocationLength = this.findRead2EnzymeLocation(read2modifiedSequence, sample, enzymeCutsite); - String read2optimalSequence = read2modifiedSequence; - String read2optimalQuality = read2modifiedQuality; - if (read2secondEnzymeLocationLength[0] != -1){ - int read2secondEnzymeLocation = read2secondEnzymeLocationLength[0]; - //enzyme site found - if (this.parameters.keepCutSites()){ - //keep the enzyme sites - read2secondEnzymeLocation += read2secondEnzymeLocationLength[1]; - } - read2optimalSequence = read2modifiedSequence.substring(0, read2secondEnzymeLocation); - read2optimalQuality = read2modifiedQuality.substring(0, read2secondEnzymeLocation); - } - - - //size selection - boolean trimedR1 = false; - boolean trimedR2 = false; - int lengthR1 = read1optimalSequence.length(); - int lengthR2 = read2optimalSequence.length(); - String sequenceError = "TRIM\t" + "trimok" + "\t" + read1.getDescription() + "\t" + read2.getDescription() + "\tor1:" + lengthR1 + "\tor2:" + lengthR2 + "\tread1:" + read1optimalSequence.length() + "\tread2:" + read2optimalSequence.length(); - - - if (read1optimalSequence.length() != read1.getSequence().length() - this.longestBarcodeLength){ - trimedR1 = true; - } - if (read2optimalSequence.length() != read2.getSequence().length()){ - trimedR2 = true; - } - - if (! trimedR1 && ! trimedR2){ - //both original => ok - this.correctionLog.addCorrecterTrimOk(sample); - }else if (! trimedR1 && read2optimalSequence.length() >= read1optimalSequence.length()){ - //R1 is original, R2 is trimed, but same length or longer => ok - this.correctionLog.addCorrecterTrimOk(sample); - }else if (read1optimalSequence.length() == read2optimalSequence.length()){ - //R1 and R2 are same length => OK - this.correctionLog.addCorrecterTrimOk(sample); - }else{ - int compareLength = this.longestBarcodeLength; - if (this.parameters.keepCutSites()){ - compareLength += sample.getPossibleEnzymeCutSiteLength(); - } - //R1 and R2 have different sizes, find lowest and check - if (read1optimalSequence.length() < read2optimalSequence.length()){ - //R1 is shortest - if (read1optimalSequence.length() > compareLength){ - String read1end = read1optimalSequence.substring(read1optimalSequence.length() - compareLength, read1optimalSequence.length()); - String expectedStartR2 = BasePair.getComplementSequence(read1end); - if (this.parameters.keepCutSites()){ - if (this.findingDistanceAlgorithm.isEquivalent(read2optimalSequence.substring(0, sample.getPossibleEnzymeCutSiteLength()), expectedStartR2.substring(0, sample.getPossibleEnzymeCutSiteLength()), this.parameters.getAllowedMismatchesEnzyme()) - && this.findingDistanceAlgorithm.isEquivalent(read2optimalSequence.substring(sample.getPossibleEnzymeCutSiteLength(), compareLength), expectedStartR2.substring(sample.getPossibleEnzymeCutSiteLength()), this.parameters.getAllowedMismatchesBarcode(sample))){ - //is equivalent:(read2(0-cutsite), complement read1(0-cutsite) with allowed mismatches enzyme) - //and is equivalent:(read2(cutsite-comparelength), complement read1(cutsite-comparelength) with allowed mismatches barcode) - //is same => trim R2 - read2optimalSequence = read2optimalSequence.substring(0, read1optimalSequence.length()); - read2optimalQuality = read2optimalQuality.substring(0, read1optimalSequence.length()); - trimedR2 = true; - //sequenceError = "CORRECTED R2\n"; - this.correctionLog.addCorrecterR2Corrected(sample); - }else{ - if (read1optimalSequence.length() + sample.getBarcode().length() + this.parameters.getAdaptorCompareSize() >= read1.getSequence().length()){ - //read1 is only checked on cutsite, not on adaptor, not corrected so wrong - int minus = this.longestBarcodeLength - sample.getBarcode().length(); - read1optimalSequence = read1.getSequence().substring(sample.getBarcode().length(), read1.getSequence().length() - minus); - read1optimalQuality = read1.getQuality().substring(sample.getBarcode().length(), read1.getSequence().length() - minus); - trimedR1 = false; - this.correctionLog.addCorrecterR1Corrected(sample); - }else{ - this.correctionLog.addCorrecterR2NotCorrected(sample); - //sequenceError = "NOTCORRECT R2\n"; - } - } - }else{ - //not keep cutsites - if (this.findingDistanceAlgorithm.isEquivalent(read2optimalSequence.substring(0, compareLength), expectedStartR2, this.parameters.getAllowedMismatchesBarcode(sample))){ - //is same => trim R2 - read2optimalSequence = read2optimalSequence.substring(0, read1optimalSequence.length()); - read2optimalQuality = read2optimalQuality.substring(0, read1optimalSequence.length()); - trimedR2 = true; - this.correctionLog.addCorrecterR2Corrected(sample); - //sequenceError = "CORRECTED R2\n"; - }else{ - if (read1optimalSequence.length() + sample.getBarcode().length() + this.parameters.getAdaptorCompareSize() + sample.getPossibleEnzymeCutSiteLength() + sample.getPossibleEnzymeCutSiteLength() >= read1.getSequence().length()){ - //read1 is only checked on cutsite, not on adaptor, not corrected so wrong - int minus = this.longestBarcodeLength - sample.getBarcode().length(); - read1optimalSequence = read1.getSequence().substring(sample.getBarcode().length() + sample.getPossibleEnzymeCutSiteLength(), read1.getSequence().length() - minus); - read1optimalQuality = read1.getQuality().substring(sample.getBarcode().length() + sample.getPossibleEnzymeCutSiteLength(), read1.getSequence().length() - minus); - trimedR1 = false; - this.correctionLog.addCorrecterR1Corrected(sample); - }else{ - this.correctionLog.addCorrecterR2NotCorrected(sample); - //sequenceError = "NOTCORRECT R2\n"; - } - } - } - } - }else if (read1optimalSequence.length() > read2optimalSequence.length()){ - //R2 is shortest - if (read2optimalSequence.length() > compareLength){ - String read2end = read2optimalSequence.substring(read2optimalSequence.length() - compareLength, read2optimalSequence.length()); - String expectedStartR1 = BasePair.getComplementSequence(read2end); - if (this.parameters.keepCutSites()){ - if (this.findingDistanceAlgorithm.isEquivalent(read1optimalSequence.substring(0, sample.getPossibleEnzymeCutSiteLength()), expectedStartR1.substring(0, sample.getPossibleEnzymeCutSiteLength()), this.parameters.getAllowedMismatchesEnzyme()) - && this.findingDistanceAlgorithm.isEquivalent(read1optimalSequence.substring(sample.getPossibleEnzymeCutSiteLength(), compareLength), expectedStartR1.substring(sample.getPossibleEnzymeCutSiteLength()), this.parameters.getAllowedMismatchesBarcode(sample))){ - read1optimalSequence = read1optimalSequence.substring(0, read2optimalSequence.length()); - read1optimalQuality = read1optimalQuality.substring(0, read2optimalSequence.length()); - trimedR1 = true; - this.correctionLog.addCorrecterR1Corrected(sample); - //sequenceError = "CORRECTED R1\n"; - }else{ - this.correctionLog.addCorrecterR1NotCorrected(sample); - //sequenceError = "NOTCORRECT R1\n"; - } - }else{ - //not keep cutsites - if (this.findingDistanceAlgorithm.isEquivalent(read1optimalSequence.substring(0, compareLength), expectedStartR1, this.parameters.getAllowedMismatchesBarcode(sample))){ - read1optimalSequence = read1optimalSequence.substring(0, read2optimalSequence.length()); - read1optimalQuality = read1optimalQuality.substring(0, read2optimalSequence.length()); - trimedR1 = true; - this.correctionLog.addCorrecterR1Corrected(sample); - //sequenceError = "CORRECTED R1\n"; - }else{ - this.correctionLog.addCorrecterR1NotCorrected(sample); - //sequenceError = "NOTCORRECT R1\n"; - } - } - } - }else{ - //same size (may not occure here) - this.correctionLog.addCorrecterTrimOk(sample); - } - } - - - - - if (trimedR1 && ! trimedR2){ - //R1 was trimed, but R2 not - int mismatch = -1; - if (read1optimalSequence.length() < (read1.getSequence().length() - this.longestBarcodeLength - sample.getBarcode().length() + 1)){ - mismatch = (MismatchIndelDistance.calculateEquivalentDistance(read2optimalSequence.substring(read1optimalSequence.length() + 1, (read1optimalSequence.length() + 1 + sample.getBarcode().length())), sample.getComplementBarcode(), 1)[0]); - } - read2optimalSequence = read2optimalSequence.substring(0, read1optimalSequence.length()); - read2optimalQuality = read2optimalQuality.substring(0, read1optimalSequence.length()); - sequenceError = "TRIM\t" + "tR1nR2" + "\t" + read1.getDescription() + "\t" + read2.getDescription() + "\tor1:" + lengthR1 + "\tor2:" + lengthR2 + "\tread1:" + read1optimalSequence.length() + "\tread2:" + read2optimalSequence.length() + "\t" + (lengthR1 - lengthR2) + "\t" + mismatch; - this.correctionLog.addTrimTrimR1NotR2Fail(sample); - }else if (! trimedR1 && trimedR2){ - //R2 was trimed, not R1, so check sizes to diside to trim - if (read1optimalSequence.length() > read2optimalSequence.length()){ - //read 1 is longer, so trim read 1 - int mismatch = -1; - if (read2optimalSequence.length() < read1.getSequence().length() - this.parameters.getAdaptorCompareSize() - this.longestBarcodeLength){ - mismatch = (MismatchIndelDistance.calculateEquivalentDistance(read1optimalSequence.substring(read2optimalSequence.length(), (read2optimalSequence.length() + this.parameters.getAdaptorCompareSize())), this.parameters.getCommonAdaptor(), 1)[0]);; - } - read1optimalSequence = read1optimalSequence.substring(0, read2optimalSequence.length()); - read1optimalQuality = read1optimalQuality.substring(0, read2optimalSequence.length()); - sequenceError = "TRIM\t" + "nR1tR2not" + "\t" + read1.getDescription() + "\t" + read2.getDescription() + "\tor1:" + lengthR1 + "\tor2:" + lengthR2 + "\tread1:" + read1optimalSequence.length() + "\tread2:" + read2optimalSequence.length() + "\t" + (lengthR1 - lengthR2) + "\t" + mismatch; - this.correctionLog.addTrimNotR1TrimR2butFail(sample); - }else if (read1optimalSequence.length() < read2optimalSequence.length()){ - //read 2 is longer, so is ok - sequenceError = "TRIM\t" + "nR1tR2okl" + "\t" + read1.getDescription() + "\t" + read2.getDescription() + "\tor1:" + lengthR1 + "\tor2:" + lengthR2 + "\tread1:" + read1optimalSequence.length() + "\tread2:" + read2optimalSequence.length(); - this.correctionLog.addTrimNotR1TrimR2butOk(sample); - }else{ - //same length so ok - sequenceError = "TRIM\t" + "nR1tR2oks" + "\t" + read1.getDescription() + "\t" + read2.getDescription() + "\tor1:" + lengthR1 + "\tor2:" + lengthR2 + "\tread1:" + read1optimalSequence.length() + "\tread2:" + read2optimalSequence.length(); - this.correctionLog.addTrimNotR1TrimR2butOk(sample); - } - }else if (trimedR1 && trimedR2){ - //both are trimed so both have to be the same size - if (read1optimalSequence.length() > read2optimalSequence.length()){ - //read 1 is longer, so trim read 1 - read1optimalSequence = read1optimalSequence.substring(0, read2optimalSequence.length()); - read1optimalQuality = read1optimalQuality.substring(0, read2optimalSequence.length()); - sequenceError = "TRIM\t" + "tR1tR2notR1" + "\t" + read1.getDescription() + "\t" + read2.getDescription() + "\tor1:" + lengthR1 + "\tor2:" + lengthR2 + "\tread1:" + read1optimalSequence.length() + "\tread2:" + read2optimalSequence.length() + "\t" + (lengthR1 - lengthR2); - this.correctionLog.addTrimTrimR1TrimR2longR1(sample); - }else if (read1optimalSequence.length() < read2optimalSequence.length()){ - //read 2 is longer, so trim read 2 - read2optimalSequence = read2optimalSequence.substring(0, read1optimalSequence.length()); - read2optimalQuality = read2optimalQuality.substring(0, read1optimalSequence.length()); - sequenceError = "TRIM\t" + "tR1tR2notR2" + "\t" + read1.getDescription() + "\t" + read2.getDescription() + "\tor1:" + lengthR1 + "\tor2:" + lengthR2 + "\tread1:" + read1optimalSequence.length() + "\tread2:" + read2optimalSequence.length() + "\t" + (lengthR1 - lengthR2); - this.correctionLog.addTrimTrimR1TrimR2longR2(sample); - }else{ - //both reads have same length so ok - sequenceError = "TRIM\t" + "tR1tR2ok" + "\t" + read1.getDescription() + "\t" + read2.getDescription() + "\tor1:" + lengthR1 + "\tor2:" + lengthR2 + "\tread1:" + read1optimalSequence.length() + "\tread2:" + read2optimalSequence.length(); - this.correctionLog.addTrimTrimR1TrimR2ok(sample); - } - }else if (! trimedR1 && ! trimedR2){ - //perfect sequence, not trimmed - this.correctionLog.addTrimNotR1NotR2(sample); - } - - - //save results - FastqRead resultRead1 = new FastqRead(read1.getDescription(), read1optimalSequence, read1optimalQuality); - FastqRead resultRead2 = new FastqRead(read2.getDescription(), read2optimalSequence, read2optimalQuality); - return new ProcessedFragment(sample, resultRead1, resultRead2, sampleBarcodeCombination.getMismatches(), sequenceError); - } - /** * this method gets two reads from a pair-end fastq file. *
Read1 is from the first read, Read2 is from the reverse read @@ -833,7 +524,7 @@ private ProcessedFragment parseFastqRead(FastqRead read1, FastqRead read2) throw * @see FastqDemultiplex#findRead1EnzymeLocation(java.lang.String, be.uzleuven.gc.logistics.gbsDemultiplex.model.Sample) * @see ProcessedFragment */ - private ProcessedFragment parseFastqRead_new(FastqRead read1, FastqRead read2) throws InvalidReadException{ + private ProcessedFragment parseFastqRead(FastqRead read1, FastqRead read2) throws InvalidReadException{ //search for the optimal barcode SampleBarcodeCombination[] comb = this.findGBSBarcode2(read1.getSequence(), this.parameters.getStartDistance(), read2.getSequence()); if (comb == null){ @@ -867,11 +558,8 @@ private ProcessedFragment parseFastqRead_new(FastqRead read1, FastqRead read2) t //find the next enzyme site (if there is any) int read1EndLocation = -1; int[] read1EndLocationLength = {-1, 0}; -// EnzymeComparator enzymeComparator = new EnzymeComparator(); -// if (enzymeComparator.compare(sample.getEnzyme(), this.parameters.getNeutralEnzyme()) != 0){ - read1EndLocationLength = this.findRead1EnzymeLocation(read1modifiedSequence, sample); - read1EndLocation = read1EndLocationLength[0]; -// } + read1EndLocationLength = this.findRead1EnzymeLocation(read1modifiedSequence, sample); + read1EndLocation = read1EndLocationLength[0]; String read1optimalSequence = read1modifiedSequence; String read1optimalQuality = read1modifiedQuality; if (read1EndLocation != -1){ @@ -897,22 +585,6 @@ private ProcessedFragment parseFastqRead_new(FastqRead read1, FastqRead read2) t String read2modifiedQuality = read2.getQuality().substring(read2firstEnzymeLocation); if (this.parameters.isRadData()){ //RAD data -// if (this.findingDistanceAlgorithm.isEquivalent(read2modifiedSequence.substring(0, sample.getBarcode().length()), sample.getBarcode(), this.parameters.getAllowedMismatchesBarcode(sample))){ -// //possible barcode found -// read2modifiedSequence = read2modifiedSequence.substring(sample.getBarcode().length()); -// read2modifiedQuality = read2modifiedQuality.substring(sample.getBarcode().length()); -// if (! this.parameters.keepCutSites()){ -// //remove the enzyme site -// String foundEnzyme = ""; -// for (String enzymeSite : sample.getEnzyme().getInitialCutSiteRemnant()){ -// if (this.findingDistanceAlgorithm.isEquivalent(read2modifiedSequence.substring(0, enzymeSite.length()), enzymeSite, this.parameters.getAllowedMismatchesEnzyme())){ -// foundEnzyme = enzymeSite; -// } -// } -// read2modifiedSequence = read2modifiedSequence.substring(foundEnzyme.length()); -// read2modifiedQuality = read2modifiedQuality.substring(foundEnzyme.length()); -// } -// } }else{ //GBS data if (! this.parameters.keepCutSites()){ @@ -1198,34 +870,7 @@ private ProcessedFragment parseFastqRead(FastqRead read1) throws InvalidReadExce return new ProcessedFragment(sample, fastqRead, sampleBarcodeCombination.getMismatches()); } - - - /** - * search for a enzyme cutsite or complement in the given sequence (cutsite found in given sample) - *
returns -1 if no cutsite is found - *
returns a location (int) if a cutsite is found - * @param sequence String | the sequence where is searched in - * @param sample Sample | the sample from which compliment cutsites is used - * @return int | -1 if no cutsite is found, else the location of the cutsite - */ - private int findFirstEnzymeLocation(String sequence, Sample sample){ - //init location - int bestLocation = -1; - //try every cutsite of the known enzyme - for (String enzymeCutSite : sample.getEnzyme().getInitialCutSiteRemnant()){ - //try to find the location of the enzyme (-1 is returned if the barcode isn't found) - int location = sequence.indexOf(enzymeCutSite); - if (location != -1){ - //location found - if (bestLocation == -1 || location < bestLocation){ - //location is better then previous one - bestLocation = location; - } - } - } - return bestLocation; - } - + /** * Goes over the sequence and search the if the piece of the sequence is equivalent to a possible enzyme cutsite (possible enzyme mismatches) *
if the found piece is equivalent and the complete check option is false, the location is returned @@ -1475,8 +1120,10 @@ private SampleBarcodeCombination findGBSBarcode(String sequence, int startDistan //check on adaptor ligase if (this.parameters.getAdaptorLigaseMismatches() != -1){ String adaptor = this.parameters.getCommonAdaptor(); - if (this.findingDistanceAlgorithm.calculateEquivalentDistance(sequence.substring(distance + barcodeLocationLength[1] + cutsiteLocationLength[1]), adaptor, this.parameters.getAdaptorLigaseMismatches())[0] != -1){ - return null; + for (int l = cutsiteLocationLength[1]/2; l <= cutsiteLocationLength[1]; l++){ + if (this.findingDistanceAlgorithm.calculateEquivalentDistance(sequence.substring(distance + barcodeLocationLength[1] + l), adaptor, this.parameters.getAdaptorLigaseMismatches())[0] != -1){ + return null; + } } } foundSampleSet.add(new SampleBarcodeCombination(sample, enzymeCutSite, distance, barcodeLocationLength[0], barcodeLocationLength[1], cutsiteLocationLength[1])); @@ -1540,9 +1187,11 @@ private SampleBarcodeCombination[] findGBSBarcode2(String sequence, int startDis //check on adaptor ligase if (this.parameters.getAdaptorLigaseMismatches() != -1){ String adaptor = this.parameters.getCommonAdaptor(); - if (this.findingDistanceAlgorithm.calculateEquivalentDistance(sequence.substring(distance + barcodeLocationLength[1] + cutsiteLocationLength[1]), adaptor, this.parameters.getAdaptorLigaseMismatches())[0] != -1){ - return null; - //adaptor ligase + for (int l = cutsiteLocationLength[1]/2; l <= cutsiteLocationLength[1]; l++){ + if (this.findingDistanceAlgorithm.calculateEquivalentDistance(sequence.substring(distance + barcodeLocationLength[1] + cutsiteLocationLength[1]), adaptor, this.parameters.getAdaptorLigaseMismatches())[0] != -1){ + return null; + //adaptor ligase + } } } exactEnzymeCutSite = enzymeCutSite; @@ -1600,8 +1249,10 @@ private SampleBarcodeCombination[] findGBSBarcode2(String sequence, int startDis //check on adaptor ligase if (this.parameters.getAdaptorLigaseMismatches() != -1){ String adaptor = this.parameters.getCommonAdaptor(); - if (this.findingDistanceAlgorithm.calculateEquivalentDistance(sequence2.substring(distance + barcodeLocationLength[1] + cutsiteLocationLength[1]), adaptor, this.parameters.getAdaptorLigaseMismatches())[0] != -1){ - return null; + for (int l = cutsiteLocationLength[1]/2; l <= cutsiteLocationLength[1]; l++){ + if (this.findingDistanceAlgorithm.calculateEquivalentDistance(sequence2.substring(distance + barcodeLocationLength[1] + l), adaptor, this.parameters.getAdaptorLigaseMismatches())[0] != -1){ + return null; + } } } exactEnzymeCutSite = enzymeCutSite; @@ -1631,183 +1282,6 @@ private SampleBarcodeCombination[] findGBSBarcode2(String sequence, int startDis return null; } - /** - * a combination of a sample, a barcode and enzyme, the mismatches (between sequence and barcode/enzyme) and the start location of the barcode in the sequence - */ - private class SampleBarcodeCombination{ - - private Sample sample; - private String enzymeCutSite; - private int location; - private int mismatches; - private int lengthFoundBarcode; - private int lengthFoundEnzyme; - /** - * - * @param sample Sample | the sample of this combination - * @param enzymeCutSite String | the used enzyme cutsite - * @param location int | the start location of the barcodeEnzyme - * @param mismatches int | the amount of mismatches - * @param length int | the length of the combination of the sample and barcode - */ - public SampleBarcodeCombination(Sample sample, String enzymeCutSite, int location, int mismatches, int lengthFoundBarcode, int lengthFoundEnzyme){ - this.sample = sample; - this.enzymeCutSite = enzymeCutSite; - this.location = location; - this.mismatches = mismatches; - this.lengthFoundBarcode = lengthFoundBarcode; - this.lengthFoundEnzyme = lengthFoundEnzyme; - } - - /** - * - * @return Sample | the sample of this combination - */ - public Sample getSample(){ - return this.sample; - } - - /** - * - * @return String | the enzymeCutsite - */ - public String getEnzymeCutsite(){ - return this.enzymeCutSite; - } - - /** - * - * @return int | the location of the barcode + enzyme in a sequence - */ - public int getLocation(){ - return this.location; - } - - /** - * - * @return int | the amount of mismatches between the barcode + enzyme and the sequence - */ - public int getMismatches(){ - return this.mismatches; - } - - /** - * - * @return int | the length of the found enzyme - */ - public int getLengthFoundEnzyme(){ - return this.lengthFoundEnzyme; - } - - /** - * - * @return int | the lenght of the found barcode - */ - public int getLengthFoundBarcode(){ - return this.lengthFoundBarcode; - } - - } - - /** - * all information of the processed fragment: - *
the sample of the read, - *
read1 (as HashMap of FastqParts and String) - *
read2 (only pair-end) (as HashMap of FastqParts and String) - *
mismatch occured in finding the barcode/enzyme - */ - private class ProcessedFragment{ - - private Sample sample; - private FastqRead read1; - private FastqRead read2; - private int mismatch; - private String sequenceComment = ""; - - /** - * create a new ProcessedFragment (pair-end) - * @param sample Sample | the sample of the fragment - * @param read1 FastqRead | the first read of the fragment - * @param read2 FastqRead | the second read of the fragment (only pair-end) - * @param mismatch int | number of mismatches in the barcode/enzyme - * @param sequenceComment String | the comment on the cut of the sequence - */ - public ProcessedFragment(Sample sample, FastqRead read1, FastqRead read2, int mismatch, String sequenceComment){ - this.sample = sample; - this.read1 = read1; - this.read2 = read2; - this.mismatch = mismatch; - this.sequenceComment = sequenceComment; - } - - /** - * create a new ProcessedFragment (pair-end) - * @param sample Sample | the sample of the fragment - * @param read1 FastqRead | the first read of the fragment - * @param read2 FastqRead | the second read of the fragment (only pair-end) - * @param mismatch int | number of mismatches in the barcode/enzyme - */ - public ProcessedFragment(Sample sample, FastqRead read1, FastqRead read2, int mismatch){ - this.sample = sample; - this.read1 = read1; - this.read2 = read2; - this.mismatch = mismatch; - } - - /** - * create a new ProcessedFragment (single read) - * @param sample Sample | the sample of the fragment - * @param read1 FastqRead | the only read of the fragment - * @param mismatch int | number of mismatches in the barcode/enzyme - */ - public ProcessedFragment(Sample sample, FastqRead read1, int mismatch){ - this.sample = sample; - this.read1 = read1; - this.read2 = null; - this.mismatch = mismatch; - } - - /** - * - * @return Sample | the sample of this fragment - */ - public Sample getSample(){ - return this.sample; - } - - /** - * - * @return HashMap of FastqParts and String | the first read - */ - public FastqRead getRead1(){ - return this.read1; - } - - /** - * - * @return HashMap of FastqParts and String | the second read (only by pair-end) - */ - public FastqRead getRead2(){ - return this.read2; - } - - /** - * - * @return int | number of mismatches in barcode/enzyme - */ - public int getMismatch(){ - return this.mismatch; - } - - /** - * - * @return String | the comment on the cut of the reads - */ - public String getComment(){ - return this.sequenceComment; - } - - } } diff --git a/src/be/uzleuven/gc/logistics/GBSX/demultiplexer/model/DemultiplexParameters.java b/src/be/uzleuven/gc/logistics/GBSX/demultiplexer/model/DemultiplexParameters.java index 25f3558..5a81df2 100644 --- a/src/be/uzleuven/gc/logistics/GBSX/demultiplexer/model/DemultiplexParameters.java +++ b/src/be/uzleuven/gc/logistics/GBSX/demultiplexer/model/DemultiplexParameters.java @@ -563,11 +563,11 @@ public String getParametersHelp(){ toHelp += "\t -n \t keep sequences where N occurs as nucleotide (standard true)" + "\n"; toHelp += "\t -ca \t the common adaptor used in the sequencing (standard (only first piece) AGATCGGAAGAGCG) currently only used for adaptor ligase see -al and when -rad is true) (minimum length is 10)" + "\n"; toHelp += "\t -s \t the posible distance of the start. This is the distance count from the start of the read to the first basepair of the barcode or enzyme (standard 0, maximum 20)" + "\n"; - toHelp += "\t -cc \t Checks the complete read for the enzyme (if false, stops at the first possible enzyme cutsite) (use values true or false, standard is true) if used, the sequence after the enzyme site is compared to the adaptors, if the first basepairs of the sequence are compaired to the first basepairs of the adaptor" + "\n"; + //toHelp += "\t -cc \t Checks the complete read for the enzyme (if false, stops at the first possible enzyme cutsite) (use values true or false, standard is true) if used, the sequence after the enzyme site is compared to the adaptors, if the first basepairs of the sequence are compaired to the first basepairs of the adaptor" + "\n"; toHelp += "\t -kc \t Keep the enzyme cut-site remains (standard true)" + "\n"; toHelp += "\t -ea \t Add enzymes from the given file (keeps the standard enzymes, and add the new) (enzyme file: no header, enzyme name tab cutsites (multiple cutsites are comma separeted)) (only use once, not use -er)" + "\n"; toHelp += "\t -er \t Replace enzymes from the given file (don't keep the standard enzymes) (enzyme file: no header, enzyme name tab cutsites (multiple cutsites are comma separeted)) (only use once, not use -ea)" + "\n"; - toHelp += "\t -al \t check for adaptor ligase: no (for no check) or a positive integer (starts at 0), for the number of mismatches (only checks 10 basepairs of the adaptor), standard 1" + "\n"; + //toHelp += "\t -al \t check for adaptor ligase: no (for no check) or a positive integer (starts at 0), for the number of mismatches (only checks 10 basepairs of the adaptor), standard 1" + "\n"; toHelp += "\t -scb \t Use self correcting barcodes (barcodes created by the barcodeGenerator) (standard false)" + "\n"; toHelp += "\t -malg \t the used algorithm to find mismatches and indels, possible algorithms (see README): " + "\n"; for (FindingsAlgorithms algorithm : FindingsAlgorithms.values()){ diff --git a/src/be/uzleuven/gc/logistics/GBSX/demultiplexer/model/ProcessedFragment.java b/src/be/uzleuven/gc/logistics/GBSX/demultiplexer/model/ProcessedFragment.java new file mode 100644 index 0000000..7d470ba --- /dev/null +++ b/src/be/uzleuven/gc/logistics/GBSX/demultiplexer/model/ProcessedFragment.java @@ -0,0 +1,114 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package be.uzleuven.gc.logistics.GBSX.demultiplexer.model; + +import be.uzleuven.gc.logistics.GBSX.utils.fastq.model.FastqRead; +import be.uzleuven.gc.logistics.GBSX.utils.sampleBarcodeEnzyme.model.Sample; + +/** + * + * @author koen + */ +public class ProcessedFragment { + + /** + * all information of the processed fragment: + *
the sample of the read, + *
read1 (as HashMap of FastqParts and String) + *
read2 (only pair-end) (as HashMap of FastqParts and String) + *
mismatch occured in finding the barcode/enzyme + */ + + private Sample sample; + private FastqRead read1; + private FastqRead read2; + private int mismatch; + private String sequenceComment = ""; + + /** + * create a new ProcessedFragment (pair-end) + * @param sample Sample | the sample of the fragment + * @param read1 FastqRead | the first read of the fragment + * @param read2 FastqRead | the second read of the fragment (only pair-end) + * @param mismatch int | number of mismatches in the barcode/enzyme + * @param sequenceComment String | the comment on the cut of the sequence + */ + public ProcessedFragment(Sample sample, FastqRead read1, FastqRead read2, int mismatch, String sequenceComment){ + this.sample = sample; + this.read1 = read1; + this.read2 = read2; + this.mismatch = mismatch; + this.sequenceComment = sequenceComment; + } + + /** + * create a new ProcessedFragment (pair-end) + * @param sample Sample | the sample of the fragment + * @param read1 FastqRead | the first read of the fragment + * @param read2 FastqRead | the second read of the fragment (only pair-end) + * @param mismatch int | number of mismatches in the barcode/enzyme + */ + public ProcessedFragment(Sample sample, FastqRead read1, FastqRead read2, int mismatch){ + this.sample = sample; + this.read1 = read1; + this.read2 = read2; + this.mismatch = mismatch; + } + + /** + * create a new ProcessedFragment (single read) + * @param sample Sample | the sample of the fragment + * @param read1 FastqRead | the only read of the fragment + * @param mismatch int | number of mismatches in the barcode/enzyme + */ + public ProcessedFragment(Sample sample, FastqRead read1, int mismatch){ + this.sample = sample; + this.read1 = read1; + this.read2 = null; + this.mismatch = mismatch; + } + + /** + * + * @return Sample | the sample of this fragment + */ + public Sample getSample(){ + return this.sample; + } + + /** + * + * @return HashMap of FastqParts and String | the first read + */ + public FastqRead getRead1(){ + return this.read1; + } + + /** + * + * @return HashMap of FastqParts and String | the second read (only by pair-end) + */ + public FastqRead getRead2(){ + return this.read2; + } + + /** + * + * @return int | number of mismatches in barcode/enzyme + */ + public int getMismatch(){ + return this.mismatch; + } + + /** + * + * @return String | the comment on the cut of the reads + */ + public String getComment(){ + return this.sequenceComment; + } + +} \ No newline at end of file diff --git a/src/be/uzleuven/gc/logistics/GBSX/demultiplexer/model/SampleBarcodeCombination.java b/src/be/uzleuven/gc/logistics/GBSX/demultiplexer/model/SampleBarcodeCombination.java new file mode 100644 index 0000000..a5cb171 --- /dev/null +++ b/src/be/uzleuven/gc/logistics/GBSX/demultiplexer/model/SampleBarcodeCombination.java @@ -0,0 +1,91 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package be.uzleuven.gc.logistics.GBSX.demultiplexer.model; + +import be.uzleuven.gc.logistics.GBSX.utils.sampleBarcodeEnzyme.model.Sample; +/** + * + * @author koen + */ +public class SampleBarcodeCombination { + + /** + * a combination of a sample, a barcode and enzyme, the mismatches (between sequence and barcode/enzyme) and the start location of the barcode in the sequence + */ + + private final Sample sample; + private final String enzymeCutSite; + private final int location; + private final int mismatches; + private final int lengthFoundBarcode; + private final int lengthFoundEnzyme; + /** + * + * @param sample Sample | the sample of this combination + * @param enzymeCutSite String | the used enzyme cutsite + * @param location int | the start location of the barcodeEnzyme + * @param mismatches int | the amount of mismatches + * @param lengthFoundBarcode int | length of the found barcode + * @param lengthFoundEnzyme int | length of the found enzyme + */ + public SampleBarcodeCombination(Sample sample, String enzymeCutSite, int location, int mismatches, int lengthFoundBarcode, int lengthFoundEnzyme){ + this.sample = sample; + this.enzymeCutSite = enzymeCutSite; + this.location = location; + this.mismatches = mismatches; + this.lengthFoundBarcode = lengthFoundBarcode; + this.lengthFoundEnzyme = lengthFoundEnzyme; + } + + /** + * + * @return Sample | the sample of this combination + */ + public Sample getSample(){ + return this.sample; + } + + /** + * + * @return String | the enzymeCutsite + */ + public String getEnzymeCutsite(){ + return this.enzymeCutSite; + } + + /** + * + * @return int | the location of the barcode + enzyme in a sequence + */ + public int getLocation(){ + return this.location; + } + + /** + * + * @return int | the amount of mismatches between the barcode + enzyme and the sequence + */ + public int getMismatches(){ + return this.mismatches; + } + + /** + * + * @return int | the length of the found enzyme + */ + public int getLengthFoundEnzyme(){ + return this.lengthFoundEnzyme; + } + + /** + * + * @return int | the lenght of the found barcode + */ + public int getLengthFoundBarcode(){ + return this.lengthFoundBarcode; + } + +}