diff --git a/README.md b/README.md index fd05771..6b86511 100644 --- a/README.md +++ b/README.md @@ -256,4 +256,7 @@ v1.1.1 v1.1.2 * Updated barcode recognition for single read modus in the demultiplexer: when a read can be assigned to multiple samples, -the read is considered as unvalid (previous was first sample) \ No newline at end of file +the read is considered as unvalid (previous was first sample) + +v1.1.3 +* On request added the enzyme AvaII \ No newline at end of file diff --git a/releases/GBSX_v1.1.3/GBSX_digest_v1.0.pl b/releases/GBSX_v1.1.3/GBSX_digest_v1.0.pl new file mode 100644 index 0000000..3499548 --- /dev/null +++ b/releases/GBSX_v1.1.3/GBSX_digest_v1.0.pl @@ -0,0 +1,380 @@ +#!/usr/bin/perl + +#################################################################################################################### +# This is GBSX v1.0. A toolkit for experimental design and demultiplexing genotyping by sequencing experiments. # +# # +# Copyright (C) 2014 KU Leuven # +# # +# This file is part of GBSX. # +# # +# GBSX is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation, either version 3 of the License, or # +# (at your option) any later version. # +# # +# GBSX is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with GBSX. If not, see . # +#################################################################################################################### + +######################################################################################################################################################################## +use Getopt::Std; +use Bio::SeqIO; +use Data::Dumper; +use Bio::PrimarySeq; +use Bio::DB::Fasta; +use Bio::Restriction::EnzymeCollection; +use Bio::Restriction::Analysis; +use Bio::Restriction::Enzyme; + + + +######################################################################################################################################################################## +my $program_information = "GBSX v1.0"; +my $license = "GBSX v1.0 Copyright (C) 2014 KU Leuven\nThis program comes with ABSOLUTELY NO WARRANTY; for details type `perl GBSX_digest_v1.0.pl -w'.\nThis is free software, and you are welcome to redistribute it under certain conditions; type `perl GBSX_digest_v1.0.pl -c' for details.\n"; + +$Getopt::Std::STANDARD_HELP_VERSION = 1; +my $help_message="\n$license\n####################################################################################\nusage: perl GBSX_digest_v1.0.pl -d \'digest sequence\' -l \'read length\' -f \'file of reference fasta file location(s)\'\n\noptional parameters: \n\t-e \'enzyme name to use\' (default: Enzyme)\n\t-g \'genome name to use in bed file name\' (default: genome)\n\t-n \'minimum size fragments to include\' (default: 100)\n\t-m \'maximum size fragments to use\' (default: 1000)\n\n\t-E \'second enzyme name to use\' (default: Enzyme2)\n\t-D \'digest sequence for a second enzyme\' (default: not declared)\n\t-R \'digest sequence for a third enzyme\' (default: not declared)\n"; + +######################################################################################################################################################################## +######################################################################################################################################################################## +#input parameters +sub HELP_MESSAGE {die "$help_message\n";} +sub VERSION_MESSAGE {print "$program_information\n";} + +my %opts = (); +getopts('e:d:g:l:f:m:n:E:D:R:cwh'); + +#die with help message if -h +if ($opt_h){ die "$help_message\n"; } +#die with warrenty message if -w +if ($opt_w){ die "GBSX is distributed in the hope that it will be useful,\nbut WITHOUT ANY WARRANTY; without even the implied warranty of\nMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\nGNU General Public License for more details.\n"; } +#die with conditions message if -c +if ($opt_c){ die "GBSX is free software: you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation, either version 3 of the License, or\n(at your option) any later version.\n"; } + + +#enzyme name (optional) +my $enzyme_name = $opt_e || "Enzyme"; + +#digest sequence: example "RCATG^Y"; +my $digest_seq = $opt_d || "not_defined"; if ($digest_seq ne "not_defined"){$digest_seq=uc($digest_seq);} + +#genome name to use for bedfile (optional) +my $bed_genome_name = $opt_g || "genome"; + +#define read length: assumes PE, only report back in bed first and last number of bps corresponding to this +my $read_length = $opt_l || "not_defined"; + +#file of reference fasta locations +my $fasta_file_locations = $opt_f || "not_defined"; + +#max/min fragment sizes (optional, otherwise use defaults) +my $max_size = $opt_m || 1000; +my $min_size = $opt_n || 100; + +#enzyme2 name (optional, otherwise if second digest just default to Enzyme2) +my $enzyme2_name = $opt_E || "Enzyme2"; + #make sure starts Upper case + $enzyme2_name = ucfirst($enzyme2_name); + #double check this is not the same as Enzyme1 + if ($enzyme2_name eq $enzyme_name){die "Please use different enzyme names for enzyme 1 and enzyme 2\n"; } + +#digest sequence for a second enzyme: example "RCATG^Y"; (optional, otherwise no second digest) +my $digest2_seq = $opt_D || "not_defined"; if ($digest2_seq ne "not_defined"){$digest2_seq=uc($digest2_seq);} + #double check this is not the same as Enzyme1 + if ($digest2_seq eq $digest_seq){die "Please use different enzyme cut sequences for enzyme 1 and enzyme 2\n"; } + +#digest sequence for a third enzyme: example "RCATG^Y"; (optional, otherwise no third digest) +my $digest3_seq = $opt_R || "not_defined"; + if ($digest3_seq ne "not_defined"){$digest3_seq=uc($digest3_seq); + #double check this is not the same as Enzyme1 or Enzyme3 + if ( ($digest3_seq eq $digest2_seq) || ($digest3_seq eq $digest_seq) ){die "Please use different enzyme cut sequences for enzyme 1, enzyme 2, and enzyme 3\n"; } + #we call this Enzyme3, so make sure nothing else uses this name + if ( ($enzyme2_name eq "Enzyme3") || ($enzyme_name eq "Enzyme3") ){ die "Enzyme3 is a reseved name, please use a different enzyme name\n"; } + } + +#die if required parameters are missing +if ( ($digest_seq eq "not_defined") || ($read_length eq "not_defined") || ($fasta_file_locations eq "not_defined") ){ + my $missing=" "; + if ($digest_seq eq "not_defined"){$missing = $missing."-d (digest sequence) ";} + if ($read_length eq "not_defined"){$missing = $missing."-l (read length) ";} + if ($fasta_file_locations eq "not_defined"){$missing = $missing."-f (reference fasta locations) ";} + die "missing parameter(s):$missing\n"; +} + +######################################################################################################################################################################## +######################################################################################################################################################################## +#first basic steps to get ready for the analysis (mostly defining variables) + +#using input information, define outfiles: +my $outfile; my $main_bed; +if ($digest2_seq ne "not_defined"){ + $outfile = $bed_genome_name.".".$enzyme_name.".".$enzyme2_name.".".$read_length."nt.digest_results"; + $main_bed = $bed_genome_name.".".$enzyme_name.".".$enzyme2_name.".".$read_length."nt.digest.bed"; +}else{ + $outfile = $bed_genome_name.".".$enzyme_name.".".$read_length."nt.digest_results"; + $main_bed = $bed_genome_name.".".$enzyme_name.".".$read_length."nt.digest.bed"; +} + +#get reference fasta files +my @ref_chr; +open (ref_files,$fasta_file_locations)||die "could not open file $fasta_file_locations\n"; +while(){ + chop; + $_=~s/\n//;$_=~s/\r//; + if ($_ ne ""){ + push(@ref_chr,$_); + #current script only works with one sequence per file, add a check for this + my $grep_results = `grep \"^\>\" $_`; + my @grep_array = split(/\n/,$grep_results); my $number_sequences_in_file = @grep_array; + if ($number_sequences_in_file > 1){die "please use only one sequence per fasta file, $_ had $number_sequences_in_file sequences\n";} + } +} +close(ref_files)||die "could not close file $fasta_file_locations\n"; + +#define enzyme to use +my $rest_enzyme = new Bio::Restriction::Enzyme(-enzyme => $enzyme_name, -seq => $digest_seq); + +#optional: define a second enzyme +my $rest_enzyme2; if ($digest2_seq ne "not_defined"){ $rest_enzyme2 = new Bio::Restriction::Enzyme(-enzyme => $enzyme2_name, -seq => $digest2_seq); } + +#optional: define a third enzyme +my $rest_enzyme3; if ($digest3_seq ne "not_defined"){ $rest_enzyme3 = new Bio::Restriction::Enzyme(-enzyme => "Enzyme3", -seq => $digest3_seq); } + +#define ouput count variables +my $total_cuts=0; my $total_fragments=0; my $total_fragments_between_min_and_max = 0; +my $bin_size=100;#hard coded for now +my $temp_size=$min_size; + until (($temp_size+$bin_size)>$max_size){ + my $temp_size_plus_bin = $temp_size+$bin_size; + my $variable_name = "total_fragments_between_".$temp_size."_and_".$temp_size_plus_bin; + $$variable_name=0; + $temp_size=$temp_size+$bin_size; + } +my $total_fragments_less_min = 0; +my %fragments_in_range_per_chr; my %fragments_in_range_per_chr_enzyme1_end; my %fragments_in_range_per_chr_enzyme2_end; my %fragments_in_range_per_chr_enzymeboth_end; + +#define ouput count variables, optional for a second enzyme +my $total_cuts_enzyme2=0; my $fragments_both_ends_enzyme1 = 0; my $fragments_both_ends_enzyme2 = 0; my $fragments_end_from_each_enzyme = 0; + +#define ouput count variables, optional for a third enzyme +my $fragments_in_range_contain_third_enzyme = 0; + +#define ouput count variables, distribution of distances +my $one_kb = 0; my $ten_kb = 0; my $hundred_kb=0; my $one_Mb = 0; my $ten_Mb = 0; my $more_ten_kb=0; + +######################################################################################################################################################################## +######################################################################################################################################################################## +#per chromosome, perform digest and add to totals + +#open bed file to print to +open(outbed,">$main_bed")||die "could not open out bedfile $main_bed\n"; + +foreach my $chr (@ref_chr){ + + #just get chr name and remove directory structure + my @chr_name_parts =split(/\//,$chr);my $number_parts=@chr_name_parts; + my $chr_name=@chr_name_parts[$number_parts-1]; + $chr_name=~s/.fa$//i;$chr_name=~s/.fasta$//i; + $fragments_in_range_per_chr{$chr_name}=0; + $fragments_in_range_per_chr_enzyme1_end{$chr_name}=0; + $fragments_in_range_per_chr_enzyme2_end{$chr_name}=0; + $fragments_in_range_per_chr_enzymeboth_end{$chr_name}=0; + + #get fasta file for this chr + my $seqio_object1 = Bio::SeqIO->new(-file => $chr, -format => "fasta"); + my $seq = $seqio_object1->next_seq; + my $fastadb = Bio::DB::Fasta->new($chr); + + #digest + my $rest_analysis=Bio::Restriction::Analysis->new(-seq=>$seq, -enzymes=>$rest_enzyme); + + #just get cut numbers first + my $chr_cuts=$rest_analysis->cuts_by_enzyme($enzyme_name); + $total_cuts=$total_cuts+$chr_cuts; + + #optional digest and cut numbers for a second enzyme + my $rest_analysis2;my $chr_cuts2; + if ($digest2_seq ne "not_defined"){ + $rest_analysis2=Bio::Restriction::Analysis->new(-seq=>$seq, -enzymes=>$rest_enzyme2); + $chr_cuts2=$rest_analysis2->cuts_by_enzyme($enzyme2_name); + $total_cuts_enzyme2=$total_cuts_enzyme2+$chr_cuts2; + } + + #get locations of cuts + my @locations1 = $rest_analysis->positions($enzyme_name); + #cut locations if second enzyme + my %hash_locations1; my %hash_locations2;#use position as keys + my @locations2;my @all_locations; + if ($digest2_seq ne "not_defined"){ + @locations2=$rest_analysis2->positions($enzyme2_name); + @all_locations = (@locations1,@locations2); + @all_locations=sort {$a <=> $b} @all_locations; + + foreach my $position1 (@locations1){ $hash_locations1{$position1}=1; } + foreach my $position2 (@locations2){ $hash_locations2{$position2}=1; } + }else{ + @all_locations=@locations1; + } + + #go through all cut locations and check fragment sizes + my @chr_with_directories = split(/\//,$chr); $chr_name=pop @chr_with_directories; $chr_name=~s/.fa//; + my $previous_location=0; my $previous_fragment_in_range_end=0; + + #to analyze every fragment, add the end of the chromosome as a position + my $chr_end_position = $seq->length; + push(@all_locations,$chr_end_position); + + foreach my $location (@all_locations){ + $total_fragments++; + if ($location<$previous_location){die "error in order, $location < $previous_location";} + my $fragment_length = $location-$previous_location; + if (($fragment_length<=$max_size)&&($fragment_length>=$min_size)){ + $total_fragments_between_min_and_max++; + $fragments_in_range_per_chr{$chr_name}=$fragments_in_range_per_chr{$chr_name}+1; + + #for distribution of distance between sequenced fragments + if ($previous_fragment_in_range_end!=0){ + my $distance_this_fragment_to_previous=$previous_location-$previous_fragment_in_range_end; + if ( $distance_this_fragment_to_previous<=1000 ){ + $one_kb++; + }elsif( ($distance_this_fragment_to_previous>1000) && ($distance_this_fragment_to_previous<=10000)){ + $ten_kb++; + }elsif( ($distance_this_fragment_to_previous>10000) && ($distance_this_fragment_to_previous<=100000)){ + $hundred_kb++; + }elsif( ($distance_this_fragment_to_previous>100000) && ($distance_this_fragment_to_previous<=1000000)){ + $one_Mb++; + }elsif( ($distance_this_fragment_to_previous>1000000) && ($distance_this_fragment_to_previous<=10000000)){ + $ten_Mb++; + }elsif( ($distance_this_fragment_to_previous>10000000) ){ + $more_ten_kb++; + }else{ + die "error in distances\n"; + } + } + $previous_fragment_in_range_end=$location; + + + #if second enzyme, are ends from one or both enzymes + if ($digest2_seq ne "not_defined"){ + if ( (exists($hash_locations1{$previous_location})) && (exists($hash_locations1{$location})) ){ + $fragments_both_ends_enzyme1++; #both ends from enzyme1 + $fragments_in_range_per_chr_enzyme1_end{$chr_name}=$fragments_in_range_per_chr_enzyme1_end{$chr_name}+1; + }elsif ( (exists($hash_locations2{$previous_location})) && (exists($hash_locations2{$location})) ){ + $fragments_both_ends_enzyme2++; #both ends from enzyme2 + $fragments_in_range_per_chr_enzyme2_end{$chr_name}=$fragments_in_range_per_chr_enzyme2_end{$chr_name}+1; + }elsif ( (exists($hash_locations1{$previous_location})) && (exists($hash_locations2{$location})) ){ + $fragments_end_from_each_enzyme++; #one end from each enzyme + $fragments_in_range_per_chr_enzymeboth_end{$chr_name}=$fragments_in_range_per_chr_enzymeboth_end{$chr_name}+1; + }elsif ( (exists($hash_locations2{$previous_location})) && (exists($hash_locations1{$location})) ){ + $fragments_end_from_each_enzyme++; #one end from each enzyme + $fragments_in_range_per_chr_enzymeboth_end{$chr_name}=$fragments_in_range_per_chr_enzymeboth_end{$chr_name}+1; + } + } + + #counts on bin sizes + my $temp_size=$min_size; + until (($temp_size+$bin_size)>$max_size){ + my $temp_size_plus_bin = $temp_size+$bin_size; + if ( ($fragment_length>=$temp_size)&&($fragment_length<$temp_size_plus_bin) ){ + my $variable_name = "total_fragments_between_".$temp_size."_and_".$temp_size_plus_bin; + $$variable_name=$$variable_name+1; + } + $temp_size=$temp_size+$bin_size; + } + + #print to bed the potentially sequenced bases + if ($fragment_length <= (2*$read_length)){ + print outbed "$chr_name\t$previous_location\t$location\n"; + }else{ + #print first read + $previous_location_plus=$previous_location+$read_length; + print outbed "$chr_name\t$previous_location\t$previous_location_plus\n"; + #print second read + $location_minus=$location-$read_length; + print outbed "$chr_name\t$location_minus\t$location\n"; + } + + #if third enzyme requested, check if there is a digest site in this fragment + if ($digest3_seq ne "not_defined"){ + my $fragment_seq = $fastadb->seq($chr_name, ($previous_location+1) => ($location)); + my $frag_seq_obj = Bio::Seq->new( -seq => $fragment_seq ); + my $rest_analysis3=Bio::Restriction::Analysis->new(-seq=>$frag_seq_obj, -enzymes=>$rest_enzyme3); + my $frag_cuts3=$rest_analysis3->cuts_by_enzyme("Enzyme3"); + if ($frag_cuts3>0){ $fragments_in_range_contain_third_enzyme++ } + } + + }elsif ($fragment_length<$min_size){ + $total_fragments_less_min++; + } + $previous_location=$location; + } + +} + +#close bedfile +close(outbed)||die "could not open out bedfile $main_bed\n"; + +######################################################################################################################################################################## +######################################################################################################################################################################## +#print to output file general digest info +open(out,">$outfile")||die "could not open outfile $outfile\n"; + +#print input parameters +print out "input parameters:\n\tenzyme name used: $enzyme_name\n\tenzyme cut sequence: $digest_seq\n"; +if ($digest2_seq ne "not_defined"){ print out "\tsecond enzyme name used: $enzyme2_name\n\tsecond enzyme cut sequence: $digest2_seq\n"; } +if ($digest3_seq ne "not_defined"){ print out "\tthird enzyme cut sequence: $digest3_seq\n"; } +print out "\tfor bedfile, genome name used: $bed_genome_name\n\tread length provided: $read_length\nminimum fragment size: $min_size\nmaximum fragment size: $max_size\n\nreference sequence(s) analyzed:\n"; +foreach my $ref_to_print_to_output (@ref_chr){ print out "\t$ref_to_print_to_output\n"; } +print out "\n"; + +#print number of cuts +print out "total cuts for enzyme ($enzyme_name): $total_cuts\n"; +if ($digest2_seq ne "not_defined"){ print out "total cuts for second enzyme ($enzyme2_name): $total_cuts_enzyme2\n"; } + +#print number of fragments +print out "\nlooked at $total_fragments fragments in total\n"; +print out "$total_fragments_between_min_and_max fragments <= $max_size nt and >= $min_size nt\n"; +if ($digest3_seq ne "not_defined"){ print out "\t-containing one or more third digest site ($digest3_seq): $fragments_in_range_contain_third_enzyme\n"; } +if ($digest2_seq ne "not_defined"){ + print out "\t-with both ends from $enzyme_name: $fragments_both_ends_enzyme1\n"; + print out "\t-with both ends from $enzyme2_name: $fragments_both_ends_enzyme2\n"; + print out "\t-with an end from each enzyme: $fragments_end_from_each_enzyme\n"; + print out "\tfragments in range per reference sequence: (both ends from $enzyme_name / both ends from $enzyme2_name / an end from each enzyme)\n"; +}else{ + print out "\tfragments in range per reference sequence:\n"; +} + +#print fragment chromosome distribution +my @chr_names_for_count_print = keys %fragments_in_range_per_chr; +foreach my $chr_name_for_count_print (@chr_names_for_count_print){ + if ($digest2_seq ne "not_defined"){ + print out "\t\t$chr_name_for_count_print had $fragments_in_range_per_chr{$chr_name_for_count_print} fragments ($fragments_in_range_per_chr_enzyme1_end{$chr_name_for_count_print}/$fragments_in_range_per_chr_enzyme2_end{$chr_name_for_count_print}/$fragments_in_range_per_chr_enzymeboth_end{$chr_name_for_count_print})\n"; + }else{ + print out "\t\t$chr_name_for_count_print had $fragments_in_range_per_chr{$chr_name_for_count_print} fragments\n"; + } +} + +#print fragment distance distribution +print out "\tdistances between fragments in range:\n"; +print out "\t\t<=1kb $one_kb fragment pairs\n\t\t1kb-10kb $ten_kb fragment pairs\n\t\t10kb-100kb $hundred_kb fragment pairs\n\t\t100kb-1Mb $one_Mb fragment pairs\n"; +print out "\t\t1Mb-10Mb $ten_Mb fragment pairs\n\t\t>10Mb $more_ten_kb fragment pairs\n"; + + +#print fragment size distribution +print out "\tfragments less than $min_size nt: $total_fragments_less_min \n\n"; +my $temp_size=$min_size; +until (($temp_size+$bin_size)>$max_size){ + my $temp_size_plus_bin = $temp_size+$bin_size; + my $variable_name = "total_fragments_between_".$temp_size."_and_".$temp_size_plus_bin; + $temp_size_plus_bin--; + print out "\tfragments $temp_size-$temp_size_plus_bin nt: $$variable_name\n"; + $temp_size=$temp_size+$bin_size; +} +print out "\n"; + +close(out)||die "could not close outfile $outfile\n"; diff --git a/releases/latest/GBSX_v1.1.2.jar b/releases/GBSX_v1.1.3/GBSX_v1.1.3.jar similarity index 94% rename from releases/latest/GBSX_v1.1.2.jar rename to releases/GBSX_v1.1.3/GBSX_v1.1.3.jar index 8832a1f..da9212b 100644 Binary files a/releases/latest/GBSX_v1.1.2.jar and b/releases/GBSX_v1.1.3/GBSX_v1.1.3.jar differ diff --git a/releases/latest/GBSX_v1.1.3.jar b/releases/latest/GBSX_v1.1.3.jar new file mode 100644 index 0000000..da9212b Binary files /dev/null and b/releases/latest/GBSX_v1.1.3.jar differ diff --git a/src/be/uzleuven/gc/logistics/GBSX/GBSX.java b/src/be/uzleuven/gc/logistics/GBSX/GBSX.java index ab32097..282d032 100644 --- a/src/be/uzleuven/gc/logistics/GBSX/GBSX.java +++ b/src/be/uzleuven/gc/logistics/GBSX/GBSX.java @@ -35,7 +35,7 @@ public class GBSX { public static final boolean DEBUG = false; - public final static String VERSION = "GBSX v1.1.2"; + public final static String VERSION = "GBSX v1.1.3"; private final static String LICENCE = "GPLv3"; /** diff --git a/src/be/uzleuven/gc/logistics/GBSX/utils/enzyme/model/EnzymeEnum.java b/src/be/uzleuven/gc/logistics/GBSX/utils/enzyme/model/EnzymeEnum.java index 3e65b39..809177f 100644 --- a/src/be/uzleuven/gc/logistics/GBSX/utils/enzyme/model/EnzymeEnum.java +++ b/src/be/uzleuven/gc/logistics/GBSX/utils/enzyme/model/EnzymeEnum.java @@ -79,6 +79,8 @@ public enum EnzymeEnum implements Enzyme{ NSPI ("NspI", new String[]{"CATGT", "CATGC"}), + AVAII ("AvaII", new String[]{"GACC", "GTCC"}), + NAN ("NA", new String[]{""}); private final String name;