From 6ea8b9195cddd05a28db1e7805c862e2aa17e293 Mon Sep 17 00:00:00 2001 From: sol Date: Sun, 27 Apr 2014 22:34:18 -0400 Subject: [PATCH] migrated referenced code from other projects to this repository --- IsoSCM/.classpath | 2 - .../src/executable/{Main.java => IsoSCM.java} | 10 +- .../processing/ClusterExpressedSegments.java | 580 +----------------- .../src/processing/FindSpliceJunctions.java | 30 +- IsoSCM/src/processing/SlidingWindow.java | 2 - .../EnumerateTranscriptModels.java | 93 +-- 6 files changed, 32 insertions(+), 685 deletions(-) rename IsoSCM/src/executable/{Main.java => IsoSCM.java} (99%) diff --git a/IsoSCM/.classpath b/IsoSCM/.classpath index b1e3ed0..a489509 100644 --- a/IsoSCM/.classpath +++ b/IsoSCM/.classpath @@ -2,9 +2,7 @@ - - diff --git a/IsoSCM/src/executable/Main.java b/IsoSCM/src/executable/IsoSCM.java similarity index 99% rename from IsoSCM/src/executable/Main.java rename to IsoSCM/src/executable/IsoSCM.java index 7f5d07f..58cc6aa 100644 --- a/IsoSCM/src/executable/Main.java +++ b/IsoSCM/src/executable/IsoSCM.java @@ -1,7 +1,6 @@ package executable; import java.io.File; -import java.io.FileNotFoundException; import java.io.IOException; import multisample.JointSegmentation; @@ -15,9 +14,9 @@ import processing.SlidingWindow; import scm.IdentifyChangePoints; import splicegraph.ExonSpliceGraph; -import tools.Strandedness; import tools.BEDTools.BEDWriter; import tools.GTFTools.GTFWriter; +import tools.Strandedness; import util.IO; import util.Util; @@ -28,12 +27,7 @@ import com.beust.jcommander.converters.FileConverter; import com.beust.jcommander.converters.IntegerConverter; -public class Main { - enum FileType {} - - public static File getFile(FileType type, File dir, String base){ - return dir; - } +public class IsoSCM { public static void main(String[] args) throws IOException { class AssembleCommand{ diff --git a/IsoSCM/src/processing/ClusterExpressedSegments.java b/IsoSCM/src/processing/ClusterExpressedSegments.java index 963f896..445748e 100644 --- a/IsoSCM/src/processing/ClusterExpressedSegments.java +++ b/IsoSCM/src/processing/ClusterExpressedSegments.java @@ -9,46 +9,38 @@ import java.util.List; import java.util.Map; -import org.apache.commons.math3.distribution.NormalDistribution; - -import cern.colt.list.DoubleArrayList; -import cern.jet.random.engine.RandomEngine; -import cern.jet.stat.quantile.DoubleQuantileFinder; -import cern.jet.stat.quantile.QuantileFinderFactory; -import filter.ComposableFilter; -import filter.ConsumingReadFilter; -import filter.Counter; -import filter.MultiMappingfilter; -import filter.SAMRecordStrandednessFilter; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecordIterator; -import splicegraph.ExonSpliceGraph.ConnectedComponentResult; + +import org.apache.commons.math3.distribution.NormalDistribution; + import tools.AnnotatedRegion; import tools.BAMTools; import tools.BEDTools.BEDWriter; import tools.GTFTools.GTFWriter; import tools.GenomicIntervalSet; import tools.GenomicIntervalTree; -import tools.GenomicPositionTree; import tools.IntervalTools; -import tools.StrandedSegmentClusterer; import tools.ParseBed.BEDIterator; import tools.ParseGTF.TranscriptIterator; -import tools.PositionClustering.StrandedClusterer; import tools.StrandedGenomicIntervalSet; import tools.StrandedGenomicIntervalTree; -import tools.StrandedGenomicPositionTree; +import tools.StrandedSegmentClusterer; import tools.Strandedness; -import util.ChartUtils; -import util.IO; import util.Util; -import util.Stats.MomentTracker; -import util.Util.ExtremeObjectTracker; import util.Util.ExtremeTracker; -import util.Util.MapList; +import cern.colt.list.DoubleArrayList; +import cern.jet.random.engine.RandomEngine; +import cern.jet.stat.quantile.DoubleQuantileFinder; +import cern.jet.stat.quantile.QuantileFinderFactory; +import filter.ComposableFilter; +import filter.ConsumingReadFilter; +import filter.Counter; +import filter.MultiMappingfilter; +import filter.SAMRecordStrandednessFilter; public class ClusterExpressedSegments { @@ -103,127 +95,6 @@ public static int nConsumingReads(SAMFileReader sfr, Strandedness strandedness, return n; } - public static GenomicIntervalTree> intervals(String gtf, GTFWriter gw) throws FileNotFoundException { - StrandedGenomicIntervalTree> git = new StrandedGenomicIntervalTree>(); - GenomicIntervalTree> git2 = new GenomicIntervalTree>(); - - TranscriptIterator ti = new TranscriptIterator(gtf); - - for(AnnotatedRegion r : ti){ - if(!git.contains(r.chr, r.start, r.end,r.strand)){ - git.add(r.chr, r.start, r.end,r.strand, r.attributes); - } - } - - // remove all stranded changepoints that share one boundary - // with an unstranded segment - // and - // remove all stranded changepoints that are shorter than - // another segment sharing the same boundary - List irrelevant = new LinkedList(); - for(AnnotatedRegion r : git){ - if(r.strand=='.'){ - for(char c : Util.list('+','-')){ - int r5p = r.get5Prime(c); - for(AnnotatedRegion n : git.overlappingRegions(r.chr, r5p, r5p, c)){ - if(r5p == n.get5Prime()){ - irrelevant.add(n); - } - } - } - } - else{ - int r5p = r.get5Prime(); - for(AnnotatedRegion n : git.overlappingRegions(r.chr, r5p, r5p, r.strand)){ - if(r5p == n.get5Prime() && r.size < n.size){ - irrelevant.add(r); - } - } - } - } - - for(AnnotatedRegion r : irrelevant){ - git.remove(r.chr, r.start, r.end, r.strand); - } - - // find those that are on both strands, - // delete them, those that remain are those that are only on one strand. - // for each of these, assign them to the closest up/down-stream segment boundary. - - StrandedGenomicPositionTree unmatchedSegmentBoundaries = new StrandedGenomicPositionTree(); - - for(AnnotatedRegion r : git){ - if(r.strand=='.'){ - unmatchedSegmentBoundaries.add(r.chr, r.start, '+'); - unmatchedSegmentBoundaries.add(r.chr, r.end, '-'); - } - else{ - unmatchedSegmentBoundaries.add(r.chr, r.get5Prime(), r.strand); - } - } - - GenomicIntervalTree> matchedSegmentBoundaries = new GenomicIntervalTree>(); - for(AnnotatedRegion r : git){ - if(r.strand!='.'){ - AnnotatedRegion closest = unmatchedSegmentBoundaries.getClosestUpstream(r.chr, r.get3Prime(), r.isNegativeStrand()?'+':'-'); - // System.out.println(r); - // System.out.printf("\t%s\n", closest); - if(closest!=null){ - - String chr = r.chr; - int start = Math.min(r.start,closest.start); - int end = Math.max(r.end,closest.end); - - boolean nested = false; - boolean matched = false; - for(AnnotatedRegion matchedSegment : git.overlappingRegions(chr, start, end, r.isNegativeStrand()?'+':'-')){ - if(matchedSegment.get5Prime()==closest.start){ - matched |= true; - // System.out.println(matchedSegment); - // System.out.println(matchedSegment.toAttributeString()); - closest.addAttribute("expression", matchedSegment.getAttribute("expression")); - int inter_5p_overlap = Math.abs(r.get5Prime() - matchedSegment.get5Prime()); - if(inter_5p_overlap < matchedSegment.size){ - System.out.printf("nested %s in %s\n", r, matchedSegment); - nested = true; - } - - } - } - if(matched && !(nested || matchedSegmentBoundaries.contains(chr, start, end))){ - // System.out.printf("(%s,%s)\t%s:%d-%d\t%s\t%s\n",r,closest,r.chr,Math.min(r.start,closest.start), Math.max(r.end,closest.end),r.toAttributeString(),closest.toAttributeString()); - Map attributes = new HashMap(); - attributes.put("consolidated", true); - if(r.isNegativeStrand()){ - attributes.put("start_expression", closest.getAttribute("expression")); - attributes.put("end_expression", r.getAttribute("expression")); - } - else{ - attributes.put("start_expression", r.getAttribute("expression")); - attributes.put("end_expression", closest.getAttribute("expression")); - } - matchedSegmentBoundaries.add(r.chr, Math.min(r.start,closest.start), Math.max(r.end,closest.end), attributes); - } - } - } - } - // System.exit(0); - - for(AnnotatedRegion r : matchedSegmentBoundaries){ - // System.out.println(r); - gw.write("exon", r.chr, r.start, r.end, '.', r.toGTFAttributeString()); - } - - // BEDWriter bw = new BEDWriter("test3.bed"); - // for(AnnotatedRegion r : git2){ - // bw.write(r.chr, r.start, r.end, '.'); - // } - // bw.close(); - - - return git2; - } - static class ConsolidatedIntervalResult{ GenomicIntervalTree> segments; GenomicIntervalTree> changePoints; @@ -238,83 +109,7 @@ public ConsolidatedIntervalResult(GenomicIntervalTree> segme } } - public static ConsolidatedIntervalResult expressedSegmentIntervals(String gtf) throws FileNotFoundException { - StrandedGenomicIntervalTree> git = new StrandedGenomicIntervalTree>(); - StrandedGenomicIntervalTree> unique_git = new StrandedGenomicIntervalTree>(); - GenomicPositionTree gpt = new GenomicPositionTree(); - GenomicIntervalTree> max_git = new GenomicIntervalTree>(); - GenomicIntervalTree> increase_git = new GenomicIntervalTree>(); - GenomicIntervalTree> decrease_git = new GenomicIntervalTree>(); - GenomicIntervalTree> expressedSegmentIntervals = new GenomicIntervalTree>(); - - TranscriptIterator ti = new TranscriptIterator(gtf); - - for(AnnotatedRegion r : ti){ - if(!unique_git.contains(r.chr, r.start, r.end,r.strand)){ - unique_git.add(r.chr, r.start, r.end,r.strand); - gpt.add(r.chr, r.start); - } - git.add(r.chr, r.start, r.end,r.strand, r.attributes); - } - - for(AnnotatedRegion r : unique_git){ - MapList mles = new MapList(); - - // System.out.println(r); - for(AnnotatedRegion g : git.overlappingRegions(r.chr, r.start, r.end, r.strand)){ - for(String s : Util.list("before_mle","after_mle")){ - mles.put(s, Double.parseDouble((String) g.getAttribute(s))); - } - } - - Map attributes = new HashMap(); - for(String s : Util.list("before_mle","after_mle")){ - attributes.put(s, Collections.max(mles.get(s))); - } - - boolean isIncrease = (Double) attributes.get("after_mle") > (Double) attributes.get("before_mle"); - - - if(!max_git.contains(r.chr, r.start, r.start)){ - max_git.add(r.chr,r.start,r.end,attributes); - if(isIncrease) - increase_git.add(r.chr,r.start,r.end,attributes); - else - decrease_git.add(r.chr,r.start,r.end,attributes); - // gw.write("exon", r.chr, r.start, r.end, '.', AnnotatedRegion.GTFAttributeString(attributes)); - } - } - - // System.out.println(); - - for(AnnotatedRegion r : max_git){ - // System.out.println(r); - - AnnotatedRegion upstream = gpt.getClosestUpstream(r.chr, r.start-1); - // AnnotatedRegion downstream = gpt.getClosestDownstream(r.chr, r.end+1); - - if(upstream!=null){ - Map attributes = new HashMap(); - attributes.put("mle", Math.max((Double) r.getAttribute("before_mle"), (Double) max_git.overlappingRegions(upstream.chr, upstream.start, upstream.end, upstream.strand).iterator().next().getAttribute("after_mle"))); - // gw.write("exon", r.chr, upstream.start, r.start, '.', AnnotatedRegion.GTFAttributeString(attributes)); - expressedSegmentIntervals.add(r.chr, upstream.start, r.start, attributes); - } - - // if(downstream!=null){ - // Map attributes = new HashMap(); - // attributes.put("mle", Math.max((Double) r.getAttribute("after_mle"), (Double) max_git.overlappingRegions(downstream.chr, downstream.start, downstream.end, downstream.strand).iterator().next().getAttribute("before_mle"))); - // gw.write("exon", r.chr, r.start, downstream.start, '.', AnnotatedRegion.GTFAttributeString(attributes)); - // - // } - - - // System.out.printf("%s\t%s\t%s\n", gpt.getClosestUpstream(r.chr, r.start-1),r,gpt.getClosestDownstream(r.chr, r.end+1)); - } - - ConsolidatedIntervalResult result = new ConsolidatedIntervalResult(expressedSegmentIntervals, max_git, increase_git, decrease_git); - - return result; - } + public static void enumerateExons(String chr, int start, int end, char strand, List splices, BEDWriter bw) throws FileNotFoundException{ @@ -1145,352 +940,5 @@ public static void scaffoldSpannableRegions(File segment_bed, File matepair_bed, } } - public static void main(String[] args) throws FileNotFoundException { - if(true){ - SAMFileReader sfr = new SAMFileReader(new File("/mnt/LaiLab/sol/GSE51572/test/g.bam")); - File segment_bed = new File("/mnt/LaiLab/sol/GSE51572/test/isoscm/tmp/gu.seg.bed"); -// countJunctionSupportingReads(sfr, new File("/mnt/LaiLab/sol/GSE51572/test/isoscm/tmp/g.sj.bed"), new GTFWriter("/mnt/LaiLab/sol/GSE51572/test/g1.counts.gtf")); - - int i = estimateMateInsertSize(sfr, Strandedness.unstranded, segment_bed, .99); - System.out.println(i); - } - if(false) - { - SAMFileReader sfr = new SAMFileReader(new File("/mnt/LaiLab/sol/GSE41637/mapped/indexed/SRR594413.bam")); - BEDWriter matepair_bw = new BEDWriter("/mnt/LaiLab/sol/GSE41637/mapped/test_pairs.bed"); - identifySpannableRegions(sfr, Strandedness.reverse_forward, matepair_bw, 160); - matepair_bw.close(); - } - if(false) - { - trimInferredExons(new File("/home/sol/lailab/sol/sangercenter/de_dup/isoscm/gtf/tmp/hip.exon.gtf"), new GTFWriter("/home/sol/lailab/sol/sangercenter/de_dup/isoscm/gtf/tmp/hip.exon.trimmed.gtf")); - } - if(false) - { - BEDWriter bw = new BEDWriter("/mnt/LaiLab/sol/sangercenter/de_dup/isoscm/gtf/tmp/hip.inferred.bed"); - inferSegmentStrand(new File("/mnt/LaiLab/sol/sangercenter/de_dup/isoscm/gtf/tmp/hip.sj.bed"), new File("/home/sol/workspace/IsoSCM/unstranded.bed"),bw); - bw.close(); - } - if(false) - { - File segment_bed = new File("/mnt/LaiLab/sol/GSE41637/nb/gtf/tmp/SRR594393.seg.bed"); - File repeat_bed = new File("/home/sol/data/repeats/mm10/mm10.RepeatMasker.bed"); - BEDWriter merged_segments = new BEDWriter("/mnt/LaiLab/sol/GSE41637/nb/test.merge.repeat.bed"); - mergeSegments(segment_bed, repeat_bed, merged_segments, 100); - merged_segments.close(); - } - if(false) - { - File segment_bed = new File("/home/sol/workspace/IsoSCM/tests/gtf/tmp/SRR594393.25.seg.bed"); - BEDWriter merged_segments = new BEDWriter("/home/sol/workspace/IsoSCM/tests/gtf/tmp/SRR594393.25.seg.merged.bed"); - // mergeSegments(segment_bed, merged_segments, 100); - merged_segments.close(); - } - if(false) - { - File splice_junction_bed = new File("/home/sol/workspace/IsoSCM/tests/gtf/tmp/SRR594393.25.sj.filtered.bed"); - File spliced_exon_gtf = new File("/home/sol/workspace/IsoSCM/tests/gtf/tmp/SRR594393.25.exon.gtf"); - File intronic_exon_gtf = new File("/home/sol/workspace/IsoSCM/tests/gtf/tmp/test.intronic.gtf"); - SAMFileReader bam = new SAMFileReader(new File("/mnt/LaiLab/sol/GSE41637/mapped/indexed/SRR594393.bam")); - Strandedness strandedness = Strandedness.reverse_forward; - int w = 5; - double d = .1; - - GTFWriter intronic_exon_writer = new GTFWriter("/home/sol/workspace/IsoSCM/tests/gtf/tmp/test.intronic.filtered.gtf"); - filterIntronicExons(bam, strandedness, spliced_exon_gtf, intronic_exon_gtf, splice_junction_bed, w, d, intronic_exon_writer); - intronic_exon_writer.close(); - } - if(false) - { - File segment_bed = new File("/home/sol/workspace/IsoSCM/tests/gtf/tmp/SRR594393.25.seg.bed"); - File splice_junction_bed = new File("/home/sol/workspace/IsoSCM/tests/gtf/tmp/SRR594393.25.sj.bed"); - File spliced_exon_gtf = new File("/home/sol/workspace/IsoSCM/tests/gtf/tmp/SRR594393.25.exon.gtf"); - SAMFileReader bam = new SAMFileReader(new File("/mnt/LaiLab/sol/GSE41637/mapped/indexed/SRR594393.bam")); - Strandedness strandedness = Strandedness.reverse_forward; - int minExtensionLength = 5; - double minExtensionFraction = .1; - GTFWriter intronic_exon_writer = new GTFWriter("/home/sol/workspace/IsoSCM/tests/gtf/tmp/test.intronic.gtf"); - identifyIntronicExons(segment_bed, splice_junction_bed, spliced_exon_gtf, bam, strandedness, intronic_exon_writer, minExtensionLength, minExtensionFraction); - intronic_exon_writer.close(); - } - System.exit(0); - - // GTFWriter gw = new GTFWriter("test5.consolidated.gtf"); - // GTFWriter gw = new GTFWriter("test6.consolidated.gtf"); - // intervals("test5.gtf",gw); - SAMFileReader sfr = new SAMFileReader(new File("/home/sol/data/sangercenter/hippocampus.bam")); - - ConsolidatedIntervalResult expressedSegmentsResult = expressedSegmentIntervals("test6.gtf"); - GenomicIntervalTree> expressedSegments = expressedSegmentsResult.segments; - GenomicIntervalSet highlyExpressedSegments = new GenomicIntervalSet(); - double expressed_threshold = 10; - for(AnnotatedRegion r : expressedSegments){ - if((Double)r.getAttribute("mle")>expressed_threshold){ - highlyExpressedSegments.add(r.chr, r.start, r.end); - } - } - - System.out.println(expressedSegments.size()); - System.out.println(highlyExpressedSegments.size()); - - - StrandedGenomicIntervalTree> sj = new StrandedGenomicIntervalTree>(); - BEDIterator splice_junctions = new BEDIterator("hippocampus.sj.bed"); - for(AnnotatedRegion splice_junction : splice_junctions){ - // System.out.println(splice_junction); - if(!sj.contains(splice_junction.chr, splice_junction.start, splice_junction.end, splice_junction.strand)) - sj.add(splice_junction); - } - - int w = 50; - int hw = (w+1)/2; - - StrandedGenomicIntervalSet clusteredSegments = new StrandedGenomicIntervalSet(); - for(AnnotatedRegion sje : sj){ - boolean overlaps = false; - for(AnnotatedRegion r : highlyExpressedSegments.overlappingRegions(sje.chr, sje.start-w-hw, sje.start+w-hw)){ - overlaps |= true; - } - for(AnnotatedRegion r : highlyExpressedSegments.overlappingRegions(sje.chr, sje.end-w-hw, sje.end+w-hw)){ - overlaps |= true; - } - if(overlaps){ - clusteredSegments.add(sje.chr, sje.start-w, sje.end+w, sje.strand); - } - } - - /** - * TODO rather than have unstranded clustered segments, make them stranded above. Then, only add highly expressed segments that - * overlap with these. - */ - - for(AnnotatedRegion r : highlyExpressedSegments){ - boolean overlaps_splice = false; - for(char strand : Util.list('+','-')){ - if(clusteredSegments.overlappingRegions(r.chr, r.start, r.end, strand).iterator().hasNext()){ - clusteredSegments.add(r.chr, r.start, r.end, strand); - overlaps_splice |= true; - } - } - if(!overlaps_splice) - clusteredSegments.add(r.chr, r.start, r.end, '.'); - - } - // chr1:23367501-23929101 - BEDWriter bw = new BEDWriter("test6.clusters.bed"); - // BEDWriter models = new BEDWriter("test6.models.bed"); - BEDWriter plus_models = new BEDWriter("test6.plus_models.bed"); - BEDWriter minus_models = new BEDWriter("test6.minus_models.bed"); - BEDWriter bw_i = new BEDWriter("increase.bed"); - BEDWriter bw_d = new BEDWriter("decrease.bed"); - GTFWriter transcribed_fragments = new GTFWriter("test6.transfrags.gtf"); - GTFWriter assembly = new GTFWriter("chr1.assembly.gtf"); - - int cluster_id=0; - for(AnnotatedRegion r : clusteredSegments){ - cluster_id++; - bw.write(r.chr, r.start, r.end, r.strand); - - // save the first and last increase and decrease, these should not be removed even if they are near a splice junction - ExtremeObjectTracker terminalIncreaseTracker = new ExtremeObjectTracker(new Util.ComparableComparator()); - ExtremeObjectTracker terminalDecreaseTracker = new ExtremeObjectTracker(new Util.ComparableComparator()); - - GenomicIntervalTree> increasePoints = new GenomicIntervalTree>(); - GenomicIntervalTree> decreasePoints = new GenomicIntervalTree>(); - - for(AnnotatedRegion changepoint : expressedSegmentsResult.increasePoints.overlappingRegions(r.chr, r.start, r.end)){ - terminalIncreaseTracker.put(changepoint, changepoint.start); - increasePoints.add(changepoint.chr, changepoint.start, changepoint.end, changepoint.attributes); - } - for(AnnotatedRegion changepoint : expressedSegmentsResult.decreasePoints.overlappingRegions(r.chr, r.start, r.end)){ - terminalDecreaseTracker.put(changepoint, changepoint.start); - decreasePoints.add(changepoint.chr, changepoint.start, changepoint.end, changepoint.attributes); - } - - AnnotatedRegion terminalIncrease = terminalIncreaseTracker.getMinObject(); - AnnotatedRegion terminalDecrease = terminalDecreaseTracker.getMaxObject(); - - List to_remove = new LinkedList(); - for(char strand : Util.list('+','-')){ - for(AnnotatedRegion sje : sj.overlappingRegions(r.chr, r.start, r.end, strand)){ - for(AnnotatedRegion changepoint : expressedSegmentsResult.changePoints.overlappingRegions(sje.chr, sje.start-w-hw, sje.start+w-hw)){ - to_remove.add(changepoint); - } - for(AnnotatedRegion changepoint : expressedSegmentsResult.changePoints.overlappingRegions(sje.chr, sje.end-w-hw, sje.end+w-hw)){ - to_remove.add(changepoint); - } - } - } - - for(AnnotatedRegion changepoint : to_remove){ - if( (terminalIncrease!=null && changepoint.start != terminalIncrease.start) && (terminalDecrease!=null && changepoint.start != terminalDecrease.start)){ - increasePoints.remove(changepoint.chr, changepoint.start, changepoint.end); - decreasePoints.remove(changepoint.chr, changepoint.start, changepoint.end); - } - } - - List increases = new LinkedList(); - List decreases = new LinkedList(); - - for(AnnotatedRegion changepoint : increasePoints.overlappingRegions(r.chr, r.start, r.end)){ - if( (Double) changepoint.getAttribute("after_mle") > expressed_threshold ){ - increases.add(changepoint); - bw_i.write(changepoint.chr, changepoint.start, changepoint.end, changepoint.strand); - } - } - for(AnnotatedRegion changepoint : decreasePoints.overlappingRegions(r.chr, r.start, r.end)){ - if( (Double) changepoint.getAttribute("before_mle") > expressed_threshold ){ - decreases.add(changepoint); - bw_d.write(changepoint.chr, changepoint.start, changepoint.end, changepoint.strand); - } - } - - // if(r.strand != '.') - // System.out.println(r); - // for(AnnotatedRegion increase : increases){ - // for(AnnotatedRegion decrease : decreases){ - // if(increase.start < decrease.end){ - // - // List splices = prepareSpliceJunctions(sfr, sj, r.chr, increase.start, decrease.end, r.strand); - // - // enumerateExons(r.chr, increase.start, decrease.end, r.strand, splices, models); - // - // // bw.write(r.chr, increase.start, decrease.end, '.'); - // } - // } - // } - - //TODO make GTF output with junctions, exons 5' and 3' labeled - - if(r.strand != '.'){ - BEDWriter models = r.isNegativeStrand()? minus_models : plus_models; - - System.out.println(r); - - StrandedGenomicIntervalTree> exons = new StrandedGenomicIntervalTree>(); - - StrandedGenomicPositionTree splice5ps = new StrandedGenomicPositionTree(); - StrandedGenomicPositionTree splice3ps = new StrandedGenomicPositionTree(); - - // find all splice junctions - // List splices = prepareSpliceJunctions(sfr, sj, r.chr, r.start, r.end, r.strand); - for(AnnotatedRegion splice : sj.overlappingRegions(r.chr, r.start, r.end, r.strand)){ - // for(AnnotatedRegion splice : splices){ - StrandedGenomicIntervalSet exonIntervals = new StrandedGenomicIntervalSet(); - exonIntervals.add(splice.chr, splice.start-1, splice.start-1, splice.strand); - exonIntervals.add(splice.chr, splice.end+1, splice.end+1, splice.strand); - writeTranscript(exonIntervals, models); - // models.write(splice.chr, splice.start, splice.end, splice.strand); - splice5ps.add(splice.chr, splice.get5Prime(), splice.strand); - splice3ps.add(splice.chr, splice.get3Prime(), splice.strand); - - Map attributes = new HashMap(); - attributes.put("gene_id", splice.toString()); - attributes.put("transcript_id", splice.toString()); - attributes.put("type", "splice_junction"); - - assembly.write("exon", splice.chr, splice.start-1, splice.start-1, splice.strand, AnnotatedRegion.GTFAttributeString(attributes)); - assembly.write("exon", splice.chr, splice.end+1, splice.end+1, splice.strand, AnnotatedRegion.GTFAttributeString(attributes)); - } - - // find all internal exons - for(AnnotatedRegion splice : sj.overlappingRegions(r.chr, r.start, r.end, r.strand)){ - AnnotatedRegion exon5p = splice3ps.getClosestUpstream(splice.chr, splice.get5Prime(), splice.strand); - if(exon5p!=null){ - int s5p = splice.get5Prime(); - int e5p = exon5p.start; - if(!exons.contains(r.chr, Math.min(s5p,e5p)+1, Math.max(s5p,e5p)-1, r.strand)){ - { - Map attributes = new HashMap(); - attributes.put("type", "internal"); - transcribed_fragments.write("exon", r.chr, Math.min(s5p,e5p)+1, Math.max(s5p,e5p)-1, r.strand, AnnotatedRegion.GTFAttributeString(attributes)); - assembly.write("exon", r.chr, Math.min(s5p,e5p)+1, Math.max(s5p,e5p)-1, r.strand, AnnotatedRegion.GTFAttributeString(attributes)); - } - - models.write(r.chr, Math.min(s5p,e5p)+1, Math.max(s5p,e5p)-1, r.strand); - exons.add(r.chr, Math.min(s5p,e5p)+1, Math.max(s5p,e5p)-1, r.strand); - } - } - } - - // find all terminal exons - List ends3p = r.isNegativeStrand()?increases:decreases; - List ends5p = !r.isNegativeStrand()?increases:decreases; - - // the 3' exon - // since there will be some error due to granularity of SCM, don't forget to add a half-width - // splice junctions also have to be adjusted by one - int i3p = r.isNegativeStrand()?-1:1; - int i5p = !r.isNegativeStrand()?-1:1; - - for(AnnotatedRegion end3p : ends3p){ - - AnnotatedRegion transcriptBoundary = splice3ps.getClosestUpstream(r.chr, end3p.start, r.strand); - if(transcriptBoundary!=null){ - int s5p = end3p.start; - int e5p = transcriptBoundary.start; - if(!exons.contains(r.chr, Math.min(s5p,e5p), Math.max(s5p,e5p), r.strand)){ - - { - Map attributes = new HashMap(); - attributes.put("type", "exon3p"); - transcribed_fragments.write("exon", r.chr, Math.min(s5p+hw,e5p)+1, Math.max(s5p+hw,e5p)-1, r.strand, AnnotatedRegion.GTFAttributeString(attributes)); - } - { - Map attributes = new HashMap(); - attributes.put("type", "3'end"); - assembly.write("exon", r.chr, s5p+hw, s5p+hw, r.strand, AnnotatedRegion.GTFAttributeString(attributes)); - } - - models.write(r.chr, Math.min(s5p+hw,e5p+i3p), Math.max(s5p+hw,e5p+i3p), r.strand); - exons.add(r.chr, Math.min(s5p,e5p), Math.max(s5p,e5p), r.strand); - } - } - } - // the 5' exon - for(AnnotatedRegion end5p : ends5p){ - - AnnotatedRegion transcriptBoundary = splice5ps.getClosestDownstream(r.chr, end5p.start, r.strand); - if(transcriptBoundary!=null){ - int s5p = end5p.start; - int e5p = transcriptBoundary.start; - if(!exons.contains(r.chr, Math.min(s5p,e5p), Math.max(s5p,e5p), r.strand)){ - { - Map attributes = new HashMap(); - attributes.put("type", "exon5p"); - transcribed_fragments.write("exon", r.chr, Math.min(s5p+hw,e5p)+1, Math.max(s5p+hw,e5p)-1, r.strand, AnnotatedRegion.GTFAttributeString(attributes)); - } - { - Map attributes = new HashMap(); - attributes.put("type", "5'end"); - assembly.write("exon", r.chr, s5p+hw, s5p+hw, r.strand, AnnotatedRegion.GTFAttributeString(attributes)); - } - models.write(r.chr, Math.min(s5p+hw,e5p+i5p), Math.max(s5p+hw,e5p+i5p), r.strand); - exons.add(r.chr, Math.min(s5p,e5p), Math.max(s5p,e5p), r.strand); - } - } - } - - // for(AnnotatedRegion increase : increases){ - // for(AnnotatedRegion decrease : decreases){ - // if(increase.start < decrease.end){ - // - // List splices = prepareSpliceJunctions(sfr, sj, r.chr, increase.start, decrease.end, r.strand); - // - // enumerateExons(r.chr, increase.start, decrease.end, r.strand, splices, models); - // - // // bw.write(r.chr, increase.start, decrease.end, '.'); - // } - // } - // } - } - } - bw_i.close(); - bw_d.close(); - bw.close(); - transcribed_fragments.close(); - // gw.close(); - } - } diff --git a/IsoSCM/src/processing/FindSpliceJunctions.java b/IsoSCM/src/processing/FindSpliceJunctions.java index 82a4303..2edbb4f 100644 --- a/IsoSCM/src/processing/FindSpliceJunctions.java +++ b/IsoSCM/src/processing/FindSpliceJunctions.java @@ -7,39 +7,33 @@ import java.util.List; import java.util.Map; -import org.apache.commons.math3.special.Beta; - -import filter.ComposableFilter; -import filter.ConsumingReadFilter; -import filter.Counter; -import filter.Echo; -import filter.MismatchFilter; -import filter.MultiMappingfilter; -import filter.SAMRecordStrandednessFilter; -import filter.Transformer; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecordIterator; -import net.sf.samtools.SAMRecord.SAMTagAndValue; + +import org.apache.commons.math3.special.Beta; + import tools.AnnotatedRegion; import tools.BAMTools; -import tools.GenomicIntervalSet; -import tools.IntervalTools; -import tools.StrandedGenomicIntervalSet; -import tools.StrandedGenomicIntervalTree; -import tools.Strandedness; import tools.BEDTools.BEDWriter; import tools.GTFTools.GTFWriter; +import tools.IntervalTools; import tools.ParseBed.BEDIterator; import tools.ParseGTF.TranscriptIterator; +import tools.StrandedGenomicIntervalSet; +import tools.StrandedGenomicIntervalTree; +import tools.Strandedness; import util.Util; -import util.Util.Criteria; import util.Util.ExtremeObjectTracker; import util.Util.MapCounter; -import util.Util.MapSet; +import filter.ComposableFilter; +import filter.ConsumingReadFilter; +import filter.Counter; +import filter.MultiMappingfilter; +import filter.SAMRecordStrandednessFilter; public class FindSpliceJunctions { diff --git a/IsoSCM/src/processing/SlidingWindow.java b/IsoSCM/src/processing/SlidingWindow.java index 0c5d0b8..5535039 100644 --- a/IsoSCM/src/processing/SlidingWindow.java +++ b/IsoSCM/src/processing/SlidingWindow.java @@ -8,8 +8,6 @@ import java.util.List; import java.util.Map; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecordIterator; diff --git a/IsoSCM/src/splicegraph/EnumerateTranscriptModels.java b/IsoSCM/src/splicegraph/EnumerateTranscriptModels.java index 27ed5c7..db3a92b 100644 --- a/IsoSCM/src/splicegraph/EnumerateTranscriptModels.java +++ b/IsoSCM/src/splicegraph/EnumerateTranscriptModels.java @@ -12,110 +12,25 @@ import java.util.Map; import java.util.Set; -import processing.FindSpliceJunctions; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecordIterator; +import processing.FindSpliceJunctions; import tools.AnnotatedRegion; +import tools.BEDTools.BEDWriter; import tools.GenomicIntervalSet; -import tools.GenomicIntervalTree; +import tools.ParseBed.BEDIterator; import tools.StrandedGenomicIntervalSet; import tools.StrandedGenomicIntervalTree; -import tools.BEDTools.BEDWriter; -import tools.ParseBed.BEDIterator; -import tools.ParseGTF.TranscriptIterator; -import tools.StrandedGenomicPositionTree; import util.Util; import util.Util.MapCounter; public class EnumerateTranscriptModels { - public static GenomicIntervalTree> intervals(String gtf) throws FileNotFoundException { - StrandedGenomicIntervalTree> git = new StrandedGenomicIntervalTree>(); - GenomicIntervalTree> git2 = new GenomicIntervalTree>(); - - TranscriptIterator ti = new TranscriptIterator(gtf); - - for(AnnotatedRegion r : ti){ - boolean isAdded=false; - for(AnnotatedRegion added : git.overlappingRegions(r.chr, r.start, r.end,r.strand)){ - - isAdded |= r.start== added.start && r.end ==added.end; - - } - - if(!isAdded){ - git.add(r.chr, r.start, r.end,r.strand); - System.out.println(r); - } - } - - for(AnnotatedRegion r : git){ - if(r.strand=='+'){ - boolean containsAntisense = false; - for(AnnotatedRegion added : git.overlappingRegions(r.chr, r.start, r.end,r.isNegativeStrand()?'+':'-')){ - containsAntisense |= r.start== added.start && r.end ==added.end; - } - if(containsAntisense) - git2.add(r.chr, r.start, r.end); - } - } - - // find those that are on both strands, - // delete them, those that remain are those that are only on one strand. - // for each of these, assign them to the closest up/down-stream segment boundary. - - StrandedGenomicPositionTree segmentBoundaries = new StrandedGenomicPositionTree(); - - for(AnnotatedRegion r : git2){ - segmentBoundaries.add(r.chr, r.start, '+'); - segmentBoundaries.add(r.chr, r.end, '-'); - for(char strand : Util.list('+','-')) - git.remove(r.chr, r.start, r.end, strand); - } - - - - for(AnnotatedRegion r : git){ - segmentBoundaries.add(r.chr, r.get5Prime(), r.strand); - - // System.out.println(r); - } - - for(AnnotatedRegion r : git){ - AnnotatedRegion closest = segmentBoundaries.getClosestUpstream(r.chr, r.get3Prime(), r.isNegativeStrand()?'+':'-'); - // System.out.println(r); - // System.out.printf("\t%s\n", closest); - if(closest!=null){ - - boolean isAdded=false; - for(AnnotatedRegion added : git2.overlappingRegions(r.chr, Math.min(r.start,closest.start), Math.max(r.end,closest.end))){ - - isAdded |= Math.min(r.start,closest.start) == added.start && Math.max(r.end,closest.end) == added.end; - - } - - if(!isAdded){ - System.out.printf("(%s,%s)\t%s:%d-%d\n",r,closest,r.chr,Math.min(r.start,closest.start), Math.max(r.end,closest.end)); - - git2.add(r.chr, Math.min(r.start,closest.start), Math.max(r.end,closest.end)); - } - } - } - - BEDWriter bw = new BEDWriter("test3.bed"); - for(AnnotatedRegion r : git2){ - bw.write(r.chr, r.start, r.end, '.'); - } - bw.close(); - - - return git2; - } - + public static void filterRegion(String chr, int start, int end, String segmentationBed) throws FileNotFoundException{ BEDIterator bi = new BEDIterator("test3.bed"); BEDWriter bw = new BEDWriter(segmentationBed);