diff --git a/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/Reassemble.scala b/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/Reassemble.scala
index 534cd5a1..23eafb2d 100644
--- a/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/Reassemble.scala
+++ b/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/Reassemble.scala
@@ -20,8 +20,7 @@ package org.bdgenomics.avocado.cli
import org.apache.spark.SparkContext
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.rdd.ADAMSaveAnyArgs
-import org.bdgenomics.adam.rdd.read.{ AlignedReadRDD, AlignmentRecordRDD, MDTagging }
-import org.bdgenomics.avocado.realigner.Realigner
+import org.bdgenomics.avocado.realigner.{ ConsensusRealigner, Realigner }
import org.bdgenomics.utils.cli._
import org.kohsuke.args4j.{ Argument, Option => Args4jOption }
@@ -58,6 +57,10 @@ class ReassembleArgs extends Args4jBase with ADAMSaveAnyArgs with ParquetArgs {
name = "-defer_merging",
usage = "Defers merging single file output")
var deferMerging: Boolean = false
+ @Args4jOption(required = false,
+ name = "-use_consensus_realigner",
+ usage = "If provided, uses consensus-mode realigner.")
+ var useConsensusRealigner: Boolean = false
// required by ADAMSaveAnyArgs
var sortFastqOutput: Boolean = false
@@ -79,7 +82,11 @@ class Reassemble(
val reads = sc.loadAlignments(args.inputPath)
// realign the reads
- val reassembledReads = Realigner.realign(reads, args.kmerLength)
+ val reassembledReads = if (args.useConsensusRealigner) {
+ ConsensusRealigner.realign(reads, args.kmerLength)
+ } else {
+ Realigner.realign(reads, args.kmerLength)
+ }
// save the reads
reassembledReads.save(args)
diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/BiallelicGenotyper.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/BiallelicGenotyper.scala
index 625857eb..6aae8d7d 100644
--- a/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/BiallelicGenotyper.scala
+++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/BiallelicGenotyper.scala
@@ -222,10 +222,26 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging {
} else if (isInsertion(variant)) {
val insAllele = variant.getAlternateAllele.tail
val insObserved = observed.filter(_._2 == insAllele)
+ println("for %s observed %s in %s".format(insAllele, observed.mkString(","), read))
if (observed.size == 2 &&
insObserved.size == 1) {
Some((variant, insObserved.head._3.duplicate(Some(false))))
- } else if (observed.forall(_._3.isRef)) {
+ } else if (!observed.forall(_._3.isRef)) {
+ val nonRef = observed.filter(!_._3.isRef)
+ if (nonRef.size == 1) {
+ val (_, allele, nonRefObs) = nonRef.head
+ if (allele.length == insAllele.length) {
+ val matchingBases = allele.zip(insAllele).count(p => p._1 == p._2)
+ Some((variant, nonRefObs.scale(matchingBases,
+ insAllele.length,
+ Some(false))))
+ } else {
+ Some((variant, observed.head._3.nullOut))
+ }
+ } else {
+ Some((variant, observed.head._3.nullOut))
+ }
+ } else if (read.getEnd != variant.getEnd) {
Some((variant, observed.head._3.invert))
} else {
Some((variant, observed.head._3.nullOut))
diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/DiscoverVariants.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/DiscoverVariants.scala
index 11a46041..9b3acc4a 100644
--- a/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/DiscoverVariants.scala
+++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/DiscoverVariants.scala
@@ -201,7 +201,11 @@ object DiscoverVariants extends Serializable with Logging {
kv
}
case Insertion(length) => {
- val insQuals = qual.substring(idx - 1, idx + length).map(_.toInt - 33).sum / length
+ val insQuals = if (length > 0) {
+ qual.substring(idx - 1, idx + length).map(_.toInt - 33).sum / length
+ } else {
+ 0
+ }
val newVar = if (insQuals >= phredThreshold) {
Variant.newBuilder
.setContigName(contigName)
diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/Observer.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/Observer.scala
index 144e7839..72e2f92d 100644
--- a/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/Observer.scala
+++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/genotyping/Observer.scala
@@ -131,9 +131,13 @@ private[genotyping] object Observer extends Serializable {
// get bases and quals
val bases = readSequence.substring(oldReadIdx, readIdx)
- val qual = readQualities.substring(oldReadIdx, readIdx)
- .map(_.toInt - 33)
- .sum / length
+ val qual = if (length > 0) {
+ readQualities.substring(oldReadIdx, readIdx)
+ .map(_.toInt - 33)
+ .sum / length
+ } else {
+ 0
+ }
// the key is the (site, allele, sampleId)
// insertions associate to the site to their left, hence the -1
diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/models/Observation.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/models/Observation.scala
index 3cfcd653..eb1be137 100644
--- a/avocado-core/src/main/scala/org/bdgenomics/avocado/models/Observation.scala
+++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/models/Observation.scala
@@ -91,6 +91,24 @@ case class Observation(alleleForwardStrand: Int,
isRef = setRef.getOrElse(isRef))
}
+ /**
+ * @return Makes a copy where underlying arrays are not shared.
+ */
+ def scale(num: Int,
+ denum: Int,
+ setRef: Option[Boolean] = None): Observation = {
+ val scaleBy = num.toDouble / denum.toDouble
+ Observation(alleleForwardStrand,
+ otherForwardStrand,
+ squareMapQ,
+ alleleLogLikelihoods.map(v => v * scaleBy),
+ otherLogLikelihoods.map(v => v * scaleBy),
+ alleleCoverage,
+ otherCoverage,
+ totalCoverage = totalCoverage,
+ isRef = setRef.getOrElse(isRef))
+ }
+
/**
* @return Returns this observation, but with allele/other swapped.
*
diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/realigner/ConsensusRealigner.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/realigner/ConsensusRealigner.scala
new file mode 100644
index 00000000..0ba9d487
--- /dev/null
+++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/realigner/ConsensusRealigner.scala
@@ -0,0 +1,64 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.avocado.realigner
+
+import org.bdgenomics.adam.algorithms.consensus.ConsensusGenerator
+import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD
+import org.bdgenomics.adam.rdd.variant.VariantRDD
+import org.bdgenomics.avocado.genotyping.DiscoverVariants
+import org.bdgenomics.formats.avro.Variant
+
+object ConsensusRealigner {
+
+ /**
+ * Realigns a set of reads against the reference genome.
+ *
+ * @param reads Reads to realign.
+ * @param kmerLength The length k of the k-mers.
+ * @return Returns the realigned reads.
+ */
+ def realign(reads: AlignmentRecordRDD,
+ kmerLength: Int): AlignmentRecordRDD = {
+
+ // use the realigner to make a first pass over the reads
+ val realignedReads = Realigner.realign(reads, kmerLength)
+
+ // from here, we'll discover any potential variants
+ val variants = filterIndels(DiscoverVariants(realignedReads))
+
+ // we'll pass these discovered variants to ADAM's indel realigner
+ reads.realignIndels(ConsensusGenerator.fromKnownIndels(variants))
+ }
+
+ /**
+ * @param v The variant to filter.
+ * @return Returns false if the variant is a SNV/MNV.
+ */
+ private[realigner] def discardSnvs(v: Variant): Boolean = {
+ v.getReferenceAllele.length != v.getAlternateAllele.length
+ }
+
+ /**
+ * @param rdd The RDD of variants to filter.
+ * @return Returns a new RDD of variants where the SNVs & MNVs have been
+ * removed and only INDEL variants remain.
+ */
+ private[realigner] def filterIndels(rdd: VariantRDD): VariantRDD = {
+ rdd.transform(r => r.filter(discardSnvs))
+ }
+}
diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/util/PrefilterReads.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/util/PrefilterReads.scala
index 017d32ac..e578f135 100644
--- a/avocado-core/src/main/scala/org/bdgenomics/avocado/util/PrefilterReads.scala
+++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/util/PrefilterReads.scala
@@ -18,7 +18,7 @@
package org.bdgenomics.avocado.util
import org.bdgenomics.adam.models.SequenceDictionary
-import org.bdgenomics.adam.rdd.read.{ AlignedReadRDD, AlignmentRecordRDD }
+import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD
import org.bdgenomics.formats.avro.AlignmentRecord
private[avocado] trait PrefilterReadsArgs extends Serializable {
@@ -85,7 +85,7 @@ private[avocado] object PrefilterReads extends Serializable {
val readRdd = rdd.rdd.filter(readFn)
// construct a new aligned read rdd
- AlignedReadRDD(readRdd, sequences, rdd.recordGroups)
+ AlignmentRecordRDD(readRdd, sequences, rdd.recordGroups)
}
/**
diff --git a/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/BiallelicGenotyperSuite.scala b/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/BiallelicGenotyperSuite.scala
index 5a6503c6..cd63a9e6 100644
--- a/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/BiallelicGenotyperSuite.scala
+++ b/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/BiallelicGenotyperSuite.scala
@@ -20,6 +20,7 @@ package org.bdgenomics.avocado.genotyping
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.avocado.AvocadoFunSuite
import org.bdgenomics.avocado.models.Observation
+import org.bdgenomics.avocado.realigner.ConsensusRealigner
import org.bdgenomics.avocado.util.{
HardFilterGenotypes,
TestHardFilterGenotypesArgs
@@ -437,14 +438,16 @@ class BiallelicGenotyperSuite extends AvocadoFunSuite {
})
}
- ignore("call hom alt C->CCCCT insertion at 1/866511") {
+ sparkTest("call hom alt C->CCCCT insertion at 1/866511") {
val readPath = resourceUrl("NA12878.chr1.866511.sam")
val reads = sc.loadAlignments(readPath.toString)
.transform(rdd => {
rdd.filter(_.getMapq > 0)
})
- val gts = BiallelicGenotyper.discoverAndCall(reads,
+ val realignedReads = ConsensusRealigner.realign(reads, 20)
+
+ val gts = BiallelicGenotyper.discoverAndCall(realignedReads,
2,
optPhredThreshold = Some(30)).transform(rdd => {
rdd.filter(gt => gt.getAlleles.forall(_ == GenotypeAllele.ALT))
diff --git a/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/DiscoverVariantsSuite.scala b/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/DiscoverVariantsSuite.scala
index e28ebecc..82dbce6c 100644
--- a/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/DiscoverVariantsSuite.scala
+++ b/avocado-core/src/test/scala/org/bdgenomics/avocado/genotyping/DiscoverVariantsSuite.scala
@@ -22,7 +22,7 @@ import org.bdgenomics.adam.models.{
SequenceDictionary,
SequenceRecord
}
-import org.bdgenomics.adam.rdd.read.AlignedReadRDD
+import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD
import org.bdgenomics.avocado.AvocadoFunSuite
import org.bdgenomics.formats.avro.{ AlignmentRecord, Variant }
@@ -233,7 +233,7 @@ class DiscoverVariantsSuite extends AvocadoFunSuite {
snpReadMCigar, snpReadEqCigar,
insertRead,
deleteRead))
- val readRdd = AlignedReadRDD(rdd,
+ val readRdd = AlignmentRecordRDD(rdd,
SequenceDictionary(
SequenceRecord("1", 50L),
SequenceRecord("2", 40L),
diff --git a/avocado-core/src/test/scala/org/bdgenomics/avocado/realigner/ConsensusRealignerSuite.scala b/avocado-core/src/test/scala/org/bdgenomics/avocado/realigner/ConsensusRealignerSuite.scala
new file mode 100644
index 00000000..b986577b
--- /dev/null
+++ b/avocado-core/src/test/scala/org/bdgenomics/avocado/realigner/ConsensusRealignerSuite.scala
@@ -0,0 +1,85 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.avocado.realigner
+
+import org.bdgenomics.adam.models.SequenceDictionary
+import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD
+import org.bdgenomics.adam.rdd.variant.VariantRDD
+import org.bdgenomics.formats.avro.Variant
+
+class ConsensusRealignerSuite extends SparkRealignerSuite {
+
+ val allowLegacyCigars = true
+
+ def realign(rdd: AlignmentRecordRDD,
+ kmerLength: Int): AlignmentRecordRDD = {
+ ConsensusRealigner.realign(rdd, kmerLength)
+ }
+
+ val snv = Variant.newBuilder
+ .setStart(0L)
+ .setReferenceAllele("A")
+ .setAlternateAllele("G")
+ .build
+ val mnv = Variant.newBuilder
+ .setStart(1L)
+ .setReferenceAllele("AC")
+ .setAlternateAllele("GT")
+ .build
+ val ins = Variant.newBuilder
+ .setStart(2L)
+ .setReferenceAllele("A")
+ .setAlternateAllele("ACC")
+ .build
+ val del = Variant.newBuilder
+ .setStart(3L)
+ .setReferenceAllele("GCT")
+ .setAlternateAllele("G")
+ .build
+
+ test("filter out a snv") {
+ assert(!ConsensusRealigner.discardSnvs(snv))
+ }
+
+ test("filter out a mnv") {
+ assert(!ConsensusRealigner.discardSnvs(mnv))
+ }
+
+ test("keep an insert") {
+ assert(ConsensusRealigner.discardSnvs(ins))
+ }
+
+ test("keep a deletion") {
+ assert(ConsensusRealigner.discardSnvs(del))
+ }
+
+ sparkTest("filter snv/mnv variants out") {
+ val vRdd = VariantRDD(sc.parallelize(Seq(snv, mnv, ins, del)),
+ SequenceDictionary.empty)
+ val filteredVariants = ConsensusRealigner.filterIndels(vRdd)
+ .rdd
+ .collect
+
+ val variantIds = filteredVariants.map(_.getStart)
+ .toSet
+
+ assert(variantIds.size === 2)
+ assert(variantIds(2L))
+ assert(variantIds(3L))
+ }
+}
diff --git a/avocado-core/src/test/scala/org/bdgenomics/avocado/realigner/RealignerSuite.scala b/avocado-core/src/test/scala/org/bdgenomics/avocado/realigner/RealignerSuite.scala
index c0875463..289dbf6d 100644
--- a/avocado-core/src/test/scala/org/bdgenomics/avocado/realigner/RealignerSuite.scala
+++ b/avocado-core/src/test/scala/org/bdgenomics/avocado/realigner/RealignerSuite.scala
@@ -17,15 +17,8 @@
*/
package org.bdgenomics.avocado.realigner
-import org.bdgenomics.adam.models.{
- SequenceDictionary,
- SequenceRecord,
- RecordGroup,
- RecordGroupDictionary
-}
import org.bdgenomics.adam.rdd.ADAMContext._
-import org.bdgenomics.adam.rdd.read.{ AlignedReadRDD, AlignmentRecordRDD }
-import org.bdgenomics.avocado.AvocadoFunSuite
+import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD
import org.bdgenomics.avocado.models.{
Clipped,
Deletion,
@@ -35,7 +28,14 @@ import org.bdgenomics.avocado.models.{
}
import org.bdgenomics.formats.avro.AlignmentRecord
-class RealignerSuite extends AvocadoFunSuite {
+class RealignerSuite extends SparkRealignerSuite {
+
+ val allowLegacyCigars = false
+
+ def realign(rdd: AlignmentRecordRDD,
+ kmerLength: Int): AlignmentRecordRDD = {
+ Realigner.realign(rdd, kmerLength)
+ }
test("realignment candidate code needs at least one block") {
intercept[AssertionError] {
@@ -190,124 +190,6 @@ class RealignerSuite extends AvocadoFunSuite {
}
}
- def makeAndRealignRdd(reads: Seq[AlignmentRecord],
- kmerLength: Int): Array[AlignmentRecord] = {
- val gRdd = AlignedReadRDD(sc.parallelize(reads),
- SequenceDictionary(SequenceRecord("ctg", 50L)),
- RecordGroupDictionary(Seq(RecordGroup("rg", "rg"))))
-
- // realign the genomic rdd
- val realignedRdd = Realigner.realign(gRdd, kmerLength)
-
- // collect the reads
- realignedRdd.rdd.collect()
- }
-
- sparkTest("realign a set of reads around an insert") {
- // insertion sequence:
- // ins: AATGAGACTTACATCATTAAAACCGTGTGGACACA
- // ref: AATGAGACTTACATCATTAA__CCGTGTGGACACA
- val sequence = "AATGAGACTTACATCATTAAAACCGTGTGGACACA"
- val insertStart = 20
- val readLength = insertStart + 6 + 2
-
- // generate 7 reads with a 6bp flank
- val reads = (0 until 7).map(rId => {
- val basesBeforeInsert = insertStart - rId
- val basesAfterInsert = 6 + rId
-
- AlignmentRecord.newBuilder()
- .setReadName(rId.toString)
- .setContigName("ctg")
- .setRecordGroupName("rg")
- .setReadMapped(true)
- .setSequence(sequence.drop(rId).take(readLength))
- .setStart(rId.toLong)
- .setEnd((rId + readLength - 2 + 1).toLong)
- .setCigar("%dM2I%dM".format(basesBeforeInsert, basesAfterInsert))
- .setMismatchingPositions((readLength - 2).toString)
- .build()
- })
-
- // make into a genomic rdd
- val newReads = makeAndRealignRdd(reads, 6)
-
- assert(newReads.size === 7)
- newReads.foreach(r => {
- val rId = r.getReadName.toInt
-
- // these values are different from above because original alignments were
- // not left justified
- val basesBeforeInsert = insertStart - rId - 2
- val basesAfterInsert = 8 + rId
-
- assert(r.getCigar === "%d=2I%d=".format(basesBeforeInsert, basesAfterInsert))
- assert(r.getMismatchingPositions === (readLength - 2).toString)
- })
- }
-
- sparkTest("realign a set of reads around a deletion") {
- // deletion sequence:
- // del: AGGTCTGAATGAGACTTA__TCATTAACCGTGTGGACACA
- // ref: AGGTCTGAATGAGACTTACATCATTAACCGTGTGGACACA
- val sequence = "AGGTCTGAATGAGACTTATCATTAACCGTGTGGACACA"
- val deleteStart = 18
- val readLength = deleteStart + 8
-
- // generate 10 reads with a 8bp flank
- val reads = (0 until 10).map(rId => {
- val basesBeforeDelete = deleteStart - rId
- val basesAfterDelete = 8 + rId
-
- AlignmentRecord.newBuilder()
- .setReadName(rId.toString)
- .setContigName("ctg")
- .setRecordGroupName("rg")
- .setReadMapped(true)
- .setSequence(sequence.drop(rId).take(readLength))
- .setStart(rId.toLong)
- .setEnd((rId + readLength + 2 + 1).toLong)
- .setCigar("%dM2D%dM".format(basesBeforeDelete, basesAfterDelete))
- .setMismatchingPositions("%d^CA%d".format(basesBeforeDelete, basesAfterDelete))
- .build()
- })
-
- // make into a genomic rdd
- val newReads = makeAndRealignRdd(reads, 8)
-
- assert(newReads.size === 10)
- newReads.foreach(r => {
- val rId = r.getReadName.toInt
-
- // these values are different from above because original alignments were
- // not left justified
- val basesBeforeDelete = deleteStart - rId - 1
- val basesAfterDelete = 9 + rId
-
- assert(r.getCigar === "%d=2D%d=".format(basesBeforeDelete, basesAfterDelete))
- assert(r.getMismatchingPositions === "%d^AC%d".format(basesBeforeDelete, basesAfterDelete))
- })
- }
-
- sparkTest("realigning a read with a repeat will return the original read") {
- val read = AlignmentRecord.newBuilder()
- .setReadName("A_READ")
- .setReadMapped(true)
- .setStart(10L)
- .setEnd(17L)
- .setSequence("TCAAAAAAGG")
- .setCigar("3M4I3M")
- .setMismatchingPositions("6")
- .build()
-
- // make into a genomic rdd
- val newReads = makeAndRealignRdd(Seq(read), 3)
-
- // should have one read
- assert(newReads.size === 1)
- assert(newReads.head === read)
- }
-
sparkTest("one sample read should fail due to a repeat, all others should realign") {
val readFile = resourceUrl("NA12878_reads.sam").toString
val reads = sc.loadAlignments(readFile)
diff --git a/avocado-core/src/test/scala/org/bdgenomics/avocado/realigner/SparkRealignerSuite.scala b/avocado-core/src/test/scala/org/bdgenomics/avocado/realigner/SparkRealignerSuite.scala
new file mode 100644
index 00000000..171d91cb
--- /dev/null
+++ b/avocado-core/src/test/scala/org/bdgenomics/avocado/realigner/SparkRealignerSuite.scala
@@ -0,0 +1,182 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.avocado.realigner
+
+import org.bdgenomics.adam.models.{
+ SequenceDictionary,
+ SequenceRecord,
+ RecordGroup,
+ RecordGroupDictionary
+}
+import org.bdgenomics.adam.rdd.ADAMContext._
+import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD
+import org.bdgenomics.avocado.AvocadoFunSuite
+import org.bdgenomics.formats.avro.AlignmentRecord
+
+trait SparkRealignerSuite extends AvocadoFunSuite {
+
+ val allowLegacyCigars: Boolean
+
+ def realign(rdd: AlignmentRecordRDD,
+ kmerLength: Int): AlignmentRecordRDD
+
+ def makeAndRealignRdd(reads: Seq[AlignmentRecord],
+ kmerLength: Int): Array[AlignmentRecord] = {
+ val gRdd = AlignmentRecordRDD(sc.parallelize(reads),
+ SequenceDictionary(SequenceRecord("ctg", 50L)),
+ RecordGroupDictionary(Seq(RecordGroup("rg", "rg"))))
+
+ // realign the genomic rdd
+ val realignedRdd = realign(gRdd, kmerLength)
+
+ // collect the reads
+ realignedRdd.rdd.collect()
+ }
+
+ sparkTest("realign a set of reads around an insert") {
+ // true insertion sequence:
+ // ins: AATGAGACTTACATCATTAAAACCGTGTGGACACA
+ // ref: AATGAGACTTACATCATT__AACCGTGTGGACACA
+ //
+ // test alignment:
+ // X: |
+ // ins: AATGAGACTTACATCATTAAAACCGTGTGGACACA
+ // ref: AATGAGACTTACATCATTAAC__CGTGTGGACACA
+ val sequence = "AATGAGACTTACATCATTAAAACCGTGTGGACACA"
+ val insertStart = 21
+ val readLength = insertStart + 6 + 2
+
+ // generate 7 reads with a 6bp flank
+ val reads = (0 until 7).map(rId => {
+ val basesBeforeInsert = insertStart - rId
+ val basesAfterInsert = 6 + rId
+
+ AlignmentRecord.newBuilder()
+ .setReadName(rId.toString)
+ .setContigName("ctg")
+ .setRecordGroupName("rg")
+ .setReadMapped(true)
+ .setSequence(sequence.drop(rId).take(readLength))
+ .setQual("*" * readLength)
+ .setStart(rId.toLong)
+ .setEnd((rId + insertStart + 6).toLong)
+ .setCigar("%dM2I%dM".format(basesBeforeInsert, basesAfterInsert))
+ .setMismatchingPositions("%dC%d".format(basesBeforeInsert - 1,
+ basesAfterInsert))
+ .setMapq(50)
+ .build()
+ })
+
+ // make into a genomic rdd
+ val newReads = makeAndRealignRdd(reads, 6)
+
+ assert(newReads.size === 7)
+ newReads.foreach(r => {
+ val rId = r.getReadName.toInt
+
+ // these values are different from above because original alignments were
+ // not left justified
+ val basesBeforeInsert = insertStart - rId - 3
+ val basesAfterInsert = 9 + rId
+
+ if (allowLegacyCigars) {
+ assert(r.getCigar === "%dM2I%dM".format(basesBeforeInsert, basesAfterInsert))
+ } else {
+ assert(r.getCigar === "%d=2I%d=".format(basesBeforeInsert, basesAfterInsert))
+ }
+ assert(r.getMismatchingPositions === (readLength - 2).toString)
+ })
+ }
+
+ sparkTest("realign a set of reads around a deletion") {
+ // true deletion sequence:
+ // del: AGGTCTGAATGAGACTT__ATCATTAACCGTGTGGACACA
+ // ref: AGGTCTGAATGAGACTTACATCATTAACCGTGTGGACACA
+ //
+ // test alignment:
+ // X: |
+ // del: AGGTCTGAATGAGACT__TATCATTAACCGTGTGGACACA
+ // ref: AGGTCTGAATGAGACTTACATCATTAACCGTGTGGACACA
+ val sequence = "AGGTCTGAATGAGACTTATCATTAACCGTGTGGACACA"
+ val deleteStart = 16
+ val readLength = deleteStart + 7
+
+ // generate 10 reads with a 8bp flank
+ val reads = (0 until 10).map(rId => {
+ val basesBeforeDelete = deleteStart - rId - 1
+ val basesAfterDelete = 8 + rId
+
+ AlignmentRecord.newBuilder()
+ .setReadName(rId.toString)
+ .setContigName("ctg")
+ .setRecordGroupName("rg")
+ .setReadMapped(true)
+ .setSequence(sequence.drop(rId).take(readLength))
+ .setQual("*" * readLength)
+ .setStart(rId.toLong)
+ .setEnd((rId + readLength + 2).toLong)
+ .setCigar("%dM2D%dM".format(basesBeforeDelete, basesAfterDelete))
+ .setMismatchingPositions("%d^TA0C%d".format(basesBeforeDelete,
+ basesAfterDelete - 1))
+ .setMapq(50)
+ .build()
+ })
+
+ // make into a genomic rdd
+ val newReads = makeAndRealignRdd(reads, 8)
+
+ assert(newReads.size === 10)
+ newReads.foreach(r => {
+ val rId = r.getReadName.toInt
+
+ // these values are different from above because original alignments were
+ // not left justified
+ val basesBeforeDelete = deleteStart - rId
+ val basesAfterDelete = 7 + rId
+
+ if (allowLegacyCigars) {
+ assert(r.getCigar === "%dM2D%dM".format(basesBeforeDelete, basesAfterDelete))
+ } else {
+ assert(r.getCigar === "%d=2D%d=".format(basesBeforeDelete, basesAfterDelete))
+ }
+ assert(r.getMismatchingPositions === "%d^AC%d".format(basesBeforeDelete, basesAfterDelete))
+ })
+ }
+
+ sparkTest("realigning a read with a repeat will return the original read") {
+ val read = AlignmentRecord.newBuilder()
+ .setContigName("ctg")
+ .setReadName("A_READ")
+ .setReadMapped(true)
+ .setStart(10L)
+ .setEnd(17L)
+ .setSequence("TCAAAAAAGG")
+ .setQual("**********")
+ .setCigar("3M4I3M")
+ .setMismatchingPositions("6")
+ .setMapq(50)
+ .build()
+
+ // make into a genomic rdd
+ val newReads = makeAndRealignRdd(Seq(read), 3)
+
+ // should have one read
+ assert(newReads.size === 1)
+ assert(newReads.head === read)
+ }
+}
diff --git a/avocado-core/src/test/scala/org/bdgenomics/avocado/util/PrefilterReadsSuite.scala b/avocado-core/src/test/scala/org/bdgenomics/avocado/util/PrefilterReadsSuite.scala
index 69bb006d..700a6a9b 100644
--- a/avocado-core/src/test/scala/org/bdgenomics/avocado/util/PrefilterReadsSuite.scala
+++ b/avocado-core/src/test/scala/org/bdgenomics/avocado/util/PrefilterReadsSuite.scala
@@ -24,7 +24,7 @@ import org.bdgenomics.adam.models.{
SequenceRecord
}
import org.bdgenomics.adam.rdd.ADAMContext._
-import org.bdgenomics.adam.rdd.read.AlignedReadRDD
+import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD
import org.bdgenomics.formats.avro.AlignmentRecord
case class TestPrefilterReadsArgs(var isNotGrc: Boolean = false,
@@ -239,7 +239,7 @@ class PrefilterReadsSuite extends AvocadoFunSuite {
def testRdd(args: PrefilterReadsArgs, numReads: Int, numContigs: Int) {
- val readRdd = AlignedReadRDD(sc.parallelize(reads), sequences, RecordGroupDictionary.empty)
+ val readRdd = AlignmentRecordRDD(sc.parallelize(reads), sequences, RecordGroupDictionary.empty)
val filteredRdd = PrefilterReads(readRdd, args)
assert(filteredRdd.rdd.count === numReads)
diff --git a/pom.xml b/pom.xml
index f7af4f26..ba734ac5 100644
--- a/pom.xml
+++ b/pom.xml
@@ -15,7 +15,7 @@
avocado: A Variant Caller, Distributed
- 0.21.0
+ 0.21.1-SNAPSHOT
1.8.0
1.8
2.10.4