Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add metric builder and sorter #826

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 9 additions & 51 deletions src/main/scala/com/fulcrumgenomics/util/Metric.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,21 @@

package com.fulcrumgenomics.util

import com.fulcrumgenomics.cmdline.FgBioMain.FailureException
import com.fulcrumgenomics.commons.CommonsDef._
import com.fulcrumgenomics.commons.io.{Writer => CommonsWriter}
import com.fulcrumgenomics.commons.reflect.{ReflectionUtil, ReflectiveBuilder}
import com.fulcrumgenomics.commons.util.{DelimitedDataParser, LazyLogging}
import com.fulcrumgenomics.commons.util.DelimitedDataParser
import enumeratum.EnumEntry
import htsjdk.samtools.util.Iso8601Date

import java.io.{PrintWriter, StringWriter, Writer}
import java.io.Writer
import java.nio.file.Path
import java.text.{DecimalFormat, NumberFormat, SimpleDateFormat}
import java.util.Date
import scala.collection.compat._
import scala.collection.concurrent.TrieMap
import scala.reflect.runtime.{universe => ru}
import scala.util.{Failure, Success}

object Metric extends LazyLogging {
object Metric {
val Delimiter: Char = '\t'
val DelimiterAsString: String = s"$Delimiter"

Expand Down Expand Up @@ -102,53 +99,14 @@ object Metric extends LazyLogging {
/** Reads metrics from a set of lines. The first line should be the header with the field names. Each subsequent
* line should be a single metric. */
def iterator[T <: Metric](lines: Iterator[String], source: Option[String] = None)(implicit tt: ru.TypeTag[T]): Iterator[T] = {
val clazz: Class[T] = ReflectionUtil.typeTagToClass[T]

def fail(lineNumber: Int,
message: String,
throwable: Option[Throwable] = None): Unit = {
val sourceMessage = source.map("\nIn source: " + _).getOrElse("")
val fullMessage = s"On line #$lineNumber for metric '${clazz.getSimpleName}'$sourceMessage\n$message"
throwable.foreach { thr =>
val stringWriter = new StringWriter
thr.printStackTrace(new PrintWriter(stringWriter))
val banner = "#" * 80
logger.debug(banner)
logger.debug(stringWriter.toString)
logger.debug(banner)
}
throw FailureException(message=Some(fullMessage))
}

if (lines.isEmpty) fail(lineNumber=1, message="No header found")
val parser = new DelimitedDataParser(lines=lines, delimiter=Delimiter, ignoreBlankLines=false, trimFields=true)
val names = parser.headers.toIndexedSeq
val reflectiveBuilder = new ReflectiveBuilder(clazz)
val builder = new MetricBuilder[T](source=source)(tt)
if (lines.isEmpty) builder.fail(message="No header found", lineNumber=Some(1))
val parser = new DelimitedDataParser(lines=lines, delimiter=Delimiter, ignoreBlankLines=false, trimFields=true)
val names = parser.headers.toIndexedSeq

parser.zipWithIndex.map { case (row, rowIndex) =>
forloop(from = 0, until = names.length) { i =>
reflectiveBuilder.argumentLookup.forField(names(i)) match {
case Some(arg) =>
val value = {
val tmp = row[String](i)
if (tmp.isEmpty && arg.argumentType == classOf[Option[_]]) ReflectionUtil.SpecialEmptyOrNoneToken else tmp
}

val argumentValue = ReflectionUtil.constructFromString(arg.argumentType, arg.unitType, value) match {
case Success(v) => v
case Failure(thr) =>
fail(lineNumber=rowIndex+2, message=s"Could not construct value for column '${arg.name}' of type '${arg.typeDescription}' from '$value'", Some(thr))
}
arg.value = argumentValue
case None =>
fail(lineNumber=rowIndex+2, message=s"Did not have a field with name '${names(i)}'.")
}
}

// build it. NB: if arguments are missing values, then an exception will be thrown here
// Also, we don't use the default "build()" method since if a collection or option is empty, it will be treated as
// missing.
reflectiveBuilder.build(reflectiveBuilder.argumentLookup.ordered.map(arg => arg.value getOrElse unreachable(s"Arguments not set: ${arg.name}")))
val argMap = names.zipWithIndex.map { case (name, i) => name -> row[String](i) }.toMap
builder.fromArgMap(argMap=argMap, lineNumber=Some(rowIndex+2))
}
}

Expand Down
142 changes: 142 additions & 0 deletions src/main/scala/com/fulcrumgenomics/util/MetricBuilder.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
/*
* The MIT License
*
* Copyright (c) 2022 Fulcrum Genomics
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*/

package com.fulcrumgenomics.util

import com.fulcrumgenomics.cmdline.FgBioMain.FailureException
import com.fulcrumgenomics.commons.CommonsDef.{forloop, unreachable}
import com.fulcrumgenomics.commons.reflect.{ReflectionUtil, ReflectiveBuilder}
import com.fulcrumgenomics.commons.util.LazyLogging

import java.io.{PrintWriter, StringWriter}
import scala.reflect.runtime.{universe => ru}
import scala.util.{Failure, Success}

/** Class for building metrics of type [[T]].
*
* This is not thread-safe.
*
* @param source optionally, the source of reading (e.g. file)
* @tparam T the metric type
*/
class MetricBuilder[T <: Metric](source: Option[String] = None)(implicit tt: ru.TypeTag[T]) extends LazyLogging {
// The main reason why a builder is necessary is to cache some expensive reflective calls.
private val clazz: Class[T] = ReflectionUtil.typeTagToClass[T]
private val reflectiveBuilder = new ReflectiveBuilder(clazz)
private val names = Metric.names[T]

/** Builds a metric from a delimited line
*
* @param line the line with delimited values
* @param delim the delimiter of the values
* @param lineNumber optionally, the line number when building a metric from a line in a file
* @return
*/
def fromLine(line: String, delim: String = Metric.DelimiterAsString, lineNumber: Option[Int] = None): T = {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: need to have different names for each of the from* methods because I want lineNumber to default to None, and I can't have multiple functions with the same name AND each with default values.

fromValues(values = line.split(delim), lineNumber = lineNumber)
}

/** Builds a metric from values for the complete set of metric fields
*
* @param values the values in the same order as the names defined in the class
* @param lineNumber optionally, the line number when building a metric from a line in a file
* @return
*/
def fromValues(values: Iterable[String], lineNumber: Option[Int] = None): T = {
val vals = values.toIndexedSeq
if (names.length != vals.length) {
fail(message = f"Failed decoding: expected '${names.length}' fields, found '${vals.length}'.", lineNumber = lineNumber)
}
fromArgMap(argMap = names.zip(values).toMap, lineNumber = lineNumber)
}

/** Builds a metric of type [[T]]
*
* @param argMap map of field names to values. All required fields must be given. Can be in any order.
* @param lineNumber optionally, the line number when building a metric from a line in a file
* @return a new instance of type [[T]]
*/
def fromArgMap(argMap: Map[String, String], lineNumber: Option[Int] = None): T = {
reflectiveBuilder.reset() // reset the arguments to their initial values
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


val names = argMap.keys.toIndexedSeq
forloop(from = 0, until = names.length) { i =>
reflectiveBuilder.argumentLookup.forField(names(i)) match {
case Some(arg) =>
val value = {
val tmp = argMap(names(i))
if (tmp.isEmpty && arg.argumentType == classOf[Option[_]]) ReflectionUtil.SpecialEmptyOrNoneToken else tmp
}

val argumentValue = ReflectionUtil.constructFromString(arg.argumentType, arg.unitType, value) match {
case Success(v) => v
case Failure(thr) =>
fail(
message = s"Could not construct value for column '${arg.name}' of type '${arg.typeDescription}' from '$value'",
throwable = Some(thr),
lineNumber = lineNumber
)
}
arg.value = argumentValue
case None =>
fail(
message = s"Did not have a field with name '${names(i)}'.",
lineNumber = lineNumber
)
}
}

// build it. NB: if arguments are missing values, then an exception will be thrown here
// Also, we don't use the default "build()" method since if a collection or option is empty, it will be treated as
// missing.
val params = reflectiveBuilder.argumentLookup.ordered.map(arg => arg.value getOrElse unreachable(s"Arguments not set: ${arg.name}"))
reflectiveBuilder.build(params)
}

/** Logs the throwable, if given, and throws a [[FailureException]] with information about when reading metrics fails
*
* @param message the message to include in the exception thrown
* @param throwable optionally, a throwable that should be logged
* @param lineNumber optionally, the line number when building a metric from a line in a file
*/
def fail(message: String, throwable: Option[Throwable] = None, lineNumber: Option[Int] = None): Unit = {
throwable.foreach { thr =>
val stringWriter = new StringWriter
thr.printStackTrace(new PrintWriter(stringWriter))
val banner = "#" * 80
logger.debug(banner)
logger.debug(stringWriter.toString)
logger.debug(banner)
}
val sourceMessage = source.map("\nIn source: " + _).getOrElse("")
val prefix = lineNumber match {
case None => "For metric"
case Some(n) => s"On line #$n for metric"
}
val fullMessage = s"$prefix '${clazz.getSimpleName}'$sourceMessage\n$message"

throw FailureException(message = Some(fullMessage))
}
}
70 changes: 70 additions & 0 deletions src/main/scala/com/fulcrumgenomics/util/MetricSorter.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*
* The MIT License
*
* Copyright (c) 2022 Fulcrum Genomics
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*/

package com.fulcrumgenomics.util

import com.fulcrumgenomics.commons.CommonsDef.DirPath

import scala.reflect.runtime.{universe => ru}

/** Disk-backed metrics sorter
*
* @param maxObjectsInRam the maximum number of metrics to keep in memory before spilling to disk
* @param keyfunc method to convert a metric to an ordered key
* @param tmpDir the temporary directory in which to spill to disk
* @param tt the type tag for [[T]]
* @tparam Key the key to use for sorting metrics
* @tparam T the metric type
*/
class MetricSorter[Key <: Ordered[Key], T <: Metric](maxObjectsInRam: Int = MetricSorter.MaxInMemory,
keyfunc: T => Key,
tmpDir: DirPath = Io.tmpDir,

)(implicit tt: ru.TypeTag[T]) extends Sorter[T, Key](
maxObjectsInRam = maxObjectsInRam,
codec = new MetricSorter.MetricSorterCodec[T](),
keyfunc = keyfunc,
tmpDir = tmpDir
)

object MetricSorter {
/** The default maximum # of records to keep and sort in memory. */
val MaxInMemory: Int = 1e6.toInt

/** The codec for encoding and decoding a metric */
class MetricSorterCodec[T <: Metric]()(implicit tt: ru.TypeTag[T])
extends Sorter.Codec[T] {
private val builder = new MetricBuilder[T]()

/** Encode the metric into an array of bytes. */
def encode(metric: T): Array[Byte] = metric.values.mkString(Metric.DelimiterAsString).getBytes

/** Decode a metric from an array of bytes. */
def decode(bs: Array[Byte], start: Int, length: Int): T = {
val fields = new String(bs.slice(from = start, until = start + length)).split(Metric.DelimiterAsString)
builder.fromValues(fields)
}
}
}
43 changes: 43 additions & 0 deletions src/test/scala/com/fulcrumgenomics/util/MetricBuilderTest.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* The MIT License
*
* Copyright (c) 2022 Fulcrum Genomics
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*/

package com.fulcrumgenomics.util

import com.fulcrumgenomics.testing.UnitSpec


case class MetricBuilderTestMetric(name: String, count: Long = 1) extends Metric

class MetricBuilderTest extends UnitSpec {
private val builder = new MetricBuilder[MetricBuilderTestMetric]()

"MetricBuilder.fromArgMap" should "build a metric from an argmap with all value specified" in {
builder.fromArgMap(Map("name" -> "foo", "count" -> "2")) shouldBe MetricBuilderTestMetric(name="foo", count=2)
}

it should "build a metric from an argmap with only required values specified" in {
builder.fromArgMap(Map("name" -> "foo")) shouldBe MetricBuilderTestMetric(name="foo")
}
}
Loading