Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up (and debug) serial json output #786

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ import org.apache.commons.io.FilenameUtils
import org.clulab.reach.assembly.relations.corpus.{CorpusReader, EventPair}
import org.clulab.odin.Mention
import org.clulab.reach.PaperReader
import org.clulab.reach.mentions._
import org.clulab.reach.mentions.serialization.json._
import org.clulab.reach.mentions.{CorefMention, MentionOps => ImplicitMentionOps}
import org.clulab.reach.mentions.serialization.json.MentionsOps
import org.clulab.utils.Serializer

import com.typesafe.config.ConfigFactory
Expand Down Expand Up @@ -127,9 +127,6 @@ object RunAnnotationEval extends App with LazyLogging {
* Serialize each paper in a directory to json
*/
object SerializePapersToJSON extends App with LazyLogging {

import org.clulab.reach.mentions.serialization.json._

val config = ConfigFactory.load()
val papersDir = new File(config.getString("papersDir"))
val outDir = new File(config.getString("outDir"))
Expand All @@ -150,7 +147,8 @@ object SerializePapersToJSON extends App with LazyLogging {
val mentions = PaperReader.getMentionsFromPaper(paper)
val cms: Seq[CorefMention] = mentions.map(_.toCorefMention)
logger.info(s"extracted ${mentions.size} mentions for $paperID")
cms.saveJSON(outFile, pretty = true)

MentionsOps(cms).saveJSON(outFile, pretty = true)
logger.info(s"saved json to $outFile")
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@ import org.clulab.processors.Document
import org.clulab.reach.assembly.relations.classifier.AssemblyRelationClassifier
import org.clulab.reach.assembly.sieves.Constraints
import org.clulab.reach.mentions.CorefMention
import org.clulab.reach.mentions.serialization.json.{MentionJSONOps, REACHMentionSeq, JSONSerializer}
import org.clulab.reach.mentions.serialization.json.{JSONSerializer, MentionOps, MentionsOps}
import org.clulab.serialization.json.JSONSerialization
import org.json4s.jackson.JsonMethods._
import org.json4s.JsonDSL._
import org.json4s._

import scala.util.hashing.MurmurHash3._
import com.typesafe.scalalogging.LazyLogging
import org.apache.commons.io.FileUtils.forceMkdir
import ai.lum.common.FileUtils._

import java.io.File


Expand Down Expand Up @@ -44,8 +46,8 @@ case class EventPair(
// the seed (not counted in the length of finalizeHash)
val h0 = stringHash("org.clulab.assembly.TrainingInstance")
// get hashes for each event
val h1 = mix(h0, e1.equivalenceHash)
val h2 = mix(h1, e2.equivalenceHash)
val h1 = mix(h0, MentionOps(e1).equivalenceHash)
val h2 = mix(h1, MentionOps(e2).equivalenceHash)
// is it cross-sentence?
val h3 = mix(h2, isCrossSentence.hashCode)
// the text of the sentences containing the two event mentions
Expand All @@ -66,34 +68,37 @@ case class EventPair(


def jsonAST: JValue = {
val e1EventOps = new EventOps(e1)
val e2EventOps = new EventOps(e2)

// build json
("id" -> this.equivalenceHash) ~
("text" -> this.text) ~
("coref" -> this.coref) ~
// event 1
("e1-id" -> this.e1.id) ~
("e1-label" -> this.e1.eventLabel) ~
("e1-sentence-text" -> this.e1.sentenceText) ~
("e1-id" -> MentionOps(this.e1).id) ~
("e1-label" -> e1EventOps.eventLabel) ~
("e1-sentence-text" -> e1EventOps.sentenceText) ~
("e1-sentence-index" -> this.e1.sentence) ~
("e1-sentence-tokens" -> this.e1.sentenceObj.words.toList) ~
// can be used to highlight event span in annotation UI
("e1-start" -> this.e1.start) ~
("e1-end" -> this.e1.end) ~
("e1-trigger" -> this.e1.trigger.text) ~
("e1-trigger-start" -> this.e1.trigger.start) ~
("e1-trigger-end" -> this.e1.trigger.end) ~
("e1-trigger" -> e1EventOps.trigger.text) ~
("e1-trigger-start" -> e1EventOps.trigger.start) ~
("e1-trigger-end" -> e1EventOps.trigger.end) ~
// event 2
("e2-id" -> this.e2.id) ~
("e2-label" -> this.e2.eventLabel) ~
("e2-sentence-text" -> this.e2.sentenceText) ~
("e2-id" -> MentionOps(this.e2).id) ~
("e2-label" -> e2EventOps.eventLabel) ~
("e2-sentence-text" -> e2EventOps.sentenceText) ~
("e2-sentence-index" -> this.e2.sentence) ~
("e2-sentence-tokens" -> this.e2.sentenceObj.words.toList) ~
// can be used to highlight event span in annotation UI
("e2-start" -> this.e2.start) ~
("e2-end" -> this.e2.end) ~
("e2-trigger" -> this.e2.trigger.text) ~
("e2-trigger-start" -> this.e2.trigger.start) ~
("e2-trigger-end" -> this.e2.trigger.end) ~
("e2-trigger" -> e2EventOps.trigger.text) ~
("e2-trigger-start" -> e2EventOps.trigger.start) ~
("e2-trigger-end" -> e2EventOps.trigger.end) ~
// these will be filled out during annotation
("annotator-id" -> this.annotatorID) ~
("relation" -> this.relation) ~
Expand Down Expand Up @@ -155,7 +160,7 @@ case class Corpus(instances: Seq[EventPair]) extends JSONSerialization {
// for each doc, write doc + mentions to a json file
for ((paperID, cms) <- dmLUT) {
val of = new File(mentionDataDir, s"$paperID-mention-data.json")
of.writeString(cms.json(pretty), java.nio.charset.StandardCharsets.UTF_8)
of.writeString(MentionsOps(cms).json(pretty), java.nio.charset.StandardCharsets.UTF_8)
}
// write event pair info to json file
val epf = new File(corpusDir, s"${Corpus.EVENT_PAIRS}.json")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ object CorpusBuilder extends LazyLogging {
// create training instance
ep = EventPair(Set(m1, m2))
// triggers should not be the same
if ep.e1.trigger != ep.e2.trigger
if new EventOps(ep.e1).trigger != new EventOps(ep.e2).trigger
} yield ep

distinctEventPairs(eps.toSeq)
Expand All @@ -133,7 +133,7 @@ object CorpusBuilder extends LazyLogging {
def distinctEventPairs(eps: Seq[EventPair]): Seq[EventPair] = {
eps.distinct.groupBy(ep =>
// distinct by...
(ep.e1.sentence, ep.e2.trigger, ep.e1.label, ep.e1.text, ep.e2.sentence, ep.e2.trigger, ep.e2.label, ep.e2.text)
(ep.e1.sentence, new EventOps(ep.e2).trigger, ep.e1.label, ep.e1.text, ep.e2.sentence, new EventOps(ep.e2).trigger, ep.e2.label, ep.e2.text)
).values.map(_.head) // get one value for each key
.toSeq
.sortBy{ ep => (ep.doc.id.getOrElse(""), ep.sentenceIndices.head) }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ package org.clulab.reach.assembly.relations

import org.clulab.odin.Mention
import org.clulab.reach.assembly.sieves.SieveUtils
import org.clulab.reach.mentions.serialization.json.{ CorefMentionOps, JSONSerializer => ReachJsonSerializer }
import org.clulab.reach.mentions._
import org.clulab.reach.mentions.CorefMention
import org.clulab.reach.mentions.serialization.json.JSONSerializer
import com.typesafe.scalalogging.LazyLogging
import scala.collection.GenSeq
import java.io.File
Expand All @@ -12,7 +12,7 @@ import java.io.File
package object corpus extends LazyLogging {

/** Additional attributes and methods for a [[CorefMention]] */
implicit class EventOps(mention: CorefMention) extends CorefMentionOps(mention) {
class EventOps(mention: CorefMention) {
val eventLabel: String = mention.label
val sentenceText: String = mention.sentenceObj.getSentenceText
// NOTE: if mention is a TB, trigger will simply be the mention (ex. BioProcess)
Expand All @@ -29,7 +29,7 @@ package object corpus extends LazyLogging {
def datasetLUT(jsonFiles: GenSeq[File]): Map[String, Vector[CorefMention]] = {
val docMentionPairs = jsonFiles.filter(_.getName.endsWith(".json")).map{ f: File =>
logger.debug(s"parsing ${f.getName}")
val cms: Vector[CorefMention] = ReachJsonSerializer.toCorefMentions(f).toVector
val cms: Vector[CorefMention] = JSONSerializer.toCorefMentions(f).toVector
if (cms.nonEmpty) logger.debug(s"successfully parsed ${f.getName}")
val paperID = getPMID(cms.head)
paperID -> cms
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import com.typesafe.scalalogging.Logger
import org.clulab.odin.Mention
import org.clulab.reach.FriesEntry
import org.clulab.reach.ReachConstants._
import org.clulab.odin.serialization.json._
import org.clulab.odin.serialization.json.MentionOps
import org.json4s.jackson.Serialization
import org.slf4j.LoggerFactory

Expand Down Expand Up @@ -154,7 +154,7 @@ object JsonOutputter {
else {
// "Gene_or_gene_product" is another possibility.
// Also "Family", "Disease", "Simple_chemical"
val json = mention.json(pretty = true)
val json = MentionOps(mention).json(pretty = true)
val message = s"""Unknown event type "$label" in event:\n$json"""
// throw new RuntimeException(message)
logger.warn(message)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,17 @@ package org.clulab.reach.export.indexcards
import java.io.File
import java.util.Date
import java.util.regex.Pattern

import scala.collection.mutable
import scala.collection.mutable.ListBuffer

import com.typesafe.scalalogging.LazyLogging
import org.clulab.odin.Mention
import org.clulab.reach.ReachConstants._
import org.clulab.reach.{FriesEntry, display}
import org.clulab.reach.export.JsonOutputter._
import org.clulab.reach.export.{JsonOutputter, OutputDegrader}
import org.clulab.reach.grounding.KBResolution
import org.clulab.reach.mentions._
import org.clulab.reach.mentions.serialization.json.mentionToJSON
import org.clulab.reach.mentions.{BioEventMention, CorefMention, Mutant, PTM, MentionOps => ImplicitMentionOps}
import org.clulab.reach.mentions.serialization.json.{JSONSerializer, MentionOps}
import org.clulab.reach.utils.MentionManager
import IndexCardOutput._

Expand Down Expand Up @@ -173,7 +171,7 @@ class IndexCardOutput extends JsonOutputter with LazyLogging {
case "amount" => mkSimpleEventIndexCard(mention, mention.label)
case _ =>
// "conversion" is one example of an eventType not handled.
val json = mentionToJSON(mention, pretty = true)
val json = MentionOps(mention).json(pretty = true)
val message = s"""Event type "$eventType" is not supported for indexcard output:\n$json"""
// throw new RuntimeException(message)
logger.warn(message)
Expand Down Expand Up @@ -211,7 +209,7 @@ class IndexCardOutput extends JsonOutputter with LazyLogging {
case "complex" => Some(new PropMapOrFrameList(mkComplexArgument(derefArg))) // FrameList
case _ => {
// "event" is a typical culprit.
val json = mentionToJSON(arg, pretty = true)
val json = MentionOps(arg).json(pretty = true)
val message = s"""Argument type "$argType" is not supported for indexcard output:\n$json"""
logger.warn(message)
None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,16 @@ package org.clulab.reach.export.serial
import java.io.File
import java.util.Date
import java.util.regex.Pattern

import java.nio.charset.Charset
import java.nio.charset.StandardCharsets.UTF_8

import ai.lum.common.FileUtils._

import com.typesafe.scalalogging.LazyLogging

import org.clulab.odin.Mention
import org.clulab.processors.Document
import org.clulab.reach.FriesEntry
import org.clulab.reach.export.JsonOutputter
import org.clulab.reach.mentions._
import org.clulab.reach.mentions.serialization.json._
import org.clulab.reach.mentions.{MentionOps => ImplicitMentionOps}
import org.clulab.reach.mentions.serialization.json.{EquivalenceHashes, MentionsOps}

/**
* Defines classes and methods used to output the serial-json output format.
Expand All @@ -39,9 +36,19 @@ class SerialJsonOutput (
outFilePrefix:String
): String = {
val mentions = allMentions.map(_.toCorefMention)
mentions.json(true) // true = pretty print
MentionsOps(mentions).json(true) // true = pretty print
}

def withDocument[T](documentOpt: Option[Document])(f: => T) = {
try {
f
}
finally {
documentOpt.foreach(EquivalenceHashes.remove)
}
}


/**
* Writes the given mentions to an output file in Mention-JSON format.
* The output file is prefixed with the given prefix string.
Expand All @@ -56,13 +63,25 @@ class SerialJsonOutput (
): Unit = {
val f: File = new File(outFilePrefix + ".json")
val mentions = allMentions.map(_.toCorefMention)
// The mentions are all going to a single file, so they should all have originated
// with the same single document. Verify this and then make sure the document's
// equivalency hash gets removed from the cache so that they don't pile up.
val documentOpt =
if (mentions.isEmpty) None
else {
val document = mentions.head.document

f.writeString(
string = mentions.json(true),
charset = encoding,
append = false,
gzipSupport = false
)
}
require(mentions.forall(_.document.eq(document)))
Some(document)
}

withDocument(documentOpt) {
f.writeString(
string = MentionsOps(mentions).json(true),
charset = encoding,
append = false,
gzipSupport = false
)
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ import akka.stream.{ActorMaterializer, Materializer}
import akka.stream.scaladsl._
import akka.util.ByteString

import org.clulab.reach.mentions._
import org.clulab.reach.mentions.serialization.json._
import org.clulab.reach.mentions.{MentionOps => ImplicitMentionOps}
import org.clulab.reach.mentions.serialization.json.MentionsOps
import org.clulab.reach.PaperReader


Expand Down Expand Up @@ -102,7 +102,7 @@ object FileProcessorWebUI extends App with FileUpload {
def processFile(tempFile: File, outputType: String): String = {
val cms = PaperReader.getMentionsFromPaper(tempFile).map(_.toCorefMention)
outputType match {
case JSON => cms.json(false)
case JSON => MentionsOps(cms).json(false)
}
}

Expand Down
4 changes: 2 additions & 2 deletions main/src/main/resources/application.conf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

# Default top-level root directory for input and output files and subdirectories.
# All other paths are based on this path but any or all can be changed individually:
rootDir = ${user.home}/Documents/reach
rootDir = ../corpora/wetransfer_nxml-files_2023-02-22_0826

# this is where the brat standoff and text files are dumped
# if this directory does not exist it will be created
Expand All @@ -20,7 +20,7 @@ outDir = ${rootDir}/output

# this is the directory that stores the raw nxml, .csv, and/or .tsv files
# this directory *must* exist
papersDir = ${rootDir}/papers
papersDir = ${rootDir}/input


# the encoding of input and output files
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package org.clulab.polarity.ml.data

import java.io.PrintWriter
import java.util.{Calendar, Date}

import com.typesafe.scalalogging.LazyLogging
import org.clulab.polarity.{NegativePolarity, Polarity, PositivePolarity}
import org.clulab.reach.{PaperReader, ReachSystem}
Expand All @@ -11,8 +10,8 @@ import org.clulab.reach.mentions.{BioEventMention, BioMention, CorefEventMention
import scala.collection.mutable.ArrayBuffer
import scala.io.Source
import scala.util.{Failure, Success, Try}
import org.clulab.reach.mentions.serialization.json._
import org.clulab.reach.mentions.{MentionOps => MOps}
import org.clulab.reach.mentions.serialization.json.{JSONSerializer, MentionsOps}
import org.clulab.reach.mentions.{MentionOps => ImplicitMentionOps}
import org.json4s.JsonAST.JValue
import org.json4s.JsonDSL._
import org.json4s._
Expand Down Expand Up @@ -119,7 +118,7 @@ object PolarityDatasetPreprocessor extends App with LazyLogging{
def saveOutput(digestedData: Seq[(BioEventMention, Polarity)], outputPath: String): Unit = {
val (evts, labels) = digestedData.unzip

val jsonEvts = evts.jsonAST
val jsonEvts = MentionsOps(evts).jsonAST


val json =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import org.clulab.odin._
import org.clulab.polarity.PolarityEngine
import org.clulab.reach._
import org.clulab.reach.mentions._
import org.clulab.reach.mentions.serialization.json.BioTextBoundMention
import org.clulab.struct.DirectedGraph

import scala.annotation.tailrec
Expand Down
Loading