diff --git a/assembly/src/main/scala/org/clulab/reach/assembly/RunAssembly.scala b/assembly/src/main/scala/org/clulab/reach/assembly/RunAssembly.scala
index 643a5e432..0bdaf988e 100644
--- a/assembly/src/main/scala/org/clulab/reach/assembly/RunAssembly.scala
+++ b/assembly/src/main/scala/org/clulab/reach/assembly/RunAssembly.scala
@@ -4,8 +4,8 @@ import org.apache.commons.io.FilenameUtils
import org.clulab.reach.assembly.relations.corpus.{CorpusReader, EventPair}
import org.clulab.odin.Mention
import org.clulab.reach.PaperReader
-import org.clulab.reach.mentions._
-import org.clulab.reach.mentions.serialization.json._
+import org.clulab.reach.mentions.{CorefMention, MentionOps => ImplicitMentionOps}
+import org.clulab.reach.mentions.serialization.json.MentionsOps
import org.clulab.utils.Serializer
import com.typesafe.config.ConfigFactory
@@ -127,9 +127,6 @@ object RunAnnotationEval extends App with LazyLogging {
* Serialize each paper in a directory to json
*/
object SerializePapersToJSON extends App with LazyLogging {
-
- import org.clulab.reach.mentions.serialization.json._
-
val config = ConfigFactory.load()
val papersDir = new File(config.getString("papersDir"))
val outDir = new File(config.getString("outDir"))
@@ -150,7 +147,8 @@ object SerializePapersToJSON extends App with LazyLogging {
val mentions = PaperReader.getMentionsFromPaper(paper)
val cms: Seq[CorefMention] = mentions.map(_.toCorefMention)
logger.info(s"extracted ${mentions.size} mentions for $paperID")
- cms.saveJSON(outFile, pretty = true)
+
+ MentionsOps(cms).saveJSON(outFile, pretty = true)
logger.info(s"saved json to $outFile")
}
}
\ No newline at end of file
diff --git a/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/Corpus.scala b/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/Corpus.scala
index 79c90473c..f0b1712c9 100644
--- a/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/Corpus.scala
+++ b/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/Corpus.scala
@@ -4,15 +4,17 @@ import org.clulab.processors.Document
import org.clulab.reach.assembly.relations.classifier.AssemblyRelationClassifier
import org.clulab.reach.assembly.sieves.Constraints
import org.clulab.reach.mentions.CorefMention
-import org.clulab.reach.mentions.serialization.json.{MentionJSONOps, REACHMentionSeq, JSONSerializer}
+import org.clulab.reach.mentions.serialization.json.{JSONSerializer, MentionOps, MentionsOps}
import org.clulab.serialization.json.JSONSerialization
import org.json4s.jackson.JsonMethods._
import org.json4s.JsonDSL._
import org.json4s._
+
import scala.util.hashing.MurmurHash3._
import com.typesafe.scalalogging.LazyLogging
import org.apache.commons.io.FileUtils.forceMkdir
import ai.lum.common.FileUtils._
+
import java.io.File
@@ -44,8 +46,8 @@ case class EventPair(
// the seed (not counted in the length of finalizeHash)
val h0 = stringHash("org.clulab.assembly.TrainingInstance")
// get hashes for each event
- val h1 = mix(h0, e1.equivalenceHash)
- val h2 = mix(h1, e2.equivalenceHash)
+ val h1 = mix(h0, MentionOps(e1).equivalenceHash)
+ val h2 = mix(h1, MentionOps(e2).equivalenceHash)
// is it cross-sentence?
val h3 = mix(h2, isCrossSentence.hashCode)
// the text of the sentences containing the two event mentions
@@ -66,34 +68,37 @@ case class EventPair(
def jsonAST: JValue = {
+ val e1EventOps = new EventOps(e1)
+ val e2EventOps = new EventOps(e2)
+
// build json
("id" -> this.equivalenceHash) ~
("text" -> this.text) ~
("coref" -> this.coref) ~
// event 1
- ("e1-id" -> this.e1.id) ~
- ("e1-label" -> this.e1.eventLabel) ~
- ("e1-sentence-text" -> this.e1.sentenceText) ~
+ ("e1-id" -> MentionOps(this.e1).id) ~
+ ("e1-label" -> e1EventOps.eventLabel) ~
+ ("e1-sentence-text" -> e1EventOps.sentenceText) ~
("e1-sentence-index" -> this.e1.sentence) ~
("e1-sentence-tokens" -> this.e1.sentenceObj.words.toList) ~
// can be used to highlight event span in annotation UI
("e1-start" -> this.e1.start) ~
("e1-end" -> this.e1.end) ~
- ("e1-trigger" -> this.e1.trigger.text) ~
- ("e1-trigger-start" -> this.e1.trigger.start) ~
- ("e1-trigger-end" -> this.e1.trigger.end) ~
+ ("e1-trigger" -> e1EventOps.trigger.text) ~
+ ("e1-trigger-start" -> e1EventOps.trigger.start) ~
+ ("e1-trigger-end" -> e1EventOps.trigger.end) ~
// event 2
- ("e2-id" -> this.e2.id) ~
- ("e2-label" -> this.e2.eventLabel) ~
- ("e2-sentence-text" -> this.e2.sentenceText) ~
+ ("e2-id" -> MentionOps(this.e2).id) ~
+ ("e2-label" -> e2EventOps.eventLabel) ~
+ ("e2-sentence-text" -> e2EventOps.sentenceText) ~
("e2-sentence-index" -> this.e2.sentence) ~
("e2-sentence-tokens" -> this.e2.sentenceObj.words.toList) ~
// can be used to highlight event span in annotation UI
("e2-start" -> this.e2.start) ~
("e2-end" -> this.e2.end) ~
- ("e2-trigger" -> this.e2.trigger.text) ~
- ("e2-trigger-start" -> this.e2.trigger.start) ~
- ("e2-trigger-end" -> this.e2.trigger.end) ~
+ ("e2-trigger" -> e2EventOps.trigger.text) ~
+ ("e2-trigger-start" -> e2EventOps.trigger.start) ~
+ ("e2-trigger-end" -> e2EventOps.trigger.end) ~
// these will be filled out during annotation
("annotator-id" -> this.annotatorID) ~
("relation" -> this.relation) ~
@@ -155,7 +160,7 @@ case class Corpus(instances: Seq[EventPair]) extends JSONSerialization {
// for each doc, write doc + mentions to a json file
for ((paperID, cms) <- dmLUT) {
val of = new File(mentionDataDir, s"$paperID-mention-data.json")
- of.writeString(cms.json(pretty), java.nio.charset.StandardCharsets.UTF_8)
+ of.writeString(MentionsOps(cms).json(pretty), java.nio.charset.StandardCharsets.UTF_8)
}
// write event pair info to json file
val epf = new File(corpusDir, s"${Corpus.EVENT_PAIRS}.json")
diff --git a/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/CorpusBuilder.scala b/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/CorpusBuilder.scala
index 1e687632a..593159cbd 100644
--- a/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/CorpusBuilder.scala
+++ b/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/CorpusBuilder.scala
@@ -124,7 +124,7 @@ object CorpusBuilder extends LazyLogging {
// create training instance
ep = EventPair(Set(m1, m2))
// triggers should not be the same
- if ep.e1.trigger != ep.e2.trigger
+ if new EventOps(ep.e1).trigger != new EventOps(ep.e2).trigger
} yield ep
distinctEventPairs(eps.toSeq)
@@ -133,7 +133,7 @@ object CorpusBuilder extends LazyLogging {
def distinctEventPairs(eps: Seq[EventPair]): Seq[EventPair] = {
eps.distinct.groupBy(ep =>
// distinct by...
- (ep.e1.sentence, ep.e2.trigger, ep.e1.label, ep.e1.text, ep.e2.sentence, ep.e2.trigger, ep.e2.label, ep.e2.text)
+ (ep.e1.sentence, new EventOps(ep.e2).trigger, ep.e1.label, ep.e1.text, ep.e2.sentence, new EventOps(ep.e2).trigger, ep.e2.label, ep.e2.text)
).values.map(_.head) // get one value for each key
.toSeq
.sortBy{ ep => (ep.doc.id.getOrElse(""), ep.sentenceIndices.head) }
diff --git a/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/package.scala b/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/package.scala
index 9d14ebfcd..9dda156a4 100644
--- a/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/package.scala
+++ b/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/package.scala
@@ -2,8 +2,8 @@ package org.clulab.reach.assembly.relations
import org.clulab.odin.Mention
import org.clulab.reach.assembly.sieves.SieveUtils
-import org.clulab.reach.mentions.serialization.json.{ CorefMentionOps, JSONSerializer => ReachJsonSerializer }
-import org.clulab.reach.mentions._
+import org.clulab.reach.mentions.CorefMention
+import org.clulab.reach.mentions.serialization.json.JSONSerializer
import com.typesafe.scalalogging.LazyLogging
import scala.collection.GenSeq
import java.io.File
@@ -12,7 +12,7 @@ import java.io.File
package object corpus extends LazyLogging {
/** Additional attributes and methods for a [[CorefMention]] */
- implicit class EventOps(mention: CorefMention) extends CorefMentionOps(mention) {
+ class EventOps(mention: CorefMention) {
val eventLabel: String = mention.label
val sentenceText: String = mention.sentenceObj.getSentenceText
// NOTE: if mention is a TB, trigger will simply be the mention (ex. BioProcess)
@@ -29,7 +29,7 @@ package object corpus extends LazyLogging {
def datasetLUT(jsonFiles: GenSeq[File]): Map[String, Vector[CorefMention]] = {
val docMentionPairs = jsonFiles.filter(_.getName.endsWith(".json")).map{ f: File =>
logger.debug(s"parsing ${f.getName}")
- val cms: Vector[CorefMention] = ReachJsonSerializer.toCorefMentions(f).toVector
+ val cms: Vector[CorefMention] = JSONSerializer.toCorefMentions(f).toVector
if (cms.nonEmpty) logger.debug(s"successfully parsed ${f.getName}")
val paperID = getPMID(cms.head)
paperID -> cms
diff --git a/export/src/main/scala/org/clulab/reach/export/JsonOutputter.scala b/export/src/main/scala/org/clulab/reach/export/JsonOutputter.scala
index d590a9f7f..5d3fb7dbe 100644
--- a/export/src/main/scala/org/clulab/reach/export/JsonOutputter.scala
+++ b/export/src/main/scala/org/clulab/reach/export/JsonOutputter.scala
@@ -8,7 +8,7 @@ import com.typesafe.scalalogging.Logger
import org.clulab.odin.Mention
import org.clulab.reach.FriesEntry
import org.clulab.reach.ReachConstants._
-import org.clulab.odin.serialization.json._
+import org.clulab.odin.serialization.json.MentionOps
import org.json4s.jackson.Serialization
import org.slf4j.LoggerFactory
@@ -154,7 +154,7 @@ object JsonOutputter {
else {
// "Gene_or_gene_product" is another possibility.
// Also "Family", "Disease", "Simple_chemical"
- val json = mention.json(pretty = true)
+ val json = MentionOps(mention).json(pretty = true)
val message = s"""Unknown event type "$label" in event:\n$json"""
// throw new RuntimeException(message)
logger.warn(message)
diff --git a/export/src/main/scala/org/clulab/reach/export/indexcards/IndexCardOutput.scala b/export/src/main/scala/org/clulab/reach/export/indexcards/IndexCardOutput.scala
index 50085275d..67bcb3345 100644
--- a/export/src/main/scala/org/clulab/reach/export/indexcards/IndexCardOutput.scala
+++ b/export/src/main/scala/org/clulab/reach/export/indexcards/IndexCardOutput.scala
@@ -3,10 +3,8 @@ package org.clulab.reach.export.indexcards
import java.io.File
import java.util.Date
import java.util.regex.Pattern
-
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
-
import com.typesafe.scalalogging.LazyLogging
import org.clulab.odin.Mention
import org.clulab.reach.ReachConstants._
@@ -14,8 +12,8 @@ import org.clulab.reach.{FriesEntry, display}
import org.clulab.reach.export.JsonOutputter._
import org.clulab.reach.export.{JsonOutputter, OutputDegrader}
import org.clulab.reach.grounding.KBResolution
-import org.clulab.reach.mentions._
-import org.clulab.reach.mentions.serialization.json.mentionToJSON
+import org.clulab.reach.mentions.{BioEventMention, CorefMention, Mutant, PTM, MentionOps => ImplicitMentionOps}
+import org.clulab.reach.mentions.serialization.json.{JSONSerializer, MentionOps}
import org.clulab.reach.utils.MentionManager
import IndexCardOutput._
@@ -173,7 +171,7 @@ class IndexCardOutput extends JsonOutputter with LazyLogging {
case "amount" => mkSimpleEventIndexCard(mention, mention.label)
case _ =>
// "conversion" is one example of an eventType not handled.
- val json = mentionToJSON(mention, pretty = true)
+ val json = MentionOps(mention).json(pretty = true)
val message = s"""Event type "$eventType" is not supported for indexcard output:\n$json"""
// throw new RuntimeException(message)
logger.warn(message)
@@ -211,7 +209,7 @@ class IndexCardOutput extends JsonOutputter with LazyLogging {
case "complex" => Some(new PropMapOrFrameList(mkComplexArgument(derefArg))) // FrameList
case _ => {
// "event" is a typical culprit.
- val json = mentionToJSON(arg, pretty = true)
+ val json = MentionOps(arg).json(pretty = true)
val message = s"""Argument type "$argType" is not supported for indexcard output:\n$json"""
logger.warn(message)
None
diff --git a/export/src/main/scala/org/clulab/reach/export/serial/SerialJsonOutput.scala b/export/src/main/scala/org/clulab/reach/export/serial/SerialJsonOutput.scala
index d285c2a04..49e444c1e 100644
--- a/export/src/main/scala/org/clulab/reach/export/serial/SerialJsonOutput.scala
+++ b/export/src/main/scala/org/clulab/reach/export/serial/SerialJsonOutput.scala
@@ -1,21 +1,22 @@
package org.clulab.reach.export.serial
-import java.io.File
+import java.io.{File, PrintWriter}
import java.util.Date
-import java.util.regex.Pattern
-
import java.nio.charset.Charset
import java.nio.charset.StandardCharsets.UTF_8
-
import ai.lum.common.FileUtils._
-
+import com.fasterxml.jackson.databind.ObjectWriter
import com.typesafe.scalalogging.LazyLogging
-
import org.clulab.odin.Mention
import org.clulab.reach.FriesEntry
import org.clulab.reach.export.JsonOutputter
-import org.clulab.reach.mentions._
-import org.clulab.reach.mentions.serialization.json._
+import org.clulab.reach.mentions.{MentionOps => ImplicitMentionOps}
+import org.clulab.reach.mentions.serialization.json.MentionsOps
+import org.clulab.serialization.json.stringify
+import org.clulab.utils.Closer.AutoCloser
+import org.clulab.utils.Sink
+import org.json4s.JValue
+import org.json4s.jackson.{JsonMethods, prettyJson, renderJValue}
/**
* Defines classes and methods used to output the serial-json output format.
@@ -28,6 +29,7 @@ class SerialJsonOutput (
encoding: Charset = UTF_8
) extends JsonOutputter with LazyLogging {
+ val objectWriter = JsonMethods.mapper.writerWithDefaultPrettyPrinter()
/** Returns the given mentions in the serial-json format, as one big string. */
override def toJSON (
@@ -39,7 +41,7 @@ class SerialJsonOutput (
outFilePrefix:String
): String = {
val mentions = allMentions.map(_.toCorefMention)
- mentions.json(true) // true = pretty print
+ MentionsOps(mentions).json(pretty = true)
}
/**
@@ -54,15 +56,17 @@ class SerialJsonOutput (
endTime:Date,
outFilePrefix:String
): Unit = {
- val f: File = new File(outFilePrefix + ".json")
val mentions = allMentions.map(_.toCorefMention)
+ val jsonAST = MentionsOps(mentions).jsonAST
+ // Code here has been modified so that no json string is produced.
+ // String lengths max out at 2GB, unlike files, and with large inputs
+ // we were crashing when output could not be stuffed into a string.
+ val renderedJsonAST = JsonMethods.render(jsonAST)
+ val file = new File(outFilePrefix + ".json")
+ val printWriter = new PrintWriter(new Sink(file, encoding.name, append = false))
- f.writeString(
- string = mentions.json(true),
- charset = encoding,
- append = false,
- gzipSupport = false
- )
+ printWriter.autoClose { printWriter =>
+ objectWriter.writeValue(printWriter, renderedJsonAST)
+ }
}
-
}
diff --git a/export/src/main/scala/org/clulab/reach/export/server/FileProcessorWebUI.scala b/export/src/main/scala/org/clulab/reach/export/server/FileProcessorWebUI.scala
index bdac4b6cd..79bbd1a22 100644
--- a/export/src/main/scala/org/clulab/reach/export/server/FileProcessorWebUI.scala
+++ b/export/src/main/scala/org/clulab/reach/export/server/FileProcessorWebUI.scala
@@ -19,8 +19,8 @@ import akka.stream.{ActorMaterializer, Materializer}
import akka.stream.scaladsl._
import akka.util.ByteString
-import org.clulab.reach.mentions._
-import org.clulab.reach.mentions.serialization.json._
+import org.clulab.reach.mentions.{MentionOps => ImplicitMentionOps}
+import org.clulab.reach.mentions.serialization.json.MentionsOps
import org.clulab.reach.PaperReader
@@ -102,7 +102,7 @@ object FileProcessorWebUI extends App with FileUpload {
def processFile(tempFile: File, outputType: String): String = {
val cms = PaperReader.getMentionsFromPaper(tempFile).map(_.toCorefMention)
outputType match {
- case JSON => cms.json(false)
+ case JSON => MentionsOps(cms).json(false)
}
}
diff --git a/main/src/main/scala/org/clulab/polarity/ml/data/PolarityDatasetPreprocessor.scala b/main/src/main/scala/org/clulab/polarity/ml/data/PolarityDatasetPreprocessor.scala
index eea90bd91..d4ee98cbd 100644
--- a/main/src/main/scala/org/clulab/polarity/ml/data/PolarityDatasetPreprocessor.scala
+++ b/main/src/main/scala/org/clulab/polarity/ml/data/PolarityDatasetPreprocessor.scala
@@ -2,7 +2,6 @@ package org.clulab.polarity.ml.data
import java.io.PrintWriter
import java.util.{Calendar, Date}
-
import com.typesafe.scalalogging.LazyLogging
import org.clulab.polarity.{NegativePolarity, Polarity, PositivePolarity}
import org.clulab.reach.{PaperReader, ReachSystem}
@@ -11,8 +10,8 @@ import org.clulab.reach.mentions.{BioEventMention, BioMention, CorefEventMention
import scala.collection.mutable.ArrayBuffer
import scala.io.Source
import scala.util.{Failure, Success, Try}
-import org.clulab.reach.mentions.serialization.json._
-import org.clulab.reach.mentions.{MentionOps => MOps}
+import org.clulab.reach.mentions.serialization.json.{JSONSerializer, MentionsOps}
+import org.clulab.reach.mentions.{MentionOps => ImplicitMentionOps}
import org.json4s.JsonAST.JValue
import org.json4s.JsonDSL._
import org.json4s._
@@ -119,7 +118,7 @@ object PolarityDatasetPreprocessor extends App with LazyLogging{
def saveOutput(digestedData: Seq[(BioEventMention, Polarity)], outputPath: String): Unit = {
val (evts, labels) = digestedData.unzip
- val jsonEvts = evts.jsonAST
+ val jsonEvts = MentionsOps(evts).jsonAST
val json =
diff --git a/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala b/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala
new file mode 100644
index 000000000..6c71ab3ca
--- /dev/null
+++ b/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala
@@ -0,0 +1,59 @@
+package org.clulab.reach.apps
+
+import org.clulab.odin.Mention
+import org.clulab.reach.PaperReader
+import org.clulab.reach.mentions.serialization.json.MentionsOps
+import org.clulab.serialization.json.stringify
+import org.clulab.utils.FileUtils
+import org.json4s.jackson.JsonMethods.parse
+
+object RoundTripApp extends App {
+ val directoryName = args.headOption.getOrElse("../corpora/nxml")
+ val files = FileUtils.findFiles(directoryName, ".nxml")
+
+ def testProcessorsSerialization(mentions: Seq[Mention]): Boolean = {
+ import org.clulab.odin.serialization.json.JSONSerializer
+
+ val jValue = JSONSerializer.jsonAST(mentions)
+ val json = stringify(jValue, pretty = true)
+
+ val mentions2 = JSONSerializer.toMentions(parse(json))
+ val jValue2 = JSONSerializer.jsonAST(mentions2)
+ val json2 = stringify(jValue2, pretty = true)
+ val result = json == json2
+
+ if (!result)
+ println("Processors mentions are not equal! Do something about it!")
+ result
+ }
+
+ def testReachSerialization(mentions: Seq[Mention]): Boolean = {
+ import org.clulab.reach.mentions.serialization.json.JSONSerializer
+
+ val json = MentionsOps(mentions).json(pretty = true)
+
+ val mentions2 = JSONSerializer.toCorefMentions2(parse(json))
+ val json2 = MentionsOps(mentions2).json(pretty = true)
+ val result = json == json2
+
+ if (!result)
+ println("Reach mentions are not equal! Do something about it!")
+ result
+ }
+
+ var failCount = 0
+
+ files.par.foreach { file =>
+ val entry = PaperReader.getEntryFromPaper(file)
+ val mentions = PaperReader.getMentionsFromEntry(entry)
+
+ val result1 = testProcessorsSerialization(mentions)
+ val result2 = testReachSerialization(mentions)
+
+ if (!(result1 && result2)) {
+ println(s"File ${file.getName} failed.")
+ failCount += 1
+ }
+ }
+ println(s"There were $failCount failures!")
+}
diff --git a/main/src/main/scala/org/clulab/reach/darpa/DarpaActions.scala b/main/src/main/scala/org/clulab/reach/darpa/DarpaActions.scala
index ef215fdba..6994f18bc 100644
--- a/main/src/main/scala/org/clulab/reach/darpa/DarpaActions.scala
+++ b/main/src/main/scala/org/clulab/reach/darpa/DarpaActions.scala
@@ -5,7 +5,6 @@ import org.clulab.odin._
import org.clulab.polarity.PolarityEngine
import org.clulab.reach._
import org.clulab.reach.mentions._
-import org.clulab.reach.mentions.serialization.json.BioTextBoundMention
import org.clulab.struct.DirectedGraph
import scala.annotation.tailrec
diff --git a/main/src/main/scala/org/clulab/reach/mentions/package.scala b/main/src/main/scala/org/clulab/reach/mentions/package.scala
index c643a3043..ad14edc28 100644
--- a/main/src/main/scala/org/clulab/reach/mentions/package.scala
+++ b/main/src/main/scala/org/clulab/reach/mentions/package.scala
@@ -25,9 +25,9 @@ package object mentions {
case m: EventMention =>
new BioEventMention(
m.labels,
- m.trigger,
+ m.trigger.toBioMention.asInstanceOf[BioTextBoundMention],
convertArguments(m.arguments),
- m.paths,
+ m.paths, // TODO: these paths should be converted as well
m.sentence,
m.document,
m.keep,
@@ -37,7 +37,7 @@ package object mentions {
new BioRelationMention(
m.labels,
convertArguments(m.arguments),
- m.paths,
+ m.paths, // TODO: these paths should be converted as well
m.sentence,
m.document,
m.keep,
@@ -62,9 +62,9 @@ package object mentions {
case m: BioEventMention => {
val ev = new CorefEventMention(
m.labels,
- m.trigger,
+ m.trigger.toCorefMention.asInstanceOf[CorefTextBoundMention],
corefArguments(m.arguments),
- m.paths,
+ m.paths, // TODO: these paths should be converted as well
m.sentence,
m.document,
m.keep,
diff --git a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/EquivalenceHashes.scala b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/EquivalenceHashes.scala
new file mode 100644
index 000000000..a73e9846a
--- /dev/null
+++ b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/EquivalenceHashes.scala
@@ -0,0 +1,24 @@
+package org.clulab.reach.mentions.serialization.json
+
+import org.clulab.processors.Document
+
+import java.util
+
+object EquivalenceHashes {
+ protected val equivalenceHashes: util.IdentityHashMap[Document, Int] = new util.IdentityHashMap[Document, Int]()
+
+ def get(document: Document): Int = synchronized {
+ if (equivalenceHashes.containsKey(document))
+ equivalenceHashes.get(document)
+ else {
+ val equivalencyHash = document.equivalenceHash
+
+ equivalenceHashes.put(document, equivalencyHash)
+ equivalencyHash
+ }
+ }
+
+ def remove(document: Document): Option[Int] = synchronized {
+ Option(equivalenceHashes.remove(document))
+ }
+}
diff --git a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/JSONSerializer.scala b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/JSONSerializer.scala
index 55d541336..4ed06d755 100644
--- a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/JSONSerializer.scala
+++ b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/JSONSerializer.scala
@@ -1,23 +1,33 @@
package org.clulab.reach.mentions.serialization.json
-import org.clulab.serialization.json.DocOps
-import org.clulab.odin.serialization.json.JSONSerializer._
-import org.clulab.odin.serialization.json.{ MentionOps => OdinMentionOps }
+import com.typesafe.scalalogging.LazyLogging
import org.clulab.odin
import org.clulab.odin._
+import org.clulab.odin.serialization.json.{JSONSerializer => OdinJSONSerializer}
+import org.clulab.processors.Document
import org.clulab.reach.grounding.KBResolution
-import org.clulab.reach.mentions._
+import org.clulab.reach.mentions.{Anaphoric, EventSite, Hypothesis, Modification, Mutant, Negation, PTM}
+import org.clulab.reach.mentions.{BioEventMention, BioMention, BioRelationMention, BioTextBoundMention}
+import org.clulab.reach.mentions.{CHEMtrigger, DNtrigger, KDtrigger, KOtrigger, OEtrigger, UnassignedTrigger}
+import org.clulab.reach.mentions.{CorefEventMention, CorefMention, CorefRelationMention, CorefTextBoundMention}
+import org.clulab.reach.mentions.{MentionOps => OdinMentionOps}
+import org.clulab.serialization.json.{DocOps, stringify}
import org.clulab.struct.{DirectedGraph, Edge, Interval}
import org.json4s.JsonDSL._
import org.json4s._
import org.json4s.jackson.JsonMethods._
+import org.json4s.jackson.{prettyJson, renderJValue}
+
import java.io.File
-import com.typesafe.scalalogging.LazyLogging
-import org.clulab.processors.Document
/** JSON serialization utilities */
object JSONSerializer extends LazyLogging {
+ implicit val formats = org.json4s.DefaultFormats
+
+ def prettify(json: JValue): String = prettyJson(renderJValue(json))
+
+ def json(jsonAST: JValue, pretty: Boolean = false): String = stringify(jsonAST, pretty)
private def mentionsToDocsJMap(mentions: Seq[Mention]): Map[String, JValue] = {
docsToDocsJMap(mentions.map(m => m.document))
@@ -34,14 +44,13 @@ object JSONSerializer extends LazyLogging {
/** Creates a Map of a Document.equivalenceHash (as String) -> Document
* Used for deserialization of mention JSON
*/
- def docsToDocumentMap(docs: Seq[Document]): Map[String, Document] = mkDocumentMap(docsToDocsJMap(docs))
+ def docsToDocumentMap(docs: Seq[Document]): Map[String, Document] = OdinJSONSerializer.mkDocumentMap(docsToDocsJMap(docs))
- def jsonAST(mentions: Seq[Mention]): JValue = {
+ def jsonAST(mention: Mention): JValue = jsonAST(Seq(mention))
- val mentionList: List[JValue] = mentions.map{
- case cm: CorefMention => CorefMentionOps(cm).jsonAST
- case bm: BioMention => BioMentionOps(bm).jsonAST
- case m: Mention => OdinMentionOps(m).jsonAST
+ def jsonAST(mentions: Seq[Mention]): JValue = {
+ val mentionList: List[JValue] = mentions.map { mention =>
+ MentionOps(mention).jsonAST
}.toList
val docMap: Map[String, JValue] = mentionsToDocsJMap(mentions)
("documents" -> docMap) ~ ("mentions" -> mentionList)
@@ -59,7 +68,7 @@ object JSONSerializer extends LazyLogging {
require(json \ "mentions" != JNothing, "\"mentions\" key missing from json")
// build the documents once
- val docMap = mkDocumentMap((json \ "documents").asInstanceOf[JObject])
+ val docMap = OdinJSONSerializer.mkDocumentMap((json \ "documents").asInstanceOf[JObject])
val mmjson = (json \ "mentions").asInstanceOf[JArray]
mmjson.arr.map(mjson => toBioMention(mjson, docMap))
@@ -84,7 +93,7 @@ object JSONSerializer extends LazyLogging {
// NOTE: while it would be cleaner to create a Mention and THEN add the needed bio and coref attributes,
// it would not be easy to transform the arguments & trigger post-hoc using the json...
val m = mjson \ "type" match {
- case JString(BioEventMention.string) =>
+ case JString(BioEventMentionOps.string) =>
new BioEventMention(
labels,
// trigger must be (Bio)TextBoundMention
@@ -98,7 +107,7 @@ object JSONSerializer extends LazyLogging {
isDirect = getIsDirect(mjson)
)
- case JString(BioRelationMention.string) =>
+ case JString(BioRelationMentionOps.string) =>
new BioRelationMention(
labels,
mkArgumentsFromJsonAST(mjson \ "arguments", docMap),
@@ -109,7 +118,7 @@ object JSONSerializer extends LazyLogging {
foundBy
)
- case JString(BioTextBoundMention.string) =>
+ case JString(BioTextBoundMentionOps.string) =>
new BioTextBoundMention(
labels,
tokInterval,
@@ -150,20 +159,29 @@ object JSONSerializer extends LazyLogging {
toCorefMentionsMap(json: JValue).values.toSeq
}
+ def toCorefMentions2(json: JValue): Seq[CorefMention] = {
+ val idsAndMentions = toCorefIdsAndMentions(json)
+ val mentions = idsAndMentions.map(_._2)
+
+ mentions
+ }
+
/** Produce a Map of id -> mentions from a json file */
def toCorefMentionsMap(file: File): Map[String, CorefMention] = toCorefMentionsMap(jsonAST(file))
/** Produce a Map of id -> mentions from json */
- def toCorefMentionsMap(json: JValue): Map[String, CorefMention] = {
+ def toCorefMentionsMap(json: JValue): Map[String, CorefMention] = toCorefIdsAndMentions(json).toMap
+ def toCorefIdsAndMentions(json: JValue): Seq[(String, CorefMention)] = {
require(json \ "documents" != JNothing, "\"documents\" key missing from json")
require(json \ "mentions" != JNothing, "\"mentions\" key missing from json")
// build the documents once
- val docMap = mkDocumentMap((json \ "documents").asInstanceOf[JObject])
+ val docMap = OdinJSONSerializer.mkDocumentMap((json \ "documents").asInstanceOf[JObject])
val mmjson = (json \ "mentions").asInstanceOf[JArray]
+ val idsAndMentions = mmjson.arr.map(mjson => toCorefMentionWithId(mjson, docMap))
- mmjson.arr.map(mjson => toCorefMentionWithId(mjson, docMap)).toMap
+ idsAndMentions
}
/** Build mention from json of mention and corresponding json map of documents
@@ -191,7 +209,7 @@ object JSONSerializer extends LazyLogging {
// it would not be easy to transform the arguments & trigger post-hoc using the json...
val mentionId: String = (mjson \ "id").extract[String]
val m = mjson \ "type" match {
- case JString(CorefEventMention.string) =>
+ case JString(CorefEventMentionOps.string) =>
new CorefEventMention(
labels,
// trigger must be (Bio)TextBoundMention
@@ -205,7 +223,7 @@ object JSONSerializer extends LazyLogging {
isDirect = getIsDirect(mjson)
)
- case JString(CorefRelationMention.string) =>
+ case JString(CorefRelationMentionOps.string) =>
new CorefRelationMention(
labels,
mkArgumentsFromJsonAST(mjson \ "arguments", docMap),
@@ -216,7 +234,7 @@ object JSONSerializer extends LazyLogging {
foundBy
)
- case JString(CorefTextBoundMention.string) =>
+ case JString(CorefTextBoundMentionOps.string) =>
new CorefTextBoundMention(
labels,
tokInterval,
@@ -255,11 +273,11 @@ object JSONSerializer extends LazyLogging {
private def toAntecedents(mjson: JValue, docMap: Map[String, Document]): Set[Anaphoric] = mjson \ "antecedents" match {
case JNothing => Set.empty[Anaphoric]
case antecedents =>
- antecedents
- .asInstanceOf[JArray]
- .arr
- .map(mjson => toCorefMention(mjson, docMap)).map(_.toCorefMention)
- .toSet
+ val arr = antecedents.asInstanceOf[JArray].arr
+ val list = arr.map(mjson => toCorefMention(mjson, docMap)).map(_.toCorefMention)
+ val set: Set[Anaphoric] = list.toSet
+
+ set
}
private def toModifications(mjson: JValue, docMap: Map[String, Document]): Set[Modification] = mjson \ "modifications" match {
@@ -425,17 +443,17 @@ object JSONSerializer extends LazyLogging {
def toMentionByType(mjson: JValue, docMap: Map[String, Document]): Option[Mention] = mjson \ "type" match {
// CorefMentions
- case JString(CorefTextBoundMention.string) => Some(toCorefMention(mjson, docMap))
- case JString(CorefEventMention.string) => Some(toCorefMention(mjson, docMap))
- case JString(CorefRelationMention.string) => Some(toCorefMention(mjson, docMap))
+ case JString(CorefTextBoundMentionOps.string) => Some(toCorefMention(mjson, docMap))
+ case JString(CorefEventMentionOps.string) => Some(toCorefMention(mjson, docMap))
+ case JString(CorefRelationMentionOps.string) => Some(toCorefMention(mjson, docMap))
// BioMentions
- case JString(BioTextBoundMention.string) => Some(toBioMention(mjson, docMap))
- case JString(BioEventMention.string) => Some(toBioMention(mjson, docMap))
- case JString(BioRelationMention.string) => Some(toBioMention(mjson, docMap))
+ case JString(BioTextBoundMentionOps.string) => Some(toBioMention(mjson, docMap))
+ case JString(BioEventMentionOps.string) => Some(toBioMention(mjson, docMap))
+ case JString(BioRelationMentionOps.string) => Some(toBioMention(mjson, docMap))
// Mentions
- case JString(org.clulab.odin.serialization.json.TextBoundMention.string) => Some(toMention(mjson, docMap))
- case JString(org.clulab.odin.serialization.json.EventMention.string) => Some(toMention(mjson, docMap))
- case JString(org.clulab.odin.serialization.json.RelationMention.string) => Some(toMention(mjson, docMap))
+ case JString(org.clulab.odin.serialization.json.TextBoundMentionOps.string) => Some(OdinJSONSerializer.toMention(mjson, docMap))
+ case JString(org.clulab.odin.serialization.json.EventMentionOps.string) => Some(OdinJSONSerializer.toMention(mjson, docMap))
+ case JString(org.clulab.odin.serialization.json.RelationMentionOps.string) => Some(OdinJSONSerializer.toMention(mjson, docMap))
// failure
case _ => None
}
diff --git a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/MentionOps.scala b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/MentionOps.scala
new file mode 100644
index 000000000..bce677d7e
--- /dev/null
+++ b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/MentionOps.scala
@@ -0,0 +1,240 @@
+package org.clulab.reach.mentions.serialization.json
+
+import org.clulab.odin.Mention
+import org.clulab.odin.serialization.json.{EventMentionOps, RelationMentionOps, TextBoundMentionOps}
+import org.clulab.odin.serialization.json.{MentionOps => OdinMentionOps}
+import org.clulab.reach.context.Context
+import org.clulab.reach.grounding.KBResolution
+import org.clulab.reach.mentions.{Anaphoric, BioEventMention, BioRelationMention, BioTextBoundMention, CorefEventMention, CorefMention, CorefRelationMention, CorefTextBoundMention, Display, EventSite, Grounding, Modification, Modifications, Mutant, PTM, SimpleModification}
+import org.clulab.serialization.json.JSONSerialization
+import org.clulab.utils.Unordered.OrderingOrElseBy
+import org.json4s._
+import org.json4s.JsonDSL._
+import org.json4s.jackson._
+
+import scala.math.Ordering.Implicits._ // Allow Seqs to be compared to each other.
+
+import ReachImplicits._
+
+object MentionOps {
+ implicit val formats = org.json4s.DefaultFormats
+
+ implicit val mentionOrdering: Ordering[Mention] = OdinMentionOps.mentionOrdering
+ .orElseBy { mention =>
+ if (mention.isInstanceOf[Anaphoric]) {
+ val antecedents = mention.asInstanceOf[Anaphoric].antecedents
+ // As with the arguments in processors, use sorted token intervals.
+ val tokenIntervals = antecedents.toSeq.map(_.asInstanceOf[Mention].tokenInterval)
+
+ tokenIntervals.sorted
+ }
+ else Seq.empty
+ }
+
+ def apply(mention: Mention): OdinMentionOps = {
+ mention match {
+ // Corefs must be on top because they inherit from the Bio classes.
+ case mention: CorefTextBoundMention => new CorefTextBoundMentionOps(mention)
+ case mention: CorefEventMention => new CorefEventMentionOps(mention)
+ case mention: CorefRelationMention => new CorefRelationMentionOps(mention)
+
+ case mention: BioTextBoundMention => new BioTextBoundMentionOps(mention)
+ case mention: BioEventMention => new BioEventMentionOps(mention)
+ case mention: BioRelationMention => new BioRelationMentionOps(mention)
+
+ case mention => OdinMentionOps(mention) // Let odin figure it out.
+ }
+ }
+}
+
+object MentionsOps {
+
+ def apply(mentions: Seq[Mention]): JSONSerialization = new SeqMentionOps(mentions)
+}
+
+trait BioMentionOps {
+ this: OdinMentionOps =>
+
+ type BioMention = Modifications with Grounding with Display with Context
+
+ // The mention is accessible through the OdinMentionOps which records it as a val.
+ override lazy val documentEquivalenceHash: Int = EquivalenceHashes.get(mention.document)
+ // If the equivalenceHash should ever need the ID of this subclass of MentionOps,
+ // then this stringCode can be used to dynamically get the right value.
+ // override val stringCode = s"org.clulab.odin.$longString"
+
+ override def asMentionOps(mention: Mention): OdinMentionOps = MentionOps(mention)
+
+ def bioJsonAST(bioMention: BioMention): JObject = {
+ ("modifications" -> bioMention.modifications.jsonAST) ~
+ // grounding is optional
+ ("grounding" -> bioMention.grounding.map(_.jsonAST)) ~
+ // context is optional
+ ("context" -> bioMention.contextOpt.map(_.jsonAST)) ~
+ // usually just labels.head...
+ ("displayLabel" -> bioMention.displayLabel)
+ }
+}
+
+class BioTextBoundMentionOps(tb: BioTextBoundMention) extends TextBoundMentionOps(tb) with BioMentionOps {
+
+ override def longString: String = BioTextBoundMentionOps.string
+
+ override def jsonAST: JValue = super.jsonAST.merge(bioJsonAST(tb))
+}
+
+class BioEventMentionOps(em: BioEventMention) extends EventMentionOps(em) with BioMentionOps {
+
+ override def longString: String = BioEventMentionOps.string
+
+ override def jsonAST: JValue = super.jsonAST
+ .merge(bioJsonAST(em))
+ .merge(JObject(List(JField("isDirect", em.isDirect))))
+}
+
+class BioRelationMentionOps(rm: BioRelationMention) extends RelationMentionOps(rm) with BioMentionOps {
+
+ override def longString: String = BioRelationMentionOps.string
+
+ override def jsonAST: JValue = super.jsonAST.merge(bioJsonAST(rm))
+}
+
+trait CorefMentionOps {
+ type CorefMention = Modifications with Grounding with Display with Context with Anaphoric
+
+ def corefJsonAST(corefMention: CorefMention): JObject = {
+ ("antecedents" -> corefMention.antecedents.jsonAST) ~
+ ("sieves" -> corefMention.sieves.jsonAST)
+ }
+}
+
+class CorefTextBoundMentionOps(tb: CorefTextBoundMention) extends BioTextBoundMentionOps(tb) with CorefMentionOps {
+
+ override def longString: String = CorefTextBoundMentionOps.string
+
+ override def jsonAST: JValue = super.jsonAST.merge(corefJsonAST(tb))
+}
+
+class CorefEventMentionOps(em: CorefEventMention) extends BioEventMentionOps(em) with CorefMentionOps {
+
+ override def longString: String = CorefEventMentionOps.string
+
+ override def jsonAST: JValue = super.jsonAST.merge(corefJsonAST(em))
+}
+
+class CorefRelationMentionOps(rm: CorefRelationMention) extends BioRelationMentionOps(rm) with CorefMentionOps {
+
+ override def longString: String = CorefRelationMentionOps.string
+
+ override def jsonAST: JValue = super.jsonAST.merge(corefJsonAST(rm))
+}
+
+object ReachImplicits {
+
+ implicit class SeqMentionOps(mentions: Seq[Mention]) extends JSONSerialization {
+
+ def jsonAST: JValue = JSONSerializer.jsonAST(mentions.sorted(MentionOps.mentionOrdering))
+ }
+
+ implicit class ModificationOps(mod: Modification) extends JSONSerialization {
+ def jsonAST: JValue = mod match {
+ case PTM(label, evidenceOp, siteOp, negated) =>
+ ("modification-type" -> "PTM") ~
+ ("label" -> label) ~
+ // evidence is optional
+ ("evidence" -> evidenceOp.map(MentionOps(_).jsonAST)) ~
+ // site is optional
+ ("site" -> siteOp.map(MentionOps(_).jsonAST)) ~
+ ("negated" -> negated)
+ case Mutant(evidence, foundBy) =>
+ ("modification-type" -> "Mutant") ~
+ ("evidence" -> MentionOps(evidence).jsonAST) ~
+ ("foundBy" -> foundBy)
+ case eventSite: EventSite =>
+ // This is an exception because of the "site" label.
+ ("modification-type" -> eventSite.label) ~
+ ("site" -> MentionOps(eventSite.site).jsonAST)
+ case simpleModification: SimpleModification =>
+ ("modification-type" -> simpleModification.label) ~
+ ("evidence" -> MentionOps(simpleModification.mention).jsonAST)
+ }
+ }
+
+ implicit class ModificationsOps(mods: Set[Modification]) extends JSONSerialization {
+ def jsonAST: JValue = mods match {
+ case hasMods if hasMods.nonEmpty =>
+ val sortedMods = hasMods.toSeq.sortBy(_.label)
+ val jValue = sortedMods.toList.map(_.jsonAST)
+
+ JArray(jValue)
+ case _ => JNothing
+ }
+ }
+
+ implicit class KBResolutionOps(kbr: KBResolution) extends JSONSerialization {
+ def jsonAST: JValue = {
+ // components needed to construct KBResolution
+ ("text" -> kbr.text) ~
+ ("namespace" -> kbr.namespace) ~
+ ("id" -> kbr.id) ~
+ ("species" -> kbr.species)
+ }
+ }
+
+ implicit class ContextOps(context: Map[String, Seq[String]]) extends JSONSerialization {
+ def jsonAST: JValue = context match {
+ case hasContext if hasContext.nonEmpty =>
+ val sortedContext = context.toSeq.sortBy(_._1).toList
+ val mappedContext = sortedContext.map { case (key, value) => key -> JArray(value.toList.map(JString)) }
+
+ JObject(mappedContext)
+ case _ => JNothing
+ }
+ }
+
+ implicit class AnaphoricOps(antecedents: Set[Anaphoric]) extends JSONSerialization {
+ def jsonAST: JValue = antecedents match {
+ case hasAntecedents if hasAntecedents.nonEmpty =>
+ val sortedCorefMentions = hasAntecedents.toSeq.map(_.asInstanceOf[CorefMention]).sorted(OdinMentionOps.mentionOrdering)
+ val jValues = sortedCorefMentions.toList.map(MentionOps(_).jsonAST)
+
+ JArray(jValues)
+ case _ => JNothing
+ }
+ }
+
+ implicit class StringSetOps(ss: Set[String]) extends JSONSerialization {
+ def jsonAST: JValue = ss match {
+ case contents if contents.nonEmpty =>
+ val sortedStrings = ss.toSeq.sorted
+ val jValues = sortedStrings.toList.map(JString)
+
+ JArray(jValues)
+ case _ => JNothing
+ }
+ }
+}
+
+object BioTextBoundMentionOps {
+ val string = "BioTextBoundMention"
+}
+
+object BioEventMentionOps {
+ val string = "BioEventMention"
+}
+
+object BioRelationMentionOps {
+ val string = "BioRelationMention"
+}
+
+object CorefTextBoundMentionOps {
+ val string = "CorefTextBoundMention"
+}
+
+object CorefEventMentionOps {
+ val string = "CorefEventMention"
+}
+
+object CorefRelationMentionOps {
+ val string = "CorefRelationMention"
+}
diff --git a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/package.scala b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/package.scala
deleted file mode 100644
index 94d80e84c..000000000
--- a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/package.scala
+++ /dev/null
@@ -1,294 +0,0 @@
-package org.clulab.reach.mentions.serialization
-
-import org.clulab.odin
-import org.clulab.odin._
-import org.clulab.odin.serialization.json.{ TextBoundMentionOps, RelationMentionOps, EventMentionOps }
-import org.clulab.odin.serialization.json.{ MentionOps => OdinMentionOps, OdinPathOps }
-import org.clulab.serialization.json.{ JSONSerialization }
-import org.clulab.reach.mentions.serialization.json.{ JSONSerializer => ReachJSONSerializer }
-import org.clulab.reach.mentions._
-import org.clulab.reach.grounding.KBResolution
-import org.json4s._
-import org.json4s.JsonDSL._
-import org.json4s.jackson._
-
-
-package object json {
-
- implicit val formats = org.json4s.DefaultFormats
-
- /** generate the appropriate AST according to Mention type */
- private def mentionToJsonAST(m: Mention): JValue = m match {
- // NOTE: order matters due to inheritance
- case cm: CorefMention => CorefMentionOps(cm).jsonAST
- case bm: BioMention => BioMentionOps(bm).jsonAST
- case m: Mention => OdinMentionOps(m).jsonAST
- }
-
- implicit class MentionJSONOps(m: Mention) extends OdinMentionOps(m) {
-
- /** Without "documents" field **/
- override def jsonAST: JValue = mentionToJsonAST(m)
-
- /** Includes "documents" field for simple deserialization **/
- override def completeAST: JValue = REACHMentionSeq(Seq(m)).jsonAST
-
- }
-
- /** For Seq[BioMention], Seq[CorefMention], etc */
- implicit class REACHMentionSeq(mentions: Seq[Mention]) extends JSONSerialization {
-
- override def jsonAST: JValue = ReachJSONSerializer.jsonAST(mentions)
-
- }
-
- /** generate a json string from the given ast */
- def astToJSON(jsonast: JValue, pretty: Boolean): String = {
- val jsonDoc = renderJValue(jsonast)
- pretty match {
- case true => prettyJson(jsonDoc)
- case false => compactJson(jsonDoc)
- }
- }
-
- /** generate a json string from a mention
- * Note that this is incomplete for deserialization purposes,
- * as only a reference to the Document is included
- * */
- def mentionToJSON(m: Mention, pretty: Boolean): String = astToJSON(mentionToJsonAST(m), pretty)
-
- /** args -> coref representation -> json */
- private def argsAST(arguments: Map[String, Seq[Mention]]): JObject = {
- val args = arguments.map {
- case (name, mentions) => name -> JArray(mentions.map(mentionToJsonAST).toList)
- }
- JObject(args.toList)
- }
-
- /** BioMention -> json */
- implicit class BioMentionOps(m: BioMention) extends JSONSerialization {
-
- override def jsonAST: JValue = m match {
- case tb: BioTextBoundMention => BioTextBoundMentionOps(tb).jsonAST
- case em: BioEventMention => BioEventMentionOps(em).jsonAST
- case rm: BioRelationMention => BioRelationMentionOps(rm).jsonAST
- }
- }
-
- /** CorefMention -> json */
- implicit class CorefMentionOps(m: CorefMention) extends JSONSerialization {
-
- override def jsonAST: JValue = m match {
- case tb: CorefTextBoundMention => CorefTextBoundMentionOps(tb).jsonAST
- case em: CorefEventMention => CorefEventMentionOps(em).jsonAST
- case rm: CorefRelationMention => CorefRelationMentionOps(rm).jsonAST
- }
- }
-
- def pathsAST(paths: Map[String, Map[Mention, odin.SynPath]]): JValue = paths match {
- case gps if gps.nonEmpty => gps.jsonAST
- case _ => JNothing
- }
-
- implicit class BioTextBoundMentionOps(tb: BioTextBoundMention) extends TextBoundMentionOps(tb) {
-
-// override val stringCode = s"org.clulab.odin.${BioTextBoundMention.string}"
-// override def id: String = s"${BioTextBoundMention.shortString}:$equivalenceHash"
-
- override def jsonAST: JValue = {
-
- val ast = TextBoundMentionOps(tb).jsonAST replace
- (List("type"), BioTextBoundMention.string) replace
- (List("id"), tb.id)
-
- ast merge (
- ("modifications" -> tb.modifications.jsonAST) ~
- // grounding is optional
- ("grounding" -> tb.grounding.map(_.jsonAST)) ~
- // context is optional
- ("context" -> tb.contextOpt.map(_.jsonAST)) ~
- // usually just labels.head...
- ("displayLabel" -> tb.displayLabel)
- )
- }
- }
-
- implicit class BioEventMentionOps(em: BioEventMention) extends EventMentionOps(em) {
- override def jsonAST: JValue = {
-
- val ast = EventMentionOps(em).jsonAST replace
- (List("type"), BioEventMention.string) replace
- (List("id"), em.id) replace
- (List("arguments"), argsAST(em.arguments))
-
- ast merge (
- ("modifications" -> em.modifications.jsonAST) ~
- // grounding is optional
- ("grounding" -> em.grounding.map(_.jsonAST)) ~
- // context is optional
- ("context" -> em.contextOpt.map(_.jsonAST)) ~
- // usually just labels.head...
- ("displayLabel" -> em.displayLabel) ~
- ("isDirect" -> em.isDirect)
- )
- }
- }
-
- implicit class BioRelationMentionOps(rm: BioRelationMention) extends RelationMentionOps(rm) {
- override def jsonAST: JValue = {
-
- val ast = RelationMentionOps(rm).jsonAST replace
- (List("type"), BioRelationMention.string) replace
- (List("id"), rm.id) replace
- (List("arguments"), argsAST(rm.arguments))
-
- ast merge (
- ("modifications" -> rm.modifications.jsonAST) ~
- // grounding is optional
- ("grounding" -> rm.grounding.map(_.jsonAST)) ~
- // context is optional
- ("context" -> rm.contextOpt.map(_.jsonAST)) ~
- // usually just labels.head...
- ("displayLabel" -> rm.displayLabel)
- )
- }
- }
-
- implicit class CorefTextBoundMentionOps(tb: CorefTextBoundMention) extends BioTextBoundMentionOps(tb) {
- override def jsonAST: JValue = {
-
- val ast = BioTextBoundMentionOps(tb).jsonAST replace
- (List("type"), CorefTextBoundMention.string) replace
- (List("id"), tb.id)
-
- ast merge (
- ("antecedents" -> tb.antecedents.jsonAST) ~
- ("sieves" -> tb.sieves.jsonAST)
- )
- }
- }
-
- implicit class CorefEventMentionOps(em: CorefEventMention) extends BioEventMentionOps(em) {
- override def jsonAST: JValue = {
-
- val ast = BioEventMentionOps(em).jsonAST replace
- (List("type"), CorefEventMention.string) replace
- (List("id"), em.id) replace
- (List("arguments"), argsAST(em.arguments))
-
- ast merge (
- ("antecedents" -> em.antecedents.jsonAST) ~
- ("sieves" -> em.sieves.jsonAST)
- )
- }
- }
-
- implicit class CorefRelationMentionOps(rm: CorefRelationMention) extends BioRelationMentionOps(rm) {
- override def jsonAST: JValue = {
-
- val ast = BioRelationMentionOps(rm).jsonAST replace
- (List("type"), CorefRelationMention.string) replace
- (List("id"), rm.id) replace
- (List("arguments"), argsAST(rm.arguments))
-
- ast merge (
- ("antecedents" -> rm.antecedents.jsonAST) ~
- ("sieves" -> rm.sieves.jsonAST)
- )
- }
- }
-
- implicit class ModificationOps(mod: Modification) extends JSONSerialization {
- def jsonAST: JValue = mod match {
- case PTM(label, evidenceOp, siteOp, negated) =>
- ("modification-type" -> "PTM") ~
- ("label" -> label) ~
- // evidence is optional
- ("evidence" -> evidenceOp.map(mentionToJsonAST)) ~
- // site is optional
- ("site" -> siteOp.map(mentionToJsonAST)) ~
- ("negated" -> negated)
- case Mutant(evidence, foundBy) =>
- ("modification-type" -> "Mutant") ~
- ("evidence" -> mentionToJsonAST(evidence)) ~
- ("foundBy" -> foundBy)
- case eventSite: EventSite =>
- // This is an exception because of the "site" label.
- ("modification-type" -> eventSite.label) ~
- ("site" -> mentionToJsonAST(eventSite.site))
- case simpleModification: SimpleModification =>
- ("modification-type" -> simpleModification.label) ~
- ("evidence" -> mentionToJsonAST(simpleModification.mention))
- }
- }
-
- implicit class ModificationsOps(mods: Set[Modification]) extends JSONSerialization {
- def jsonAST: JValue = mods match {
- case hasMods if hasMods.nonEmpty => hasMods.map(_.jsonAST).toList
- case _ => JNothing
- }
- }
-
- implicit class KBResolutionOps(kbr: KBResolution) extends JSONSerialization {
- def jsonAST: JValue = {
- // components needed to construct KBResolution
- ("text" -> kbr.text) ~
- ("namespace" -> kbr.namespace) ~
- ("id" -> kbr.id) ~
- ("species" -> kbr.species)
- }
- }
-
- implicit class ContextOps(context: Map[String, Seq[String]]) extends JSONSerialization {
- def jsonAST: JValue = context match {
- case hasContext if hasContext.nonEmpty => hasContext
- case _ => JNothing
- }
- }
-
- implicit class AnaphoricOps(antecedents: Set[Anaphoric]) extends JSONSerialization {
- def jsonAST: JValue = antecedents match {
- case hasAntecedents if hasAntecedents.nonEmpty => hasAntecedents.map(m => m.asInstanceOf[CorefMention].jsonAST)
- case _ => JNothing
- }
- }
-
- implicit class StringSetOps(ss: Set[String]) extends JSONSerialization {
- def jsonAST: JValue = ss match {
- case contents if contents.nonEmpty => contents
- case _ => JNothing
- }
- }
-
- def prettify(json: JValue): String = prettyJson(renderJValue(json))
-
- object BioTextBoundMention {
- val string = "BioTextBoundMention"
- val shortString = "T"
- }
-
- object BioEventMention {
- val string = "BioEventMention"
- val shortString = "E"
- }
-
- object BioRelationMention {
- val string = "BioRelationMention"
- val shortString = "R"
- }
-
- object CorefTextBoundMention {
- val string = "CorefTextBoundMention"
- val shortString = "T"
- }
-
- object CorefEventMention {
- val string = "CorefEventMention"
- val shortString = "E"
- }
-
- object CorefRelationMention {
- val string = "CorefRelationMention"
- val shortString = "R"
- }
-}
diff --git a/main/src/test/scala/org/clulab/reach/TestModifications.scala b/main/src/test/scala/org/clulab/reach/TestModifications.scala
index 6001fc563..0bfc88665 100644
--- a/main/src/test/scala/org/clulab/reach/TestModifications.scala
+++ b/main/src/test/scala/org/clulab/reach/TestModifications.scala
@@ -4,7 +4,6 @@ import org.clulab.reach.mentions._
import org.scalatest._
import TestUtils._
import org.clulab.reach.mentions.serialization.json.JSONSerializer
-import org.clulab.reach.mentions.serialization.json.prettify
/**
* Unit tests to ensure PTM rules are matching correctly
@@ -1196,7 +1195,7 @@ class TestModifications extends FlatSpec with Matchers {
koTriggers1 should not be ('empty)
val jValue = JSONSerializer.jsonAST(bioMentions1)
- val json = prettify(jValue)
+ val json = JSONSerializer.prettify(jValue)
json should include ("KOtrigger")
diff --git a/main/src/test/scala/org/clulab/reach/TestObjectWriter.scala b/main/src/test/scala/org/clulab/reach/TestObjectWriter.scala
new file mode 100644
index 000000000..8dd5a0560
--- /dev/null
+++ b/main/src/test/scala/org/clulab/reach/TestObjectWriter.scala
@@ -0,0 +1,51 @@
+package org.clulab.reach
+
+import org.clulab.utils.Closer.AutoCloser
+import org.clulab.utils.Sink
+import org.json4s.{JArray, JObject, JValue}
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods
+import org.scalatest.{FlatSpec, Matchers}
+
+import java.io.{File, PrintWriter}
+import java.nio.charset.StandardCharsets
+
+class TestObjectWriter extends FlatSpec with Matchers {
+
+ behavior of "ObjectWriter"
+
+ it should "produce a small file" in {
+ val json = """{ "boolean": true, "integer": 42, "float": 3.14, "string": "Hello, world!" }"""
+ val jValue = JsonMethods.parse(json)
+ val renderedJValue = JsonMethods.render(jValue)
+ val objectWriter = JsonMethods.mapper.writerWithDefaultPrettyPrinter()
+ val file = new File("SmallTestObjectWriter.json")
+ val printWriter = new PrintWriter(new Sink(file, StandardCharsets.UTF_8.name, append = false))
+
+ printWriter.autoClose { printWriter =>
+ objectWriter.writeValue(printWriter, renderedJValue)
+ }
+ file.exists should be (true)
+ file.delete()
+ }
+
+ // This will produce a file of about 8GB!
+ ignore should "produce a very large file" in {
+ val jObject =
+ ("string" -> "The quick brown fox jumped over the lazy dog.") ~
+ ("number" -> 42)
+ val list = 1.to(100000000).map(_ => jObject).toList
+ val jArray = JArray(list)
+ val jValue = jArray
+ val renderedJValue = JsonMethods.render(jValue)
+ val objectWriter = JsonMethods.mapper.writerWithDefaultPrettyPrinter()
+ val file = new File("LargeTestObjectWriter.json")
+ val printWriter = new PrintWriter(new Sink(file, StandardCharsets.UTF_8.name, append = false))
+
+ printWriter.autoClose { printWriter =>
+ objectWriter.writeValue(printWriter, renderedJValue)
+ }
+ file.exists should be(true)
+ file.delete()
+ }
+}
diff --git a/main/src/test/scala/org/clulab/reach/mentions/serialization/TestJSONSerializer.scala b/main/src/test/scala/org/clulab/reach/mentions/serialization/TestJSONSerializer.scala
index 1055eec70..b40aafb1f 100644
--- a/main/src/test/scala/org/clulab/reach/mentions/serialization/TestJSONSerializer.scala
+++ b/main/src/test/scala/org/clulab/reach/mentions/serialization/TestJSONSerializer.scala
@@ -1,22 +1,22 @@
package org.clulab.reach.mentions.serialization
-import org.scalatest._
-import scala.util.Try // do not remove: needed for debugging
import org.clulab.reach.TestUtils._
-import org.clulab.reach.mentions.{MentionOps => MOps}
-import org.clulab.reach.mentions.serialization.json._
+import org.clulab.reach.mentions.serialization.json.JSONSerializer
+import org.clulab.reach.mentions.serialization.json.{MentionOps, MentionsOps}
+import org.clulab.reach.mentions.{MentionOps => ImplicitMentionOps}
import org.json4s._
import org.json4s.jackson.JsonMethods._
-
+import org.scalatest._
class TestJSONSerializer extends FlatSpec with Matchers {
+ implicit val formats = org.json4s.DefaultFormats
val text = "Phosphorylated MEK activates K-RAS."
val mentions = getMentionsFromText(text)
val corefmentions = mentions.map(_.toCorefMention)
- val corefJAST = corefmentions.jsonAST
+ val corefJAST = MentionsOps(corefmentions).jsonAST
val biomentions = mentions.map(_.toBioMention)
- val bioJAST = biomentions.jsonAST
+ val bioJAST = MentionsOps(biomentions).jsonAST
// printMentions(Try(biomentions), true) // DEBUGGING
@@ -36,7 +36,7 @@ class TestJSONSerializer extends FlatSpec with Matchers {
it should "serialize/deserialize a BioMention to/from json correctly " in {
val Seq(m) = biomentions.filter(m => (m matches "Entity") && (m.text == "MEK"))
- val mns = JSONSerializer.toBioMentions(m.completeAST)
+ val mns = JSONSerializer.toBioMentions(MentionsOps(Seq(m)).jsonAST)
mns should have size (1)
val Seq(deserializedm) = mns
m.document.equivalenceHash should equal (deserializedm.document.equivalenceHash)
@@ -56,7 +56,7 @@ class TestJSONSerializer extends FlatSpec with Matchers {
// coref mention tests
it should "serialize a Seq[CorefMention] to json correctly" in {
- val mentions2 = JSONSerializer.toCorefMentions(corefmentions.jsonAST)
+ val mentions2 = JSONSerializer.toCorefMentions(MentionsOps(corefmentions).jsonAST)
mentions2 should have size corefmentions.size
mentions2.map(_.label) should equal (corefmentions.map(_.label))
mentions2.map(_.document.equivalenceHash) should equal (corefmentions.map(_.document.equivalenceHash))
@@ -71,7 +71,7 @@ class TestJSONSerializer extends FlatSpec with Matchers {
it should "serialize/deserialize a CorefMention to/from json correctly " in {
val Seq(cm) = corefmentions.filter(m => (m matches "Entity") && (m.text == "MEK"))
- val mns = JSONSerializer.toCorefMentions(cm.completeAST)
+ val mns = JSONSerializer.toCorefMentions(MentionsOps(Seq(cm)).jsonAST)
mns should have size 1
val Seq(m) = mns
m.document.equivalenceHash should equal (cm.document.equivalenceHash)
@@ -80,29 +80,33 @@ class TestJSONSerializer extends FlatSpec with Matchers {
}
it should "produce identical json for a Seq[CorefMention] before and after serialization/deserialization" in {
- corefmentions.json(true) == JSONSerializer.toCorefMentions(corefmentions.jsonAST).json(true)
+ val json = MentionsOps(corefmentions).json(pretty = true)
+ val rejson = MentionsOps(JSONSerializer.toCorefMentions(MentionsOps(corefmentions).jsonAST)).json(pretty = true)
+ rejson should equal (json) // This was not checked before and had been failing!
}
s"json for '$text'" should "be identical before and after serialization/deserialzation" in {
val mekmns = corefmentions.filter(_.text == "MEK")
mekmns should have size 1
val mek = mekmns.head
- mek.json(true) should equal (JSONSerializer.toCorefMentions(Seq(mek).jsonAST).head.json(true))
+ MentionOps(mek).json(pretty = true) should equal (
+ MentionOps(JSONSerializer.toCorefMentions(MentionsOps(Seq(mek)).jsonAST).head).json(pretty = true)
+ )
}
it should "contain a modification with \"modification-type\" PTM" in {
val mekmns = corefmentions.filter(_.text == "MEK")
mekmns should have size 1
val mek = mekmns.head
- (mek.jsonAST \ "modifications" \\ "modification-type").extract[String] should equal ("PTM")
+ (MentionOps(mek).jsonAST \ "modifications" \\ "modification-type").extract[String] should equal ("PTM")
}
it should "still contain a PTM after serialization/deserialization" in {
- val deserializedCorefMentions = JSONSerializer.toCorefMentions(corefmentions.jsonAST)
+ val deserializedCorefMentions = JSONSerializer.toCorefMentions(MentionsOps(corefmentions).jsonAST)
val mekmns = deserializedCorefMentions.filter(_.text == "MEK")
mekmns should have size 1
val mek = mekmns.head
- (mek.jsonAST \ "modifications" \\ "modification-type").extract[String] should equal ("PTM")
+ (MentionOps(mek).jsonAST \ "modifications" \\ "modification-type").extract[String] should equal ("PTM")
}
val text2 = "MEK activates K-RAS."
@@ -110,6 +114,6 @@ class TestJSONSerializer extends FlatSpec with Matchers {
val mekmns = getCorefmentionsFromText(text2).filter(_.text == "MEK")
mekmns should have size 1
val mek = mekmns.head
- (mek.jsonAST \ "modifications") should equal(JNothing)
+ (MentionOps(mek).jsonAST \ "modifications") should equal(JNothing)
}
}
diff --git a/processors/build.sbt b/processors/build.sbt
index d7f8f8e0d..80d04c258 100644
--- a/processors/build.sbt
+++ b/processors/build.sbt
@@ -5,7 +5,7 @@ resolvers += "clulab" at "https://artifactory.clulab.org/artifactory/sbt-release
libraryDependencies ++= {
- val procVer = "8.5.3"
+ val procVer = "8.5.4-SNAPSHOT"
Seq(
"com.typesafe" % "config" % "1.3.1",
diff --git a/processors/src/main/scala/org/clulab/processors/bionlp/ner/ReachStandardKbSource.scala b/processors/src/main/scala/org/clulab/processors/bionlp/ner/ReachStandardKbSource.scala
index faea5d422..61579c3d9 100644
--- a/processors/src/main/scala/org/clulab/processors/bionlp/ner/ReachStandardKbSource.scala
+++ b/processors/src/main/scala/org/clulab/processors/bionlp/ner/ReachStandardKbSource.scala
@@ -7,12 +7,12 @@ import org.clulab.processors.bionlp.ner.KBGenerator.tokenizeResourceLine
import org.clulab.processors.clu.tokenizer.Tokenizer
import org.clulab.sequences.StandardKbSource
import org.clulab.utils.Files
-import org.clulab.utils.Serializer
import java.io.File
import java.util.function.Consumer
import scala.language.reflectiveCalls // required to access consumer.lineCount
import scala.util.Try
+import scala.util.Using
abstract class ReachStandardKbSource(caseInsensitiveMatching: Boolean) extends StandardKbSource(caseInsensitiveMatching)
@@ -54,7 +54,7 @@ class ReachSingleStandardKbSource(kbEntry: KBEntry, caseInsensitiveMatching: Boo
)
)
- Serializer.using(bufferedReader) { bufferedReader =>
+ Using.resource(bufferedReader) { bufferedReader =>
bufferedReader.lines.forEach(consumer)
}
logger.info(s"Done. Read ${consumer.lineCount} lines from ${new File(kbEntry.path).getName}")
diff --git a/src/main/scala/org/clulab/reach/ReachCLI.scala b/src/main/scala/org/clulab/reach/ReachCLI.scala
index 162d3d750..c1a85cdc5 100644
--- a/src/main/scala/org/clulab/reach/ReachCLI.scala
+++ b/src/main/scala/org/clulab/reach/ReachCLI.scala
@@ -11,6 +11,8 @@ import java.nio.charset.Charset
import java.nio.charset.StandardCharsets.UTF_8
import ai.lum.common.FileUtils._
import ai.lum.common.ConfigUtils._
+import org.clulab.reach.mentions.serialization.json.EquivalenceHashes
+import org.clulab.utils.Timer
//import jline.internal.InputStreamReader
import org.clulab.odin._
import org.clulab.processors.Document
@@ -116,21 +118,51 @@ class ReachCLI (
logger.debug(s" ${ durationToS(startNS, System.nanoTime) }s: $paperId: finished reading")
+ // The mentions come from a single file, so they should all have originated
+ // with the same single document. Verify this and then make sure the document's
+ // equivalency hash gets added and removed from the cache.
+ def withDocument[T](mentions: Seq[Mention])(f: => T): T = {
+ val documentOpt =
+ if (mentions.isEmpty) None
+ else {
+ val document = mentions.head.document
+
+ require(mentions.forall(_.document.eq(document)))
+ Some(document)
+ }
+
+ try {
+ documentOpt.foreach(EquivalenceHashes.get)
+
+ val timer = new Timer("ReachCLI")
+ val result = timer.time {
+ f
+ }
+ println(s"Time to output ${file.getName} is ${timer.elapsedToString()}.")
+ result
+ }
+ finally {
+ documentOpt.foreach(EquivalenceHashes.remove)
+ }
+ }
+
// generate outputs
// NOTE: Assembly can't be run before calling this method without additional refactoring,
// as different output formats apply different filters before running assembly
- val errorCount = outputFormats
- .map { outputFormat =>
- try {
- outputMentions(mentions, entry, paperId, startTime, outputDir, outputFormat, withAssembly)
- 0
- }
- catch {
- case throwable: Throwable =>
- reportException(file, throwable)
- 1
- }
- }.sum
+ val errorCount = withDocument(mentions) {
+ outputFormats
+ .map { outputFormat =>
+ try {
+ outputMentions(mentions, entry, paperId, startTime, outputDir, outputFormat, withAssembly)
+ 0
+ }
+ catch {
+ case throwable: Throwable =>
+ reportException(file, throwable)
+ 1
+ }
+ }.sum
+ }
// elapsed time: processing + writing output
val endTime = ReachCLI.now