From d6aee3b2a53d6911e0244ec671ae33b2fb7acc48 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Thu, 9 Mar 2023 13:51:30 -0700 Subject: [PATCH 01/12] Consolidate JSON changes Remove local withDocument Remove println Put withDocument here instead --- .../clulab/reach/assembly/RunAssembly.scala | 10 +- .../assembly/relations/corpus/Corpus.scala | 37 ++- .../relations/corpus/CorpusBuilder.scala | 4 +- .../assembly/relations/corpus/package.scala | 8 +- .../clulab/reach/export/JsonOutputter.scala | 4 +- .../export/indexcards/IndexCardOutput.scala | 10 +- .../export/serial/SerialJsonOutput.scala | 18 +- .../export/server/FileProcessorWebUI.scala | 6 +- .../ml/data/PolarityDatasetPreprocessor.scala | 7 +- .../org/clulab/reach/darpa/DarpaActions.scala | 1 - .../org/clulab/reach/mentions/package.scala | 10 +- .../json/EquivalenceHashes.scala | 24 ++ .../serialization/json/JSONSerializer.scala | 66 ++-- .../serialization/json/MentionOps.scala | 209 +++++++++++++ .../mentions/serialization/json/package.scala | 294 ------------------ .../org/clulab/reach/TestModifications.scala | 3 +- .../serialization/TestJSONSerializer.scala | 36 ++- processors/build.sbt | 2 +- .../bionlp/ner/ReachStandardKbSource.scala | 4 +- .../scala/org/clulab/reach/ReachCLI.scala | 56 +++- 20 files changed, 393 insertions(+), 416 deletions(-) create mode 100644 main/src/main/scala/org/clulab/reach/mentions/serialization/json/EquivalenceHashes.scala create mode 100644 main/src/main/scala/org/clulab/reach/mentions/serialization/json/MentionOps.scala delete mode 100644 main/src/main/scala/org/clulab/reach/mentions/serialization/json/package.scala diff --git a/assembly/src/main/scala/org/clulab/reach/assembly/RunAssembly.scala b/assembly/src/main/scala/org/clulab/reach/assembly/RunAssembly.scala index 643a5e432..0bdaf988e 100644 --- a/assembly/src/main/scala/org/clulab/reach/assembly/RunAssembly.scala +++ b/assembly/src/main/scala/org/clulab/reach/assembly/RunAssembly.scala @@ -4,8 +4,8 @@ import org.apache.commons.io.FilenameUtils import org.clulab.reach.assembly.relations.corpus.{CorpusReader, EventPair} import org.clulab.odin.Mention import org.clulab.reach.PaperReader -import org.clulab.reach.mentions._ -import org.clulab.reach.mentions.serialization.json._ +import org.clulab.reach.mentions.{CorefMention, MentionOps => ImplicitMentionOps} +import org.clulab.reach.mentions.serialization.json.MentionsOps import org.clulab.utils.Serializer import com.typesafe.config.ConfigFactory @@ -127,9 +127,6 @@ object RunAnnotationEval extends App with LazyLogging { * Serialize each paper in a directory to json */ object SerializePapersToJSON extends App with LazyLogging { - - import org.clulab.reach.mentions.serialization.json._ - val config = ConfigFactory.load() val papersDir = new File(config.getString("papersDir")) val outDir = new File(config.getString("outDir")) @@ -150,7 +147,8 @@ object SerializePapersToJSON extends App with LazyLogging { val mentions = PaperReader.getMentionsFromPaper(paper) val cms: Seq[CorefMention] = mentions.map(_.toCorefMention) logger.info(s"extracted ${mentions.size} mentions for $paperID") - cms.saveJSON(outFile, pretty = true) + + MentionsOps(cms).saveJSON(outFile, pretty = true) logger.info(s"saved json to $outFile") } } \ No newline at end of file diff --git a/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/Corpus.scala b/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/Corpus.scala index 79c90473c..f0b1712c9 100644 --- a/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/Corpus.scala +++ b/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/Corpus.scala @@ -4,15 +4,17 @@ import org.clulab.processors.Document import org.clulab.reach.assembly.relations.classifier.AssemblyRelationClassifier import org.clulab.reach.assembly.sieves.Constraints import org.clulab.reach.mentions.CorefMention -import org.clulab.reach.mentions.serialization.json.{MentionJSONOps, REACHMentionSeq, JSONSerializer} +import org.clulab.reach.mentions.serialization.json.{JSONSerializer, MentionOps, MentionsOps} import org.clulab.serialization.json.JSONSerialization import org.json4s.jackson.JsonMethods._ import org.json4s.JsonDSL._ import org.json4s._ + import scala.util.hashing.MurmurHash3._ import com.typesafe.scalalogging.LazyLogging import org.apache.commons.io.FileUtils.forceMkdir import ai.lum.common.FileUtils._ + import java.io.File @@ -44,8 +46,8 @@ case class EventPair( // the seed (not counted in the length of finalizeHash) val h0 = stringHash("org.clulab.assembly.TrainingInstance") // get hashes for each event - val h1 = mix(h0, e1.equivalenceHash) - val h2 = mix(h1, e2.equivalenceHash) + val h1 = mix(h0, MentionOps(e1).equivalenceHash) + val h2 = mix(h1, MentionOps(e2).equivalenceHash) // is it cross-sentence? val h3 = mix(h2, isCrossSentence.hashCode) // the text of the sentences containing the two event mentions @@ -66,34 +68,37 @@ case class EventPair( def jsonAST: JValue = { + val e1EventOps = new EventOps(e1) + val e2EventOps = new EventOps(e2) + // build json ("id" -> this.equivalenceHash) ~ ("text" -> this.text) ~ ("coref" -> this.coref) ~ // event 1 - ("e1-id" -> this.e1.id) ~ - ("e1-label" -> this.e1.eventLabel) ~ - ("e1-sentence-text" -> this.e1.sentenceText) ~ + ("e1-id" -> MentionOps(this.e1).id) ~ + ("e1-label" -> e1EventOps.eventLabel) ~ + ("e1-sentence-text" -> e1EventOps.sentenceText) ~ ("e1-sentence-index" -> this.e1.sentence) ~ ("e1-sentence-tokens" -> this.e1.sentenceObj.words.toList) ~ // can be used to highlight event span in annotation UI ("e1-start" -> this.e1.start) ~ ("e1-end" -> this.e1.end) ~ - ("e1-trigger" -> this.e1.trigger.text) ~ - ("e1-trigger-start" -> this.e1.trigger.start) ~ - ("e1-trigger-end" -> this.e1.trigger.end) ~ + ("e1-trigger" -> e1EventOps.trigger.text) ~ + ("e1-trigger-start" -> e1EventOps.trigger.start) ~ + ("e1-trigger-end" -> e1EventOps.trigger.end) ~ // event 2 - ("e2-id" -> this.e2.id) ~ - ("e2-label" -> this.e2.eventLabel) ~ - ("e2-sentence-text" -> this.e2.sentenceText) ~ + ("e2-id" -> MentionOps(this.e2).id) ~ + ("e2-label" -> e2EventOps.eventLabel) ~ + ("e2-sentence-text" -> e2EventOps.sentenceText) ~ ("e2-sentence-index" -> this.e2.sentence) ~ ("e2-sentence-tokens" -> this.e2.sentenceObj.words.toList) ~ // can be used to highlight event span in annotation UI ("e2-start" -> this.e2.start) ~ ("e2-end" -> this.e2.end) ~ - ("e2-trigger" -> this.e2.trigger.text) ~ - ("e2-trigger-start" -> this.e2.trigger.start) ~ - ("e2-trigger-end" -> this.e2.trigger.end) ~ + ("e2-trigger" -> e2EventOps.trigger.text) ~ + ("e2-trigger-start" -> e2EventOps.trigger.start) ~ + ("e2-trigger-end" -> e2EventOps.trigger.end) ~ // these will be filled out during annotation ("annotator-id" -> this.annotatorID) ~ ("relation" -> this.relation) ~ @@ -155,7 +160,7 @@ case class Corpus(instances: Seq[EventPair]) extends JSONSerialization { // for each doc, write doc + mentions to a json file for ((paperID, cms) <- dmLUT) { val of = new File(mentionDataDir, s"$paperID-mention-data.json") - of.writeString(cms.json(pretty), java.nio.charset.StandardCharsets.UTF_8) + of.writeString(MentionsOps(cms).json(pretty), java.nio.charset.StandardCharsets.UTF_8) } // write event pair info to json file val epf = new File(corpusDir, s"${Corpus.EVENT_PAIRS}.json") diff --git a/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/CorpusBuilder.scala b/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/CorpusBuilder.scala index 1e687632a..593159cbd 100644 --- a/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/CorpusBuilder.scala +++ b/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/CorpusBuilder.scala @@ -124,7 +124,7 @@ object CorpusBuilder extends LazyLogging { // create training instance ep = EventPair(Set(m1, m2)) // triggers should not be the same - if ep.e1.trigger != ep.e2.trigger + if new EventOps(ep.e1).trigger != new EventOps(ep.e2).trigger } yield ep distinctEventPairs(eps.toSeq) @@ -133,7 +133,7 @@ object CorpusBuilder extends LazyLogging { def distinctEventPairs(eps: Seq[EventPair]): Seq[EventPair] = { eps.distinct.groupBy(ep => // distinct by... - (ep.e1.sentence, ep.e2.trigger, ep.e1.label, ep.e1.text, ep.e2.sentence, ep.e2.trigger, ep.e2.label, ep.e2.text) + (ep.e1.sentence, new EventOps(ep.e2).trigger, ep.e1.label, ep.e1.text, ep.e2.sentence, new EventOps(ep.e2).trigger, ep.e2.label, ep.e2.text) ).values.map(_.head) // get one value for each key .toSeq .sortBy{ ep => (ep.doc.id.getOrElse(""), ep.sentenceIndices.head) } diff --git a/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/package.scala b/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/package.scala index 9d14ebfcd..9dda156a4 100644 --- a/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/package.scala +++ b/assembly/src/main/scala/org/clulab/reach/assembly/relations/corpus/package.scala @@ -2,8 +2,8 @@ package org.clulab.reach.assembly.relations import org.clulab.odin.Mention import org.clulab.reach.assembly.sieves.SieveUtils -import org.clulab.reach.mentions.serialization.json.{ CorefMentionOps, JSONSerializer => ReachJsonSerializer } -import org.clulab.reach.mentions._ +import org.clulab.reach.mentions.CorefMention +import org.clulab.reach.mentions.serialization.json.JSONSerializer import com.typesafe.scalalogging.LazyLogging import scala.collection.GenSeq import java.io.File @@ -12,7 +12,7 @@ import java.io.File package object corpus extends LazyLogging { /** Additional attributes and methods for a [[CorefMention]] */ - implicit class EventOps(mention: CorefMention) extends CorefMentionOps(mention) { + class EventOps(mention: CorefMention) { val eventLabel: String = mention.label val sentenceText: String = mention.sentenceObj.getSentenceText // NOTE: if mention is a TB, trigger will simply be the mention (ex. BioProcess) @@ -29,7 +29,7 @@ package object corpus extends LazyLogging { def datasetLUT(jsonFiles: GenSeq[File]): Map[String, Vector[CorefMention]] = { val docMentionPairs = jsonFiles.filter(_.getName.endsWith(".json")).map{ f: File => logger.debug(s"parsing ${f.getName}") - val cms: Vector[CorefMention] = ReachJsonSerializer.toCorefMentions(f).toVector + val cms: Vector[CorefMention] = JSONSerializer.toCorefMentions(f).toVector if (cms.nonEmpty) logger.debug(s"successfully parsed ${f.getName}") val paperID = getPMID(cms.head) paperID -> cms diff --git a/export/src/main/scala/org/clulab/reach/export/JsonOutputter.scala b/export/src/main/scala/org/clulab/reach/export/JsonOutputter.scala index d590a9f7f..5d3fb7dbe 100644 --- a/export/src/main/scala/org/clulab/reach/export/JsonOutputter.scala +++ b/export/src/main/scala/org/clulab/reach/export/JsonOutputter.scala @@ -8,7 +8,7 @@ import com.typesafe.scalalogging.Logger import org.clulab.odin.Mention import org.clulab.reach.FriesEntry import org.clulab.reach.ReachConstants._ -import org.clulab.odin.serialization.json._ +import org.clulab.odin.serialization.json.MentionOps import org.json4s.jackson.Serialization import org.slf4j.LoggerFactory @@ -154,7 +154,7 @@ object JsonOutputter { else { // "Gene_or_gene_product" is another possibility. // Also "Family", "Disease", "Simple_chemical" - val json = mention.json(pretty = true) + val json = MentionOps(mention).json(pretty = true) val message = s"""Unknown event type "$label" in event:\n$json""" // throw new RuntimeException(message) logger.warn(message) diff --git a/export/src/main/scala/org/clulab/reach/export/indexcards/IndexCardOutput.scala b/export/src/main/scala/org/clulab/reach/export/indexcards/IndexCardOutput.scala index 50085275d..67bcb3345 100644 --- a/export/src/main/scala/org/clulab/reach/export/indexcards/IndexCardOutput.scala +++ b/export/src/main/scala/org/clulab/reach/export/indexcards/IndexCardOutput.scala @@ -3,10 +3,8 @@ package org.clulab.reach.export.indexcards import java.io.File import java.util.Date import java.util.regex.Pattern - import scala.collection.mutable import scala.collection.mutable.ListBuffer - import com.typesafe.scalalogging.LazyLogging import org.clulab.odin.Mention import org.clulab.reach.ReachConstants._ @@ -14,8 +12,8 @@ import org.clulab.reach.{FriesEntry, display} import org.clulab.reach.export.JsonOutputter._ import org.clulab.reach.export.{JsonOutputter, OutputDegrader} import org.clulab.reach.grounding.KBResolution -import org.clulab.reach.mentions._ -import org.clulab.reach.mentions.serialization.json.mentionToJSON +import org.clulab.reach.mentions.{BioEventMention, CorefMention, Mutant, PTM, MentionOps => ImplicitMentionOps} +import org.clulab.reach.mentions.serialization.json.{JSONSerializer, MentionOps} import org.clulab.reach.utils.MentionManager import IndexCardOutput._ @@ -173,7 +171,7 @@ class IndexCardOutput extends JsonOutputter with LazyLogging { case "amount" => mkSimpleEventIndexCard(mention, mention.label) case _ => // "conversion" is one example of an eventType not handled. - val json = mentionToJSON(mention, pretty = true) + val json = MentionOps(mention).json(pretty = true) val message = s"""Event type "$eventType" is not supported for indexcard output:\n$json""" // throw new RuntimeException(message) logger.warn(message) @@ -211,7 +209,7 @@ class IndexCardOutput extends JsonOutputter with LazyLogging { case "complex" => Some(new PropMapOrFrameList(mkComplexArgument(derefArg))) // FrameList case _ => { // "event" is a typical culprit. - val json = mentionToJSON(arg, pretty = true) + val json = MentionOps(arg).json(pretty = true) val message = s"""Argument type "$argType" is not supported for indexcard output:\n$json""" logger.warn(message) None diff --git a/export/src/main/scala/org/clulab/reach/export/serial/SerialJsonOutput.scala b/export/src/main/scala/org/clulab/reach/export/serial/SerialJsonOutput.scala index d285c2a04..8ad0eb29f 100644 --- a/export/src/main/scala/org/clulab/reach/export/serial/SerialJsonOutput.scala +++ b/export/src/main/scala/org/clulab/reach/export/serial/SerialJsonOutput.scala @@ -2,20 +2,15 @@ package org.clulab.reach.export.serial import java.io.File import java.util.Date -import java.util.regex.Pattern - import java.nio.charset.Charset import java.nio.charset.StandardCharsets.UTF_8 - import ai.lum.common.FileUtils._ - import com.typesafe.scalalogging.LazyLogging - import org.clulab.odin.Mention import org.clulab.reach.FriesEntry import org.clulab.reach.export.JsonOutputter -import org.clulab.reach.mentions._ -import org.clulab.reach.mentions.serialization.json._ +import org.clulab.reach.mentions.{MentionOps => ImplicitMentionOps} +import org.clulab.reach.mentions.serialization.json.MentionsOps /** * Defines classes and methods used to output the serial-json output format. @@ -39,7 +34,7 @@ class SerialJsonOutput ( outFilePrefix:String ): String = { val mentions = allMentions.map(_.toCorefMention) - mentions.json(true) // true = pretty print + MentionsOps(mentions).json(pretty = true) } /** @@ -58,11 +53,10 @@ class SerialJsonOutput ( val mentions = allMentions.map(_.toCorefMention) f.writeString( - string = mentions.json(true), - charset = encoding, - append = false, + string = MentionsOps(mentions).json(true), + charset = encoding, + append = false, gzipSupport = false ) } - } diff --git a/export/src/main/scala/org/clulab/reach/export/server/FileProcessorWebUI.scala b/export/src/main/scala/org/clulab/reach/export/server/FileProcessorWebUI.scala index bdac4b6cd..79bbd1a22 100644 --- a/export/src/main/scala/org/clulab/reach/export/server/FileProcessorWebUI.scala +++ b/export/src/main/scala/org/clulab/reach/export/server/FileProcessorWebUI.scala @@ -19,8 +19,8 @@ import akka.stream.{ActorMaterializer, Materializer} import akka.stream.scaladsl._ import akka.util.ByteString -import org.clulab.reach.mentions._ -import org.clulab.reach.mentions.serialization.json._ +import org.clulab.reach.mentions.{MentionOps => ImplicitMentionOps} +import org.clulab.reach.mentions.serialization.json.MentionsOps import org.clulab.reach.PaperReader @@ -102,7 +102,7 @@ object FileProcessorWebUI extends App with FileUpload { def processFile(tempFile: File, outputType: String): String = { val cms = PaperReader.getMentionsFromPaper(tempFile).map(_.toCorefMention) outputType match { - case JSON => cms.json(false) + case JSON => MentionsOps(cms).json(false) } } diff --git a/main/src/main/scala/org/clulab/polarity/ml/data/PolarityDatasetPreprocessor.scala b/main/src/main/scala/org/clulab/polarity/ml/data/PolarityDatasetPreprocessor.scala index eea90bd91..d4ee98cbd 100644 --- a/main/src/main/scala/org/clulab/polarity/ml/data/PolarityDatasetPreprocessor.scala +++ b/main/src/main/scala/org/clulab/polarity/ml/data/PolarityDatasetPreprocessor.scala @@ -2,7 +2,6 @@ package org.clulab.polarity.ml.data import java.io.PrintWriter import java.util.{Calendar, Date} - import com.typesafe.scalalogging.LazyLogging import org.clulab.polarity.{NegativePolarity, Polarity, PositivePolarity} import org.clulab.reach.{PaperReader, ReachSystem} @@ -11,8 +10,8 @@ import org.clulab.reach.mentions.{BioEventMention, BioMention, CorefEventMention import scala.collection.mutable.ArrayBuffer import scala.io.Source import scala.util.{Failure, Success, Try} -import org.clulab.reach.mentions.serialization.json._ -import org.clulab.reach.mentions.{MentionOps => MOps} +import org.clulab.reach.mentions.serialization.json.{JSONSerializer, MentionsOps} +import org.clulab.reach.mentions.{MentionOps => ImplicitMentionOps} import org.json4s.JsonAST.JValue import org.json4s.JsonDSL._ import org.json4s._ @@ -119,7 +118,7 @@ object PolarityDatasetPreprocessor extends App with LazyLogging{ def saveOutput(digestedData: Seq[(BioEventMention, Polarity)], outputPath: String): Unit = { val (evts, labels) = digestedData.unzip - val jsonEvts = evts.jsonAST + val jsonEvts = MentionsOps(evts).jsonAST val json = diff --git a/main/src/main/scala/org/clulab/reach/darpa/DarpaActions.scala b/main/src/main/scala/org/clulab/reach/darpa/DarpaActions.scala index ef215fdba..6994f18bc 100644 --- a/main/src/main/scala/org/clulab/reach/darpa/DarpaActions.scala +++ b/main/src/main/scala/org/clulab/reach/darpa/DarpaActions.scala @@ -5,7 +5,6 @@ import org.clulab.odin._ import org.clulab.polarity.PolarityEngine import org.clulab.reach._ import org.clulab.reach.mentions._ -import org.clulab.reach.mentions.serialization.json.BioTextBoundMention import org.clulab.struct.DirectedGraph import scala.annotation.tailrec diff --git a/main/src/main/scala/org/clulab/reach/mentions/package.scala b/main/src/main/scala/org/clulab/reach/mentions/package.scala index c643a3043..ad14edc28 100644 --- a/main/src/main/scala/org/clulab/reach/mentions/package.scala +++ b/main/src/main/scala/org/clulab/reach/mentions/package.scala @@ -25,9 +25,9 @@ package object mentions { case m: EventMention => new BioEventMention( m.labels, - m.trigger, + m.trigger.toBioMention.asInstanceOf[BioTextBoundMention], convertArguments(m.arguments), - m.paths, + m.paths, // TODO: these paths should be converted as well m.sentence, m.document, m.keep, @@ -37,7 +37,7 @@ package object mentions { new BioRelationMention( m.labels, convertArguments(m.arguments), - m.paths, + m.paths, // TODO: these paths should be converted as well m.sentence, m.document, m.keep, @@ -62,9 +62,9 @@ package object mentions { case m: BioEventMention => { val ev = new CorefEventMention( m.labels, - m.trigger, + m.trigger.toCorefMention.asInstanceOf[CorefTextBoundMention], corefArguments(m.arguments), - m.paths, + m.paths, // TODO: these paths should be converted as well m.sentence, m.document, m.keep, diff --git a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/EquivalenceHashes.scala b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/EquivalenceHashes.scala new file mode 100644 index 000000000..a73e9846a --- /dev/null +++ b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/EquivalenceHashes.scala @@ -0,0 +1,24 @@ +package org.clulab.reach.mentions.serialization.json + +import org.clulab.processors.Document + +import java.util + +object EquivalenceHashes { + protected val equivalenceHashes: util.IdentityHashMap[Document, Int] = new util.IdentityHashMap[Document, Int]() + + def get(document: Document): Int = synchronized { + if (equivalenceHashes.containsKey(document)) + equivalenceHashes.get(document) + else { + val equivalencyHash = document.equivalenceHash + + equivalenceHashes.put(document, equivalencyHash) + equivalencyHash + } + } + + def remove(document: Document): Option[Int] = synchronized { + Option(equivalenceHashes.remove(document)) + } +} diff --git a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/JSONSerializer.scala b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/JSONSerializer.scala index 55d541336..e79194168 100644 --- a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/JSONSerializer.scala +++ b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/JSONSerializer.scala @@ -1,23 +1,33 @@ package org.clulab.reach.mentions.serialization.json -import org.clulab.serialization.json.DocOps -import org.clulab.odin.serialization.json.JSONSerializer._ -import org.clulab.odin.serialization.json.{ MentionOps => OdinMentionOps } +import com.typesafe.scalalogging.LazyLogging import org.clulab.odin import org.clulab.odin._ +import org.clulab.odin.serialization.json.{JSONSerializer => OdinJSONSerializer} +import org.clulab.processors.Document import org.clulab.reach.grounding.KBResolution -import org.clulab.reach.mentions._ +import org.clulab.reach.mentions.{Anaphoric, EventSite, Hypothesis, Modification, Mutant, Negation, PTM} +import org.clulab.reach.mentions.{BioEventMention, BioMention, BioRelationMention, BioTextBoundMention} +import org.clulab.reach.mentions.{CHEMtrigger, DNtrigger, KDtrigger, KOtrigger, OEtrigger, UnassignedTrigger} +import org.clulab.reach.mentions.{CorefEventMention, CorefMention, CorefRelationMention, CorefTextBoundMention} +import org.clulab.reach.mentions.{MentionOps => OdinMentionOps} +import org.clulab.serialization.json.{DocOps, stringify} import org.clulab.struct.{DirectedGraph, Edge, Interval} import org.json4s.JsonDSL._ import org.json4s._ import org.json4s.jackson.JsonMethods._ +import org.json4s.jackson.{prettyJson, renderJValue} + import java.io.File -import com.typesafe.scalalogging.LazyLogging -import org.clulab.processors.Document /** JSON serialization utilities */ object JSONSerializer extends LazyLogging { + implicit val formats = org.json4s.DefaultFormats + + def prettify(json: JValue): String = prettyJson(renderJValue(json)) + + def json(jsonAST: JValue, pretty: Boolean = false): String = stringify(jsonAST, pretty) private def mentionsToDocsJMap(mentions: Seq[Mention]): Map[String, JValue] = { docsToDocsJMap(mentions.map(m => m.document)) @@ -34,14 +44,14 @@ object JSONSerializer extends LazyLogging { /** Creates a Map of a Document.equivalenceHash (as String) -> Document
* Used for deserialization of mention JSON */ - def docsToDocumentMap(docs: Seq[Document]): Map[String, Document] = mkDocumentMap(docsToDocsJMap(docs)) + def docsToDocumentMap(docs: Seq[Document]): Map[String, Document] = OdinJSONSerializer.mkDocumentMap(docsToDocsJMap(docs)) + + def jsonAST(mention: Mention): JValue = jsonAST(Seq(mention)) def jsonAST(mentions: Seq[Mention]): JValue = { - val mentionList: List[JValue] = mentions.map{ - case cm: CorefMention => CorefMentionOps(cm).jsonAST - case bm: BioMention => BioMentionOps(bm).jsonAST - case m: Mention => OdinMentionOps(m).jsonAST + val mentionList: List[JValue] = mentions.zipWithIndex.map { case (mention, index) => + MentionOps(mention).jsonAST }.toList val docMap: Map[String, JValue] = mentionsToDocsJMap(mentions) ("documents" -> docMap) ~ ("mentions" -> mentionList) @@ -59,7 +69,7 @@ object JSONSerializer extends LazyLogging { require(json \ "mentions" != JNothing, "\"mentions\" key missing from json") // build the documents once - val docMap = mkDocumentMap((json \ "documents").asInstanceOf[JObject]) + val docMap = OdinJSONSerializer.mkDocumentMap((json \ "documents").asInstanceOf[JObject]) val mmjson = (json \ "mentions").asInstanceOf[JArray] mmjson.arr.map(mjson => toBioMention(mjson, docMap)) @@ -84,7 +94,7 @@ object JSONSerializer extends LazyLogging { // NOTE: while it would be cleaner to create a Mention and THEN add the needed bio and coref attributes, // it would not be easy to transform the arguments & trigger post-hoc using the json... val m = mjson \ "type" match { - case JString(BioEventMention.string) => + case JString(BioEventMentionOps.string) => new BioEventMention( labels, // trigger must be (Bio)TextBoundMention @@ -98,7 +108,7 @@ object JSONSerializer extends LazyLogging { isDirect = getIsDirect(mjson) ) - case JString(BioRelationMention.string) => + case JString(BioRelationMentionOps.string) => new BioRelationMention( labels, mkArgumentsFromJsonAST(mjson \ "arguments", docMap), @@ -109,7 +119,7 @@ object JSONSerializer extends LazyLogging { foundBy ) - case JString(BioTextBoundMention.string) => + case JString(BioTextBoundMentionOps.string) => new BioTextBoundMention( labels, tokInterval, @@ -160,7 +170,7 @@ object JSONSerializer extends LazyLogging { require(json \ "mentions" != JNothing, "\"mentions\" key missing from json") // build the documents once - val docMap = mkDocumentMap((json \ "documents").asInstanceOf[JObject]) + val docMap = OdinJSONSerializer.mkDocumentMap((json \ "documents").asInstanceOf[JObject]) val mmjson = (json \ "mentions").asInstanceOf[JArray] mmjson.arr.map(mjson => toCorefMentionWithId(mjson, docMap)).toMap @@ -191,7 +201,7 @@ object JSONSerializer extends LazyLogging { // it would not be easy to transform the arguments & trigger post-hoc using the json... val mentionId: String = (mjson \ "id").extract[String] val m = mjson \ "type" match { - case JString(CorefEventMention.string) => + case JString(CorefEventMentionOps.string) => new CorefEventMention( labels, // trigger must be (Bio)TextBoundMention @@ -205,7 +215,7 @@ object JSONSerializer extends LazyLogging { isDirect = getIsDirect(mjson) ) - case JString(CorefRelationMention.string) => + case JString(CorefRelationMentionOps.string) => new CorefRelationMention( labels, mkArgumentsFromJsonAST(mjson \ "arguments", docMap), @@ -216,7 +226,7 @@ object JSONSerializer extends LazyLogging { foundBy ) - case JString(CorefTextBoundMention.string) => + case JString(CorefTextBoundMentionOps.string) => new CorefTextBoundMention( labels, tokInterval, @@ -425,17 +435,17 @@ object JSONSerializer extends LazyLogging { def toMentionByType(mjson: JValue, docMap: Map[String, Document]): Option[Mention] = mjson \ "type" match { // CorefMentions - case JString(CorefTextBoundMention.string) => Some(toCorefMention(mjson, docMap)) - case JString(CorefEventMention.string) => Some(toCorefMention(mjson, docMap)) - case JString(CorefRelationMention.string) => Some(toCorefMention(mjson, docMap)) + case JString(CorefTextBoundMentionOps.string) => Some(toCorefMention(mjson, docMap)) + case JString(CorefEventMentionOps.string) => Some(toCorefMention(mjson, docMap)) + case JString(CorefRelationMentionOps.string) => Some(toCorefMention(mjson, docMap)) // BioMentions - case JString(BioTextBoundMention.string) => Some(toBioMention(mjson, docMap)) - case JString(BioEventMention.string) => Some(toBioMention(mjson, docMap)) - case JString(BioRelationMention.string) => Some(toBioMention(mjson, docMap)) + case JString(BioTextBoundMentionOps.string) => Some(toBioMention(mjson, docMap)) + case JString(BioEventMentionOps.string) => Some(toBioMention(mjson, docMap)) + case JString(BioRelationMentionOps.string) => Some(toBioMention(mjson, docMap)) // Mentions - case JString(org.clulab.odin.serialization.json.TextBoundMention.string) => Some(toMention(mjson, docMap)) - case JString(org.clulab.odin.serialization.json.EventMention.string) => Some(toMention(mjson, docMap)) - case JString(org.clulab.odin.serialization.json.RelationMention.string) => Some(toMention(mjson, docMap)) + case JString(org.clulab.odin.serialization.json.TextBoundMentionOps.string) => Some(OdinJSONSerializer.toMention(mjson, docMap)) + case JString(org.clulab.odin.serialization.json.EventMentionOps.string) => Some(OdinJSONSerializer.toMention(mjson, docMap)) + case JString(org.clulab.odin.serialization.json.RelationMentionOps.string) => Some(OdinJSONSerializer.toMention(mjson, docMap)) // failure case _ => None } diff --git a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/MentionOps.scala b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/MentionOps.scala new file mode 100644 index 000000000..944ba58b2 --- /dev/null +++ b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/MentionOps.scala @@ -0,0 +1,209 @@ +package org.clulab.reach.mentions.serialization.json + +import org.clulab.odin.Mention +import org.clulab.odin.serialization.json.{EventMentionOps, RelationMentionOps, TextBoundMentionOps} +import org.clulab.odin.serialization.json.{MentionOps => OdinMentionOps} +import org.clulab.reach.context.Context +import org.clulab.reach.grounding.KBResolution +import org.clulab.reach.mentions.{Anaphoric, BioEventMention, BioRelationMention, BioTextBoundMention, CorefEventMention, CorefMention, CorefRelationMention, CorefTextBoundMention, Display, EventSite, Grounding, Modification, Modifications, Mutant, PTM, SimpleModification} +import org.clulab.serialization.json.JSONSerialization +import org.json4s._ +import org.json4s.JsonDSL._ +import org.json4s.jackson._ + +import ReachImplicits._ + +object MentionOps { + implicit val formats = org.json4s.DefaultFormats + + def apply(mention: Mention): OdinMentionOps = { + mention match { + // Corefs must be on top because they inherit from the Bio classes. + case mention: CorefTextBoundMention => new CorefTextBoundMentionOps(mention) + case mention: CorefEventMention => new CorefEventMentionOps(mention) + case mention: CorefRelationMention => new CorefRelationMentionOps(mention) + + case mention: BioTextBoundMention => new BioTextBoundMentionOps(mention) + case mention: BioEventMention => new BioEventMentionOps(mention) + case mention: BioRelationMention => new BioRelationMentionOps(mention) + + case mention => OdinMentionOps(mention) // Let odin figure it out. + } + } +} + +object MentionsOps { + + def apply(mentions: Seq[Mention]): JSONSerialization = new SeqMentionOps(mentions) +} + +trait BioMentionOps { + this: OdinMentionOps => + + type BioMention = Modifications with Grounding with Display with Context + + // The mention is accessible through the OdinMentionOps which records it as a val. + override lazy val documentEquivalenceHash: Int = EquivalenceHashes.get(mention.document) + // If the equivalenceHash should ever need the ID of this subclass of MentionOps, + // then this stringCode can be used to dynamically get the right value. + // override val stringCode = s"org.clulab.odin.$longString" + + override def asMentionOps(mention: Mention): OdinMentionOps = MentionOps(mention) + + def bioJsonAST(bioMention: BioMention): JObject = { + ("modifications" -> bioMention.modifications.jsonAST) ~ + // grounding is optional + ("grounding" -> bioMention.grounding.map(_.jsonAST)) ~ + // context is optional + ("context" -> bioMention.contextOpt.map(_.jsonAST)) ~ + // usually just labels.head... + ("displayLabel" -> bioMention.displayLabel) + } +} + +class BioTextBoundMentionOps(tb: BioTextBoundMention) extends TextBoundMentionOps(tb) with BioMentionOps { + + override def longString: String = BioTextBoundMentionOps.string + + override def jsonAST: JValue = super.jsonAST.merge(bioJsonAST(tb)) +} + +class BioEventMentionOps(em: BioEventMention) extends EventMentionOps(em) with BioMentionOps { + + override def longString: String = BioEventMentionOps.string + + override def jsonAST: JValue = super.jsonAST + .merge(bioJsonAST(em)) + .merge(JObject(List(JField("isDirect", em.isDirect)))) +} + +class BioRelationMentionOps(rm: BioRelationMention) extends RelationMentionOps(rm) with BioMentionOps { + + override def longString: String = BioRelationMentionOps.string + + override def jsonAST: JValue = super.jsonAST.merge(bioJsonAST(rm)) +} + +trait CorefMentionOps { + type CorefMention = Modifications with Grounding with Display with Context with Anaphoric + + def corefJsonAST(corefMention: CorefMention): JObject = { + ("antecedents" -> corefMention.antecedents.jsonAST) ~ + ("sieves" -> corefMention.sieves.jsonAST) + } +} + +class CorefTextBoundMentionOps(tb: CorefTextBoundMention) extends BioTextBoundMentionOps(tb) with CorefMentionOps { + + override def longString: String = CorefTextBoundMentionOps.string + + override def jsonAST: JValue = super.jsonAST.merge(corefJsonAST(tb)) +} + +class CorefEventMentionOps(em: CorefEventMention) extends BioEventMentionOps(em) with CorefMentionOps { + + override def longString: String = CorefEventMentionOps.string + + override def jsonAST: JValue = super.jsonAST.merge(corefJsonAST(em)) +} + +class CorefRelationMentionOps(rm: CorefRelationMention) extends BioRelationMentionOps(rm) with CorefMentionOps { + + override def longString: String = CorefRelationMentionOps.string + + override def jsonAST: JValue = super.jsonAST.merge(corefJsonAST(rm)) +} + +object ReachImplicits { + + implicit class SeqMentionOps(mentions: Seq[Mention]) extends JSONSerialization { + + def jsonAST: JValue = JSONSerializer.jsonAST(mentions) + } + + implicit class ModificationOps(mod: Modification) extends JSONSerialization { + def jsonAST: JValue = mod match { + case PTM(label, evidenceOp, siteOp, negated) => + ("modification-type" -> "PTM") ~ + ("label" -> label) ~ + // evidence is optional + ("evidence" -> evidenceOp.map(MentionOps(_).jsonAST)) ~ + // site is optional + ("site" -> siteOp.map(MentionOps(_).jsonAST)) ~ + ("negated" -> negated) + case Mutant(evidence, foundBy) => + ("modification-type" -> "Mutant") ~ + ("evidence" -> MentionOps(evidence).jsonAST) ~ + ("foundBy" -> foundBy) + case eventSite: EventSite => + // This is an exception because of the "site" label. + ("modification-type" -> eventSite.label) ~ + ("site" -> MentionOps(eventSite.site).jsonAST) + case simpleModification: SimpleModification => + ("modification-type" -> simpleModification.label) ~ + ("evidence" -> MentionOps(simpleModification.mention).jsonAST) + } + } + + implicit class ModificationsOps(mods: Set[Modification]) extends JSONSerialization { + def jsonAST: JValue = mods match { + case hasMods if hasMods.nonEmpty => hasMods.map(_.jsonAST).toList + case _ => JNothing + } + } + + implicit class KBResolutionOps(kbr: KBResolution) extends JSONSerialization { + def jsonAST: JValue = { + // components needed to construct KBResolution + ("text" -> kbr.text) ~ + ("namespace" -> kbr.namespace) ~ + ("id" -> kbr.id) ~ + ("species" -> kbr.species) + } + } + + implicit class ContextOps(context: Map[String, Seq[String]]) extends JSONSerialization { + def jsonAST: JValue = context match { + case hasContext if hasContext.nonEmpty => hasContext + case _ => JNothing + } + } + + implicit class AnaphoricOps(antecedents: Set[Anaphoric]) extends JSONSerialization { + def jsonAST: JValue = antecedents match { + case hasAntecedents if hasAntecedents.nonEmpty => hasAntecedents.map(m => MentionOps(m.asInstanceOf[CorefMention]).jsonAST) + case _ => JNothing + } + } + + implicit class StringSetOps(ss: Set[String]) extends JSONSerialization { + def jsonAST: JValue = ss match { + case contents if contents.nonEmpty => contents + case _ => JNothing + } + } +} + +object BioTextBoundMentionOps { + val string = "BioTextBoundMention" +} + +object BioEventMentionOps { + val string = "BioEventMention" +} + +object BioRelationMentionOps { + val string = "BioRelationMention" +} + +object CorefTextBoundMentionOps { + val string = "CorefTextBoundMention" +} + +object CorefEventMentionOps { + val string = "CorefEventMention" +} + +object CorefRelationMentionOps { + val string = "CorefRelationMention" +} diff --git a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/package.scala b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/package.scala deleted file mode 100644 index 94d80e84c..000000000 --- a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/package.scala +++ /dev/null @@ -1,294 +0,0 @@ -package org.clulab.reach.mentions.serialization - -import org.clulab.odin -import org.clulab.odin._ -import org.clulab.odin.serialization.json.{ TextBoundMentionOps, RelationMentionOps, EventMentionOps } -import org.clulab.odin.serialization.json.{ MentionOps => OdinMentionOps, OdinPathOps } -import org.clulab.serialization.json.{ JSONSerialization } -import org.clulab.reach.mentions.serialization.json.{ JSONSerializer => ReachJSONSerializer } -import org.clulab.reach.mentions._ -import org.clulab.reach.grounding.KBResolution -import org.json4s._ -import org.json4s.JsonDSL._ -import org.json4s.jackson._ - - -package object json { - - implicit val formats = org.json4s.DefaultFormats - - /** generate the appropriate AST according to Mention type */ - private def mentionToJsonAST(m: Mention): JValue = m match { - // NOTE: order matters due to inheritance - case cm: CorefMention => CorefMentionOps(cm).jsonAST - case bm: BioMention => BioMentionOps(bm).jsonAST - case m: Mention => OdinMentionOps(m).jsonAST - } - - implicit class MentionJSONOps(m: Mention) extends OdinMentionOps(m) { - - /** Without "documents" field **/ - override def jsonAST: JValue = mentionToJsonAST(m) - - /** Includes "documents" field for simple deserialization **/ - override def completeAST: JValue = REACHMentionSeq(Seq(m)).jsonAST - - } - - /** For Seq[BioMention], Seq[CorefMention], etc */ - implicit class REACHMentionSeq(mentions: Seq[Mention]) extends JSONSerialization { - - override def jsonAST: JValue = ReachJSONSerializer.jsonAST(mentions) - - } - - /** generate a json string from the given ast */ - def astToJSON(jsonast: JValue, pretty: Boolean): String = { - val jsonDoc = renderJValue(jsonast) - pretty match { - case true => prettyJson(jsonDoc) - case false => compactJson(jsonDoc) - } - } - - /** generate a json string from a mention
- * Note that this is incomplete for deserialization purposes, - * as only a reference to the Document is included - * */ - def mentionToJSON(m: Mention, pretty: Boolean): String = astToJSON(mentionToJsonAST(m), pretty) - - /** args -> coref representation -> json */ - private def argsAST(arguments: Map[String, Seq[Mention]]): JObject = { - val args = arguments.map { - case (name, mentions) => name -> JArray(mentions.map(mentionToJsonAST).toList) - } - JObject(args.toList) - } - - /** BioMention -> json */ - implicit class BioMentionOps(m: BioMention) extends JSONSerialization { - - override def jsonAST: JValue = m match { - case tb: BioTextBoundMention => BioTextBoundMentionOps(tb).jsonAST - case em: BioEventMention => BioEventMentionOps(em).jsonAST - case rm: BioRelationMention => BioRelationMentionOps(rm).jsonAST - } - } - - /** CorefMention -> json */ - implicit class CorefMentionOps(m: CorefMention) extends JSONSerialization { - - override def jsonAST: JValue = m match { - case tb: CorefTextBoundMention => CorefTextBoundMentionOps(tb).jsonAST - case em: CorefEventMention => CorefEventMentionOps(em).jsonAST - case rm: CorefRelationMention => CorefRelationMentionOps(rm).jsonAST - } - } - - def pathsAST(paths: Map[String, Map[Mention, odin.SynPath]]): JValue = paths match { - case gps if gps.nonEmpty => gps.jsonAST - case _ => JNothing - } - - implicit class BioTextBoundMentionOps(tb: BioTextBoundMention) extends TextBoundMentionOps(tb) { - -// override val stringCode = s"org.clulab.odin.${BioTextBoundMention.string}" -// override def id: String = s"${BioTextBoundMention.shortString}:$equivalenceHash" - - override def jsonAST: JValue = { - - val ast = TextBoundMentionOps(tb).jsonAST replace - (List("type"), BioTextBoundMention.string) replace - (List("id"), tb.id) - - ast merge ( - ("modifications" -> tb.modifications.jsonAST) ~ - // grounding is optional - ("grounding" -> tb.grounding.map(_.jsonAST)) ~ - // context is optional - ("context" -> tb.contextOpt.map(_.jsonAST)) ~ - // usually just labels.head... - ("displayLabel" -> tb.displayLabel) - ) - } - } - - implicit class BioEventMentionOps(em: BioEventMention) extends EventMentionOps(em) { - override def jsonAST: JValue = { - - val ast = EventMentionOps(em).jsonAST replace - (List("type"), BioEventMention.string) replace - (List("id"), em.id) replace - (List("arguments"), argsAST(em.arguments)) - - ast merge ( - ("modifications" -> em.modifications.jsonAST) ~ - // grounding is optional - ("grounding" -> em.grounding.map(_.jsonAST)) ~ - // context is optional - ("context" -> em.contextOpt.map(_.jsonAST)) ~ - // usually just labels.head... - ("displayLabel" -> em.displayLabel) ~ - ("isDirect" -> em.isDirect) - ) - } - } - - implicit class BioRelationMentionOps(rm: BioRelationMention) extends RelationMentionOps(rm) { - override def jsonAST: JValue = { - - val ast = RelationMentionOps(rm).jsonAST replace - (List("type"), BioRelationMention.string) replace - (List("id"), rm.id) replace - (List("arguments"), argsAST(rm.arguments)) - - ast merge ( - ("modifications" -> rm.modifications.jsonAST) ~ - // grounding is optional - ("grounding" -> rm.grounding.map(_.jsonAST)) ~ - // context is optional - ("context" -> rm.contextOpt.map(_.jsonAST)) ~ - // usually just labels.head... - ("displayLabel" -> rm.displayLabel) - ) - } - } - - implicit class CorefTextBoundMentionOps(tb: CorefTextBoundMention) extends BioTextBoundMentionOps(tb) { - override def jsonAST: JValue = { - - val ast = BioTextBoundMentionOps(tb).jsonAST replace - (List("type"), CorefTextBoundMention.string) replace - (List("id"), tb.id) - - ast merge ( - ("antecedents" -> tb.antecedents.jsonAST) ~ - ("sieves" -> tb.sieves.jsonAST) - ) - } - } - - implicit class CorefEventMentionOps(em: CorefEventMention) extends BioEventMentionOps(em) { - override def jsonAST: JValue = { - - val ast = BioEventMentionOps(em).jsonAST replace - (List("type"), CorefEventMention.string) replace - (List("id"), em.id) replace - (List("arguments"), argsAST(em.arguments)) - - ast merge ( - ("antecedents" -> em.antecedents.jsonAST) ~ - ("sieves" -> em.sieves.jsonAST) - ) - } - } - - implicit class CorefRelationMentionOps(rm: CorefRelationMention) extends BioRelationMentionOps(rm) { - override def jsonAST: JValue = { - - val ast = BioRelationMentionOps(rm).jsonAST replace - (List("type"), CorefRelationMention.string) replace - (List("id"), rm.id) replace - (List("arguments"), argsAST(rm.arguments)) - - ast merge ( - ("antecedents" -> rm.antecedents.jsonAST) ~ - ("sieves" -> rm.sieves.jsonAST) - ) - } - } - - implicit class ModificationOps(mod: Modification) extends JSONSerialization { - def jsonAST: JValue = mod match { - case PTM(label, evidenceOp, siteOp, negated) => - ("modification-type" -> "PTM") ~ - ("label" -> label) ~ - // evidence is optional - ("evidence" -> evidenceOp.map(mentionToJsonAST)) ~ - // site is optional - ("site" -> siteOp.map(mentionToJsonAST)) ~ - ("negated" -> negated) - case Mutant(evidence, foundBy) => - ("modification-type" -> "Mutant") ~ - ("evidence" -> mentionToJsonAST(evidence)) ~ - ("foundBy" -> foundBy) - case eventSite: EventSite => - // This is an exception because of the "site" label. - ("modification-type" -> eventSite.label) ~ - ("site" -> mentionToJsonAST(eventSite.site)) - case simpleModification: SimpleModification => - ("modification-type" -> simpleModification.label) ~ - ("evidence" -> mentionToJsonAST(simpleModification.mention)) - } - } - - implicit class ModificationsOps(mods: Set[Modification]) extends JSONSerialization { - def jsonAST: JValue = mods match { - case hasMods if hasMods.nonEmpty => hasMods.map(_.jsonAST).toList - case _ => JNothing - } - } - - implicit class KBResolutionOps(kbr: KBResolution) extends JSONSerialization { - def jsonAST: JValue = { - // components needed to construct KBResolution - ("text" -> kbr.text) ~ - ("namespace" -> kbr.namespace) ~ - ("id" -> kbr.id) ~ - ("species" -> kbr.species) - } - } - - implicit class ContextOps(context: Map[String, Seq[String]]) extends JSONSerialization { - def jsonAST: JValue = context match { - case hasContext if hasContext.nonEmpty => hasContext - case _ => JNothing - } - } - - implicit class AnaphoricOps(antecedents: Set[Anaphoric]) extends JSONSerialization { - def jsonAST: JValue = antecedents match { - case hasAntecedents if hasAntecedents.nonEmpty => hasAntecedents.map(m => m.asInstanceOf[CorefMention].jsonAST) - case _ => JNothing - } - } - - implicit class StringSetOps(ss: Set[String]) extends JSONSerialization { - def jsonAST: JValue = ss match { - case contents if contents.nonEmpty => contents - case _ => JNothing - } - } - - def prettify(json: JValue): String = prettyJson(renderJValue(json)) - - object BioTextBoundMention { - val string = "BioTextBoundMention" - val shortString = "T" - } - - object BioEventMention { - val string = "BioEventMention" - val shortString = "E" - } - - object BioRelationMention { - val string = "BioRelationMention" - val shortString = "R" - } - - object CorefTextBoundMention { - val string = "CorefTextBoundMention" - val shortString = "T" - } - - object CorefEventMention { - val string = "CorefEventMention" - val shortString = "E" - } - - object CorefRelationMention { - val string = "CorefRelationMention" - val shortString = "R" - } -} diff --git a/main/src/test/scala/org/clulab/reach/TestModifications.scala b/main/src/test/scala/org/clulab/reach/TestModifications.scala index 6001fc563..0bfc88665 100644 --- a/main/src/test/scala/org/clulab/reach/TestModifications.scala +++ b/main/src/test/scala/org/clulab/reach/TestModifications.scala @@ -4,7 +4,6 @@ import org.clulab.reach.mentions._ import org.scalatest._ import TestUtils._ import org.clulab.reach.mentions.serialization.json.JSONSerializer -import org.clulab.reach.mentions.serialization.json.prettify /** * Unit tests to ensure PTM rules are matching correctly @@ -1196,7 +1195,7 @@ class TestModifications extends FlatSpec with Matchers { koTriggers1 should not be ('empty) val jValue = JSONSerializer.jsonAST(bioMentions1) - val json = prettify(jValue) + val json = JSONSerializer.prettify(jValue) json should include ("KOtrigger") diff --git a/main/src/test/scala/org/clulab/reach/mentions/serialization/TestJSONSerializer.scala b/main/src/test/scala/org/clulab/reach/mentions/serialization/TestJSONSerializer.scala index 1055eec70..b40aafb1f 100644 --- a/main/src/test/scala/org/clulab/reach/mentions/serialization/TestJSONSerializer.scala +++ b/main/src/test/scala/org/clulab/reach/mentions/serialization/TestJSONSerializer.scala @@ -1,22 +1,22 @@ package org.clulab.reach.mentions.serialization -import org.scalatest._ -import scala.util.Try // do not remove: needed for debugging import org.clulab.reach.TestUtils._ -import org.clulab.reach.mentions.{MentionOps => MOps} -import org.clulab.reach.mentions.serialization.json._ +import org.clulab.reach.mentions.serialization.json.JSONSerializer +import org.clulab.reach.mentions.serialization.json.{MentionOps, MentionsOps} +import org.clulab.reach.mentions.{MentionOps => ImplicitMentionOps} import org.json4s._ import org.json4s.jackson.JsonMethods._ - +import org.scalatest._ class TestJSONSerializer extends FlatSpec with Matchers { + implicit val formats = org.json4s.DefaultFormats val text = "Phosphorylated MEK activates K-RAS." val mentions = getMentionsFromText(text) val corefmentions = mentions.map(_.toCorefMention) - val corefJAST = corefmentions.jsonAST + val corefJAST = MentionsOps(corefmentions).jsonAST val biomentions = mentions.map(_.toBioMention) - val bioJAST = biomentions.jsonAST + val bioJAST = MentionsOps(biomentions).jsonAST // printMentions(Try(biomentions), true) // DEBUGGING @@ -36,7 +36,7 @@ class TestJSONSerializer extends FlatSpec with Matchers { it should "serialize/deserialize a BioMention to/from json correctly " in { val Seq(m) = biomentions.filter(m => (m matches "Entity") && (m.text == "MEK")) - val mns = JSONSerializer.toBioMentions(m.completeAST) + val mns = JSONSerializer.toBioMentions(MentionsOps(Seq(m)).jsonAST) mns should have size (1) val Seq(deserializedm) = mns m.document.equivalenceHash should equal (deserializedm.document.equivalenceHash) @@ -56,7 +56,7 @@ class TestJSONSerializer extends FlatSpec with Matchers { // coref mention tests it should "serialize a Seq[CorefMention] to json correctly" in { - val mentions2 = JSONSerializer.toCorefMentions(corefmentions.jsonAST) + val mentions2 = JSONSerializer.toCorefMentions(MentionsOps(corefmentions).jsonAST) mentions2 should have size corefmentions.size mentions2.map(_.label) should equal (corefmentions.map(_.label)) mentions2.map(_.document.equivalenceHash) should equal (corefmentions.map(_.document.equivalenceHash)) @@ -71,7 +71,7 @@ class TestJSONSerializer extends FlatSpec with Matchers { it should "serialize/deserialize a CorefMention to/from json correctly " in { val Seq(cm) = corefmentions.filter(m => (m matches "Entity") && (m.text == "MEK")) - val mns = JSONSerializer.toCorefMentions(cm.completeAST) + val mns = JSONSerializer.toCorefMentions(MentionsOps(Seq(cm)).jsonAST) mns should have size 1 val Seq(m) = mns m.document.equivalenceHash should equal (cm.document.equivalenceHash) @@ -80,29 +80,33 @@ class TestJSONSerializer extends FlatSpec with Matchers { } it should "produce identical json for a Seq[CorefMention] before and after serialization/deserialization" in { - corefmentions.json(true) == JSONSerializer.toCorefMentions(corefmentions.jsonAST).json(true) + val json = MentionsOps(corefmentions).json(pretty = true) + val rejson = MentionsOps(JSONSerializer.toCorefMentions(MentionsOps(corefmentions).jsonAST)).json(pretty = true) + rejson should equal (json) // This was not checked before and had been failing! } s"json for '$text'" should "be identical before and after serialization/deserialzation" in { val mekmns = corefmentions.filter(_.text == "MEK") mekmns should have size 1 val mek = mekmns.head - mek.json(true) should equal (JSONSerializer.toCorefMentions(Seq(mek).jsonAST).head.json(true)) + MentionOps(mek).json(pretty = true) should equal ( + MentionOps(JSONSerializer.toCorefMentions(MentionsOps(Seq(mek)).jsonAST).head).json(pretty = true) + ) } it should "contain a modification with \"modification-type\" PTM" in { val mekmns = corefmentions.filter(_.text == "MEK") mekmns should have size 1 val mek = mekmns.head - (mek.jsonAST \ "modifications" \\ "modification-type").extract[String] should equal ("PTM") + (MentionOps(mek).jsonAST \ "modifications" \\ "modification-type").extract[String] should equal ("PTM") } it should "still contain a PTM after serialization/deserialization" in { - val deserializedCorefMentions = JSONSerializer.toCorefMentions(corefmentions.jsonAST) + val deserializedCorefMentions = JSONSerializer.toCorefMentions(MentionsOps(corefmentions).jsonAST) val mekmns = deserializedCorefMentions.filter(_.text == "MEK") mekmns should have size 1 val mek = mekmns.head - (mek.jsonAST \ "modifications" \\ "modification-type").extract[String] should equal ("PTM") + (MentionOps(mek).jsonAST \ "modifications" \\ "modification-type").extract[String] should equal ("PTM") } val text2 = "MEK activates K-RAS." @@ -110,6 +114,6 @@ class TestJSONSerializer extends FlatSpec with Matchers { val mekmns = getCorefmentionsFromText(text2).filter(_.text == "MEK") mekmns should have size 1 val mek = mekmns.head - (mek.jsonAST \ "modifications") should equal(JNothing) + (MentionOps(mek).jsonAST \ "modifications") should equal(JNothing) } } diff --git a/processors/build.sbt b/processors/build.sbt index d7f8f8e0d..80d04c258 100644 --- a/processors/build.sbt +++ b/processors/build.sbt @@ -5,7 +5,7 @@ resolvers += "clulab" at "https://artifactory.clulab.org/artifactory/sbt-release libraryDependencies ++= { - val procVer = "8.5.3" + val procVer = "8.5.4-SNAPSHOT" Seq( "com.typesafe" % "config" % "1.3.1", diff --git a/processors/src/main/scala/org/clulab/processors/bionlp/ner/ReachStandardKbSource.scala b/processors/src/main/scala/org/clulab/processors/bionlp/ner/ReachStandardKbSource.scala index faea5d422..61579c3d9 100644 --- a/processors/src/main/scala/org/clulab/processors/bionlp/ner/ReachStandardKbSource.scala +++ b/processors/src/main/scala/org/clulab/processors/bionlp/ner/ReachStandardKbSource.scala @@ -7,12 +7,12 @@ import org.clulab.processors.bionlp.ner.KBGenerator.tokenizeResourceLine import org.clulab.processors.clu.tokenizer.Tokenizer import org.clulab.sequences.StandardKbSource import org.clulab.utils.Files -import org.clulab.utils.Serializer import java.io.File import java.util.function.Consumer import scala.language.reflectiveCalls // required to access consumer.lineCount import scala.util.Try +import scala.util.Using abstract class ReachStandardKbSource(caseInsensitiveMatching: Boolean) extends StandardKbSource(caseInsensitiveMatching) @@ -54,7 +54,7 @@ class ReachSingleStandardKbSource(kbEntry: KBEntry, caseInsensitiveMatching: Boo ) ) - Serializer.using(bufferedReader) { bufferedReader => + Using.resource(bufferedReader) { bufferedReader => bufferedReader.lines.forEach(consumer) } logger.info(s"Done. Read ${consumer.lineCount} lines from ${new File(kbEntry.path).getName}") diff --git a/src/main/scala/org/clulab/reach/ReachCLI.scala b/src/main/scala/org/clulab/reach/ReachCLI.scala index 162d3d750..c1a85cdc5 100644 --- a/src/main/scala/org/clulab/reach/ReachCLI.scala +++ b/src/main/scala/org/clulab/reach/ReachCLI.scala @@ -11,6 +11,8 @@ import java.nio.charset.Charset import java.nio.charset.StandardCharsets.UTF_8 import ai.lum.common.FileUtils._ import ai.lum.common.ConfigUtils._ +import org.clulab.reach.mentions.serialization.json.EquivalenceHashes +import org.clulab.utils.Timer //import jline.internal.InputStreamReader import org.clulab.odin._ import org.clulab.processors.Document @@ -116,21 +118,51 @@ class ReachCLI ( logger.debug(s" ${ durationToS(startNS, System.nanoTime) }s: $paperId: finished reading") + // The mentions come from a single file, so they should all have originated + // with the same single document. Verify this and then make sure the document's + // equivalency hash gets added and removed from the cache. + def withDocument[T](mentions: Seq[Mention])(f: => T): T = { + val documentOpt = + if (mentions.isEmpty) None + else { + val document = mentions.head.document + + require(mentions.forall(_.document.eq(document))) + Some(document) + } + + try { + documentOpt.foreach(EquivalenceHashes.get) + + val timer = new Timer("ReachCLI") + val result = timer.time { + f + } + println(s"Time to output ${file.getName} is ${timer.elapsedToString()}.") + result + } + finally { + documentOpt.foreach(EquivalenceHashes.remove) + } + } + // generate outputs // NOTE: Assembly can't be run before calling this method without additional refactoring, // as different output formats apply different filters before running assembly - val errorCount = outputFormats - .map { outputFormat => - try { - outputMentions(mentions, entry, paperId, startTime, outputDir, outputFormat, withAssembly) - 0 - } - catch { - case throwable: Throwable => - reportException(file, throwable) - 1 - } - }.sum + val errorCount = withDocument(mentions) { + outputFormats + .map { outputFormat => + try { + outputMentions(mentions, entry, paperId, startTime, outputDir, outputFormat, withAssembly) + 0 + } + catch { + case throwable: Throwable => + reportException(file, throwable) + 1 + } + }.sum + } // elapsed time: processing + writing output val endTime = ReachCLI.now From 3d73651df821e6bfffdfb554def49a08f42e4231 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Tue, 14 Mar 2023 09:52:20 -0700 Subject: [PATCH 02/12] Test file writing --- .../org/clulab/reach/TestObjectWriter.scala | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 main/src/test/scala/org/clulab/reach/TestObjectWriter.scala diff --git a/main/src/test/scala/org/clulab/reach/TestObjectWriter.scala b/main/src/test/scala/org/clulab/reach/TestObjectWriter.scala new file mode 100644 index 000000000..fac2d086c --- /dev/null +++ b/main/src/test/scala/org/clulab/reach/TestObjectWriter.scala @@ -0,0 +1,29 @@ +package org.clulab.reach + +import com.fasterxml.jackson.databind.ObjectMapper +import org.clulab.utils.Sink +import org.json4s.jackson.JsonMethods +import org.scalatest.{FlatSpec, Matchers} + +import java.io.{File, PrintWriter} +import java.nio.charset.StandardCharsets + +class TestObjectWriter extends FlatSpec with Matchers { + + behavior of "ObjectWriter" + + it should "produce a file" in { + val json = """{ "boolean": true, "integer": 42, "float": 3.14, "string": "Hello, world!" }""" + val jValue = JsonMethods.parse(json) + println(jValue) + val prettyJson = JsonMethods.pretty(JsonMethods.render(jValue)) + println(prettyJson) + + val file = new File("TestObjectWriter.json") + val printWriter = new PrintWriter(new Sink(file, StandardCharsets.UTF_8.name, append = false)) + val objectMapper = new ObjectMapper() + val objectWriter = objectMapper.writerWithDefaultPrettyPrinter() + + objectWriter.writeValue(printWriter, jValue) + } +} From 36d5f69cc1301629577ac557455195fa554cf1ee Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Tue, 14 Mar 2023 10:17:50 -0700 Subject: [PATCH 03/12] Test writing large files --- .../org/clulab/reach/TestObjectWriter.scala | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/main/src/test/scala/org/clulab/reach/TestObjectWriter.scala b/main/src/test/scala/org/clulab/reach/TestObjectWriter.scala index fac2d086c..7aa4b206c 100644 --- a/main/src/test/scala/org/clulab/reach/TestObjectWriter.scala +++ b/main/src/test/scala/org/clulab/reach/TestObjectWriter.scala @@ -2,7 +2,7 @@ package org.clulab.reach import com.fasterxml.jackson.databind.ObjectMapper import org.clulab.utils.Sink -import org.json4s.jackson.JsonMethods +import org.json4s.jackson.{JsonMethods, renderJValue} import org.scalatest.{FlatSpec, Matchers} import java.io.{File, PrintWriter} @@ -15,15 +15,24 @@ class TestObjectWriter extends FlatSpec with Matchers { it should "produce a file" in { val json = """{ "boolean": true, "integer": 42, "float": 3.14, "string": "Hello, world!" }""" val jValue = JsonMethods.parse(json) + val renderedJValue = JsonMethods.render(jValue) println(jValue) - val prettyJson = JsonMethods.pretty(JsonMethods.render(jValue)) + println(renderedJValue) + + val prettyJson = JsonMethods.pretty(renderedJValue) println(prettyJson) val file = new File("TestObjectWriter.json") val printWriter = new PrintWriter(new Sink(file, StandardCharsets.UTF_8.name, append = false)) - val objectMapper = new ObjectMapper() - val objectWriter = objectMapper.writerWithDefaultPrettyPrinter() +// val objectMapper = new ObjectMapper() +// val objectWriter = objectMapper.writerWithDefaultPrettyPrinter() + // This actually writes to the file, but not pretty. +// val objectWriter = JsonMethods.mapper.writer() + val objectWriter = JsonMethods.mapper.writerWithDefaultPrettyPrinter() + + objectWriter.writeValue(printWriter, renderedJValue) - objectWriter.writeValue(printWriter, jValue) + val string = JsonMethods.pretty(renderedJValue) + println(string) } } From 5dc11ef94cb6d24d308117906dfd4d851b102f35 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Tue, 14 Mar 2023 13:28:20 -0700 Subject: [PATCH 04/12] clean up large files --- .../export/serial/SerialJsonOutput.scala | 26 ++++++---- .../org/clulab/reach/TestObjectWriter.scala | 47 ++++++++++++------- 2 files changed, 48 insertions(+), 25 deletions(-) diff --git a/export/src/main/scala/org/clulab/reach/export/serial/SerialJsonOutput.scala b/export/src/main/scala/org/clulab/reach/export/serial/SerialJsonOutput.scala index 8ad0eb29f..49e444c1e 100644 --- a/export/src/main/scala/org/clulab/reach/export/serial/SerialJsonOutput.scala +++ b/export/src/main/scala/org/clulab/reach/export/serial/SerialJsonOutput.scala @@ -1,16 +1,22 @@ package org.clulab.reach.export.serial -import java.io.File +import java.io.{File, PrintWriter} import java.util.Date import java.nio.charset.Charset import java.nio.charset.StandardCharsets.UTF_8 import ai.lum.common.FileUtils._ +import com.fasterxml.jackson.databind.ObjectWriter import com.typesafe.scalalogging.LazyLogging import org.clulab.odin.Mention import org.clulab.reach.FriesEntry import org.clulab.reach.export.JsonOutputter import org.clulab.reach.mentions.{MentionOps => ImplicitMentionOps} import org.clulab.reach.mentions.serialization.json.MentionsOps +import org.clulab.serialization.json.stringify +import org.clulab.utils.Closer.AutoCloser +import org.clulab.utils.Sink +import org.json4s.JValue +import org.json4s.jackson.{JsonMethods, prettyJson, renderJValue} /** * Defines classes and methods used to output the serial-json output format. @@ -23,6 +29,7 @@ class SerialJsonOutput ( encoding: Charset = UTF_8 ) extends JsonOutputter with LazyLogging { + val objectWriter = JsonMethods.mapper.writerWithDefaultPrettyPrinter() /** Returns the given mentions in the serial-json format, as one big string. */ override def toJSON ( @@ -49,14 +56,17 @@ class SerialJsonOutput ( endTime:Date, outFilePrefix:String ): Unit = { - val f: File = new File(outFilePrefix + ".json") val mentions = allMentions.map(_.toCorefMention) + val jsonAST = MentionsOps(mentions).jsonAST + // Code here has been modified so that no json string is produced. + // String lengths max out at 2GB, unlike files, and with large inputs + // we were crashing when output could not be stuffed into a string. + val renderedJsonAST = JsonMethods.render(jsonAST) + val file = new File(outFilePrefix + ".json") + val printWriter = new PrintWriter(new Sink(file, encoding.name, append = false)) - f.writeString( - string = MentionsOps(mentions).json(true), - charset = encoding, - append = false, - gzipSupport = false - ) + printWriter.autoClose { printWriter => + objectWriter.writeValue(printWriter, renderedJsonAST) + } } } diff --git a/main/src/test/scala/org/clulab/reach/TestObjectWriter.scala b/main/src/test/scala/org/clulab/reach/TestObjectWriter.scala index 7aa4b206c..8dd5a0560 100644 --- a/main/src/test/scala/org/clulab/reach/TestObjectWriter.scala +++ b/main/src/test/scala/org/clulab/reach/TestObjectWriter.scala @@ -1,8 +1,10 @@ package org.clulab.reach -import com.fasterxml.jackson.databind.ObjectMapper +import org.clulab.utils.Closer.AutoCloser import org.clulab.utils.Sink -import org.json4s.jackson.{JsonMethods, renderJValue} +import org.json4s.{JArray, JObject, JValue} +import org.json4s.JsonDSL._ +import org.json4s.jackson.JsonMethods import org.scalatest.{FlatSpec, Matchers} import java.io.{File, PrintWriter} @@ -12,27 +14,38 @@ class TestObjectWriter extends FlatSpec with Matchers { behavior of "ObjectWriter" - it should "produce a file" in { + it should "produce a small file" in { val json = """{ "boolean": true, "integer": 42, "float": 3.14, "string": "Hello, world!" }""" val jValue = JsonMethods.parse(json) val renderedJValue = JsonMethods.render(jValue) - println(jValue) - println(renderedJValue) + val objectWriter = JsonMethods.mapper.writerWithDefaultPrettyPrinter() + val file = new File("SmallTestObjectWriter.json") + val printWriter = new PrintWriter(new Sink(file, StandardCharsets.UTF_8.name, append = false)) - val prettyJson = JsonMethods.pretty(renderedJValue) - println(prettyJson) + printWriter.autoClose { printWriter => + objectWriter.writeValue(printWriter, renderedJValue) + } + file.exists should be (true) + file.delete() + } - val file = new File("TestObjectWriter.json") - val printWriter = new PrintWriter(new Sink(file, StandardCharsets.UTF_8.name, append = false)) -// val objectMapper = new ObjectMapper() -// val objectWriter = objectMapper.writerWithDefaultPrettyPrinter() - // This actually writes to the file, but not pretty. -// val objectWriter = JsonMethods.mapper.writer() + // This will produce a file of about 8GB! + ignore should "produce a very large file" in { + val jObject = + ("string" -> "The quick brown fox jumped over the lazy dog.") ~ + ("number" -> 42) + val list = 1.to(100000000).map(_ => jObject).toList + val jArray = JArray(list) + val jValue = jArray + val renderedJValue = JsonMethods.render(jValue) val objectWriter = JsonMethods.mapper.writerWithDefaultPrettyPrinter() + val file = new File("LargeTestObjectWriter.json") + val printWriter = new PrintWriter(new Sink(file, StandardCharsets.UTF_8.name, append = false)) - objectWriter.writeValue(printWriter, renderedJValue) - - val string = JsonMethods.pretty(renderedJValue) - println(string) + printWriter.autoClose { printWriter => + objectWriter.writeValue(printWriter, renderedJValue) + } + file.exists should be(true) + file.delete() } } From 0c97fa7f7f52ba6955aa62cb00975e627be52b27 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Tue, 28 Mar 2023 17:40:53 -0700 Subject: [PATCH 05/12] Add round trip test --- .../org/clulab/reach/apps/RoundTripApp.scala | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala diff --git a/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala b/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala new file mode 100644 index 000000000..cc564ce2a --- /dev/null +++ b/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala @@ -0,0 +1,57 @@ +package org.clulab.reach.apps + +import org.clulab.odin.Mention +import org.clulab.reach.PaperReader +import org.clulab.reach.mentions.serialization.json.MentionsOps +import org.clulab.serialization.json.stringify +import org.clulab.utils.FileUtils +import org.json4s.jackson.JsonMethods.parse + +object RoundTripApp extends App { + val directoryName = args.headOption.getOrElse("../corpora/nxml") + val files = FileUtils.findFiles(directoryName, ".nxml") + + def testProcessorsSerialization(mentions: Seq[Mention]): Boolean = { + import org.clulab.odin.serialization.json.JSONSerializer + + val jValue = JSONSerializer.jsonAST(mentions) + val json = stringify(jValue, pretty = true) + + val mentions2 = JSONSerializer.toMentions(parse(json)) + val jValue2 = JSONSerializer.jsonAST(mentions2) + val json2 = stringify(jValue2, pretty = true) + val result = json == json2 + + if (json != json2) { + println("Processors mentions are not equal! Do something about it!") + println(json) + println(json2) + } + result + } + + def testReachSerialization(mentions: Seq[Mention]): Boolean = { + import org.clulab.reach.mentions.serialization.json.JSONSerializer + + val json = MentionsOps(mentions).json(pretty = true) + + val mentions2 = JSONSerializer.toCorefMentions(parse(json)) + val json2 = MentionsOps(mentions2).json(pretty = true) + val result = json == json2 + + if (json != json2) { + println("Reach mentions are not equal! Do something about it!") + println(json) + println(json2) + } + result + } + + files.foreach { file => + val entry = PaperReader.getEntryFromPaper(file) + val mentions = PaperReader.getMentionsFromEntry(entry) + + testProcessorsSerialization(mentions) + testReachSerialization(mentions) + } +} From ac19c25b190580b2b0fb4cb38dca5d284c8780ec Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Wed, 29 Mar 2023 18:54:14 -0700 Subject: [PATCH 06/12] Sort he reach mentions --- .../clulab/reach/mentions/serialization/json/MentionOps.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/MentionOps.scala b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/MentionOps.scala index 944ba58b2..a22ac9fb9 100644 --- a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/MentionOps.scala +++ b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/MentionOps.scala @@ -118,7 +118,7 @@ object ReachImplicits { implicit class SeqMentionOps(mentions: Seq[Mention]) extends JSONSerialization { - def jsonAST: JValue = JSONSerializer.jsonAST(mentions) + def jsonAST: JValue = JSONSerializer.jsonAST(mentions.sorted(OdinMentionOps.mentionOrdering)) } implicit class ModificationOps(mod: Modification) extends JSONSerialization { From 104943f55295aad376fecea62704ca14925e4e87 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Wed, 29 Mar 2023 20:21:20 -0700 Subject: [PATCH 07/12] Working except for antecedents --- main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala | 2 ++ .../reach/mentions/serialization/json/JSONSerializer.scala | 1 + 2 files changed, 3 insertions(+) diff --git a/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala b/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala index cc564ce2a..1498611df 100644 --- a/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala +++ b/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala @@ -14,6 +14,8 @@ object RoundTripApp extends App { def testProcessorsSerialization(mentions: Seq[Mention]): Boolean = { import org.clulab.odin.serialization.json.JSONSerializer + val sortedMentions = mentions.sorted(org.clulab.odin.serialization.json.MentionOps.mentionOrdering) + val jValue = JSONSerializer.jsonAST(mentions) val json = stringify(jValue, pretty = true) diff --git a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/JSONSerializer.scala b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/JSONSerializer.scala index e79194168..aab537e0f 100644 --- a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/JSONSerializer.scala +++ b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/JSONSerializer.scala @@ -51,6 +51,7 @@ object JSONSerializer extends LazyLogging { def jsonAST(mentions: Seq[Mention]): JValue = { val mentionList: List[JValue] = mentions.zipWithIndex.map { case (mention, index) => + println(s"Serializing mention $index.") MentionOps(mention).jsonAST }.toList val docMap: Map[String, JValue] = mentionsToDocsJMap(mentions) From f9cb3752b72d298b5f22345f6caad966905cec0b Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Thu, 30 Mar 2023 11:10:31 -0700 Subject: [PATCH 08/12] Did round trip --- .../org/clulab/reach/apps/RoundTripApp.scala | 2 +- .../serialization/json/JSONSerializer.scala | 36 +++++++++++++++---- .../serialization/json/MentionOps.scala | 24 ++++++++++--- 3 files changed, 50 insertions(+), 12 deletions(-) diff --git a/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala b/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala index 1498611df..db646b8be 100644 --- a/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala +++ b/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala @@ -37,7 +37,7 @@ object RoundTripApp extends App { val json = MentionsOps(mentions).json(pretty = true) - val mentions2 = JSONSerializer.toCorefMentions(parse(json)) + val mentions2 = JSONSerializer.toCorefMentions2(parse(json)) val json2 = MentionsOps(mentions2).json(pretty = true) val result = json == json2 diff --git a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/JSONSerializer.scala b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/JSONSerializer.scala index aab537e0f..511749e44 100644 --- a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/JSONSerializer.scala +++ b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/JSONSerializer.scala @@ -161,20 +161,29 @@ object JSONSerializer extends LazyLogging { toCorefMentionsMap(json: JValue).values.toSeq } + def toCorefMentions2(json: JValue): Seq[CorefMention] = { + val idsAndMentions = toCorefIdsAndMentions(json) + val mentions = idsAndMentions.map(_._2) + + mentions + } + /** Produce a Map of id -> mentions from a json file */ def toCorefMentionsMap(file: File): Map[String, CorefMention] = toCorefMentionsMap(jsonAST(file)) /** Produce a Map of id -> mentions from json */ - def toCorefMentionsMap(json: JValue): Map[String, CorefMention] = { + def toCorefMentionsMap(json: JValue): Map[String, CorefMention] = toCorefIdsAndMentions(json).toMap + def toCorefIdsAndMentions(json: JValue): Seq[(String, CorefMention)] = { require(json \ "documents" != JNothing, "\"documents\" key missing from json") require(json \ "mentions" != JNothing, "\"mentions\" key missing from json") // build the documents once val docMap = OdinJSONSerializer.mkDocumentMap((json \ "documents").asInstanceOf[JObject]) val mmjson = (json \ "mentions").asInstanceOf[JArray] + val idsAndMentions = mmjson.arr.map(mjson => toCorefMentionWithId(mjson, docMap)) - mmjson.arr.map(mjson => toCorefMentionWithId(mjson, docMap)).toMap + idsAndMentions } /** Build mention from json of mention and corresponding json map of documents
@@ -241,6 +250,9 @@ object JSONSerializer extends LazyLogging { case other => toMentionByType(mjson, docMap).get.toCorefMention } + val antecedentsOpt = (mjson \ "antecedents").extractOpt[JArray] + if (antecedentsOpt.nonEmpty) + println("How can these be read in!") m.antecedents = toAntecedents(mjson, docMap) m.sieves = (mjson \ "sieves").extract[Set[String]] @@ -266,11 +278,21 @@ object JSONSerializer extends LazyLogging { private def toAntecedents(mjson: JValue, docMap: Map[String, Document]): Set[Anaphoric] = mjson \ "antecedents" match { case JNothing => Set.empty[Anaphoric] case antecedents => - antecedents - .asInstanceOf[JArray] - .arr - .map(mjson => toCorefMention(mjson, docMap)).map(_.toCorefMention) - .toSet + val arr = antecedents.asInstanceOf[JArray].arr + val ids = arr.map { elem => (elem \ "id").extract[String] } + if (ids.contains("T:-401089887") || ids.contains("T:-463773910")) + println("It's about to happen") + val list = arr.map(mjson => toCorefMention(mjson, docMap)) + val listLength = list.length + val next = list.map(_.toCorefMention) + val nextLength = next.length + val set: Set[Anaphoric] = next.toSet + val setLength = set.size + + if (listLength != nextLength || listLength != setLength) + println("This isn't supposed to happen!") + + set } private def toModifications(mjson: JValue, docMap: Map[String, Document]): Set[Modification] = mjson \ "modifications" match { diff --git a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/MentionOps.scala b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/MentionOps.scala index a22ac9fb9..547d99c56 100644 --- a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/MentionOps.scala +++ b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/MentionOps.scala @@ -147,7 +147,11 @@ object ReachImplicits { implicit class ModificationsOps(mods: Set[Modification]) extends JSONSerialization { def jsonAST: JValue = mods match { - case hasMods if hasMods.nonEmpty => hasMods.map(_.jsonAST).toList + case hasMods if hasMods.nonEmpty => + val sortedMods = hasMods.toSeq.sortBy(_.label) + val jValue = sortedMods.toList.map(_.jsonAST) + + JArray(jValue) case _ => JNothing } } @@ -164,21 +168,33 @@ object ReachImplicits { implicit class ContextOps(context: Map[String, Seq[String]]) extends JSONSerialization { def jsonAST: JValue = context match { - case hasContext if hasContext.nonEmpty => hasContext + case hasContext if hasContext.nonEmpty => + val sortedContext = context.toSeq.sortBy(_._1).toList + val mappedContext = sortedContext.map { case (key, value) => key -> JArray(value.toList.map(JString)) } + + JObject(mappedContext) case _ => JNothing } } implicit class AnaphoricOps(antecedents: Set[Anaphoric]) extends JSONSerialization { def jsonAST: JValue = antecedents match { - case hasAntecedents if hasAntecedents.nonEmpty => hasAntecedents.map(m => MentionOps(m.asInstanceOf[CorefMention]).jsonAST) + case hasAntecedents if hasAntecedents.nonEmpty => + val sortedCorefMentions = hasAntecedents.toSeq.map(_.asInstanceOf[CorefMention]).sorted(OdinMentionOps.mentionOrdering) + val jValues = sortedCorefMentions.toList.map(MentionOps(_).jsonAST) + + JArray(jValues) case _ => JNothing } } implicit class StringSetOps(ss: Set[String]) extends JSONSerialization { def jsonAST: JValue = ss match { - case contents if contents.nonEmpty => contents + case contents if contents.nonEmpty => + val sortedStrings = ss.toSeq.sorted + val jValues = sortedStrings.toList.map(JString) + + JArray(jValues) case _ => JNothing } } From a714451522a800bb0032582f3883316d2d5f93f7 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Fri, 31 Mar 2023 08:26:48 -0700 Subject: [PATCH 09/12] Sort mentions locally --- .../serialization/json/MentionOps.scala | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/MentionOps.scala b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/MentionOps.scala index 547d99c56..bce677d7e 100644 --- a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/MentionOps.scala +++ b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/MentionOps.scala @@ -7,15 +7,30 @@ import org.clulab.reach.context.Context import org.clulab.reach.grounding.KBResolution import org.clulab.reach.mentions.{Anaphoric, BioEventMention, BioRelationMention, BioTextBoundMention, CorefEventMention, CorefMention, CorefRelationMention, CorefTextBoundMention, Display, EventSite, Grounding, Modification, Modifications, Mutant, PTM, SimpleModification} import org.clulab.serialization.json.JSONSerialization +import org.clulab.utils.Unordered.OrderingOrElseBy import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson._ +import scala.math.Ordering.Implicits._ // Allow Seqs to be compared to each other. + import ReachImplicits._ object MentionOps { implicit val formats = org.json4s.DefaultFormats + implicit val mentionOrdering: Ordering[Mention] = OdinMentionOps.mentionOrdering + .orElseBy { mention => + if (mention.isInstanceOf[Anaphoric]) { + val antecedents = mention.asInstanceOf[Anaphoric].antecedents + // As with the arguments in processors, use sorted token intervals. + val tokenIntervals = antecedents.toSeq.map(_.asInstanceOf[Mention].tokenInterval) + + tokenIntervals.sorted + } + else Seq.empty + } + def apply(mention: Mention): OdinMentionOps = { mention match { // Corefs must be on top because they inherit from the Bio classes. @@ -118,7 +133,7 @@ object ReachImplicits { implicit class SeqMentionOps(mentions: Seq[Mention]) extends JSONSerialization { - def jsonAST: JValue = JSONSerializer.jsonAST(mentions.sorted(OdinMentionOps.mentionOrdering)) + def jsonAST: JValue = JSONSerializer.jsonAST(mentions.sorted(MentionOps.mentionOrdering)) } implicit class ModificationOps(mod: Modification) extends JSONSerialization { From d0f2f8bd180a8536fd53e77f7f226543c44c6163 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Fri, 31 Mar 2023 08:35:02 -0700 Subject: [PATCH 10/12] Clean up round trip --- .../org/clulab/reach/apps/RoundTripApp.scala | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala b/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala index db646b8be..8b37b4762 100644 --- a/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala +++ b/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala @@ -14,8 +14,6 @@ object RoundTripApp extends App { def testProcessorsSerialization(mentions: Seq[Mention]): Boolean = { import org.clulab.odin.serialization.json.JSONSerializer - val sortedMentions = mentions.sorted(org.clulab.odin.serialization.json.MentionOps.mentionOrdering) - val jValue = JSONSerializer.jsonAST(mentions) val json = stringify(jValue, pretty = true) @@ -24,11 +22,8 @@ object RoundTripApp extends App { val json2 = stringify(jValue2, pretty = true) val result = json == json2 - if (json != json2) { + if (!result) println("Processors mentions are not equal! Do something about it!") - println(json) - println(json2) - } result } @@ -41,19 +36,25 @@ object RoundTripApp extends App { val json2 = MentionsOps(mentions2).json(pretty = true) val result = json == json2 - if (json != json2) { + if (!result) println("Reach mentions are not equal! Do something about it!") - println(json) - println(json2) - } result } - files.foreach { file => + var failCount = 0 + + files.par.foreach { file => val entry = PaperReader.getEntryFromPaper(file) val mentions = PaperReader.getMentionsFromEntry(entry) - testProcessorsSerialization(mentions) - testReachSerialization(mentions) + val result1 = testProcessorsSerialization(mentions) + val result2 = testReachSerialization(mentions) + + if (!(result1 && result2)) { + println(s"File ${file.getName} failed.") + failCount += 1 + } } + if (failCount != 0) + println(s"There were $failCount failures!") } From 7f37db155a1097f05eddd002369f72bf3b645174 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Fri, 31 Mar 2023 09:22:46 -0700 Subject: [PATCH 11/12] Tidy --- .../serialization/json/JSONSerializer.scala | 21 +++---------------- 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/JSONSerializer.scala b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/JSONSerializer.scala index 511749e44..4ed06d755 100644 --- a/main/src/main/scala/org/clulab/reach/mentions/serialization/json/JSONSerializer.scala +++ b/main/src/main/scala/org/clulab/reach/mentions/serialization/json/JSONSerializer.scala @@ -49,9 +49,7 @@ object JSONSerializer extends LazyLogging { def jsonAST(mention: Mention): JValue = jsonAST(Seq(mention)) def jsonAST(mentions: Seq[Mention]): JValue = { - - val mentionList: List[JValue] = mentions.zipWithIndex.map { case (mention, index) => - println(s"Serializing mention $index.") + val mentionList: List[JValue] = mentions.map { mention => MentionOps(mention).jsonAST }.toList val docMap: Map[String, JValue] = mentionsToDocsJMap(mentions) @@ -250,9 +248,6 @@ object JSONSerializer extends LazyLogging { case other => toMentionByType(mjson, docMap).get.toCorefMention } - val antecedentsOpt = (mjson \ "antecedents").extractOpt[JArray] - if (antecedentsOpt.nonEmpty) - println("How can these be read in!") m.antecedents = toAntecedents(mjson, docMap) m.sieves = (mjson \ "sieves").extract[Set[String]] @@ -279,18 +274,8 @@ object JSONSerializer extends LazyLogging { case JNothing => Set.empty[Anaphoric] case antecedents => val arr = antecedents.asInstanceOf[JArray].arr - val ids = arr.map { elem => (elem \ "id").extract[String] } - if (ids.contains("T:-401089887") || ids.contains("T:-463773910")) - println("It's about to happen") - val list = arr.map(mjson => toCorefMention(mjson, docMap)) - val listLength = list.length - val next = list.map(_.toCorefMention) - val nextLength = next.length - val set: Set[Anaphoric] = next.toSet - val setLength = set.size - - if (listLength != nextLength || listLength != setLength) - println("This isn't supposed to happen!") + val list = arr.map(mjson => toCorefMention(mjson, docMap)).map(_.toCorefMention) + val set: Set[Anaphoric] = list.toSet set } From ab4aa6171d90a9c690f0e7cc1097961d95979db4 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Fri, 31 Mar 2023 09:23:55 -0700 Subject: [PATCH 12/12] Add output --- main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala b/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala index 8b37b4762..6c71ab3ca 100644 --- a/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala +++ b/main/src/main/scala/org/clulab/reach/apps/RoundTripApp.scala @@ -55,6 +55,5 @@ object RoundTripApp extends App { failCount += 1 } } - if (failCount != 0) - println(s"There were $failCount failures!") + println(s"There were $failCount failures!") }