Skip to content

Commit

Permalink
Merge pull request #3 from ratschlab/feature/conversionCommand
Browse files Browse the repository at this point in the history
Feature/conversion command
  • Loading branch information
Marc Zimmermann authored Oct 8, 2021
2 parents 60d7c94 + 67bc3bc commit 86a37a4
Show file tree
Hide file tree
Showing 31 changed files with 490 additions and 264 deletions.
27 changes: 14 additions & 13 deletions deidentifier-pipeline/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<junitVersion>5.8.1</junitVersion>
</properties>
<dependencies>
<!-- https://mvnrepository.com/artifact/uk.ac.gate/gate-core -->
Expand All @@ -33,35 +34,41 @@
<dependency>
<groupId>uk.ac.gate</groupId>
<artifactId>gate-plugin-test-utils</artifactId>
<version>8.6.1</version>
<version>9.0.1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.9.7</version>
<version>2.9.10.8</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<version>5.3.1</version>
<version>${junitVersion}</version>
<!-- <scope>test</scope> -->
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-params</artifactId>
<version>${junitVersion}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<version>5.3.1</version>
<version>${junitVersion}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.platform</groupId>
<artifactId>junit-platform-launcher</artifactId>
<version>1.1.0</version>
<version>1.7.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.platform</groupId>
<artifactId>junit-platform-runner</artifactId>
<version>1.1.0</version>
<version>1.7.2</version>
<scope>test</scope>
</dependency>
<dependency>
Expand All @@ -76,12 +83,6 @@
<version>1.4.200</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-params</artifactId>
<version>5.3.1</version>
<!-- <scope>test</scope> -->
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-library</artifactId>
Expand Down Expand Up @@ -137,7 +138,7 @@
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>20.0</version>
<version>30.1.1-jre</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
Expand Down
11 changes: 11 additions & 0 deletions deidentifier-pipeline/src/main/java/org/ratschlab/DeidCmd.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
package org.ratschlab;

import java.util.concurrent.Callable;

public class DeidCmd implements Callable<Integer> {
@Override
public Integer call() {
org.ratschlab.util.Utils.tieSystemOutAndErrToLog();
return null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import gate.util.GateException;
import org.ratschlab.deidentifier.pipelines.PipelineFactory;
import org.ratschlab.deidentifier.sources.ImportCmd;
import org.ratschlab.deidentifier.sources.KisimFormat;
import org.ratschlab.deidentifier.sources.KisimSource;
import org.ratschlab.deidentifier.utils.DbCommands;
import org.ratschlab.deidentifier.utils.paths.PathConstraint;
Expand All @@ -28,12 +29,15 @@
import java.util.stream.Stream;

@CommandLine.Command(description = "Annotate Corpus", name = "annotate")
public class AnnotationCmd extends DbCommands implements Callable<Integer> {
public class AnnotationCmd extends DbCommands {
private static final Logger log = LoggerFactory.getLogger(AnnotationCmd.class);

@CommandLine.Option(names = {"-i"}, description = "Input corpus dir")
private String corpusInputDirPath = null;

@CommandLine.Option(names = {"--json-input"}, description = "Assumes input dir consists of json files, one per report")
private boolean jsonInput = false;

@CommandLine.Option(names = {"--xml-input"}, description = "Assumes input dir consists of xml files, one per report (testing purposes)")
private boolean xmlInput = false;

Expand Down Expand Up @@ -62,8 +66,48 @@ public static void main(String[] args) {
System.exit(CommandLine.call(new AnnotationCmd(), args));
}

private PipelineWorkflow<?> readFromFiles(List<WorkflowConcern> concerns, SerialAnalyserController controller) throws Exception {
if(!xmlInput && !jsonInput) {
return new PipelineWorkflow<>(
GateTools.readDocsInCorpus(new File(corpusInputDirPath)),
d -> d,
controller,
threads,
concerns);
}

if(xmlInput) {
List<File> inputFiles = Lists.newArrayList(new File(corpusInputDirPath).listFiles());

return new PipelineWorkflow<>(
docsLimiting(inputFiles.stream()),
org.ratschlab.util.Utils.exceptionWrapper(f -> Optional.of(GateTools.readDocumentFromFile(f))),
controller,
threads,
concerns);
}

List<File> files = Lists.newArrayList(new File(corpusInputDirPath).listFiles());

KisimFormat ksf = new KisimFormat();
return new PipelineWorkflow<>(
docsLimiting(files.stream()),
org.ratschlab.util.Utils.exceptionWrapper(f -> {
Document doc = ksf.jsonToDocument(f);

doc.setName(f.getName().replaceAll(".json", ""));
doc.getFeatures().put("reportnr", f.getName().replaceAll(".json", ""));
return Optional.of(doc);
}),
controller,
threads,
concerns);
}

@Override
public Integer call() {
super.call();

if(corpusInputDirPath == null && databaseConfigPath == null) {
System.err.println("Need at least -i or -d");
return 1;
Expand Down Expand Up @@ -128,45 +172,24 @@ public Integer call() {
concerns.add(new EvaluateCorpus(String.format("%s-manual", PipelineFactory.finalASName), PipelineFactory.finalASName, PipelineFactory.annotationTypes, corpusOutputDir, reportOutput));
}

if(corpusInputDirPath != null && !xmlInput) {
PipelineWorkflow<Optional<Document>> workflow = new PipelineWorkflow<>(
GateTools.readDocsInCorpus(new File(corpusInputDirPath)),
d -> d,
myController,
threads,
concerns);

workflow.run();
}
else if(corpusInputDirPath != null) {
List<File> inputFiles = Lists.newArrayList(new File(corpusInputDirPath).listFiles());

if(maxDocs > 0 && inputFiles.size() > maxDocs) {
inputFiles = inputFiles.subList(0, maxDocs);
}

PipelineWorkflow<File> workflow = new PipelineWorkflow<>(
inputFiles.stream(),
f -> GateTools.readDocumentFromFile(f),
myController,
threads,
concerns);

workflow.run();
PipelineWorkflow<?> workflow;
if (corpusInputDirPath != null) {
workflow = readFromFiles(concerns, myController);
} else {
KisimSource ks = new KisimSource(new File(databaseConfigPath));
Stream<Map<String, Object>> records = docsLimiting(ImportCmd.documentRecordsStream(ks, Optional.ofNullable(docTypeFilterPath),
Optional.ofNullable(docIdFilterPath)));

PipelineWorkflow<Map<String, Object>> workflow = new PipelineWorkflow<>(
workflow = new PipelineWorkflow<>(
records,
p -> ImportCmd.kisimDocConversion(p, ks),
org.ratschlab.util.Utils.exceptionWrapper(p -> Optional.of(ImportCmd.kisimDocConversion(p, ks))),
myController,
threads,
concerns);

workflow.run();
}

workflow.run();

} catch (GateException e) {
e.printStackTrace();
return 1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import org.ratschlab.deidentifier.dev.DiagnosticsCmd;
import org.ratschlab.deidentifier.pipelines.testing.PipelineTesterCmd;
import org.ratschlab.deidentifier.sources.ConversionCmd;
import org.ratschlab.deidentifier.sources.ImportCmd;
import org.ratschlab.deidentifier.substitution.SubstitutionCmd;
import picocli.CommandLine;
Expand All @@ -13,6 +14,7 @@
SubstitutionCmd.class,
AnnotationCmd.class,
ImportCmd.class,
ConversionCmd.class,
DiagnosticsCmd.class,
PipelineTesterCmd.class
}
Expand All @@ -25,8 +27,10 @@ public Integer call() {
}

public static void main(String[] args) {
org.ratschlab.util.Utils.tieSystemOutAndErrToLog();
int exitCode = CommandLine.call(new DeidMain(), args);
Integer exitCode = CommandLine.call(new DeidMain(), args);
if(exitCode == null) {
System.exit(2);
}
System.exit(exitCode);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import com.typesafe.config.Config;
import gate.Document;
import gate.Gate;
import gate.creole.ResourceInstantiationException;
import gate.creole.SerialAnalyserController;
import gate.util.GateException;
import org.apache.commons.lang3.tuple.Pair;
Expand Down Expand Up @@ -51,22 +52,15 @@ public static void main(String[] args) throws IOException, SQLException, GateExc

PipelineWorkflow<Pair<String, String>> workflow = new PipelineWorkflow<>(
records,
p -> {
try {
ObjectMapper om = new ObjectMapper();
// parse and emit string again to not have to deal with formatting issues during assert
String jsonStr = om.writeValueAsString(om.reader().readTree(p.getRight()));

//System.out.println(p.getLeft());
Files.write(new File(String.format("/home/marczim/data/deid_poc/sets/kisim/kisim_json/%s.json", p.getLeft())).toPath(), jsonStr.getBytes(StandardCharsets.UTF_8));

return Optional.of(checkConversion(p.getRight(), myController));
} catch (IOException e) {
e.printStackTrace();
}

return Optional.empty();
},
org.ratschlab.util.Utils.exceptionWrapper(p -> {
ObjectMapper om = new ObjectMapper();
// parse and emit string again to not have to deal with formatting issues during assert
String jsonStr = om.writeValueAsString(om.reader().readTree(p.getRight()));

Files.write(new File(String.format("/home/marczim/data/deid_poc/sets/kisim/kisim_json/%s.json", p.getLeft())).toPath(), jsonStr.getBytes(StandardCharsets.UTF_8));

return Optional.of(checkConversion(p.getRight(), myController));
}),
PipelineFactory.NoOpController(),
threads,
new ArrayList<>());
Expand All @@ -75,7 +69,7 @@ public static void main(String[] args) throws IOException, SQLException, GateExc

}

public static Document checkConversion(String kisimJson, SerialAnalyserController myController) throws IOException {
public static Document checkConversion(String kisimJson, SerialAnalyserController myController) throws IOException, ResourceInstantiationException {
ObjectMapper om = new ObjectMapper();
// parse and emit string again to not have to deal with formatting issues during assert
String jsonStr = om.writeValueAsString(om.reader().readTree(kisimJson));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@

import gate.Gate;
import gate.util.GateException;
import org.ratschlab.DeidCmd;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import picocli.CommandLine;

import java.io.File;
import java.io.IOException;
import java.util.concurrent.Callable;

@CommandLine.Command(description = "Tests a pipeline", name = "test")
public class PipelineTesterCmd implements Callable<Integer> {
public class PipelineTesterCmd extends DeidCmd {
private static final Logger log = LoggerFactory.getLogger(PipelineTesterCmd.class);

@CommandLine.Parameters(index = "0", description = "Pipeline Configuration File")
Expand All @@ -29,6 +29,8 @@ public static void main(String[] args) {

@Override
public Integer call() {
super.call();

try {
Gate.init();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import com.fasterxml.jackson.databind.ObjectMapper;
import gate.Document;
import gate.Gate;
import gate.creole.ResourceInstantiationException;
import gate.util.GateException;
import org.apache.commons.lang3.tuple.Pair;
import org.ratschlab.deidentifier.utils.DbCommands;
Expand All @@ -20,7 +21,7 @@
import java.util.stream.Stream;

@CommandLine.Command(mixinStandardHelpOptions = true, description = "Roundtrip test between JSON <--> GATE format", name = "conversioncheck")
public class ConversionCheckCmd extends DbCommands implements Callable<Integer> {
public class ConversionCheckCmd extends DbCommands {

private static final Logger log = LoggerFactory.getLogger(ConversionCheckCmd.class);

Expand All @@ -29,6 +30,8 @@ public class ConversionCheckCmd extends DbCommands implements Callable<Integer>

@Override
public Integer call() {
super.call();

try {
Gate.init();

Expand All @@ -47,9 +50,8 @@ public Integer call() {
}

return !checkConversion(p.getLeft(), p.getRight(), out);
} catch (IOException e) {
log.error("Exception thrown", e);
return true;
} catch (IOException|ResourceInstantiationException e) {
throw new RuntimeException(e);
}
}).count();

Expand All @@ -62,7 +64,7 @@ public Integer call() {
return 0;
}

public static boolean checkConversion(String docId, String kisimJson, PrintStream out) throws IOException {
public static boolean checkConversion(String docId, String kisimJson, PrintStream out) throws IOException, ResourceInstantiationException {
ObjectMapper om = new ObjectMapper();
// parse and emit string again to not have to deal with formatting issues during assert
String jsonStr = om.writeValueAsString(om.reader().readTree(kisimJson));
Expand Down
Loading

0 comments on commit 86a37a4

Please sign in to comment.