Skip to content

Commit

Permalink
miniocr: Make page fragment parsing more robust (#447)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbaiter committed Sep 30, 2024
1 parent 1eb778f commit 8224fee
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 10 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>de.digitalcollections</groupId>
<artifactId>solr-ocrhighlighting</artifactId>
<version>0.9.1</version>
<version>0.9.2-SNAPSHOT</version>

<name>Solr OCR Highlighting Plugin</name>
<description>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@
import javax.xml.stream.XMLStreamException;

public class MiniOcrFormat implements OcrFormat {
private static final Pattern pagePat =
Pattern.compile(
"<p (?:xml)?:id=(\"|')(?<pageId>.+?)(\"|') ?(?:wh=(\"|')(?<width>\\d+) (?<height>\\d+)(\"|'))?>");
private static final Pattern pageIdPat =
Pattern.compile("(?:xml)?:id=[\"'](?<pageId>.+?)[\"']");
private static final Pattern pageDimPat =
Pattern.compile("wh=[\"'](?<width>\\d+) (?<height>\\d+)[\"']");
private static final Map<OcrBlock, String> blockTagMapping =
ImmutableMap.of(
OcrBlock.PAGE, "p",
Expand Down Expand Up @@ -50,15 +51,16 @@ public OcrParser getParser(Reader input, OcrParser.ParsingFeature... features) {

@Override
public OcrPage parsePageFragment(String pageFragment) {
Matcher m = pagePat.matcher(pageFragment);
if (!m.find()) {
return null;
}
String pageId = null;
Dimension dims = null;
if (m.group("width") != null && m.group("height") != null) {
Matcher m = pageIdPat.matcher(pageFragment);
if (m.find()) {
pageId = m.group("pageId");
}
m = pageDimPat.matcher(pageFragment);
if (m.find()) {
dims = new Dimension(Integer.parseInt(m.group("width")), Integer.parseInt(m.group("height")));
}
String pageId = m.group("pageId");
return new OcrPage(pageId, dims);
}

Expand Down

0 comments on commit 8224fee

Please sign in to comment.