diff --git a/pom.xml b/pom.xml index 9217187e..97b69d75 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ de.digitalcollections solr-ocrhighlighting - 0.9.1 + 0.9.2-SNAPSHOT Solr OCR Highlighting Plugin diff --git a/src/main/java/com/github/dbmdz/solrocr/formats/miniocr/MiniOcrFormat.java b/src/main/java/com/github/dbmdz/solrocr/formats/miniocr/MiniOcrFormat.java index 5f82b3e2..581ea5e6 100644 --- a/src/main/java/com/github/dbmdz/solrocr/formats/miniocr/MiniOcrFormat.java +++ b/src/main/java/com/github/dbmdz/solrocr/formats/miniocr/MiniOcrFormat.java @@ -16,9 +16,10 @@ import javax.xml.stream.XMLStreamException; public class MiniOcrFormat implements OcrFormat { - private static final Pattern pagePat = - Pattern.compile( - "

.+?)(\"|') ?(?:wh=(\"|')(?\\d+) (?\\d+)(\"|'))?>"); + private static final Pattern pageIdPat = + Pattern.compile("(?:xml)?:id=[\"'](?.+?)[\"']"); + private static final Pattern pageDimPat = + Pattern.compile("wh=[\"'](?\\d+) (?\\d+)[\"']"); private static final Map blockTagMapping = ImmutableMap.of( OcrBlock.PAGE, "p", @@ -50,15 +51,16 @@ public OcrParser getParser(Reader input, OcrParser.ParsingFeature... features) { @Override public OcrPage parsePageFragment(String pageFragment) { - Matcher m = pagePat.matcher(pageFragment); - if (!m.find()) { - return null; - } + String pageId = null; Dimension dims = null; - if (m.group("width") != null && m.group("height") != null) { + Matcher m = pageIdPat.matcher(pageFragment); + if (m.find()) { + pageId = m.group("pageId"); + } + m = pageDimPat.matcher(pageFragment); + if (m.find()) { dims = new Dimension(Integer.parseInt(m.group("width")), Integer.parseInt(m.group("height"))); } - String pageId = m.group("pageId"); return new OcrPage(pageId, dims); }