diff --git a/src/main/java/com/github/dbmdz/solrocr/formats/miniocr/MiniOcrFormat.java b/src/main/java/com/github/dbmdz/solrocr/formats/miniocr/MiniOcrFormat.java
index 5f82b3e2..581ea5e6 100644
--- a/src/main/java/com/github/dbmdz/solrocr/formats/miniocr/MiniOcrFormat.java
+++ b/src/main/java/com/github/dbmdz/solrocr/formats/miniocr/MiniOcrFormat.java
@@ -16,9 +16,10 @@
import javax.xml.stream.XMLStreamException;
public class MiniOcrFormat implements OcrFormat {
- private static final Pattern pagePat =
- Pattern.compile(
- ".+?)(\"|') ?(?:wh=(\"|')(?\\d+) (?\\d+)(\"|'))?>");
+ private static final Pattern pageIdPat =
+ Pattern.compile("(?:xml)?:id=[\"'](?.+?)[\"']");
+ private static final Pattern pageDimPat =
+ Pattern.compile("wh=[\"'](?\\d+) (?\\d+)[\"']");
private static final Map blockTagMapping =
ImmutableMap.of(
OcrBlock.PAGE, "p",
@@ -50,15 +51,16 @@ public OcrParser getParser(Reader input, OcrParser.ParsingFeature... features) {
@Override
public OcrPage parsePageFragment(String pageFragment) {
- Matcher m = pagePat.matcher(pageFragment);
- if (!m.find()) {
- return null;
- }
+ String pageId = null;
Dimension dims = null;
- if (m.group("width") != null && m.group("height") != null) {
+ Matcher m = pageIdPat.matcher(pageFragment);
+ if (m.find()) {
+ pageId = m.group("pageId");
+ }
+ m = pageDimPat.matcher(pageFragment);
+ if (m.find()) {
dims = new Dimension(Integer.parseInt(m.group("width")), Integer.parseInt(m.group("height")));
}
- String pageId = m.group("pageId");
return new OcrPage(pageId, dims);
}