From 42278683a42cb369f079bc64fd26600126de894f Mon Sep 17 00:00:00 2001 From: Katharina Schmid Date: Tue, 18 Jun 2024 15:13:40 +0200 Subject: [PATCH] Determine regions independent of sectionReadSize --- .../formats/hocr/HocrClassBreakLocator.java | 22 ++++++++++++++----- .../solrocr/reader/BaseSourceReader.java | 2 +- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/main/java/com/github/dbmdz/solrocr/formats/hocr/HocrClassBreakLocator.java b/src/main/java/com/github/dbmdz/solrocr/formats/hocr/HocrClassBreakLocator.java index d3ae0255..f8b66576 100644 --- a/src/main/java/com/github/dbmdz/solrocr/formats/hocr/HocrClassBreakLocator.java +++ b/src/main/java/com/github/dbmdz/solrocr/formats/hocr/HocrClassBreakLocator.java @@ -135,6 +135,7 @@ protected int getPreceding(int offset) throws IOException { /** Find a match for one of the break classes in the given String, seeking forward. */ private int findForwardMatch(String text, int fromOffset, int toOffset) { + int match = Integer.MAX_VALUE; for (String breakClass : this.breakClasses) { // Where to start looking from for a break in the next iteration int fromIdx = fromOffset; @@ -169,11 +170,17 @@ private int findForwardMatch(String text, int fromOffset, int toOffset) { fromIdx = closeIdx; continue; } - // Found a match - return openIdx; + // Found a match, try next class to see whether there is a match closer to the offset + if (openIdx < match) { + match = openIdx; + } + break; } } - return -1; + if (match == Integer.MAX_VALUE) { + return -1; + } + return match; } /** Find a match for one of the break classes in the given String, seeking backwards. */ @@ -185,6 +192,7 @@ private int findBackwardMatch(String text, int fromOffset, int toOffset) { assert fromOffset > toOffset : "fromOffset must be greater than toOffset, we're looking backwards!"; + int match = -1; for (String breakClass : this.breakClasses) { // Look for the class in the block while (fromOffset > toOffset) { @@ -202,9 +210,13 @@ private int findBackwardMatch(String text, int fromOffset, int toOffset) { fromOffset = Math.max(previousClose, elemOpen); continue; } - return elemOpen; + // Found match, try next class to see whether there is a match closer to the offset + if (elemOpen > match) { + match = elemOpen; + } + break; } } - return -1; + return match; } } diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/BaseSourceReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/BaseSourceReader.java index e56a6b50..f93b3eea 100644 --- a/src/main/java/com/github/dbmdz/solrocr/reader/BaseSourceReader.java +++ b/src/main/java/com/github/dbmdz/solrocr/reader/BaseSourceReader.java @@ -217,7 +217,7 @@ public Section getAsciiSection(int offset) throws IOException { int startOffset = sectionIndex * sectionSize; int readLen = Math.min(sectionSize, this.length() - startOffset); int numRead = 0; - while(numRead < readLen) { + while (numRead < readLen) { numRead += this.readBytes(copyBuf, numRead, startOffset + numRead, readLen - numRead); } // Construct a String without going through a decoder to save on CPU.