From e3be7dff205ffcbed0a965f47a5a7a7f15df6a3f Mon Sep 17 00:00:00 2001 From: Johannes Baiter Date: Tue, 9 Jul 2024 13:17:46 +0200 Subject: [PATCH] Fix incorrect output offset for multi-char codepoints in ExternalUtf8Filter#correctOffsets --- .../solrocr/lucene/filters/ExternalUtf8ContentFilter.java | 5 +++-- src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/github/dbmdz/solrocr/lucene/filters/ExternalUtf8ContentFilter.java b/src/main/java/com/github/dbmdz/solrocr/lucene/filters/ExternalUtf8ContentFilter.java index b790fdab..e715db73 100644 --- a/src/main/java/com/github/dbmdz/solrocr/lucene/filters/ExternalUtf8ContentFilter.java +++ b/src/main/java/com/github/dbmdz/solrocr/lucene/filters/ExternalUtf8ContentFilter.java @@ -138,10 +138,11 @@ private void correctOffsets(char[] decodedChars, int bufOffset, int numChars) { this.addOffCorrectMap(currentOutCharOffset, cumulativeOffsetDifference); lastCharHadMultipleBytes = false; } - currentOutCharOffset += 1; int cp = Character.codePointAt(decodedChars, i); - i += Character.charCount(cp); int encodedLen = Utf8.encodedLength(cp); + int charLen = Character.charCount(cp); + i += charLen; + currentOutCharOffset += charLen; currentInByteOffset += encodedLen; if (encodedLen > 1) { cumulativeOffsetDifference += (encodedLen - 1); diff --git a/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java b/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java index 8b4ce919..68a831a6 100644 --- a/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java +++ b/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java @@ -68,7 +68,7 @@ public Reader create(Reader input) { + ptrStr); } adjustRegions(pointer); - // Section size and cache size dont't matter, since we don't use sectioned reads during + // Section size and cache size don't matter, since we don't use sectioned reads during // indexing. SourceReader r = pointer.getReader(512 * 1024, 0); List regions =