ExternalUtf8ContentFilterFactory: Don't convert pointer offsets to char

Instead, calculate difference while we read through the input during parsing. This saves us a whole pass through the input files, which should improve performance for non-filesystem based sources (where we don't have the page cache to help us out).
dbmdz · Jul 8, 2024 · 7623d2b · 7623d2b
1 parent d4c4b8b
commit 7623d2b
Show file tree

Hide file tree

Showing 6 changed files with 511 additions and 155 deletions.
diff --git a/src/main/java/com/github/dbmdz/solrocr/lucene/filters/ByteSeekableReader.java b/src/main/java/com/github/dbmdz/solrocr/lucene/filters/ByteSeekableReader.java
@@ -0,0 +1,36 @@
+package com.github.dbmdz.solrocr.lucene.filters;
+
+import com.github.dbmdz.solrocr.reader.StreamDecoder;
+import java.io.IOException;
+import java.io.Reader;
+import java.nio.channels.SeekableByteChannel;
+import java.nio.charset.StandardCharsets;
+
+public class ByteSeekableReader extends Reader {
+  private final SeekableByteChannel channel;
+  private StreamDecoder decoder;
+
+  public ByteSeekableReader(SeekableByteChannel channel) {
+    this.channel = channel;
+    this.decoder = StreamDecoder.forDecoder(channel, StandardCharsets.UTF_8.newDecoder(), -1);
+  }
+
+  @Override
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    return this.decoder.read(cbuf, off, len);
+  }
+
+  public int position() throws IOException {
+    return (int) this.channel.position();
+  }
+
+  public void position(int newPosition) throws IOException {
+    this.channel.position(newPosition);
+    this.decoder = StreamDecoder.forDecoder(channel, StandardCharsets.UTF_8.newDecoder(), -1);
+  }
+
+  @Override
+  public void close() throws IOException {
+    this.channel.close();
+  }
+}
diff --git a/src/main/java/com/github/dbmdz/solrocr/lucene/filters/ExternalUtf8ContentFilter.java b/src/main/java/com/github/dbmdz/solrocr/lucene/filters/ExternalUtf8ContentFilter.java
@@ -1,10 +1,13 @@
 package com.github.dbmdz.solrocr.lucene.filters;
 
 import com.github.dbmdz.solrocr.model.SourcePointer;
+import com.github.dbmdz.solrocr.model.SourcePointer.Region;
 import com.github.dbmdz.solrocr.util.SourceAwareReader;
 import com.github.dbmdz.solrocr.util.Utf8;
+import com.google.common.collect.ImmutableList;
 import java.io.IOException;
-import java.io.Reader;
+import java.nio.CharBuffer;
+import java.nio.channels.SeekableByteChannel;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Optional;
@@ -18,37 +21,46 @@ public class ExternalUtf8ContentFilter extends BaseCharFilter implements SourceA
    * current position.
    *
    * <pre>
-   * current actual byte offset in input = currentOutOffset + cumulative
+   * current actual __byte__ offset in input = currentOutCharOffset + cumulativeOffsetDifference
    * </pre>
    */
-  private int cumulative;
+  private int cumulativeOffsetDifference;
 
-  /** The current <strong>char</strong> offset in the full file; */
-  private int currentInOffset;
+  /** The current <strong>byte</strong> offset in the <strong>full</strong> input; */
+  private int currentInByteOffset;
 
-  /** The current <strong>char</strong> offset in the output. */
-  private int currentOutOffset;
+  /**
+   * The current <strong>char</strong> offset in the output, i.e. the concatenated region content.
+   */
+  private int currentOutCharOffset;
 
   /** Source pointer of this reader, used for debugging and error reporting. */
   private final String pointer;
 
+  /** Whether the next character has more than 1 byte for a char */
   private boolean nextIsOffset = false;
+
   private final Queue<SourcePointer.Region> remainingRegions;
   private SourcePointer.Region currentRegion;
 
-  public ExternalUtf8ContentFilter(Reader input, List<SourcePointer.Region> regions, String pointer)
+  public ExternalUtf8ContentFilter(
+      SeekableByteChannel channel, List<SourcePointer.Region> regions, String pointer)
       throws IOException {
-    super(input);
+    super(new ByteSeekableReader(channel));
+    if (regions == null || regions.isEmpty()) {
+      regions = ImmutableList.of(new Region(0, (int) channel.size()));
+    }
     this.pointer = pointer;
-    this.currentOutOffset = 0;
-    this.currentInOffset = 0;
-    this.cumulative = 0;
+    this.currentOutCharOffset = 0;
+    this.currentInByteOffset = 0;
+    this.cumulativeOffsetDifference = 0;
     this.remainingRegions = new LinkedList<>(regions);
     currentRegion = remainingRegions.remove();
     if (currentRegion.start > 0) {
-      this.addOffCorrectMap(currentOutOffset, currentRegion.startOffset);
-      this.cumulative += currentRegion.startOffset;
-      this.currentInOffset = (int) this.input.skip(currentRegion.start);
+      this.addOffCorrectMap(currentOutCharOffset, currentRegion.start);
+      this.cumulativeOffsetDifference += currentRegion.start;
+      this.currentInByteOffset = currentRegion.start;
+      ((ByteSeekableReader) this.input).position(currentInByteOffset);
     }
   }
 
@@ -58,39 +70,41 @@ public ExternalUtf8ContentFilter(Reader input, List<SourcePointer.Region> region
    */
   @Override
   public int read(char[] cbuf, int off, int len) throws IOException {
-    if (currentInOffset == currentRegion.end) {
+    if (currentInByteOffset == currentRegion.end) {
       return -1;
     }
 
     int numCharsRead = 0;
     while (len - numCharsRead > 0) {
-      int charsRemainingInRegion = currentRegion.end - currentInOffset;
+      int bytesRemainingInRegion = currentRegion.end - currentInByteOffset;
       int charsToRead = len - numCharsRead;
-      if (charsToRead > charsRemainingInRegion) {
-        charsToRead = charsRemainingInRegion;
+      if (charsToRead > bytesRemainingInRegion) {
+        charsToRead = bytesRemainingInRegion;
       }
 
-      int read = this.input.read(cbuf, off, charsToRead);
-      if (read < 0) {
+      int charsRead = this.input.read(cbuf, off, charsToRead);
+      if (charsRead < 0) {
         break;
       }
-      correctOffsets(cbuf, off, read);
-      numCharsRead += read;
-      off += read;
+      while (Utf8.encodedLength(CharBuffer.wrap(cbuf, off, charsRead)) > bytesRemainingInRegion) {
+        charsRead--;
+      }
+      correctOffsets(cbuf, off, charsRead);
+      numCharsRead += charsRead;
+      off += charsRead;
 
-      if (currentInOffset == currentRegion.end) {
+      if (currentInByteOffset == currentRegion.end) {
         if (remainingRegions.isEmpty()) {
           break;
         }
         currentRegion = remainingRegions.remove();
 
-        cumulative = currentRegion.startOffset - currentOutOffset;
-        this.addOffCorrectMap(currentOutOffset, cumulative);
-        int toSkip = this.currentRegion.start - this.currentInOffset;
-        if (toSkip > 0) {
-          this.input.skip(this.currentRegion.start - this.currentInOffset);
+        cumulativeOffsetDifference = currentRegion.start - currentOutCharOffset;
+        this.addOffCorrectMap(currentOutCharOffset, cumulativeOffsetDifference);
+        if (this.currentRegion.start > this.currentInByteOffset) {
+          this.currentInByteOffset = currentRegion.start;
         }
-        this.currentInOffset = currentRegion.start;
+        ((ByteSeekableReader) this.input).position(this.currentInByteOffset);
       }
     }
     return numCharsRead > 0 ? numCharsRead : -1;
@@ -99,15 +113,15 @@ public int read(char[] cbuf, int off, int len) throws IOException {
   private void correctOffsets(char[] cbuf, int off, int len) {
     for (int i = off; i < off + len; i++) {
       if (nextIsOffset) {
-        this.addOffCorrectMap(currentOutOffset, cumulative);
+        this.addOffCorrectMap(currentOutCharOffset, cumulativeOffsetDifference);
         nextIsOffset = false;
       }
-      currentInOffset += 1;
-      currentOutOffset += 1;
+      currentOutCharOffset += 1;
       int cp = Character.codePointAt(cbuf, i);
-      int increment = Utf8.encodedLength(cp) - 1;
-      if (increment > 0) {
-        cumulative += increment;
+      int encodedLen = Utf8.encodedLength(cp);
+      currentInByteOffset += encodedLen;
+      if (encodedLen > 1) {
+        cumulativeOffsetDifference += (encodedLen - 1);
         nextIsOffset = true;
       }
     }

diff --git a/src/main/java/com/github/dbmdz/solrocr/model/SourcePointer.java b/src/main/java/com/github/dbmdz/solrocr/model/SourcePointer.java
@@ -3,13 +3,13 @@
 import com.github.dbmdz.solrocr.reader.FileSourceReader;
 import com.github.dbmdz.solrocr.reader.MultiFileSourceReader;
 import com.github.dbmdz.solrocr.reader.SourceReader;
-import com.google.common.collect.ImmutableList;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Comparator;
 import java.util.List;
@@ -75,7 +75,7 @@ static Source parse(String pointer) {
         throw new RuntimeException("Could not parse source pointer from '" + pointer + ".");
       }
       String target = m.group("target");
-      List<Region> regions = ImmutableList.of();
+      List<Region> regions = new ArrayList<>();
       if (m.group("regions") != null) {
         regions =
             Arrays.stream(m.group("regions").split(","))
@@ -124,7 +124,6 @@ public static class Region {
 
     public int start;
     public int end;
-    public int startOffset = 0;
 
     public static Region parse(String r) {
       if (r.startsWith(":")) {
@@ -142,11 +141,6 @@ public Region(int start, int end) {
       this.end = end;
     }
 
-    public Region(int start, int end, int startOffset) {
-      this(start, end);
-      this.startOffset = startOffset;
-    }
-
     @Override
     public String toString() {
       return start + ":" + end;