Skip to content

Commit

Permalink
ExternalUtf8ContentFilterFactory: Don't convert pointer offsets to char
Browse files Browse the repository at this point in the history
Instead, calculate difference while we read through the input during
parsing. This saves us a whole pass through the input files, which
should improve performance for non-filesystem based sources (where we
don't have the page cache to help us out).
  • Loading branch information
jbaiter committed Jul 8, 2024
1 parent d4c4b8b commit 7623d2b
Show file tree
Hide file tree
Showing 6 changed files with 511 additions and 155 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package com.github.dbmdz.solrocr.lucene.filters;

import com.github.dbmdz.solrocr.reader.StreamDecoder;
import java.io.IOException;
import java.io.Reader;
import java.nio.channels.SeekableByteChannel;
import java.nio.charset.StandardCharsets;

public class ByteSeekableReader extends Reader {
private final SeekableByteChannel channel;
private StreamDecoder decoder;

public ByteSeekableReader(SeekableByteChannel channel) {
this.channel = channel;
this.decoder = StreamDecoder.forDecoder(channel, StandardCharsets.UTF_8.newDecoder(), -1);
}

@Override
public int read(char[] cbuf, int off, int len) throws IOException {
return this.decoder.read(cbuf, off, len);
}

public int position() throws IOException {
return (int) this.channel.position();
}

public void position(int newPosition) throws IOException {
this.channel.position(newPosition);
this.decoder = StreamDecoder.forDecoder(channel, StandardCharsets.UTF_8.newDecoder(), -1);
}

@Override
public void close() throws IOException {
this.channel.close();
}
}
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
package com.github.dbmdz.solrocr.lucene.filters;

import com.github.dbmdz.solrocr.model.SourcePointer;
import com.github.dbmdz.solrocr.model.SourcePointer.Region;
import com.github.dbmdz.solrocr.util.SourceAwareReader;
import com.github.dbmdz.solrocr.util.Utf8;
import com.google.common.collect.ImmutableList;
import java.io.IOException;
import java.io.Reader;
import java.nio.CharBuffer;
import java.nio.channels.SeekableByteChannel;
import java.util.LinkedList;
import java.util.List;
import java.util.Optional;
Expand All @@ -18,37 +21,46 @@ public class ExternalUtf8ContentFilter extends BaseCharFilter implements SourceA
* current position.
*
* <pre>
* current actual byte offset in input = currentOutOffset + cumulative
* current actual __byte__ offset in input = currentOutCharOffset + cumulativeOffsetDifference
* </pre>
*/
private int cumulative;
private int cumulativeOffsetDifference;

/** The current <strong>char</strong> offset in the full file; */
private int currentInOffset;
/** The current <strong>byte</strong> offset in the <strong>full</strong> input; */
private int currentInByteOffset;

/** The current <strong>char</strong> offset in the output. */
private int currentOutOffset;
/**
* The current <strong>char</strong> offset in the output, i.e. the concatenated region content.
*/
private int currentOutCharOffset;

/** Source pointer of this reader, used for debugging and error reporting. */
private final String pointer;

/** Whether the next character has more than 1 byte for a char */
private boolean nextIsOffset = false;

private final Queue<SourcePointer.Region> remainingRegions;
private SourcePointer.Region currentRegion;

public ExternalUtf8ContentFilter(Reader input, List<SourcePointer.Region> regions, String pointer)
public ExternalUtf8ContentFilter(
SeekableByteChannel channel, List<SourcePointer.Region> regions, String pointer)
throws IOException {
super(input);
super(new ByteSeekableReader(channel));
if (regions == null || regions.isEmpty()) {
regions = ImmutableList.of(new Region(0, (int) channel.size()));
}
this.pointer = pointer;
this.currentOutOffset = 0;
this.currentInOffset = 0;
this.cumulative = 0;
this.currentOutCharOffset = 0;
this.currentInByteOffset = 0;
this.cumulativeOffsetDifference = 0;
this.remainingRegions = new LinkedList<>(regions);
currentRegion = remainingRegions.remove();
if (currentRegion.start > 0) {
this.addOffCorrectMap(currentOutOffset, currentRegion.startOffset);
this.cumulative += currentRegion.startOffset;
this.currentInOffset = (int) this.input.skip(currentRegion.start);
this.addOffCorrectMap(currentOutCharOffset, currentRegion.start);
this.cumulativeOffsetDifference += currentRegion.start;
this.currentInByteOffset = currentRegion.start;
((ByteSeekableReader) this.input).position(currentInByteOffset);
}
}

Expand All @@ -58,39 +70,41 @@ public ExternalUtf8ContentFilter(Reader input, List<SourcePointer.Region> region
*/
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
if (currentInOffset == currentRegion.end) {
if (currentInByteOffset == currentRegion.end) {
return -1;
}

int numCharsRead = 0;
while (len - numCharsRead > 0) {
int charsRemainingInRegion = currentRegion.end - currentInOffset;
int bytesRemainingInRegion = currentRegion.end - currentInByteOffset;
int charsToRead = len - numCharsRead;
if (charsToRead > charsRemainingInRegion) {
charsToRead = charsRemainingInRegion;
if (charsToRead > bytesRemainingInRegion) {
charsToRead = bytesRemainingInRegion;
}

int read = this.input.read(cbuf, off, charsToRead);
if (read < 0) {
int charsRead = this.input.read(cbuf, off, charsToRead);
if (charsRead < 0) {
break;
}
correctOffsets(cbuf, off, read);
numCharsRead += read;
off += read;
while (Utf8.encodedLength(CharBuffer.wrap(cbuf, off, charsRead)) > bytesRemainingInRegion) {
charsRead--;
}
correctOffsets(cbuf, off, charsRead);
numCharsRead += charsRead;
off += charsRead;

if (currentInOffset == currentRegion.end) {
if (currentInByteOffset == currentRegion.end) {
if (remainingRegions.isEmpty()) {
break;
}
currentRegion = remainingRegions.remove();

cumulative = currentRegion.startOffset - currentOutOffset;
this.addOffCorrectMap(currentOutOffset, cumulative);
int toSkip = this.currentRegion.start - this.currentInOffset;
if (toSkip > 0) {
this.input.skip(this.currentRegion.start - this.currentInOffset);
cumulativeOffsetDifference = currentRegion.start - currentOutCharOffset;
this.addOffCorrectMap(currentOutCharOffset, cumulativeOffsetDifference);
if (this.currentRegion.start > this.currentInByteOffset) {
this.currentInByteOffset = currentRegion.start;
}
this.currentInOffset = currentRegion.start;
((ByteSeekableReader) this.input).position(this.currentInByteOffset);
}
}
return numCharsRead > 0 ? numCharsRead : -1;
Expand All @@ -99,15 +113,15 @@ public int read(char[] cbuf, int off, int len) throws IOException {
private void correctOffsets(char[] cbuf, int off, int len) {
for (int i = off; i < off + len; i++) {
if (nextIsOffset) {
this.addOffCorrectMap(currentOutOffset, cumulative);
this.addOffCorrectMap(currentOutCharOffset, cumulativeOffsetDifference);
nextIsOffset = false;
}
currentInOffset += 1;
currentOutOffset += 1;
currentOutCharOffset += 1;
int cp = Character.codePointAt(cbuf, i);
int increment = Utf8.encodedLength(cp) - 1;
if (increment > 0) {
cumulative += increment;
int encodedLen = Utf8.encodedLength(cp);
currentInByteOffset += encodedLen;
if (encodedLen > 1) {
cumulativeOffsetDifference += (encodedLen - 1);
nextIsOffset = true;
}
}
Expand Down
10 changes: 2 additions & 8 deletions src/main/java/com/github/dbmdz/solrocr/model/SourcePointer.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
import com.github.dbmdz.solrocr.reader.FileSourceReader;
import com.github.dbmdz.solrocr.reader.MultiFileSourceReader;
import com.github.dbmdz.solrocr.reader.SourceReader;
import com.google.common.collect.ImmutableList;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
Expand Down Expand Up @@ -75,7 +75,7 @@ static Source parse(String pointer) {
throw new RuntimeException("Could not parse source pointer from '" + pointer + ".");
}
String target = m.group("target");
List<Region> regions = ImmutableList.of();
List<Region> regions = new ArrayList<>();
if (m.group("regions") != null) {
regions =
Arrays.stream(m.group("regions").split(","))
Expand Down Expand Up @@ -124,7 +124,6 @@ public static class Region {

public int start;
public int end;
public int startOffset = 0;

public static Region parse(String r) {
if (r.startsWith(":")) {
Expand All @@ -142,11 +141,6 @@ public Region(int start, int end) {
this.end = end;
}

public Region(int start, int end, int startOffset) {
this(start, end);
this.startOffset = startOffset;
}

@Override
public String toString() {
return start + ":" + end;
Expand Down
Loading

0 comments on commit 7623d2b

Please sign in to comment.