Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ExternalUtf8ContentFilterFactory: Don't convert pointer offsets to char #444

Merged
merged 2 commits into from
Jul 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package com.github.dbmdz.solrocr.lucene.filters;

import com.github.dbmdz.solrocr.reader.StreamDecoder;
import java.io.IOException;
import java.io.Reader;
import java.nio.channels.SeekableByteChannel;
import java.nio.charset.StandardCharsets;

/**
* A Reader implementation that reads from a SeekableByteChannel and allows repositioning the
* reader.
*/
public class ByteSeekableReader extends Reader {
private final SeekableByteChannel channel;
private StreamDecoder decoder;

public ByteSeekableReader(SeekableByteChannel channel) {
this.channel = channel;
this.decoder = StreamDecoder.forDecoder(channel, StandardCharsets.UTF_8.newDecoder(), -1);
}

@Override
public int read(char[] cbuf, int off, int len) throws IOException {
return this.decoder.read(cbuf, off, len);
}

/** Return the current byte position in the underlying channel. */
public int position() throws IOException {
return (int) this.channel.position();
}

/**
* Reposition the reader to the given byte position.
*
* <p>This will also reset the decoder.
*/
public void position(int newPosition) throws IOException {
this.channel.position(newPosition);
this.decoder = StreamDecoder.forDecoder(channel, StandardCharsets.UTF_8.newDecoder(), -1);
}

@Override
public void close() throws IOException {
this.channel.close();
}
}
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
package com.github.dbmdz.solrocr.lucene.filters;

import com.github.dbmdz.solrocr.model.SourcePointer;
import com.github.dbmdz.solrocr.model.SourcePointer.Region;
import com.github.dbmdz.solrocr.util.SourceAwareReader;
import com.github.dbmdz.solrocr.util.Utf8;
import com.google.common.collect.ImmutableList;
import java.io.IOException;
import java.io.Reader;
import java.nio.CharBuffer;
import java.nio.channels.SeekableByteChannel;
import java.util.LinkedList;
import java.util.List;
import java.util.Optional;
Expand All @@ -17,98 +20,133 @@ public class ExternalUtf8ContentFilter extends BaseCharFilter implements SourceA
* The cumulative offset difference between the input (bytes) and the output (chars) at the
* current position.
*
* <p>Used to calculate the <strong>byte</strong> offset in the input, given a
* <strong>char</strong> offset from the output of this filter.
*
* <pre>
* current actual byte offset in input = currentOutOffset + cumulative
* currentInputByteOffset = currentOutCharOffset + cumulativeOffsetDifference
* </pre>
*/
private int cumulative;
private int cumulativeOffsetDifference;

/** The current <strong>char</strong> offset in the full file; */
private int currentInOffset;
/**
* The current <strong>byte</strong> offset in the <strong>full</strong> input, i.e. the
* concatenated content of all files in the source pointer.
*/
private int currentInByteOffset;

/** The current <strong>char</strong> offset in the output. */
private int currentOutOffset;
/**
* The current <strong>char</strong> offset in the output, i.e. the concatenated and decoded
* content of all regions in the source pointer.
*/
private int currentOutCharOffset;

/** Source pointer of this reader, used for debugging and error reporting. */
private final String pointer;

private boolean nextIsOffset = false;
/** Whether the last seen character had more than 1 byte for a char */
private boolean lastCharHadMultipleBytes = false;

private final Queue<SourcePointer.Region> remainingRegions;
private SourcePointer.Region currentRegion;

public ExternalUtf8ContentFilter(Reader input, List<SourcePointer.Region> regions, String pointer)
public ExternalUtf8ContentFilter(
SeekableByteChannel channel, List<SourcePointer.Region> regions, String pointer)
throws IOException {
super(input);
// We need to be able to reposition the underlying reader, so we use our own implementation
// based on a SeekableByteChannel.
super(new ByteSeekableReader(channel));
if (regions == null || regions.isEmpty()) {
regions = ImmutableList.of(new Region(0, (int) channel.size()));
}
this.pointer = pointer;
this.currentOutOffset = 0;
this.currentInOffset = 0;
this.cumulative = 0;
this.currentOutCharOffset = 0;
this.currentInByteOffset = 0;
this.cumulativeOffsetDifference = 0;
this.remainingRegions = new LinkedList<>(regions);
currentRegion = remainingRegions.remove();
if (currentRegion.start > 0) {
this.addOffCorrectMap(currentOutOffset, currentRegion.startOffset);
this.cumulative += currentRegion.startOffset;
this.currentInOffset = (int) this.input.skip(currentRegion.start);
this.addOffCorrectMap(currentOutCharOffset, currentRegion.start);
this.cumulativeOffsetDifference += currentRegion.start;
this.currentInByteOffset = currentRegion.start;
((ByteSeekableReader) this.input).position(currentInByteOffset);
}
}

/**
* Read <tt>len</tt> <tt>char</tt>s into <tt>cbuf</tt>, starting from character index <tt>off</tt>
* relative to the beginning of <tt>cbuf</tt> and return the number of <tt>char</tt>s read.
* Read <tt>requestedCharLen</tt> <tt>char</tt>s into <tt>outputBuffer</tt>, starting from
* character index <tt>outputCharOffset</tt> relative to the beginning of <tt>outputBuffer</tt>
* and return the number of <tt>char</tt>s read.
*
* <p>Keeps track of the current byte offset in the input and the current char offset in the
* output.
*/
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
if (currentInOffset == currentRegion.end) {
public int read(char[] outputBuffer, int outputCharOffset, int requestedCharLen)
throws IOException {
if (currentInByteOffset == currentRegion.end) {
return -1;
}

int numCharsRead = 0;
while (len - numCharsRead > 0) {
int charsRemainingInRegion = currentRegion.end - currentInOffset;
int charsToRead = len - numCharsRead;
if (charsToRead > charsRemainingInRegion) {
charsToRead = charsRemainingInRegion;
while (requestedCharLen - numCharsRead > 0) {
int bytesRemainingInRegion = currentRegion.end - currentInByteOffset;
int charsToRead = requestedCharLen - numCharsRead;
if (charsToRead > bytesRemainingInRegion) {
charsToRead = bytesRemainingInRegion;
}

int read = this.input.read(cbuf, off, charsToRead);
if (read < 0) {
int charsRead = this.input.read(outputBuffer, outputCharOffset, charsToRead);
if (charsRead < 0) {
break;
}
correctOffsets(cbuf, off, read);
numCharsRead += read;
off += read;
while (Utf8.encodedLength(CharBuffer.wrap(outputBuffer, outputCharOffset, charsRead))
> bytesRemainingInRegion) {
charsRead--;
}
correctOffsets(outputBuffer, outputCharOffset, charsRead);
numCharsRead += charsRead;
outputCharOffset += charsRead;

if (currentInOffset == currentRegion.end) {
if (currentInByteOffset == currentRegion.end) {
if (remainingRegions.isEmpty()) {
break;
}
currentRegion = remainingRegions.remove();

cumulative = currentRegion.startOffset - currentOutOffset;
this.addOffCorrectMap(currentOutOffset, cumulative);
int toSkip = this.currentRegion.start - this.currentInOffset;
if (toSkip > 0) {
this.input.skip(this.currentRegion.start - this.currentInOffset);
cumulativeOffsetDifference = currentRegion.start - currentOutCharOffset;
this.addOffCorrectMap(currentOutCharOffset, cumulativeOffsetDifference);
if (this.currentRegion.start > this.currentInByteOffset) {
this.currentInByteOffset = currentRegion.start;
}
this.currentInOffset = currentRegion.start;
((ByteSeekableReader) this.input).position(this.currentInByteOffset);
}
}
return numCharsRead > 0 ? numCharsRead : -1;
}

private void correctOffsets(char[] cbuf, int off, int len) {
for (int i = off; i < off + len; i++) {
if (nextIsOffset) {
this.addOffCorrectMap(currentOutOffset, cumulative);
nextIsOffset = false;
/**
* Updates the current input and output offsets based on the characters read from the input.
*
* @param decodedChars Buffer of characters that were read from the input
* @param bufOffset Offset in decodedChars, where the stored characters start
* @param numChars Number of characters stored in decodedChars
*/
private void correctOffsets(char[] decodedChars, int bufOffset, int numChars) {
for (int i = bufOffset; i < bufOffset + numChars; ) {
if (lastCharHadMultipleBytes) {
this.addOffCorrectMap(currentOutCharOffset, cumulativeOffsetDifference);
lastCharHadMultipleBytes = false;
}
currentInOffset += 1;
currentOutOffset += 1;
int cp = Character.codePointAt(cbuf, i);
int increment = Utf8.encodedLength(cp) - 1;
if (increment > 0) {
cumulative += increment;
nextIsOffset = true;
int cp = Character.codePointAt(decodedChars, i);
int encodedLen = Utf8.encodedLength(cp);
int charLen = Character.charCount(cp);
i += charLen;
currentOutCharOffset += charLen;
currentInByteOffset += encodedLen;
if (encodedLen > 1) {
cumulativeOffsetDifference += (encodedLen - 1);
lastCharHadMultipleBytes = true;
}
}
}
Expand Down
10 changes: 2 additions & 8 deletions src/main/java/com/github/dbmdz/solrocr/model/SourcePointer.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
import com.github.dbmdz.solrocr.reader.FileSourceReader;
import com.github.dbmdz.solrocr.reader.MultiFileSourceReader;
import com.github.dbmdz.solrocr.reader.SourceReader;
import com.google.common.collect.ImmutableList;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
Expand Down Expand Up @@ -75,7 +75,7 @@ static Source parse(String pointer) {
throw new RuntimeException("Could not parse source pointer from '" + pointer + ".");
}
String target = m.group("target");
List<Region> regions = ImmutableList.of();
List<Region> regions = new ArrayList<>();
if (m.group("regions") != null) {
regions =
Arrays.stream(m.group("regions").split(","))
Expand Down Expand Up @@ -124,7 +124,6 @@ public static class Region {

public int start;
public int end;
public int startOffset = 0;

public static Region parse(String r) {
if (r.startsWith(":")) {
Expand All @@ -142,11 +141,6 @@ public Region(int start, int end) {
this.end = end;
}

public Region(int start, int end, int startOffset) {
this(start, end);
this.startOffset = startOffset;
}

@Override
public String toString() {
return start + ":" + end;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
import java.nio.channels.SeekableByteChannel;

/** API for reading data from a source. */
public interface SourceReader {
public interface SourceReader extends AutoCloseable {
/** Close the resources associated with this reader. */
@Override
void close() throws IOException;

/** Get the pointer this reader is reading from. */
Expand Down
Loading
Loading