diff --git a/src/main/java/com/github/dbmdz/solrocr/lucene/filters/ByteSeekableReader.java b/src/main/java/com/github/dbmdz/solrocr/lucene/filters/ByteSeekableReader.java new file mode 100644 index 00000000..e6e3a44e --- /dev/null +++ b/src/main/java/com/github/dbmdz/solrocr/lucene/filters/ByteSeekableReader.java @@ -0,0 +1,46 @@ +package com.github.dbmdz.solrocr.lucene.filters; + +import com.github.dbmdz.solrocr.reader.StreamDecoder; +import java.io.IOException; +import java.io.Reader; +import java.nio.channels.SeekableByteChannel; +import java.nio.charset.StandardCharsets; + +/** + * A Reader implementation that reads from a SeekableByteChannel and allows repositioning the + * reader. + */ +public class ByteSeekableReader extends Reader { + private final SeekableByteChannel channel; + private StreamDecoder decoder; + + public ByteSeekableReader(SeekableByteChannel channel) { + this.channel = channel; + this.decoder = StreamDecoder.forDecoder(channel, StandardCharsets.UTF_8.newDecoder(), -1); + } + + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + return this.decoder.read(cbuf, off, len); + } + + /** Return the current byte position in the underlying channel. */ + public int position() throws IOException { + return (int) this.channel.position(); + } + + /** + * Reposition the reader to the given byte position. + * + *

This will also reset the decoder. + */ + public void position(int newPosition) throws IOException { + this.channel.position(newPosition); + this.decoder = StreamDecoder.forDecoder(channel, StandardCharsets.UTF_8.newDecoder(), -1); + } + + @Override + public void close() throws IOException { + this.channel.close(); + } +} diff --git a/src/main/java/com/github/dbmdz/solrocr/lucene/filters/ExternalUtf8ContentFilter.java b/src/main/java/com/github/dbmdz/solrocr/lucene/filters/ExternalUtf8ContentFilter.java index 3570446f..b790fdab 100644 --- a/src/main/java/com/github/dbmdz/solrocr/lucene/filters/ExternalUtf8ContentFilter.java +++ b/src/main/java/com/github/dbmdz/solrocr/lucene/filters/ExternalUtf8ContentFilter.java @@ -1,10 +1,13 @@ package com.github.dbmdz.solrocr.lucene.filters; import com.github.dbmdz.solrocr.model.SourcePointer; +import com.github.dbmdz.solrocr.model.SourcePointer.Region; import com.github.dbmdz.solrocr.util.SourceAwareReader; import com.github.dbmdz.solrocr.util.Utf8; +import com.google.common.collect.ImmutableList; import java.io.IOException; -import java.io.Reader; +import java.nio.CharBuffer; +import java.nio.channels.SeekableByteChannel; import java.util.LinkedList; import java.util.List; import java.util.Optional; @@ -17,98 +20,132 @@ public class ExternalUtf8ContentFilter extends BaseCharFilter implements SourceA * The cumulative offset difference between the input (bytes) and the output (chars) at the * current position. * + *

Used to calculate the byte offset in the input, given a + * char offset from the output of this filter. + * *

-   * current actual byte offset in input = currentOutOffset + cumulative
+   * currentInputByteOffset = currentOutCharOffset + cumulativeOffsetDifference
    * 
*/ - private int cumulative; + private int cumulativeOffsetDifference; - /** The current char offset in the full file; */ - private int currentInOffset; + /** + * The current byte offset in the full input, i.e. the + * concatenated content of all files in the source pointer. + */ + private int currentInByteOffset; - /** The current char offset in the output. */ - private int currentOutOffset; + /** + * The current char offset in the output, i.e. the concatenated and decoded + * content of all regions in the source pointer. + */ + private int currentOutCharOffset; /** Source pointer of this reader, used for debugging and error reporting. */ private final String pointer; - private boolean nextIsOffset = false; + /** Whether the last seen character had more than 1 byte for a char */ + private boolean lastCharHadMultipleBytes = false; + private final Queue remainingRegions; private SourcePointer.Region currentRegion; - public ExternalUtf8ContentFilter(Reader input, List regions, String pointer) + public ExternalUtf8ContentFilter( + SeekableByteChannel channel, List regions, String pointer) throws IOException { - super(input); + // We need to be able to reposition the underlying reader, so we use our own implementation + // based on a SeekableByteChannel. + super(new ByteSeekableReader(channel)); + if (regions == null || regions.isEmpty()) { + regions = ImmutableList.of(new Region(0, (int) channel.size())); + } this.pointer = pointer; - this.currentOutOffset = 0; - this.currentInOffset = 0; - this.cumulative = 0; + this.currentOutCharOffset = 0; + this.currentInByteOffset = 0; + this.cumulativeOffsetDifference = 0; this.remainingRegions = new LinkedList<>(regions); currentRegion = remainingRegions.remove(); if (currentRegion.start > 0) { - this.addOffCorrectMap(currentOutOffset, currentRegion.startOffset); - this.cumulative += currentRegion.startOffset; - this.currentInOffset = (int) this.input.skip(currentRegion.start); + this.addOffCorrectMap(currentOutCharOffset, currentRegion.start); + this.cumulativeOffsetDifference += currentRegion.start; + this.currentInByteOffset = currentRegion.start; + ((ByteSeekableReader) this.input).position(currentInByteOffset); } } /** - * Read len chars into cbuf, starting from character index off - * relative to the beginning of cbuf and return the number of chars read. + * Read requestedCharLen chars into outputBuffer, starting from + * character index outputCharOffset relative to the beginning of outputBuffer + * and return the number of chars read. + * + *

Keeps track of the current byte offset in the input and the current char offset in the + * output. */ @Override - public int read(char[] cbuf, int off, int len) throws IOException { - if (currentInOffset == currentRegion.end) { + public int read(char[] outputBuffer, int outputCharOffset, int requestedCharLen) + throws IOException { + if (currentInByteOffset == currentRegion.end) { return -1; } int numCharsRead = 0; - while (len - numCharsRead > 0) { - int charsRemainingInRegion = currentRegion.end - currentInOffset; - int charsToRead = len - numCharsRead; - if (charsToRead > charsRemainingInRegion) { - charsToRead = charsRemainingInRegion; + while (requestedCharLen - numCharsRead > 0) { + int bytesRemainingInRegion = currentRegion.end - currentInByteOffset; + int charsToRead = requestedCharLen - numCharsRead; + if (charsToRead > bytesRemainingInRegion) { + charsToRead = bytesRemainingInRegion; } - int read = this.input.read(cbuf, off, charsToRead); - if (read < 0) { + int charsRead = this.input.read(outputBuffer, outputCharOffset, charsToRead); + if (charsRead < 0) { break; } - correctOffsets(cbuf, off, read); - numCharsRead += read; - off += read; + while (Utf8.encodedLength(CharBuffer.wrap(outputBuffer, outputCharOffset, charsRead)) + > bytesRemainingInRegion) { + charsRead--; + } + correctOffsets(outputBuffer, outputCharOffset, charsRead); + numCharsRead += charsRead; + outputCharOffset += charsRead; - if (currentInOffset == currentRegion.end) { + if (currentInByteOffset == currentRegion.end) { if (remainingRegions.isEmpty()) { break; } currentRegion = remainingRegions.remove(); - cumulative = currentRegion.startOffset - currentOutOffset; - this.addOffCorrectMap(currentOutOffset, cumulative); - int toSkip = this.currentRegion.start - this.currentInOffset; - if (toSkip > 0) { - this.input.skip(this.currentRegion.start - this.currentInOffset); + cumulativeOffsetDifference = currentRegion.start - currentOutCharOffset; + this.addOffCorrectMap(currentOutCharOffset, cumulativeOffsetDifference); + if (this.currentRegion.start > this.currentInByteOffset) { + this.currentInByteOffset = currentRegion.start; } - this.currentInOffset = currentRegion.start; + ((ByteSeekableReader) this.input).position(this.currentInByteOffset); } } return numCharsRead > 0 ? numCharsRead : -1; } - private void correctOffsets(char[] cbuf, int off, int len) { - for (int i = off; i < off + len; i++) { - if (nextIsOffset) { - this.addOffCorrectMap(currentOutOffset, cumulative); - nextIsOffset = false; + /** + * Updates the current input and output offsets based on the characters read from the input. + * + * @param decodedChars Buffer of characters that were read from the input + * @param bufOffset Offset in decodedChars, where the stored characters start + * @param numChars Number of characters stored in decodedChars + */ + private void correctOffsets(char[] decodedChars, int bufOffset, int numChars) { + for (int i = bufOffset; i < bufOffset + numChars; ) { + if (lastCharHadMultipleBytes) { + this.addOffCorrectMap(currentOutCharOffset, cumulativeOffsetDifference); + lastCharHadMultipleBytes = false; } - currentInOffset += 1; - currentOutOffset += 1; - int cp = Character.codePointAt(cbuf, i); - int increment = Utf8.encodedLength(cp) - 1; - if (increment > 0) { - cumulative += increment; - nextIsOffset = true; + currentOutCharOffset += 1; + int cp = Character.codePointAt(decodedChars, i); + i += Character.charCount(cp); + int encodedLen = Utf8.encodedLength(cp); + currentInByteOffset += encodedLen; + if (encodedLen > 1) { + cumulativeOffsetDifference += (encodedLen - 1); + lastCharHadMultipleBytes = true; } } } diff --git a/src/main/java/com/github/dbmdz/solrocr/model/SourcePointer.java b/src/main/java/com/github/dbmdz/solrocr/model/SourcePointer.java index fd916695..22c347a3 100644 --- a/src/main/java/com/github/dbmdz/solrocr/model/SourcePointer.java +++ b/src/main/java/com/github/dbmdz/solrocr/model/SourcePointer.java @@ -3,13 +3,13 @@ import com.github.dbmdz.solrocr.reader.FileSourceReader; import com.github.dbmdz.solrocr.reader.MultiFileSourceReader; import com.github.dbmdz.solrocr.reader.SourceReader; -import com.google.common.collect.ImmutableList; import java.io.FileNotFoundException; import java.io.IOException; import java.lang.invoke.MethodHandles; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.List; @@ -75,7 +75,7 @@ static Source parse(String pointer) { throw new RuntimeException("Could not parse source pointer from '" + pointer + "."); } String target = m.group("target"); - List regions = ImmutableList.of(); + List regions = new ArrayList<>(); if (m.group("regions") != null) { regions = Arrays.stream(m.group("regions").split(",")) @@ -124,7 +124,6 @@ public static class Region { public int start; public int end; - public int startOffset = 0; public static Region parse(String r) { if (r.startsWith(":")) { @@ -142,11 +141,6 @@ public Region(int start, int end) { this.end = end; } - public Region(int start, int end, int startOffset) { - this(start, end); - this.startOffset = startOffset; - } - @Override public String toString() { return start + ":" + end; diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/SourceReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/SourceReader.java index 0a996396..d25fdac4 100644 --- a/src/main/java/com/github/dbmdz/solrocr/reader/SourceReader.java +++ b/src/main/java/com/github/dbmdz/solrocr/reader/SourceReader.java @@ -6,8 +6,9 @@ import java.nio.channels.SeekableByteChannel; /** API for reading data from a source. */ -public interface SourceReader { +public interface SourceReader extends AutoCloseable { /** Close the resources associated with this reader. */ + @Override void close() throws IOException; /** Get the pointer this reader is reading from. */ diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/StreamDecoder.java b/src/main/java/com/github/dbmdz/solrocr/reader/StreamDecoder.java new file mode 100644 index 00000000..a23f30cf --- /dev/null +++ b/src/main/java/com/github/dbmdz/solrocr/reader/StreamDecoder.java @@ -0,0 +1,382 @@ +/* + * Copyright (c) 2001, 2023, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package com.github.dbmdz.solrocr.reader; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.io.UnsupportedEncodingException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.ReadableByteChannel; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.UnsupportedCharsetException; +import java.util.Arrays; + +/** + * Vendored version of {@link sun.nio.cs.StreamDecoder} from OpenJDK 22+36. + * + *

Vendoring was necessary due to package-private constructors that don't allow subclassing. + */ +public class StreamDecoder extends Reader { + + private static final int MIN_BYTE_BUFFER_SIZE = 32; + private static final int DEFAULT_BYTE_BUFFER_SIZE = 8192; + + private volatile boolean closed; + + private void ensureOpen() throws IOException { + if (closed) throw new IOException("Stream closed"); + } + + // In order to handle surrogates properly we must never try to produce + // fewer than two characters at a time. If we're only asked to return one + // character then the other is saved here to be returned later. + // + private boolean haveLeftoverChar = false; + private char leftoverChar; + + // Factories for java.io.InputStreamReader + + public static StreamDecoder forInputStreamReader(InputStream in, Object lock, String charsetName) + throws UnsupportedEncodingException { + try { + return new StreamDecoder(in, lock, Charset.forName(charsetName)); + } catch (IllegalCharsetNameException | UnsupportedCharsetException x) { + throw new UnsupportedEncodingException(charsetName); + } + } + + public static StreamDecoder forInputStreamReader(InputStream in, Object lock, Charset cs) { + return new StreamDecoder(in, lock, cs); + } + + public static StreamDecoder forInputStreamReader( + InputStream in, Object lock, CharsetDecoder dec) { + return new StreamDecoder(in, lock, dec); + } + + // Factory for java.nio.channels.Channels.newReader + + public static StreamDecoder forDecoder( + ReadableByteChannel ch, CharsetDecoder dec, int minBufferCap) { + return new StreamDecoder(ch, dec, minBufferCap); + } + + // -- Public methods corresponding to those in InputStreamReader -- + + // All synchronization and state/argument checking is done in these public + // methods; the concrete stream-decoder subclasses defined below need not + // do any such checking. + + public String getEncoding() { + if (isOpen()) return encodingName(); + return null; + } + + public int read() throws IOException { + return read0(); + } + + private int read0() throws IOException { + Object lock = this.lock; + synchronized (lock) { + return lockedRead0(); + } + } + + @SuppressWarnings("fallthrough") + private int lockedRead0() throws IOException { + // Return the leftover char, if there is one + if (haveLeftoverChar) { + haveLeftoverChar = false; + return leftoverChar; + } + + // Convert more bytes + char[] cb = new char[2]; + int n = read(cb, 0, 2); + switch (n) { + case -1: + return -1; + case 2: + leftoverChar = cb[1]; + haveLeftoverChar = true; + // FALL THROUGH + case 1: + return cb[0]; + default: + assert false : n; + return -1; + } + } + + public int read(char[] cbuf, int offset, int length) throws IOException { + Object lock = this.lock; + synchronized (lock) { + return lockedRead(cbuf, offset, length); + } + } + + private int lockedRead(char[] cbuf, int offset, int length) throws IOException { + int off = offset; + int len = length; + + ensureOpen(); + if ((off < 0) + || (off > cbuf.length) + || (len < 0) + || ((off + len) > cbuf.length) + || ((off + len) < 0)) { + throw new IndexOutOfBoundsException(); + } + if (len == 0) return 0; + + int n = 0; + + if (haveLeftoverChar) { + // Copy the leftover char into the buffer + cbuf[off] = leftoverChar; + off++; + len--; + haveLeftoverChar = false; + n = 1; + if ((len == 0) || !implReady()) + // Return now if this is all we can produce w/o blocking + return n; + } + + if (len == 1) { + // Treat single-character array reads just like read() + int c = read0(); + if (c == -1) return (n == 0) ? -1 : n; + cbuf[off] = (char) c; + return n + 1; + } + + // Read remaining characters + int nr = implRead(cbuf, off, off + len); + + // At this point, n is either 1 if a leftover character was read, + // or 0 if no leftover character was read. If n is 1 and nr is -1, + // indicating EOF, then we don't return their sum as this loses data. + return (nr < 0) ? (n == 1 ? 1 : nr) : (n + nr); + } + + public boolean ready() throws IOException { + Object lock = this.lock; + synchronized (lock) { + return lockedReady(); + } + } + + private boolean lockedReady() throws IOException { + ensureOpen(); + return haveLeftoverChar || implReady(); + } + + public void close() throws IOException { + Object lock = this.lock; + synchronized (lock) { + lockedClose(); + } + } + + private void lockedClose() throws IOException { + if (closed) return; + try { + implClose(); + } finally { + closed = true; + } + } + + private boolean isOpen() { + return !closed; + } + + public void fillZeroToPosition() throws IOException { + Object lock = this.lock; + synchronized (lock) { + lockedFillZeroToPosition(); + } + } + + private void lockedFillZeroToPosition() { + Arrays.fill(bb.array(), bb.arrayOffset(), bb.arrayOffset() + bb.position(), (byte) 0); + } + + // -- Charset-based stream decoder impl -- + + private final Charset cs; + private final CharsetDecoder decoder; + private final ByteBuffer bb; + + // Exactly one of these is non-null + private final InputStream in; + private final ReadableByteChannel ch; + + StreamDecoder(InputStream in, Object lock, Charset cs) { + this( + in, + lock, + cs.newDecoder() + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE)); + } + + StreamDecoder(InputStream in, Object lock, CharsetDecoder dec) { + super(lock); + this.cs = dec.charset(); + this.decoder = dec; + this.in = in; + this.ch = null; + this.bb = ByteBuffer.allocate(DEFAULT_BYTE_BUFFER_SIZE); + bb.flip(); // So that bb is initially empty + } + + StreamDecoder(ReadableByteChannel ch, CharsetDecoder dec, int mbc) { + this.in = null; + this.ch = ch; + this.decoder = dec; + this.cs = dec.charset(); + this.bb = + ByteBuffer.allocate( + mbc < 0 + ? DEFAULT_BYTE_BUFFER_SIZE + : (mbc < MIN_BYTE_BUFFER_SIZE ? MIN_BYTE_BUFFER_SIZE : mbc)); + bb.flip(); + } + + private int readBytes() throws IOException { + bb.compact(); + try { + if (ch != null) { + // Read from the channel + int n = ch.read(bb); + if (n < 0) return n; + } else { + // Read from the input stream, and then update the buffer + int lim = bb.limit(); + int pos = bb.position(); + assert (pos <= lim); + int rem = (pos <= lim ? lim - pos : 0); + int n = in.read(bb.array(), bb.arrayOffset() + pos, rem); + if (n < 0) return n; + if (n == 0) throw new IOException("Underlying input stream returned zero bytes"); + assert (n <= rem) : "n = " + n + ", rem = " + rem; + bb.position(pos + n); + } + } finally { + // Flip even when an IOException is thrown, + // otherwise the stream will stutter + bb.flip(); + } + + int rem = bb.remaining(); + assert (rem != 0) : rem; + return rem; + } + + int implRead(char[] cbuf, int off, int end) throws IOException { + + // In order to handle surrogate pairs, this method requires that + // the invoker attempt to read at least two characters. Saving the + // extra character, if any, at a higher level is easier than trying + // to deal with it here. + assert (end - off > 1); + + CharBuffer cb = CharBuffer.wrap(cbuf, off, end - off); + if (cb.position() != 0) { + // Ensure that cb[0] == cbuf[off] + cb = cb.slice(); + } + + boolean eof = false; + for (; ; ) { + CoderResult cr = decoder.decode(bb, cb, eof); + if (cr.isUnderflow()) { + if (eof) break; + if (!cb.hasRemaining()) break; + if ((cb.position() > 0) && !inReady()) break; // Block at most once + int n = readBytes(); + if (n < 0) { + eof = true; + if ((cb.position() == 0) && (!bb.hasRemaining())) break; + } + continue; + } + if (cr.isOverflow()) { + assert cb.position() > 0; + break; + } + cr.throwException(); + } + + if (eof) { + // ## Need to flush decoder + decoder.reset(); + } + + if (cb.position() == 0) { + if (eof) { + return -1; + } + assert false; + } + return cb.position(); + } + + String encodingName() { + return cs.name(); + } + + private boolean inReady() { + try { + return (((in != null) && (in.available() > 0)) + || (ch instanceof FileChannel)); // ## RBC.available()? + } catch (IOException x) { + return false; + } + } + + boolean implReady() { + return bb.hasRemaining() || inReady(); + } + + void implClose() throws IOException { + if (ch != null) { + ch.close(); + } else { + in.close(); + } + } +} diff --git a/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java b/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java index 385eb319..8b4ce919 100644 --- a/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java +++ b/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java @@ -2,21 +2,18 @@ import com.github.dbmdz.solrocr.lucene.filters.ExternalUtf8ContentFilter; import com.github.dbmdz.solrocr.model.SourcePointer; +import com.github.dbmdz.solrocr.model.SourcePointer.Region; import com.github.dbmdz.solrocr.model.SourcePointer.Source; import com.github.dbmdz.solrocr.model.SourcePointer.SourceType; import com.github.dbmdz.solrocr.reader.SourceReader; -import com.github.dbmdz.solrocr.util.Utf8; -import com.google.common.collect.ImmutableList; -import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.nio.ByteBuffer; -import java.nio.channels.Channels; import java.nio.channels.SeekableByteChannel; -import java.nio.charset.StandardCharsets; import java.nio.file.Paths; +import java.util.Arrays; import java.util.List; import java.util.Locale; import java.util.Map; @@ -63,11 +60,6 @@ public Reader create(Reader input) { } pointer.sources.forEach(this::validateSource); - // Regions contained in source pointers are defined by byte offsets. - // We need to convert these to Java character offsets, so they can be used by the filter. - // This is very expensive, but we need this since all IO from here on out is character-based. - toCharOffsets(pointer); - if (pointer.sources.isEmpty()) { throw new RuntimeException( "No source files could be determined from pointer. " @@ -75,12 +67,13 @@ public Reader create(Reader input) { + "Pointer was: " + ptrStr); } - Reader r = - Channels.newReader( - pointer.getReader(512 * 1024, 0).getByteChannel(), StandardCharsets.UTF_8.name()); - List charRegions = + adjustRegions(pointer); + // Section size and cache size dont't matter, since we don't use sectioned reads during + // indexing. + SourceReader r = pointer.getReader(512 * 1024, 0); + List regions = pointer.sources.stream().flatMap(s -> s.regions.stream()).collect(Collectors.toList()); - return new ExternalUtf8ContentFilter(new BufferedReader(r), charRegions, ptrStr); + return new ExternalUtf8ContentFilter(r.getByteChannel(), regions, ptrStr); } catch (IOException e) { throw new RuntimeException( String.format( @@ -107,109 +100,48 @@ private void validateSource(Source src) { } } - private static long getUtf8DecodedLength(SeekableByteChannel chan, ByteBuffer buf, long numBytes) - throws IOException { - long numRead = 0; - long decodedLength = 0; - while (numRead < numBytes) { - if (buf.remaining() > (numBytes - numRead)) { - buf.limit((int) (numBytes - numRead)); - } - int read = chan.read(buf); - if (read < 0) { - break; - } - numRead += read; - buf.flip(); - decodedLength += Utf8.decodedLength(buf); - buf.clear(); - } - if (numRead < numBytes) { - throw new IOException( - String.format( - Locale.US, - "Read fewer bytes than expected (%d vs %d), check your source pointer!", - numRead, - numBytes)); - } - return decodedLength; - } - - private void toCharOffsets(SourcePointer ptr) throws IOException { - int byteOffset = 0; - int charOffset = 0; - ByteBuffer buf = ByteBuffer.allocateDirect(1024 * 1024 /* 1 MiB */); - // TODO: Use a queue for the file sources so we don't have to read until the end of the last - // file every time - // TODO: Think about building the UTF8 -> UTF16 offset map right here if the mapping part should - // become a bottle neck + /** + * Adjust regions to account for UTF BOM, if present, and to make them relative to the + * concatenated inputs. + * + *

UTF8-encoded files may contain a 3 byte byte-order-marker at the beginning of the file. Its + * use is discouraged and not needed for UTF8 (since the byte order is pre-defined), but we've + * encountered OCR files in the wild that have it, so we check for it and adjust regions starting + * on the beginning of the file to account for it. + */ + private void adjustRegions(SourcePointer ptr) throws IOException { + int outByteOffset = 0; + byte[] bomBuf = new byte[3]; for (SourcePointer.Source src : ptr.sources) { - SourceReader reader = src.getReader(512 * 1024, 0); - try { - SeekableByteChannel chan = reader.getByteChannel(); - final int size = (int) chan.size(); + // Again, section size and cache size don't matter, since we don't use sectioned reads during + // indexing. + try (SourceReader reader = src.getReader(512, 0)) { + int inputLen = reader.length(); - int bomOffset = 0; - if (!src.isAscii) { - // Check for BOM without modifying channel position, we need to skip it as to not break - // mult-file parsing - ByteBuffer bomBuf = ByteBuffer.allocate(3); - chan.read(bomBuf); + if (src.regions.isEmpty()) { + src.regions.add(new Region(0, inputLen)); + } + + if (!src.isAscii && src.regions.stream().anyMatch(r -> r.start == 0)) { + SeekableByteChannel chan = reader.getByteChannel(); + chan.read(ByteBuffer.wrap(bomBuf)); chan.position(0); - bomBuf.flip(); - if (bomBuf.equals(ByteBuffer.wrap(new byte[] {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF}))) { - bomOffset = 3; + boolean hasBOM = + Arrays.equals(bomBuf, new byte[] {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF}); + if (hasBOM) { + src.regions.stream().filter(r -> r.start == 0).forEach(r -> r.start += 3); } } - // Byte offset of the current file from the beginning of the first file - final int baseOffset = byteOffset; - if (src.regions.isEmpty()) { - src.regions = ImmutableList.of(new SourcePointer.Region(0, size)); - } - for (SourcePointer.Region region : src.regions) { - if (src.isAscii) { - // Optimization for pure-ASCII sources, where we don't need to do any mapping - region.start += baseOffset; - region.end = Math.min(region.end + baseOffset, size + baseOffset); - continue; - } - if (region.start == 0) { - // Skip the BOM at the start of a file, if present - region.start += bomOffset; - } - // Make region offsets relative to the beginning of the first file - region.start += baseOffset; - if (region.end < 0) { - region.end = size; + for (Region region : src.regions) { + if (region.end == -1) { + region.end = inputLen; } - region.end = Math.min(region.end + baseOffset, size + baseOffset); - // Read until the start of the region - if (byteOffset != region.start) { - // Read the data between the current offset and the start of the region - int len = region.start - byteOffset; - charOffset += (int) getUtf8DecodedLength(chan, buf, len); - byteOffset += len; - } - - int regionSize = region.end - region.start; - region.start = charOffset; - region.startOffset = byteOffset; - // Read region, determine character offset of region end - charOffset += (int) getUtf8DecodedLength(chan, buf, regionSize); - byteOffset += regionSize; - region.end = charOffset; + region.start += outByteOffset; + region.end += outByteOffset; } - // Determine character offset of the end of the file - if (src.isAscii) { - byteOffset += size; - } else if (byteOffset != baseOffset + size) { - int len = (baseOffset + size) - byteOffset; - charOffset += (int) getUtf8DecodedLength(chan, buf, len); - byteOffset += len; - } - } finally { - reader.close(); + + outByteOffset += inputLen; } } } diff --git a/src/test/java/com/github/dbmdz/solrocr/lucene/ExternalUtf8ContentFilterTest.java b/src/test/java/com/github/dbmdz/solrocr/lucene/ExternalUtf8ContentFilterTest.java index ce8b4d50..db3463b6 100644 --- a/src/test/java/com/github/dbmdz/solrocr/lucene/ExternalUtf8ContentFilterTest.java +++ b/src/test/java/com/github/dbmdz/solrocr/lucene/ExternalUtf8ContentFilterTest.java @@ -6,8 +6,6 @@ import com.github.dbmdz.solrocr.model.SourcePointer.Region; import com.github.dbmdz.solrocr.util.Utf8; import com.google.common.collect.ImmutableList; -import java.io.BufferedReader; -import java.io.FileReader; import java.io.IOException; import java.io.StringReader; import java.nio.ByteBuffer; @@ -57,7 +55,7 @@ public void extractFully() throws IOException { Path p = Paths.get("src/test/resources/data/hocr.html"); CharFilter filter = new ExternalUtf8ContentFilter( - new BufferedReader(new FileReader(p.toFile())), + Files.newByteChannel(p), ImmutableList.of(new Region(0, (int) p.toFile().length())), p.toString()); String full = new String(Files.readAllBytes(p), StandardCharsets.UTF_8);