Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't fail the whole request when a source pointer can't be read. #424

Merged
merged 1 commit into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import com.google.common.collect.ImmutableList;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
Expand All @@ -12,8 +13,11 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class SourcePointer {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

public static class FileSource {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,14 @@ private void addOcrSnippets(
SimpleOrderedMap<Object> docMap = (SimpleOrderedMap<Object>) out.get(docId);
if (docMap == null) {
docMap = new SimpleOrderedMap<>();
out.add(docId, docMap);
}
if (ocrSnippets[k] == null) {
continue;
}
docMap.addAll(ocrSnippets[k].toNamedList());
if (docMap.size() > 0) {
out.add(docId, docMap);
}
}
}

Expand Down
10 changes: 8 additions & 2 deletions src/main/java/solrocr/OcrHighlighter.java
Original file line number Diff line number Diff line change
Expand Up @@ -528,14 +528,20 @@ protected List<IterableCharSequence[]> loadOcrFieldValues(
ocrVals[fieldIdx] = IterableCharSequence.fromString(fieldValue);
continue;
}
SourcePointer sourcePointer = SourcePointer.parse(fieldValue);
SourcePointer sourcePointer = null;
try {
sourcePointer = SourcePointer.parse(fieldValue);
} catch (RuntimeException e) {
log.error("Could not parse OCR pointer for document {}: {}", docId, fieldValue, e);
}
if (sourcePointer == null) {
// None of the files in the pointer exist or were readable, log should have warnings
ocrVals[fieldIdx] = null;
continue;
}
// If preloading is enabled, start warming the cache for the pointer
PageCacheWarmer.getInstance().ifPresent(w -> w.preload(sourcePointer));
final SourcePointer finalPtr = sourcePointer;
PageCacheWarmer.getInstance().ifPresent(w -> w.preload(finalPtr));
schmika marked this conversation as resolved.
Show resolved Hide resolved
if (sourcePointer.sources.size() == 1) {
ocrVals[fieldIdx] =
new FileBytesCharIterator(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ public void testCombinedHighlightingWorks() throws Exception {
"Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea <em>commodo consequat</em>. Duis aute irure dolor in "
+ "reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. ");
NamedList<?> ocrHls = (NamedList<?>) resp.getResponse().get("ocrHighlighting");
assertEquals(2, ocrHls.size());
assertEquals(1, ocrHls.size());
assertEquals(1, ((NamedList<?>) ocrHls.get("31337")).size());
}
}
30 changes: 27 additions & 3 deletions src/test/java/com/github/dbmdz/solrocr/solr/HocrTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -229,13 +229,12 @@ public void testMaskedDocumentIsIndexed() {
@Test
public void testHighlightingTimeout() {
// This test can only check for the worst case, since checking for partial results is unlikely
// to be stable across
// multiple environments due to timing issues.
// to be stable across multiple environments due to timing issues.
SolrQueryRequest req = xmlQ("q", "Vögelchen", "hl.ocr.timeAllowed", "1");
assertQ(
req,
"//bool[@name='partialOcrHighlights']='true'",
"count(//lst[@name='ocrHighlighting']/lst)=2",
"count(//lst[@name='ocrHighlighting']/lst)=0",
"count(//arr[@name='snippets'])=0");
}

Expand Down Expand Up @@ -646,4 +645,29 @@ public void testHlQParserParam() {
"count(//arr[@name='snippets']/lst)='1'",
"contains(//arr[@name='snippets']/lst/str[@name='text'], '<em>Nathanael Brush</em>')");
}

public void testMissingFileDoesNotFailWholeQuery() throws IOException {
// Create a copy of of a document, with the OCR residing in a temporary directory
Path tmpDir = createTempDir();
Files.copy(Paths.get("src/test/resources/data/hocr.html"), tmpDir.resolve("hocr.html"));
assertU(
adoc("ocr_text", tmpDir.resolve("hocr.html").toAbsolutePath().toString(), "id", "999999"));
assertU(commit());

// With indexing complete, we delete the referenced hOCR in order to cause an error during
// highlighting
Files.delete(tmpDir.resolve("hocr.html"));
Files.delete(tmpDir);

try {
SolrQueryRequest req = xmlQ("q", "ocr_text:Nedereien");
assertQ(
req,
"count(//lst[@name='ocrHighlighting']/lst)=1",
"count(//lst[@name='ocrHighlighting']/lst[@name='999999'])=0");
} finally {
assertU(delI("999999"));
assertU(commit());
}
}
}
Loading