diff --git a/src/main/java/com/github/dbmdz/solrocr/model/SourcePointer.java b/src/main/java/com/github/dbmdz/solrocr/model/SourcePointer.java index b4cec1b0..3265f735 100644 --- a/src/main/java/com/github/dbmdz/solrocr/model/SourcePointer.java +++ b/src/main/java/com/github/dbmdz/solrocr/model/SourcePointer.java @@ -3,6 +3,7 @@ import com.google.common.collect.ImmutableList; import java.io.FileNotFoundException; import java.io.IOException; +import java.lang.invoke.MethodHandles; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Arrays; @@ -12,8 +13,11 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class SourcePointer { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); public static class FileSource { diff --git a/src/main/java/com/github/dbmdz/solrocr/solr/SolrOcrHighlighter.java b/src/main/java/com/github/dbmdz/solrocr/solr/SolrOcrHighlighter.java index adcb9d89..34583e67 100644 --- a/src/main/java/com/github/dbmdz/solrocr/solr/SolrOcrHighlighter.java +++ b/src/main/java/com/github/dbmdz/solrocr/solr/SolrOcrHighlighter.java @@ -100,12 +100,14 @@ private void addOcrSnippets( SimpleOrderedMap docMap = (SimpleOrderedMap) out.get(docId); if (docMap == null) { docMap = new SimpleOrderedMap<>(); - out.add(docId, docMap); } if (ocrSnippets[k] == null) { continue; } docMap.addAll(ocrSnippets[k].toNamedList()); + if (docMap.size() > 0) { + out.add(docId, docMap); + } } } diff --git a/src/main/java/solrocr/OcrHighlighter.java b/src/main/java/solrocr/OcrHighlighter.java index 78436d18..7b3c3e40 100644 --- a/src/main/java/solrocr/OcrHighlighter.java +++ b/src/main/java/solrocr/OcrHighlighter.java @@ -528,14 +528,20 @@ protected List loadOcrFieldValues( ocrVals[fieldIdx] = IterableCharSequence.fromString(fieldValue); continue; } - SourcePointer sourcePointer = SourcePointer.parse(fieldValue); + SourcePointer sourcePointer = null; + try { + sourcePointer = SourcePointer.parse(fieldValue); + } catch (RuntimeException e) { + log.error("Could not parse OCR pointer for document {}: {}", docId, fieldValue, e); + } if (sourcePointer == null) { // None of the files in the pointer exist or were readable, log should have warnings ocrVals[fieldIdx] = null; continue; } // If preloading is enabled, start warming the cache for the pointer - PageCacheWarmer.getInstance().ifPresent(w -> w.preload(sourcePointer)); + final SourcePointer finalPtr = sourcePointer; + PageCacheWarmer.getInstance().ifPresent(w -> w.preload(finalPtr)); if (sourcePointer.sources.size() == 1) { ocrVals[fieldIdx] = new FileBytesCharIterator( diff --git a/src/test/java/com/github/dbmdz/solrocr/solr/DistributedTest.java b/src/test/java/com/github/dbmdz/solrocr/solr/DistributedTest.java index 4a4705dd..751797f4 100644 --- a/src/test/java/com/github/dbmdz/solrocr/solr/DistributedTest.java +++ b/src/test/java/com/github/dbmdz/solrocr/solr/DistributedTest.java @@ -140,7 +140,7 @@ public void testCombinedHighlightingWorks() throws Exception { "Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in " + "reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. "); NamedList ocrHls = (NamedList) resp.getResponse().get("ocrHighlighting"); - assertEquals(2, ocrHls.size()); + assertEquals(1, ocrHls.size()); assertEquals(1, ((NamedList) ocrHls.get("31337")).size()); } } diff --git a/src/test/java/com/github/dbmdz/solrocr/solr/HocrTest.java b/src/test/java/com/github/dbmdz/solrocr/solr/HocrTest.java index 222390bc..6df5d71e 100644 --- a/src/test/java/com/github/dbmdz/solrocr/solr/HocrTest.java +++ b/src/test/java/com/github/dbmdz/solrocr/solr/HocrTest.java @@ -229,13 +229,12 @@ public void testMaskedDocumentIsIndexed() { @Test public void testHighlightingTimeout() { // This test can only check for the worst case, since checking for partial results is unlikely - // to be stable across - // multiple environments due to timing issues. + // to be stable across multiple environments due to timing issues. SolrQueryRequest req = xmlQ("q", "Vögelchen", "hl.ocr.timeAllowed", "1"); assertQ( req, "//bool[@name='partialOcrHighlights']='true'", - "count(//lst[@name='ocrHighlighting']/lst)=2", + "count(//lst[@name='ocrHighlighting']/lst)=0", "count(//arr[@name='snippets'])=0"); } @@ -646,4 +645,29 @@ public void testHlQParserParam() { "count(//arr[@name='snippets']/lst)='1'", "contains(//arr[@name='snippets']/lst/str[@name='text'], 'Nathanael Brush')"); } + + public void testMissingFileDoesNotFailWholeQuery() throws IOException { + // Create a copy of of a document, with the OCR residing in a temporary directory + Path tmpDir = createTempDir(); + Files.copy(Paths.get("src/test/resources/data/hocr.html"), tmpDir.resolve("hocr.html")); + assertU( + adoc("ocr_text", tmpDir.resolve("hocr.html").toAbsolutePath().toString(), "id", "999999")); + assertU(commit()); + + // With indexing complete, we delete the referenced hOCR in order to cause an error during + // highlighting + Files.delete(tmpDir.resolve("hocr.html")); + Files.delete(tmpDir); + + try { + SolrQueryRequest req = xmlQ("q", "ocr_text:Nedereien"); + assertQ( + req, + "count(//lst[@name='ocrHighlighting']/lst)=1", + "count(//lst[@name='ocrHighlighting']/lst[@name='999999'])=0"); + } finally { + assertU(delI("999999")); + assertU(commit()); + } + } }