dbmdz · jbaiter · Apr 25, 2024 · Apr 25, 2024
diff --git a/src/main/java/com/github/dbmdz/solrocr/model/SourcePointer.java b/src/main/java/com/github/dbmdz/solrocr/model/SourcePointer.java
@@ -3,6 +3,7 @@
 import com.google.common.collect.ImmutableList;
 import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.lang.invoke.MethodHandles;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.Arrays;
@@ -12,8 +13,11 @@
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 public class SourcePointer {
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
   public static class FileSource {
 

diff --git a/src/main/java/com/github/dbmdz/solrocr/solr/SolrOcrHighlighter.java b/src/main/java/com/github/dbmdz/solrocr/solr/SolrOcrHighlighter.java
@@ -100,12 +100,14 @@ private void addOcrSnippets(
       SimpleOrderedMap<Object> docMap = (SimpleOrderedMap<Object>) out.get(docId);
       if (docMap == null) {
         docMap = new SimpleOrderedMap<>();
-        out.add(docId, docMap);
       }
       if (ocrSnippets[k] == null) {
         continue;
       }
       docMap.addAll(ocrSnippets[k].toNamedList());
+      if (docMap.size() > 0) {
+        out.add(docId, docMap);
+      }
     }
   }
 

diff --git a/src/main/java/solrocr/OcrHighlighter.java b/src/main/java/solrocr/OcrHighlighter.java
@@ -528,14 +528,20 @@ protected List<IterableCharSequence[]> loadOcrFieldValues(
           ocrVals[fieldIdx] = IterableCharSequence.fromString(fieldValue);
           continue;
         }
-        SourcePointer sourcePointer = SourcePointer.parse(fieldValue);
+        SourcePointer sourcePointer = null;
+        try {
+          sourcePointer = SourcePointer.parse(fieldValue);
+        } catch (RuntimeException e) {
+          log.error("Could not parse OCR pointer for document {}: {}", docId, fieldValue, e);
+        }
         if (sourcePointer == null) {
           // None of the files in the pointer exist or were readable, log should have warnings
           ocrVals[fieldIdx] = null;
           continue;
         }
         // If preloading is enabled, start warming the cache for the pointer
-        PageCacheWarmer.getInstance().ifPresent(w -> w.preload(sourcePointer));
+        final SourcePointer finalPtr = sourcePointer;
+        PageCacheWarmer.getInstance().ifPresent(w -> w.preload(finalPtr));
         if (sourcePointer.sources.size() == 1) {
           ocrVals[fieldIdx] =
               new FileBytesCharIterator(

diff --git a/src/test/java/com/github/dbmdz/solrocr/solr/DistributedTest.java b/src/test/java/com/github/dbmdz/solrocr/solr/DistributedTest.java
@@ -140,7 +140,7 @@ public void testCombinedHighlightingWorks() throws Exception {
         "Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea <em>commodo consequat</em>. Duis aute irure dolor in "
             + "reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. ");
     NamedList<?> ocrHls = (NamedList<?>) resp.getResponse().get("ocrHighlighting");
-    assertEquals(2, ocrHls.size());
+    assertEquals(1, ocrHls.size());
     assertEquals(1, ((NamedList<?>) ocrHls.get("31337")).size());
   }
 }
diff --git a/src/test/java/com/github/dbmdz/solrocr/solr/HocrTest.java b/src/test/java/com/github/dbmdz/solrocr/solr/HocrTest.java
@@ -229,13 +229,12 @@ public void testMaskedDocumentIsIndexed() {
   @Test
   public void testHighlightingTimeout() {
     // This test can only check for the worst case, since checking for partial results is unlikely
-    // to be stable across
-    // multiple environments due to timing issues.
+    // to be stable across multiple environments due to timing issues.
     SolrQueryRequest req = xmlQ("q", "Vögelchen", "hl.ocr.timeAllowed", "1");
     assertQ(
         req,
         "//bool[@name='partialOcrHighlights']='true'",
-        "count(//lst[@name='ocrHighlighting']/lst)=2",
+        "count(//lst[@name='ocrHighlighting']/lst)=0",
         "count(//arr[@name='snippets'])=0");
   }
 
@@ -646,4 +645,29 @@ public void testHlQParserParam() {
         "count(//arr[@name='snippets']/lst)='1'",
         "contains(//arr[@name='snippets']/lst/str[@name='text'], '<em>Nathanael Brush</em>')");
   }
+
+  public void testMissingFileDoesNotFailWholeQuery() throws IOException {
+    // Create a copy of of a document, with the OCR residing in a temporary directory
+    Path tmpDir = createTempDir();
+    Files.copy(Paths.get("src/test/resources/data/hocr.html"), tmpDir.resolve("hocr.html"));
+    assertU(
+        adoc("ocr_text", tmpDir.resolve("hocr.html").toAbsolutePath().toString(), "id", "999999"));
+    assertU(commit());
+
+    // With indexing complete, we delete the referenced hOCR in order to cause an error during
+    // highlighting
+    Files.delete(tmpDir.resolve("hocr.html"));
+    Files.delete(tmpDir);
+
+    try {
+      SolrQueryRequest req = xmlQ("q", "ocr_text:Nedereien");
+      assertQ(
+          req,
+          "count(//lst[@name='ocrHighlighting']/lst)=1",
+          "count(//lst[@name='ocrHighlighting']/lst[@name='999999'])=0");
+    } finally {
+      assertU(delI("999999"));
+      assertU(commit());
+    }
+  }
 }