From 6c14c34807f2fd32e2e8c2788e83dfe26d9f79ae Mon Sep 17 00:00:00 2001 From: Johannes Baiter Date: Wed, 29 May 2024 10:48:14 +0200 Subject: [PATCH] Fix bad total snippets counts. Turns out we were mixing up the counts between multiple documents by keying according to the leaf-reader-relativ docId instead of the index-absolute docId. This led to differing total counts between different runs of one and the same query, which is how this was discovered. Total counts should be accurate now. --- example/bench.py | 6 ++- .../solrocr/lucene/OcrFieldHighlighter.java | 14 +++--- src/main/java/solrocr/OcrHighlighter.java | 48 +++++++++++-------- 3 files changed, 41 insertions(+), 27 deletions(-) diff --git a/example/bench.py b/example/bench.py index 4ec508c8..8a0bc716 100755 --- a/example/bench.py +++ b/example/bench.py @@ -155,6 +155,7 @@ def run_query( ) -> tuple[float, float, dict]: query_params = { "q": f"ocr_text:{query}", + "echoParams": "explicit", "hl": "on", "hl.ocr.fl": "ocr_text", "hl.snippets": num_snippets, @@ -260,7 +261,7 @@ def _run_query(query): type=str, default=None, metavar="PATH", - help="Path to save the responses for every query to as a JSON file (optional)", + help="Path to save the responses for every query to as a JSONL file (optional)", ) parser.add_argument( "--num-rows", @@ -332,4 +333,5 @@ def _run_query(query): if args.save_responses: with cast(TextIO, gzip.open(args.save_responses, "wt", compresslevel=9)) as f: - json.dump(results.responses, f) + for response in results.responses: + f.write(json.dumps(response) + "\n") diff --git a/src/main/java/com/github/dbmdz/solrocr/lucene/OcrFieldHighlighter.java b/src/main/java/com/github/dbmdz/solrocr/lucene/OcrFieldHighlighter.java index 4b8dd5ad..100ac456 100644 --- a/src/main/java/com/github/dbmdz/solrocr/lucene/OcrFieldHighlighter.java +++ b/src/main/java/com/github/dbmdz/solrocr/lucene/OcrFieldHighlighter.java @@ -66,7 +66,8 @@ public OcrFieldHighlighter( */ public OcrSnippet[] highlightFieldForDoc( LeafReader reader, - int docId, + int indexDocId, // relative to the whole index + int readerDocId, // relative to the current leafReader BreakLocator breakLocator, OcrPassageFormatter formatter, SourceReader content, @@ -82,10 +83,11 @@ public OcrSnippet[] highlightFieldForDoc( } Passage[] passages; - try (OffsetsEnum offsetsEnums = fieldOffsetStrategy.getOffsetsEnum(reader, docId, null)) { + try (OffsetsEnum offsetsEnums = + fieldOffsetStrategy.getOffsetsEnum(reader, readerDocId, null)) { passages = highlightOffsetsEnums( - offsetsEnums, docId, breakLocator, formatter, pageId, snippetLimit, scorePassages); + offsetsEnums, indexDocId, breakLocator, formatter, pageId, snippetLimit, scorePassages); } // Format the resulting Passages. @@ -121,7 +123,7 @@ protected Passage[] highlightOffsetsEnums(OffsetsEnum off) { */ protected Passage[] highlightOffsetsEnums( OffsetsEnum off, - int docId, + int indexDocId, BreakLocator breakLocator, OcrPassageFormatter formatter, String pageId, @@ -178,7 +180,7 @@ protected Passage[] highlightOffsetsEnums( continue; } // Since building passages is expensive when using external files, we forego it past a certain - // limit (which can be set by the user) and just update the total count, counting each matc + // limit (which can be set by the user) and just update the total count, counting each match // as a single passage. if (limitReached || numTotal > snippetLimit) { numTotal++; @@ -213,7 +215,7 @@ protected Passage[] highlightOffsetsEnums( } maybeAddPassage(passageQueue, passageScorer, passage, contentLength, scorePassages); - this.numMatches.put(docId, numTotal); + this.numMatches.put(indexDocId, numTotal); Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]); // sort in ascending order Arrays.sort(passages, Comparator.comparingInt(Passage::getStartOffset)); diff --git a/src/main/java/solrocr/OcrHighlighter.java b/src/main/java/solrocr/OcrHighlighter.java index dd12e93e..328929bb 100644 --- a/src/main/java/solrocr/OcrHighlighter.java +++ b/src/main/java/solrocr/OcrHighlighter.java @@ -321,9 +321,11 @@ public OcrHighlightResult[] highlightOcrFields( // Sort docs & fields for sequential i/o // Sort doc IDs w/ index to original order: (copy input arrays since we sort in-place) - int[] docIds = new int[docIDs.length]; - int[] docInIndexes = new int[docIds.length]; // fill in ascending order; points into docIdsIn[] - copyAndSortDocIdsWithIndex(docIDs, docIds, docInIndexes); // latter 2 are "out" params + int[] sortedDocIds = new int[docIDs.length]; + // Contains the index in `docIDs` for every position in `sortedDocIds` + int[] docInIndexes = + new int[sortedDocIds.length]; // fill in ascending order; points into docIdsIn[] + copyAndSortDocIdsWithIndex(docIDs, sortedDocIds, docInIndexes); // latter 2 are "out" params // Sort fields w/ maxPassages pair: (copy input arrays since we sort in-place) final String[] fields = new String[ocrFieldNames.length]; @@ -365,14 +367,14 @@ public OcrHighlightResult[] highlightOcrFields( (numTermVectors >= 2) ? TermVectorReusingLeafReader.wrap(searcher.getIndexReader()) : null; // [fieldIdx][docIdInIndex] of highlightDoc result - OcrSnippet[][][] highlightDocsInByField = new OcrSnippet[fields.length][docIds.length][]; - int[][] snippetCountsByField = new int[fields.length][docIds.length]; + OcrSnippet[][][] highlightDocsInByField = new OcrSnippet[fields.length][sortedDocIds.length][]; + int[][] snippetCountsByField = new int[fields.length][sortedDocIds.length]; // Highlight in doc batches determined by loadFieldValues (consumes from docIdIter) - DocIdSetIterator docIdIter = asDocIdSetIterator(docIds); + DocIdSetIterator docIdIter = asDocIdSetIterator(sortedDocIds); List> hlFuts = new ArrayList<>(); docLoop: - for (int batchDocIdx = 0; batchDocIdx < docIds.length; ) { + for (int batchDocIdx = 0; batchDocIdx < sortedDocIds.length; ) { List fieldValsByDoc = loadOcrFieldValues(fields, docIdIter); // Highlight in per-field order first, then by doc (better I/O pattern) @@ -380,7 +382,11 @@ public OcrHighlightResult[] highlightOcrFields( OcrSnippet[][] resultByDocIn = highlightDocsInByField[fieldIdx]; // parallel to docIdsIn OcrFieldHighlighter fieldHighlighter = fieldHighlighters[fieldIdx]; for (int docIdx = batchDocIdx; docIdx - batchDocIdx < fieldValsByDoc.size(); docIdx++) { - int docId = docIds[docIdx]; // sorted order + // This docId is potentially going to be made relative to the corresponding leaf reader + // later on, i.e. it's not always going to be the index-wide docId, hence we store the + // absolute value in another variable. + int readerDocId = sortedDocIds[docIdx]; // sorted order + int indexDocId = sortedDocIds[docIdx]; SourceReader content = fieldValsByDoc.get(docIdx - batchDocIdx)[fieldIdx]; if (content == null) { continue; @@ -401,9 +407,10 @@ public OcrHighlightResult[] highlightOcrFields( leafReader = (LeafReader) indexReader; } else { List leaves = indexReader.leaves(); - LeafReaderContext leafReaderContext = leaves.get(ReaderUtil.subIndex(docId, leaves)); + LeafReaderContext leafReaderContext = + leaves.get(ReaderUtil.subIndex(readerDocId, leaves)); leafReader = leafReaderContext.reader(); - docId -= leafReaderContext.docBase; // adjust 'doc' to be within this leaf reader + readerDocId -= leafReaderContext.docBase; // adjust 'doc' to be within this leaf reader } int docInIndex = docInIndexes[docIdx]; // original input order assert resultByDocIn[docInIndex] == null; @@ -414,14 +421,15 @@ public OcrHighlightResult[] highlightOcrFields( params.getInt(OcrHighlightParams.MAX_OCR_PASSAGES, DEFAULT_SNIPPET_LIMIT)); // Final aliases for lambda - final int docIdFinal = docId; + final int readerDocIdFinal = readerDocId; final int fieldIdxFinal = fieldIdx; final SourceReader contentFinal = content; Runnable hlFn = () -> { try { highlightDocField( - docIdFinal, + indexDocId, + readerDocIdFinal, docInIndex, fieldIdxFinal, contentFinal, @@ -440,7 +448,7 @@ public OcrHighlightResult[] highlightOcrFields( if (contentFinal.getPointer() != null) { log.error( "Could not highlight OCR content for document {} at '{}'", - docIdFinal, + indexDocId, contentFinal.getPointer(), e); } else { @@ -452,7 +460,7 @@ public OcrHighlightResult[] highlightOcrFields( } log.error( "Could not highlight OCR for document {} with OCR markup '{}...'", - docIdFinal, + indexDocId, debugStr, e); } @@ -506,8 +514,8 @@ public OcrHighlightResult[] highlightOcrFields( } } - OcrHighlightResult[] out = new OcrHighlightResult[docIds.length]; - for (int d = 0; d < docIds.length; d++) { + OcrHighlightResult[] out = new OcrHighlightResult[sortedDocIds.length]; + for (int d = 0; d < sortedDocIds.length; d++) { OcrHighlightResult hl = new OcrHighlightResult(); for (int f = 0; f < fields.length; f++) { if (snippetCountsByField[f][d] <= 0) { @@ -525,7 +533,8 @@ public OcrHighlightResult[] highlightOcrFields( } private void highlightDocField( - int docId, + int indexDocId, // index-wide docId + int readerDocId, // docId relative to the leaf reader int docInIndex, int fieldIdx, SourceReader reader, @@ -572,14 +581,15 @@ private void highlightDocField( resultByDocIn[docInIndex] = fieldHighlighter.highlightFieldForDoc( leafReader, - docId, + indexDocId, + readerDocId, breakLocator, formatter, reader, params.get(OcrHighlightParams.PAGE_ID), snippetLimit, scorePassages); - snippetCountsByField[fieldIdx][docInIndex] = fieldHighlighter.getNumMatches(docId); + snippetCountsByField[fieldIdx][docInIndex] = fieldHighlighter.getNumMatches(indexDocId); } protected List loadOcrFieldValues(String[] fields, DocIdSetIterator docIter)