Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bad total snippets counts. #433

Merged
merged 1 commit into from
Jun 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions example/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ def run_query(
) -> tuple[float, float, dict]:
query_params = {
"q": f"ocr_text:{query}",
"echoParams": "explicit",
"hl": "on",
"hl.ocr.fl": "ocr_text",
"hl.snippets": num_snippets,
Expand Down Expand Up @@ -260,7 +261,7 @@ def _run_query(query):
type=str,
default=None,
metavar="PATH",
help="Path to save the responses for every query to as a JSON file (optional)",
help="Path to save the responses for every query to as a JSONL file (optional)",
)
parser.add_argument(
"--num-rows",
Expand Down Expand Up @@ -332,4 +333,5 @@ def _run_query(query):

if args.save_responses:
with cast(TextIO, gzip.open(args.save_responses, "wt", compresslevel=9)) as f:
json.dump(results.responses, f)
for response in results.responses:
f.write(json.dumps(response) + "\n")
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ public OcrFieldHighlighter(
*/
public OcrSnippet[] highlightFieldForDoc(
LeafReader reader,
int docId,
int indexDocId, // relative to the whole index
int readerDocId, // relative to the current leafReader
BreakLocator breakLocator,
OcrPassageFormatter formatter,
SourceReader content,
Expand All @@ -82,10 +83,16 @@ public OcrSnippet[] highlightFieldForDoc(
}

Passage[] passages;
try (OffsetsEnum offsetsEnums = fieldOffsetStrategy.getOffsetsEnum(reader, docId, null)) {
try (OffsetsEnum offsetsEnums = fieldOffsetStrategy.getOffsetsEnum(reader, readerDocId, null)) {
passages =
highlightOffsetsEnums(
offsetsEnums, docId, breakLocator, formatter, pageId, snippetLimit, scorePassages);
offsetsEnums,
indexDocId,
breakLocator,
formatter,
pageId,
snippetLimit,
scorePassages);
}

// Format the resulting Passages.
Expand Down Expand Up @@ -121,7 +128,7 @@ protected Passage[] highlightOffsetsEnums(OffsetsEnum off) {
*/
protected Passage[] highlightOffsetsEnums(
OffsetsEnum off,
int docId,
int indexDocId,
BreakLocator breakLocator,
OcrPassageFormatter formatter,
String pageId,
Expand Down Expand Up @@ -178,7 +185,7 @@ protected Passage[] highlightOffsetsEnums(
continue;
}
// Since building passages is expensive when using external files, we forego it past a certain
// limit (which can be set by the user) and just update the total count, counting each matc
// limit (which can be set by the user) and just update the total count, counting each match
// as a single passage.
if (limitReached || numTotal > snippetLimit) {
numTotal++;
Expand Down Expand Up @@ -213,7 +220,7 @@ protected Passage[] highlightOffsetsEnums(
}
maybeAddPassage(passageQueue, passageScorer, passage, contentLength, scorePassages);

this.numMatches.put(docId, numTotal);
this.numMatches.put(indexDocId, numTotal);
Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]);
// sort in ascending order
Arrays.sort(passages, Comparator.comparingInt(Passage::getStartOffset));
Expand Down
48 changes: 29 additions & 19 deletions src/main/java/solrocr/OcrHighlighter.java
Original file line number Diff line number Diff line change
Expand Up @@ -321,9 +321,11 @@ public OcrHighlightResult[] highlightOcrFields(

// Sort docs & fields for sequential i/o
// Sort doc IDs w/ index to original order: (copy input arrays since we sort in-place)
int[] docIds = new int[docIDs.length];
int[] docInIndexes = new int[docIds.length]; // fill in ascending order; points into docIdsIn[]
copyAndSortDocIdsWithIndex(docIDs, docIds, docInIndexes); // latter 2 are "out" params
int[] sortedDocIds = new int[docIDs.length];
// Contains the index in `docIDs` for every position in `sortedDocIds`
int[] docInIndexes =
new int[sortedDocIds.length]; // fill in ascending order; points into docIdsIn[]
copyAndSortDocIdsWithIndex(docIDs, sortedDocIds, docInIndexes); // latter 2 are "out" params

// Sort fields w/ maxPassages pair: (copy input arrays since we sort in-place)
final String[] fields = new String[ocrFieldNames.length];
Expand Down Expand Up @@ -365,22 +367,26 @@ public OcrHighlightResult[] highlightOcrFields(
(numTermVectors >= 2) ? TermVectorReusingLeafReader.wrap(searcher.getIndexReader()) : null;

// [fieldIdx][docIdInIndex] of highlightDoc result
OcrSnippet[][][] highlightDocsInByField = new OcrSnippet[fields.length][docIds.length][];
int[][] snippetCountsByField = new int[fields.length][docIds.length];
OcrSnippet[][][] highlightDocsInByField = new OcrSnippet[fields.length][sortedDocIds.length][];
int[][] snippetCountsByField = new int[fields.length][sortedDocIds.length];
// Highlight in doc batches determined by loadFieldValues (consumes from docIdIter)
DocIdSetIterator docIdIter = asDocIdSetIterator(docIds);
DocIdSetIterator docIdIter = asDocIdSetIterator(sortedDocIds);

List<CompletableFuture<Void>> hlFuts = new ArrayList<>();
docLoop:
for (int batchDocIdx = 0; batchDocIdx < docIds.length; ) {
for (int batchDocIdx = 0; batchDocIdx < sortedDocIds.length; ) {
List<SourceReader[]> fieldValsByDoc = loadOcrFieldValues(fields, docIdIter);

// Highlight in per-field order first, then by doc (better I/O pattern)
for (int fieldIdx = 0; fieldIdx < fields.length; fieldIdx++) {
OcrSnippet[][] resultByDocIn = highlightDocsInByField[fieldIdx]; // parallel to docIdsIn
OcrFieldHighlighter fieldHighlighter = fieldHighlighters[fieldIdx];
for (int docIdx = batchDocIdx; docIdx - batchDocIdx < fieldValsByDoc.size(); docIdx++) {
int docId = docIds[docIdx]; // sorted order
// This docId is potentially going to be made relative to the corresponding leaf reader
// later on, i.e. it's not always going to be the index-wide docId, hence we store the
// absolute value in another variable.
int readerDocId = sortedDocIds[docIdx]; // sorted order
int indexDocId = sortedDocIds[docIdx];
SourceReader content = fieldValsByDoc.get(docIdx - batchDocIdx)[fieldIdx];
if (content == null) {
continue;
Expand All @@ -401,9 +407,10 @@ public OcrHighlightResult[] highlightOcrFields(
leafReader = (LeafReader) indexReader;
} else {
List<LeafReaderContext> leaves = indexReader.leaves();
LeafReaderContext leafReaderContext = leaves.get(ReaderUtil.subIndex(docId, leaves));
LeafReaderContext leafReaderContext =
leaves.get(ReaderUtil.subIndex(readerDocId, leaves));
leafReader = leafReaderContext.reader();
docId -= leafReaderContext.docBase; // adjust 'doc' to be within this leaf reader
readerDocId -= leafReaderContext.docBase; // adjust 'doc' to be within this leaf reader
}
int docInIndex = docInIndexes[docIdx]; // original input order
assert resultByDocIn[docInIndex] == null;
Expand All @@ -414,14 +421,15 @@ public OcrHighlightResult[] highlightOcrFields(
params.getInt(OcrHighlightParams.MAX_OCR_PASSAGES, DEFAULT_SNIPPET_LIMIT));

// Final aliases for lambda
final int docIdFinal = docId;
final int readerDocIdFinal = readerDocId;
final int fieldIdxFinal = fieldIdx;
final SourceReader contentFinal = content;
Runnable hlFn =
() -> {
try {
highlightDocField(
docIdFinal,
indexDocId,
readerDocIdFinal,
docInIndex,
fieldIdxFinal,
contentFinal,
Expand All @@ -440,7 +448,7 @@ public OcrHighlightResult[] highlightOcrFields(
if (contentFinal.getPointer() != null) {
log.error(
"Could not highlight OCR content for document {} at '{}'",
docIdFinal,
indexDocId,
contentFinal.getPointer(),
e);
} else {
Expand All @@ -452,7 +460,7 @@ public OcrHighlightResult[] highlightOcrFields(
}
log.error(
"Could not highlight OCR for document {} with OCR markup '{}...'",
docIdFinal,
indexDocId,
debugStr,
e);
}
Expand Down Expand Up @@ -506,8 +514,8 @@ public OcrHighlightResult[] highlightOcrFields(
}
}

OcrHighlightResult[] out = new OcrHighlightResult[docIds.length];
for (int d = 0; d < docIds.length; d++) {
OcrHighlightResult[] out = new OcrHighlightResult[sortedDocIds.length];
for (int d = 0; d < sortedDocIds.length; d++) {
OcrHighlightResult hl = new OcrHighlightResult();
for (int f = 0; f < fields.length; f++) {
if (snippetCountsByField[f][d] <= 0) {
Expand All @@ -525,7 +533,8 @@ public OcrHighlightResult[] highlightOcrFields(
}

private void highlightDocField(
int docId,
int indexDocId, // index-wide docId
int readerDocId, // docId relative to the leaf reader
int docInIndex,
int fieldIdx,
SourceReader reader,
Expand Down Expand Up @@ -572,14 +581,15 @@ private void highlightDocField(
resultByDocIn[docInIndex] =
fieldHighlighter.highlightFieldForDoc(
leafReader,
docId,
indexDocId,
readerDocId,
breakLocator,
formatter,
reader,
params.get(OcrHighlightParams.PAGE_ID),
snippetLimit,
scorePassages);
snippetCountsByField[fieldIdx][docInIndex] = fieldHighlighter.getNumMatches(docId);
snippetCountsByField[fieldIdx][docInIndex] = fieldHighlighter.getNumMatches(indexDocId);
}

protected List<SourceReader[]> loadOcrFieldValues(String[] fields, DocIdSetIterator docIter)
Expand Down
Loading