diff --git a/docs/changes.md b/docs/changes.md index 09c53056..81b7372c 100644 --- a/docs/changes.md +++ b/docs/changes.md @@ -1,3 +1,12 @@ +## Unreleased + +**Changed:** +- Add support for multithreaded highlighting. Uses all available logical CPU cores by default and + can be tweaked with the `numHighlightingThreads` and `maxQueuedPerThread` attributes on the + `OcrHighlightComponent` in `solrconfig.xml`. +- Removed `PageCacheWarmer`, no longer needed due to multithreading support. + + ## 0.8.5 (2024-04-25) [GitHub Release](https://github.com/dbmdz/solr-ocrhighlighting/releases/tag/0.8.5) diff --git a/docs/performance.md b/docs/performance.md index 043f7c37..3811fcd2 100644 --- a/docs/performance.md +++ b/docs/performance.md @@ -16,13 +16,17 @@ Before you start tuning the plugin, it is important to spend some time on analyz - Check Solr queries with `debug=timing`: How much of the response time is actually spent in the OCR highlighting component? +- On newer Linux kernels, check the Pressure Stall Information (PSI) metrics with `htop` or by looking + at `/proc/pressure/{io,cpu}`. This can give you an indication if the system is I/O-bottlenecked or + CPU-bottlenecked. - On the operating system level (if you're on a Linux system), use [BCC Tools](https://github.com/iovisor/bcc), especially `{nfs/xfs/ext/...}slower` and `{nfs/xfs/ext/...}dist` to check if the performance issues are due to I/O latency. ## Storage Layer -The plugin spends a lot of time on randomly reading small sections of the target files from disk. This means that -the performance characteristics of the underlying storage system have a huge effect on the performance of the plugin. +The plugin spends a lot of time reading small sections of the target files from disk. This means that +the performance characteristics of the underlying storage system have a huge effect on the performance +of the plugin. Important factors include: @@ -33,36 +37,20 @@ Generally speaking, local storage is better than remote storage (like NFS or CIF flash-based storage is better than disk-based storage, due to the lower random read latency and the possibility to do parallel reads. A RAID1/10 setup is preferred over a RAID0/JBOD setup, due to the increased potential for parallel reads. -## Plugin configuration -The plugin offers the possibility to perform a **concurrent read-ahead of highlighting target files**. This will perform -"dummy" reads on multiple parallel threads, with the intent to fill the operating system's page cache with the contents -of the highlighting targets, so that the actual highlighting process is performed on data read from the cache (which -resides in main memory). This is mainly useful for storage layers that benefit from parallel reads, since the highlighting -process is strongly sequential and performing the read-ahead concurrently can reduce latency. - -To enable it, add the `enablePreload=true` attribute on the OCR highlighting component in your core's `solrconfig.xml`. -It is important to accompany this with benchmarking and monitoring, the available settings should be tuned to the -environment: - -- `preloadReadSize`: Size in bytes of read-ahead block reads, should be aligned with file system block size - (or `rsize` for NFS file systems). Defaults to `32768`. -- `preloadConcurrency`: Number of threads to perform read-ahead. Optimal settings have to be determined via - experimentation. Defaults to `8`. - -This approach relies on the OS-level page cache, so make sure you have enough spare RAM available on your machine to -actually benefit from this! Use BCC's `*slower` tools to verify that it's a `solr-ocrhighlight` thread that performs -most of the reads and not the actual query thread (`qtp....`). If you run the same query twice, you shouldn't see a lot -of reads from either the `qtp...` or `solr-ocrhlighight` threads on the second run. - -Example configuration tuned for remote NFS storage mounted with `rsize=65536`: -```xml - -``` - +## Concurrency +The plugin can read multiple files in parallel and also process them concurrently. By default, it will +use as many threads as there are available logical CPU cores on the machine, but this can be tweaked +with the `numHighlightingThreads` and `maxQueuedPerThread` parameters on the `OcrHighlightComponent` +in your `solrconfig.xml`. Tune these parameters to match your hardware and storage layer. + +- `numHighlightingThreads`: The number of threads that will be used to read and process the OCR files. + Defaults to the number of logical CPU cores. Set this higher if you're I/O-bottlenecked and can + support more parallel reads than you have logical CPU cores (very likely for modern NVMe drives). +- `maxQueuedPerThread`: By default, we queue only a limited number of documents per thread as to not + stall other requests. If this number is reached, all highlighting will be done single-threaded on + the request thread. You usually don't have to touch this setting, but if you have large result sets + with many concurrent requests, this can help to reduce the number of threads that are active at + the same time, at least as a stopgap. ## Runtime configuration Another option to influence the performance of the plugin is to tune some runtime options for highlighting. diff --git a/example/bench.py b/example/bench.py new file mode 100755 index 00000000..47bb1f44 --- /dev/null +++ b/example/bench.py @@ -0,0 +1,320 @@ +#!/usr/bin/env python3 +"""Small benchmarking script for Solr highlighting performance. + +Generates a set of common two-term phrase queries from the Google 1000 dataset +and runs them against the Dockerized example setup. The script measures the time +spent on query execution and highlighting and prints the results to stdout. + +If you want to profile the plugin: +- Download async-profiler: https://github.com/async-profiler/async-profiler +- Mount the async-profiler directory to the same location in the container as + on your system +- Add these lines to the `solr` service in `docker-compose.yml`: + ``` + security_opt: + - seccomp:unconfined + cap_add: + - SYS_ADMIN + ``` +- Launch the container +- Find the PID of the Solr process on the host machine (use `ps` or `htop`) +- Launch the profiler: `${ASYNC_PROFILER_DIR}/asprof -d 60 -f /tmp/flamegraph.svg ${SOLRPID}` +""" + +import argparse +import json +import os +import random +import statistics +import string +import sys +import xml.etree.ElementTree as etree + +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed +from multiprocessing import cpu_count +from pathlib import Path +from typing import Iterable, Mapping, NamedTuple, TextIO, cast +from urllib.parse import urlencode +from urllib.request import Request, urlopen +from collections import Counter + +NSMAP = {"mets": "http://www.loc.gov/METS/", "mods": "http://www.loc.gov/mods/v3"} +STRIP_PUNCTUATION_TBL = str.maketrans("", "", string.punctuation) + + +class BenchmarkResult(NamedTuple): + query_times_ms: list[Mapping[str, float]] + hl_times_ms: list[Mapping[str, float]] + + def mean_query_time(self) -> float: + return statistics.mean( + statistics.mean(query_times.values()) for query_times in self.query_times_ms + ) + + def mean_query_times(self) -> tuple[float, ...]: + return tuple( + statistics.mean(query_times.values()) for query_times in self.query_times_ms + ) + + def mean_hl_time(self) -> float: + return statistics.mean( + statistics.mean(hl_times.values()) for hl_times in self.hl_times_ms + ) + + def mean_hl_times(self) -> tuple[float, ...]: + return tuple( + statistics.mean(hl_times.values()) for hl_times in self.hl_times_ms + ) + + +def analyze_phrases( + sections: Iterable[tuple[str, ...]], + min_len=2, + max_len=8, + min_word_len=4, + sample_size: int = 512, +) -> Mapping[tuple[str, ...], int]: + counter: Counter[tuple[str, ...]] = Counter() + for words in sections: + for length in range(min_len, max_len + 1): + for i in range(len(words) - length + 1): + phrase = tuple(words[i : i + length]) + if all(len(word) < min_word_len for word in phrase): + continue + counter[phrase] += 1 + filtered = [(phrase, count) for phrase, count in counter.items() if count > 4] + return dict( + random.sample( + filtered, + min(sample_size, len(filtered)), + ) + ) + + +def parse_hocr(hocr_path: Path) -> Iterable[tuple[str, ...]]: + tree = etree.parse(hocr_path) + for block in tree.findall('.//div[@class="ocrx_block"]'): + words = [w for w in block.findall('.//span[@class="ocrx_word"]')] + if len(words) == 0: + continue + passage = tuple( + filtered + for filtered in ( + w.text.translate(STRIP_PUNCTUATION_TBL) + for w in words + if w is not None and w.text is not None + ) + if filtered + ) + if len(passage) == 0: + continue + yield passage + + +def build_query_set( + hocr_base_path: Path, min_count=8, max_count=256 +) -> Iterable[tuple[str, int]]: + # Counts in how many documents a phrase occurs + phrase_counter = Counter() + with ProcessPoolExecutor(max_workers=cpu_count()) as pool: + futs = [ + pool.submit(lambda p: analyze_phrases(parse_hocr(p)), hocr_path) + for hocr_path in hocr_base_path.glob("**/*.hocr") + ] + num_completed = 0 + for fut in as_completed(futs): + num_completed += 1 + print( + f"Analyzed {num_completed:>5}/{len(futs)} documents", + file=sys.stderr, + end="\r", + ) + for phrase, count in fut.result().items(): + if count < 4: + continue + phrase_counter[phrase] += 1 + + # Selects phrases that occur in at least min_count documents + for phrase, count in phrase_counter.items(): + if count < min_count or count > max_count: + continue + if len(phrase) == 1: + yield phrase[0] + else: + yield f'"{" ".join(phrase)}"', count + + +def run_query( + query: str, solr_handler: str, num_rows: int, num_snippets: int +) -> tuple[float, float]: + query_params = { + "q": f"ocr_text:{query}", + "hl": "on", + "hl.ocr.fl": "ocr_text", + "hl.snippets": num_snippets, + "rows": num_rows, + "debug": "timing", + "hl.weightMatches": "true", + } + req = Request(f"{solr_handler}?{urlencode(query_params)}") + with urlopen(req) as http_resp: + solr_resp = json.load(http_resp) + hl_duration = solr_resp["debug"]["timing"]["process"]["ocrHighlight"]["time"] + query_duration = solr_resp["debug"]["timing"]["time"] + return query_duration, hl_duration + + +def run_benchmark( + solr_handler: str, + queries: set[str], + iterations=3, + concurrency=1, + num_rows=50, + num_snippets=5, +) -> BenchmarkResult: + print( + f"Running benchmark for {num_rows} rows with {num_snippets} snippets across {iterations} iterations and {concurrency} parallel requests", + file=sys.stderr, + ) + + with ThreadPoolExecutor(max_workers=concurrency) as pool: + print(f"Running {iterations} benchmark iterations", file=sys.stderr) + all_query_times = [] + all_hl_times = [] + + def _run_query(query): + return query, run_query(query, solr_handler, num_rows, num_snippets) + + for iteration_idx in range(iterations): + iter_futs = [pool.submit(_run_query, query) for query in queries] + + query_times = {} + hl_times = {} + for idx, fut in enumerate(as_completed(iter_futs)): + try: + query, (query_time, hl_time) = fut.result() + except Exception as e: + print(f"\nError: {e}", file=sys.stderr) + continue + query_times[query] = query_time + hl_times[query] = hl_time + hl_factor = statistics.mean(hl_times.values()) / statistics.mean( + query_times.values() + ) + print( + f"Iteration {iteration_idx+1}: {idx+1:>4}/{len(queries)}, " + f"øq={statistics.mean(query_times.values()):.2f}ms, " + f"øhl={statistics.mean(hl_times.values()):.2f}ms, " + f"hl_share={hl_factor:.2f}", + file=sys.stderr, + end="\r", + ) + all_query_times.append(query_times) + all_hl_times.append(hl_times) + + return BenchmarkResult(all_query_times, all_hl_times) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawTextHelpFormatter + ) + parser.add_argument( + "--iterations", + type=int, + default=3, + metavar="N", + help="Number of benchmark iterations", + ) + parser.add_argument( + "--concurrency", + type=int, + default=1, + metavar="N", + help="Number of concurrent requests", + ) + parser.add_argument( + "--queries-path", + type=str, + default="./benchqueries.txt.gz", + metavar="PATH", + help="Path to the file containing the queries", + ) + parser.add_argument( + "--save-results", + type=str, + default=None, + metavar="PATH", + help="Path to save the results to as a JSON file (optional)", + ) + parser.add_argument( + "--num-rows", + type=int, + default=50, + metavar="N", + help="Number of rows to request from Solr", + ) + parser.add_argument( + "--num-snippets", + type=int, + default=5, + metavar="N", + help="Number of snippets to request from Solr", + ) + parser.add_argument( + "--solr-handler", + type=str, + default="http://localhost:8983/solr/ocr/select", + help="URL to the Solr handler", + ) + args = parser.parse_args() + + if os.path.exists(args.queries_path): + if args.queries_path.endswith(".gz"): + import gzip + + with gzip.open(args.queries_path, "rt") as f: + queries = set( + q for q in (line.strip() for line in cast(TextIO, f)) if q + ) + else: + with open(args.queries_path, "rt") as f: + queries = set(q for q in (line.strip() for line in f) if q) + else: + hocr_base_path = Path("./data/google1000") + queries = set(q for q, _ in build_query_set(hocr_base_path)) + if args.queries_path.endswith(".gz"): + import gzip + + with cast(TextIO, gzip.open(args.queries_path, "wt", compresslevel=9)) as f: + f.write("\n".join(queries)) + else: + with open(args.queries_path, "w") as f: + f.write("\n".join(queries)) + + results = run_benchmark( + args.solr_handler, + queries, + iterations=args.iterations, + concurrency=args.concurrency, + num_rows=args.num_rows, + num_snippets=args.num_snippets, + ) + + print("\n\n=====================================") + print(f"Mean query time: {results.mean_query_time():.2f}ms") + print(f"Mean highlighting time: {results.mean_hl_time():.2f}ms") + print( + f"Percent of time spent on highlighting: {results.mean_hl_time() / results.mean_query_time() * 100:.2f}%" + ) + print("=====================================\n\n") + + if args.save_results: + with open(args.save_results, "w") as f: + json.dump( + { + "query_times": results.query_times_ms, + "hl_times": results.hl_times_ms, + }, + f, + ) diff --git a/example/benchqueries.txt.gz b/example/benchqueries.txt.gz new file mode 100644 index 00000000..84cd3063 Binary files /dev/null and b/example/benchqueries.txt.gz differ diff --git a/example/docker-compose.yml b/example/docker-compose.yml index 43d74a39..48b6e0ba 100644 --- a/example/docker-compose.yml +++ b/example/docker-compose.yml @@ -11,10 +11,12 @@ services: - index-data:/var/solr - ./data:/data - ../target:/build + - ./flightrecords:/flightrecords environment: - ENABLE_REMOTE_JMX_OPTS=true - SOLR_HEAP=4g - - ADDITIONAL_CMD_OPTS=-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:1044 + - ADDITIONAL_CMD_OPTS=-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:1044 -XX:StartFlightRecording=settings=profile,filename=/flightrecords/profile.jfr,maxage=30m -XX:+UnlockDiagnosticVMOptions -XX:+DebugNonSafepoints -XX:+PreserveFramePointer + - SOLR_SECURITY_MANAGER_ENABLED=false entrypoint: - docker-entrypoint.sh diff --git a/example/solr/core/schema.xml b/example/solr/core/conf/schema.xml similarity index 100% rename from example/solr/core/schema.xml rename to example/solr/core/conf/schema.xml diff --git a/example/solr/core/solrconfig.xml b/example/solr/core/conf/solrconfig.xml similarity index 95% rename from example/solr/core/solrconfig.xml rename to example/solr/core/conf/solrconfig.xml index 05ee8291..8ead7b50 100644 --- a/example/solr/core/solrconfig.xml +++ b/example/solr/core/conf/solrconfig.xml @@ -12,6 +12,7 @@ query ocrHighlight highlight + debug diff --git a/pom.xml b/pom.xml index 80844fe9..995dbfa4 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ de.digitalcollections solr-ocrhighlighting - 0.8.6 + 0.9.0-SNAPSHOT Solr OCR Highlighting Plugin @@ -64,7 +64,7 @@ 3.6.3 3.5.1 3.3.0 - 1.1.7 + 3.2.5 1.6.13 @@ -181,18 +181,11 @@ - de.sormuras.junit - junit-platform-maven-plugin - ${version.junit-platform-maven} - true + org.apache.maven.plugins + maven-surefire-plugin + ${version.surefire} - JAVA - - true - - -Djava.security.egd=file:/dev/./urandom - - + -Djava.security.egd=file:/dev/./urandom diff --git a/src/main/java/com/github/dbmdz/solrocr/lucene/OcrFieldHighlighter.java b/src/main/java/com/github/dbmdz/solrocr/lucene/OcrFieldHighlighter.java index 7ccdced0..5731f3a5 100644 --- a/src/main/java/com/github/dbmdz/solrocr/lucene/OcrFieldHighlighter.java +++ b/src/main/java/com/github/dbmdz/solrocr/lucene/OcrFieldHighlighter.java @@ -27,14 +27,12 @@ import com.github.dbmdz.solrocr.iter.BreakLocator; import com.github.dbmdz.solrocr.iter.IterableCharSequence; import com.github.dbmdz.solrocr.model.OcrSnippet; -import com.github.dbmdz.solrocr.util.PageCacheWarmer; import java.io.IOException; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; -import java.util.HashMap; -import java.util.Map; import java.util.PriorityQueue; +import java.util.concurrent.ConcurrentHashMap; import org.apache.lucene.index.LeafReader; import org.apache.lucene.search.uhighlight.FieldHighlighter; import org.apache.lucene.search.uhighlight.FieldOffsetStrategy; @@ -45,7 +43,7 @@ /** A customization of {@link FieldHighlighter} to support OCR fields */ public class OcrFieldHighlighter extends FieldHighlighter { - private final Map numMatches; + private final ConcurrentHashMap numMatches; public OcrFieldHighlighter( String field, @@ -55,7 +53,7 @@ public OcrFieldHighlighter( int maxNoHighlightPassages) { super( field, fieldOffsetStrategy, null, passageScorer, maxPassages, maxNoHighlightPassages, null); - this.numMatches = new HashMap<>(); + this.numMatches = new ConcurrentHashMap<>(); } /** @@ -79,8 +77,6 @@ public OcrSnippet[] highlightFieldForDoc( // note: it'd be nice to accept a CharSequence for content, but we need a CharacterIterator impl // for it. - // If page cache pre-warming is enabled, cancel it, since we're doing the I/O ourselves now - PageCacheWarmer.getInstance().ifPresent(w -> w.cancelPreload(content.getPointer())); if (content.length() == 0) { return null; // nothing to do } diff --git a/src/main/java/com/github/dbmdz/solrocr/lucene/filters/SanitizingXmlFilter.java b/src/main/java/com/github/dbmdz/solrocr/lucene/filters/SanitizingXmlFilter.java index a204de19..4ad5a7b1 100644 --- a/src/main/java/com/github/dbmdz/solrocr/lucene/filters/SanitizingXmlFilter.java +++ b/src/main/java/com/github/dbmdz/solrocr/lucene/filters/SanitizingXmlFilter.java @@ -90,7 +90,7 @@ public int read(char[] cbuf, int off, int len) throws IOException { while (idx < (off + numRead)) { // Check for invalid entities and try to fix them while (advancedFixing && idx < (off + numRead)) { - int match = multiIndexOf(cbuf, idx, '<', '&'); + int match = multiIndexOf(cbuf, idx, (off + numRead), '<', '&'); if (match < 0 || match > (off + numRead)) { // Nothing to do in this buffer break outer; @@ -99,13 +99,13 @@ public int read(char[] cbuf, int off, int len) throws IOException { // Start of element, no more entities to check break; } - int entityEnd = multiIndexOf(cbuf, match + 1, '<', ';'); + int entityEnd = multiIndexOf(cbuf, match + 1, (off + numRead), '<', ';'); if (entityEnd < match + 1) { // Not enough data to determine entity end, we may have to carry over // FIXME: This code is largely identical to the carry-over case below, find a way to // deduplicate - int carryOverSize = numRead - (match + 1); + int carryOverSize = (off + numRead) - (match + 1); if (carryOverSize == numRead) { // Can't carry-over the whole buffer, since the read would return 0 // We bail out of validation since there's no way we can force the caller to request @@ -205,7 +205,7 @@ public int read(char[] cbuf, int off, int len) throws IOException { (cbuf[startElem + 1] == '/' || cbuf[startElem + 1] == '?') ? startElem + 2 : startElem + 1; - int endTag = multiIndexOf(cbuf, startTag, ' ', '\n', '\t'); + int endTag = multiIndexOf(cbuf, startTag, (off + numRead), ' ', '\n', '\t'); if (endTag > endElem || endTag < 0) { endTag = cbuf[endElem - 1] == '/' ? endElem - 1 : endElem; } @@ -290,14 +290,14 @@ public int read(char[] cbuf, int off, int len) throws IOException { * Variant of {@link org.apache.commons.lang3.ArrayUtils#indexOf(char[], char)} that supports * looking for multiple values. */ - private static int multiIndexOf(final char[] array, int startIndex, final char... valuesToFind) { + private static int multiIndexOf(final char[] array, int startIndex, int limit, final char... valuesToFind) { if (array == null) { return -1; } if (startIndex < 0) { startIndex = 0; } - for (int i = startIndex; i < array.length; i++) { + for (int i = startIndex; i < limit; i++) { for (char value : valuesToFind) { if (value == array[i]) { return i; diff --git a/src/main/java/com/github/dbmdz/solrocr/solr/SolrOcrHighlighter.java b/src/main/java/com/github/dbmdz/solrocr/solr/SolrOcrHighlighter.java index 85c6a1c5..e9312d6b 100644 --- a/src/main/java/com/github/dbmdz/solrocr/solr/SolrOcrHighlighter.java +++ b/src/main/java/com/github/dbmdz/solrocr/solr/SolrOcrHighlighter.java @@ -25,12 +25,17 @@ package com.github.dbmdz.solrocr.solr; import com.github.dbmdz.solrocr.model.OcrHighlightResult; +import com.google.common.util.concurrent.ThreadFactoryBuilder; import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.Collection; import java.util.LinkedHashSet; import java.util.Map; import java.util.Set; +import java.util.concurrent.Executor; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; import org.apache.lucene.search.Query; import org.apache.solr.common.params.HighlightParams; import org.apache.solr.common.params.SolrParams; @@ -47,6 +52,41 @@ public class SolrOcrHighlighter extends UnifiedSolrHighlighter { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private final Executor hlThreadPool; + + public SolrOcrHighlighter() { + this(Runtime.getRuntime().availableProcessors(), 8); + } + + public SolrOcrHighlighter(int numHlThreads, int maxQueuedPerThread) { + super(); + if (numHlThreads > 0) { + this.hlThreadPool = + new ThreadPoolExecutor( + numHlThreads, + numHlThreads, + 120L, + TimeUnit.SECONDS, + new LinkedBlockingQueue<>(numHlThreads * maxQueuedPerThread), + new ThreadFactoryBuilder().setNameFormat("OcrHighlighter-%d").build()); + } else { + // Executors.newDirectExecutorService() for Java 8 + this.hlThreadPool = + new Executor() { + @Override + public void execute(Runnable cmd) { + cmd.run(); + } + }; + } + } + + public void shutdownThreadPool() { + if (hlThreadPool instanceof ThreadPoolExecutor) { + ((ThreadPoolExecutor) hlThreadPool).shutdown(); + } + } + public NamedList doHighlighting( DocList docs, Query query, SolrQueryRequest req, Map respHeader) throws IOException { @@ -75,7 +115,8 @@ public NamedList doHighlighting( OcrHighlighter ocrHighlighter = new OcrHighlighter(req.getSearcher(), req.getSchema().getIndexAnalyzer(), req); OcrHighlightResult[] ocrSnippets = - ocrHighlighter.highlightOcrFields(ocrFieldNames, query, docIDs, maxPassagesOcr, respHeader); + ocrHighlighter.highlightOcrFields( + ocrFieldNames, query, docIDs, maxPassagesOcr, respHeader, hlThreadPool); // Assemble output data SimpleOrderedMap out = new SimpleOrderedMap<>(); diff --git a/src/main/java/com/github/dbmdz/solrocr/util/PageCacheWarmer.java b/src/main/java/com/github/dbmdz/solrocr/util/PageCacheWarmer.java deleted file mode 100644 index dfdb71c2..00000000 --- a/src/main/java/com/github/dbmdz/solrocr/util/PageCacheWarmer.java +++ /dev/null @@ -1,145 +0,0 @@ -package com.github.dbmdz.solrocr.util; - -import com.github.dbmdz.solrocr.model.SourcePointer; -import com.github.dbmdz.solrocr.model.SourcePointer.FileSource; -import com.github.dbmdz.solrocr.model.SourcePointer.Region; -import com.google.common.collect.ImmutableList; -import com.google.common.util.concurrent.ThreadFactoryBuilder; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.channels.SeekableByteChannel; -import java.nio.file.Files; -import java.nio.file.StandardOpenOption; -import java.util.List; -import java.util.Optional; -import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.LinkedBlockingDeque; -import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.TimeUnit; - -/** - * Utility to concurrently "warm" the OS page cache with files that will be used for highlighting. - * - *

Should significantly reduce the I/O latency during the sequential highlighting process, - * especially when using a network storage layer or a RAID system. - * - *

The idea is that a lot of storage layers can benefit from parallel I/O. Unfortunately, snippet - * generation with the current UHighlighter approach is strongly sequential, which means we give - * away a lot of potential performance, since we're limited by the I/O latency of the underlying - * storage layer. By pre-reading the data we might need in a concurrent way, we pre-populate the - * operating system's page cache, so any I/O performed by the snippet generation process further - * down the line should only hit the page cache and not incur as much of a latency hit. - * - *

The class also provides a way to cancel the pre-loading of a given source pointer. This is - * called at the beginning of the snippet generation process, since at that point any background I/O - * on the target files will only add to the latency we might experience anyway. - */ -public class PageCacheWarmer { - private static final int MAX_PENDING_JOBS = 128; - - // The singleton instance of the cache warmer - private static PageCacheWarmer instance; - - // This is the read buffer for every worker thread, so we only do as many allocations as necessary - private final ThreadLocal localBuf; - - // Set of pending preload operations for file sources, used to allow the cancelling of preloading - // tasks - private final Set pendingPreloads = ConcurrentHashMap.newKeySet(MAX_PENDING_JOBS); - - private final ExecutorService service; - - /** - * Enable the page cache warmer. - * - * @param readBufSize Size of blocks to read for cache warming. Should match the block size of the - * underlying storage layer for best performance. - * @param numThreads Number of worker threads to use for cache warming. Should match the number of - * parallel I/O operations that are possible with the storage layer - */ - public static void enable(int readBufSize, int numThreads) { - if (instance == null) { - instance = new PageCacheWarmer(readBufSize, numThreads); - } - } - - public static Optional getInstance() { - return Optional.ofNullable(instance); - } - - private PageCacheWarmer(int bufSize, int numThreads) { - this.localBuf = ThreadLocal.withInitial(() -> ByteBuffer.allocate(bufSize)); - this.service = - new ThreadPoolExecutor( - numThreads, - numThreads, - 0, - TimeUnit.MILLISECONDS, - new LinkedBlockingDeque<>(MAX_PENDING_JOBS), - new ThreadFactoryBuilder() - .setNameFormat("solr-ocrhighlighting-cache-warmer-%d") - .build(), - new ThreadPoolExecutor.DiscardOldestPolicy()); - } - - /** - * Reads the file source in 32KiB chunks - * - * @param src file source - */ - private void preload(FileSource src) { - ByteBuffer buf = localBuf.get(); - try (SeekableByteChannel channel = Files.newByteChannel(src.path, StandardOpenOption.READ)) { - List regions; - if (src.regions.isEmpty()) { - regions = ImmutableList.of(new Region(0, (int) channel.size())); - } else { - regions = src.regions; - } - for (Region region : regions) { - channel.position(region.start); - int remainingSize = region.end - region.start; - while (remainingSize > 0) { - // Read and immediately clear the buffer, we don't need the data - remainingSize -= channel.read(buf); - buf.clear(); - if (Thread.interrupted() || !pendingPreloads.contains(src)) { - return; - } - } - } - } catch (IOException e) { - // NOP, this method only serves to populate the page cache, so we don't care about I/O errors. - } finally { - pendingPreloads.remove(src); - } - } - - /** Populate the OS page cache with the targets of the source pointer. */ - public void preload(SourcePointer ptr) { - if (ptr == null) { - return; - } - for (FileSource source : ptr.sources) { - if (pendingPreloads.contains(source)) { - continue; - } - pendingPreloads.add(source); - service.submit(() -> preload(source)); - } - } - - /** Cancel all running and pending preloading tasks for the given source pointer. */ - public void cancelPreload(SourcePointer ptr) { - if (ptr == null) { - return; - } - ptr.sources.forEach(pendingPreloads::remove); - } - - public void shutdown() { - service.shutdownNow(); - } -} diff --git a/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java b/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java index 53a43913..3a414a1e 100644 --- a/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java +++ b/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java @@ -63,7 +63,8 @@ public Reader create(Reader input) { pointer.sources.forEach(this::validateSource); // Regions contained in source pointers are defined by byte offsets. - // We need to convert these to Java character offsets so they can be used by the filter. + // We need to convert these to Java character offsets, so they can be used by the filter. + // This is very expensive, but we need this since all IO from here on out is character-based. toCharOffsets(pointer); Reader r; diff --git a/src/main/java/solrocr/OcrHighlightComponent.java b/src/main/java/solrocr/OcrHighlightComponent.java index c98b76cc..20968023 100644 --- a/src/main/java/solrocr/OcrHighlightComponent.java +++ b/src/main/java/solrocr/OcrHighlightComponent.java @@ -2,7 +2,6 @@ import com.github.dbmdz.solrocr.solr.OcrHighlightParams; import com.github.dbmdz.solrocr.solr.SolrOcrHighlighter; -import com.github.dbmdz.solrocr.util.PageCacheWarmer; import com.google.common.base.Strings; import java.io.IOException; import java.util.Arrays; @@ -19,7 +18,6 @@ import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.SimpleOrderedMap; -import org.apache.solr.core.CloseHook; import org.apache.solr.core.PluginInfo; import org.apache.solr.core.SolrCore; import org.apache.solr.handler.component.ResponseBuilder; @@ -80,24 +78,14 @@ public void prepare(ResponseBuilder rb) throws IOException { @Override public void inform(SolrCore core) { - this.ocrHighlighter = new SolrOcrHighlighter(); - if ("true".equals(info.attributes.getOrDefault("enablePreload", "false"))) { - PageCacheWarmer.enable( - Integer.parseInt(info.attributes.getOrDefault("preloadReadSize", "32768")), - Integer.parseInt(info.attributes.getOrDefault("preloadConcurrency", "8"))); - } - - // Shut down the cache warming threads after closing of the core - core.addCloseHook( - new CloseHook() { - @Override - public void preClose(SolrCore core) {} - - @Override - public void postClose(SolrCore core) { - PageCacheWarmer.getInstance().ifPresent(PageCacheWarmer::shutdown); - } - }); + int numHlThreads = + Integer.parseInt( + info.attributes.getOrDefault( + "numHighlightingThreads", + String.valueOf(Runtime.getRuntime().availableProcessors()))); + int maxQueuedPerThread = + Integer.parseInt(info.attributes.getOrDefault("maxQueuedPerThread", "8")); + this.ocrHighlighter = new SolrOcrHighlighter(numHlThreads, maxQueuedPerThread); } @Override diff --git a/src/main/java/solrocr/OcrHighlighter.java b/src/main/java/solrocr/OcrHighlighter.java index 32575d52..71429be6 100644 --- a/src/main/java/solrocr/OcrHighlighter.java +++ b/src/main/java/solrocr/OcrHighlighter.java @@ -43,7 +43,6 @@ import com.github.dbmdz.solrocr.model.SourcePointer; import com.github.dbmdz.solrocr.reader.LegacyBaseCompositeReader; import com.github.dbmdz.solrocr.solr.OcrHighlightParams; -import com.github.dbmdz.solrocr.util.PageCacheWarmer; import com.github.dbmdz.solrocr.util.TimeAllowedLimit; import com.google.common.collect.ImmutableSet; import java.io.IOException; @@ -59,6 +58,10 @@ import java.util.Locale; import java.util.Map; import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CompletionException; +import java.util.concurrent.Executor; +import java.util.concurrent.RejectedExecutionException; import java.util.function.Function; import java.util.function.Predicate; import java.util.stream.Collectors; @@ -284,7 +287,8 @@ public OcrHighlightResult[] highlightOcrFields( Query query, int[] docIDs, int[] maxPassagesOcr, - Map respHeader) + Map respHeader, + Executor hlThreadPool) throws IOException { if (ocrFieldNames.length < 1) { throw new IllegalArgumentException("ocrFieldNames must not be empty"); @@ -358,6 +362,8 @@ public OcrHighlightResult[] highlightOcrFields( int[][] snippetCountsByField = new int[fields.length][docIds.length]; // Highlight in doc batches determined by loadFieldValues (consumes from docIdIter) DocIdSetIterator docIdIter = asDocIdSetIterator(docIds); + + List> hlFuts = new ArrayList<>(); docLoop: for (int batchDocIdx = 0; batchDocIdx < docIds.length; ) { List fieldValsByDoc = loadOcrFieldValues(fields, docIdIter); @@ -394,85 +400,79 @@ public OcrHighlightResult[] highlightOcrFields( } int docInIndex = docInIndexes[docIdx]; // original input order assert resultByDocIn[docInIndex] == null; - OcrFormat ocrFormat = getFormat(content); - if (ocrFormat == null) { - continue; - } - String limitBlockParam = params.get(OcrHighlightParams.LIMIT_BLOCK, "block"); - OcrBlock[] limitBlocks = null; - if (!limitBlockParam.equalsIgnoreCase("NONE")) { - limitBlocks = - OcrBlock.getHierarchyFrom(OcrBlock.valueOf(limitBlockParam.toUpperCase(Locale.US))) - .toArray(new OcrBlock[0]); - } - OcrBlock contextBlock = - OcrBlock.valueOf( - params.get(OcrHighlightParams.CONTEXT_BLOCK, "line").toUpperCase(Locale.US)); - BreakLocator contextLocator = ocrFormat.getBreakLocator(content, contextBlock); - BreakLocator limitLocator = - limitBlocks == null ? null : ocrFormat.getBreakLocator(content, limitBlocks); - BreakLocator breakLocator = - new ContextBreakLocator( - contextLocator, limitLocator, params.getInt(OcrHighlightParams.CONTEXT_SIZE, 2)); - OcrPassageFormatter formatter = - ocrFormat.getPassageFormatter( - OcrHighlightParams.get(params, OcrHighlightParams.TAG_PRE, ""), - OcrHighlightParams.get(params, OcrHighlightParams.TAG_POST, ""), - params.getBool(OcrHighlightParams.ABSOLUTE_HIGHLIGHTS, false), - params.getBool(OcrHighlightParams.ALIGN_SPANS, false), - params.getBool(OcrHighlightParams.TRACK_PAGES, true)); + int snippetLimit = Math.max( maxPassages[fieldIdx], params.getInt(OcrHighlightParams.MAX_OCR_PASSAGES, DEFAULT_SNIPPET_LIMIT)); - boolean scorePassages = params.getBool(OcrHighlightParams.SCORE_PASSAGES, true); + + // Final aliases for lambda + final int docIdFinal = docId; + final int fieldIdxFinal = fieldIdx; + final IterableCharSequence contentFinal = content; + Runnable hlFn = + () -> { + try { + highlightDocField( + docIdFinal, + docInIndex, + fieldIdxFinal, + contentFinal, + fieldHighlighter, + leafReader, + snippetLimit, + resultByDocIn, + snippetCountsByField); + } catch (ExitingIterCharSeq.ExitingIterCharSeqException + | ExitableDirectoryReader.ExitingReaderException e) { + resultByDocIn[docInIndex] = null; + throw e; + } catch (IOException | RuntimeException e) { + // This catch-all prevents OCR highlighting from failing the complete query, + // instead users get an error message in their Solr log. + if (contentFinal.getPointer() != null) { + log.error( + "Could not highlight OCR content for document {} at '{}'", + docIdFinal, + contentFinal.getPointer(), + e); + } else { + log.error( + "Could not highlight OCR for document {} with OCR markup '{}...'", + docIdFinal, + contentFinal.subSequence(0, 256), + e); + } + } finally { + if (contentFinal instanceof AutoCloseable) { + try { + ((AutoCloseable) contentFinal).close(); + } catch (Exception e) { + log.warn( + "Encountered error while closing content iterator for {}: {}", + contentFinal.getPointer(), + e.getMessage()); + } + } + } + }; try { - resultByDocIn[docInIndex] = - fieldHighlighter.highlightFieldForDoc( - leafReader, - docId, - breakLocator, - formatter, - content, - params.get(OcrHighlightParams.PAGE_ID), - snippetLimit, - scorePassages); - } catch (ExitingIterCharSeq.ExitingIterCharSeqException - | ExitableDirectoryReader.ExitingReaderException e) { - log.warn("OCR Highlighting timed out while handling " + content.getPointer(), e); - respHeader.put(PARTIAL_OCR_HIGHLIGHTS, Boolean.TRUE); - resultByDocIn[docInIndex] = null; - // Stop highlighting - break docLoop; - } catch (RuntimeException e) { - // This catch-all prevents OCR highlighting from failing the complete query, - // instead users get an error message in their Solr log. - if (content.getPointer() != null) { - log.error( - "Could not highlight OCR content for document {} at '{}'", - docId, - content.getPointer(), - e); - } else { - log.error( - "Could not highlight OCR for document {} with OCR markup '{}...'", - docId, - content.subSequence(0, 256), - e); - } - } finally { - if (content instanceof AutoCloseable) { - try { - ((AutoCloseable) content).close(); - } catch (Exception e) { - log.warn( - "Encountered error while closing content iterator for {}: {}", - content.getPointer(), - e.getMessage()); + // Speed up highlighting by parallelizing the work as much as possible + hlFuts.add(CompletableFuture.runAsync(hlFn, hlThreadPool)); + } catch (RejectedExecutionException rejected) { + // If the pool is full, run the task synchronously on the current thread + try { + hlFn.run(); + } catch (ExitingIterCharSeq.ExitingIterCharSeqException + | ExitableDirectoryReader.ExitingReaderException e) { + if (respHeader.get(PARTIAL_OCR_HIGHLIGHTS) == null) { + respHeader.put(PARTIAL_OCR_HIGHLIGHTS, Boolean.TRUE); + log.warn("OCR Highlighting timed out while handling " + content.getPointer(), e); } + resultByDocIn[docInIndex] = null; + break docLoop; } } - snippetCountsByField[fieldIdx][docInIndex] = fieldHighlighter.getNumMatches(docId); } } batchDocIdx += fieldValsByDoc.size(); @@ -480,6 +480,21 @@ public OcrHighlightResult[] highlightOcrFields( assert docIdIter.docID() == DocIdSetIterator.NO_MORE_DOCS || docIdIter.nextDoc() == DocIdSetIterator.NO_MORE_DOCS; + if (!hlFuts.isEmpty()) { + CompletableFuture[] futArray = hlFuts.toArray(new CompletableFuture[0]); + CompletableFuture allFut = CompletableFuture.allOf(futArray); + try { + allFut.join(); + } catch (CompletionException e) { + if (e.getCause() instanceof ExitingIterCharSeq.ExitingIterCharSeqException + || e.getCause() instanceof ExitableDirectoryReader.ExitingReaderException) { + respHeader.put(PARTIAL_OCR_HIGHLIGHTS, Boolean.TRUE); + } else { + log.error("Error while highlighting OCR content", e); + } + } + } + OcrHighlightResult[] out = new OcrHighlightResult[docIds.length]; for (int d = 0; d < docIds.length; d++) { OcrHighlightResult hl = new OcrHighlightResult(); @@ -498,6 +513,64 @@ public OcrHighlightResult[] highlightOcrFields( return out; } + private void highlightDocField( + int docId, + int docInIndex, + int fieldIdx, + IterableCharSequence content, + OcrFieldHighlighter fieldHighlighter, + LeafReader leafReader, + int snippetLimit, + OcrSnippet[][] resultByDocIn, + int[][] snippetCountsByField) + throws IOException { + if (content == null) { + return; + } + OcrFormat ocrFormat = getFormat(content); + if (ocrFormat == null) { + return; + } + + String limitBlockParam = params.get(OcrHighlightParams.LIMIT_BLOCK, "block"); + OcrBlock[] limitBlocks = null; + if (!limitBlockParam.equalsIgnoreCase("NONE")) { + limitBlocks = + OcrBlock.getHierarchyFrom(OcrBlock.valueOf(limitBlockParam.toUpperCase(Locale.US))) + .toArray(new OcrBlock[0]); + } + OcrBlock contextBlock = + OcrBlock.valueOf( + params.get(OcrHighlightParams.CONTEXT_BLOCK, "line").toUpperCase(Locale.US)); + + BreakLocator contextLocator = ocrFormat.getBreakLocator(content, contextBlock); + BreakLocator limitLocator = + limitBlocks == null ? null : ocrFormat.getBreakLocator(content, limitBlocks); + BreakLocator breakLocator = + new ContextBreakLocator( + contextLocator, limitLocator, params.getInt(OcrHighlightParams.CONTEXT_SIZE, 2)); + OcrPassageFormatter formatter = + ocrFormat.getPassageFormatter( + OcrHighlightParams.get(params, OcrHighlightParams.TAG_PRE, ""), + OcrHighlightParams.get(params, OcrHighlightParams.TAG_POST, ""), + params.getBool(OcrHighlightParams.ABSOLUTE_HIGHLIGHTS, false), + params.getBool(OcrHighlightParams.ALIGN_SPANS, false), + params.getBool(OcrHighlightParams.TRACK_PAGES, true)); + boolean scorePassages = params.getBool(OcrHighlightParams.SCORE_PASSAGES, true); + + resultByDocIn[docInIndex] = + fieldHighlighter.highlightFieldForDoc( + leafReader, + docId, + breakLocator, + formatter, + content, + params.get(OcrHighlightParams.PAGE_ID), + snippetLimit, + scorePassages); + snippetCountsByField[fieldIdx][docInIndex] = fieldHighlighter.getNumMatches(docId); + } + @Override protected List loadFieldValues( String[] fields, DocIdSetIterator docIter, int cacheCharsThreshold) throws IOException { @@ -542,9 +615,6 @@ protected List loadOcrFieldValues( ocrVals[fieldIdx] = null; continue; } - // If preloading is enabled, start warming the cache for the pointer - final SourcePointer finalPtr = sourcePointer; - PageCacheWarmer.getInstance().ifPresent(w -> w.preload(finalPtr)); if (sourcePointer.sources.size() == 1) { ocrVals[fieldIdx] = new FileBytesCharIterator( @@ -822,6 +892,7 @@ public long cost() { * on the original code. */ private static class TermVectorReusingLeafReader extends FilterLeafReader { + static IndexReader wrap(IndexReader reader) throws IOException { LeafReader[] leafReaders = reader.leaves().stream() diff --git a/src/test/java/com/github/dbmdz/solrocr/solr/AltoMultiTest.java b/src/test/java/com/github/dbmdz/solrocr/solr/AltoMultiTest.java index 6901dbf3..71ddab06 100644 --- a/src/test/java/com/github/dbmdz/solrocr/solr/AltoMultiTest.java +++ b/src/test/java/com/github/dbmdz/solrocr/solr/AltoMultiTest.java @@ -1,5 +1,6 @@ package com.github.dbmdz.solrocr.solr; +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; import com.google.common.collect.ImmutableMap; import java.nio.file.Files; import java.nio.file.Path; @@ -8,12 +9,21 @@ import java.util.Map; import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.lucene.tests.util.QuickPatchThreadsFilter; +import org.apache.solr.SolrIgnoredThreadsFilter; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.request.SolrQueryRequest; import org.junit.BeforeClass; import org.junit.Test; +@ThreadLeakFilters( + defaultFilters = true, + filters = { + SolrIgnoredThreadsFilter.class, + QuickPatchThreadsFilter.class, + HlThreadsFilter.class + }) public class AltoMultiTest extends SolrTestCaseJ4 { @BeforeClass public static void beforeClass() throws Exception { diff --git a/src/test/java/com/github/dbmdz/solrocr/solr/AltoTest.java b/src/test/java/com/github/dbmdz/solrocr/solr/AltoTest.java index 7bb7e1ac..779ec8a5 100644 --- a/src/test/java/com/github/dbmdz/solrocr/solr/AltoTest.java +++ b/src/test/java/com/github/dbmdz/solrocr/solr/AltoTest.java @@ -1,5 +1,6 @@ package com.github.dbmdz.solrocr.solr; +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; import com.google.common.collect.ImmutableMap; import java.nio.file.Path; import java.nio.file.Paths; @@ -8,12 +9,21 @@ import java.util.Map; import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.lucene.tests.util.QuickPatchThreadsFilter; +import org.apache.solr.SolrIgnoredThreadsFilter; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.request.SolrQueryRequest; import org.junit.BeforeClass; import org.junit.Test; +@ThreadLeakFilters( + defaultFilters = true, + filters = { + SolrIgnoredThreadsFilter.class, + QuickPatchThreadsFilter.class, + HlThreadsFilter.class + }) public class AltoTest extends SolrTestCaseJ4 { @BeforeClass public static void beforeClass() throws Exception { diff --git a/src/test/java/com/github/dbmdz/solrocr/solr/DistributedTest.java b/src/test/java/com/github/dbmdz/solrocr/solr/DistributedTest.java index 9d4e2f35..a5be6c5f 100644 --- a/src/test/java/com/github/dbmdz/solrocr/solr/DistributedTest.java +++ b/src/test/java/com/github/dbmdz/solrocr/solr/DistributedTest.java @@ -1,15 +1,25 @@ package com.github.dbmdz.solrocr.solr; +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; import java.nio.file.Path; import java.nio.file.Paths; import java.util.List; +import org.apache.lucene.tests.util.QuickPatchThreadsFilter; import org.apache.solr.BaseDistributedSearchTestCase; +import org.apache.solr.SolrIgnoredThreadsFilter; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.util.NamedList; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; +@ThreadLeakFilters( + defaultFilters = true, + filters = { + SolrIgnoredThreadsFilter.class, + QuickPatchThreadsFilter.class, + HlThreadsFilter.class + }) public class DistributedTest extends BaseDistributedSearchTestCase { @Override diff --git a/src/test/java/com/github/dbmdz/solrocr/solr/HlThreadsFilter.java b/src/test/java/com/github/dbmdz/solrocr/solr/HlThreadsFilter.java new file mode 100644 index 00000000..ffe2a857 --- /dev/null +++ b/src/test/java/com/github/dbmdz/solrocr/solr/HlThreadsFilter.java @@ -0,0 +1,11 @@ +package com.github.dbmdz.solrocr.solr; + +import com.carrotsearch.randomizedtesting.ThreadFilter; + +public class HlThreadsFilter implements ThreadFilter { + + @Override + public boolean reject(Thread thread) { + return thread.getName().startsWith("OcrHighlighter-"); + } +} diff --git a/src/test/java/com/github/dbmdz/solrocr/solr/HocrTest.java b/src/test/java/com/github/dbmdz/solrocr/solr/HocrTest.java index 45feb707..767e1b07 100644 --- a/src/test/java/com/github/dbmdz/solrocr/solr/HocrTest.java +++ b/src/test/java/com/github/dbmdz/solrocr/solr/HocrTest.java @@ -1,5 +1,6 @@ package com.github.dbmdz.solrocr.solr; +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; import com.google.common.collect.ImmutableMap; import java.io.IOException; import java.nio.charset.StandardCharsets; @@ -11,12 +12,21 @@ import java.util.Map; import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.lucene.tests.util.QuickPatchThreadsFilter; +import org.apache.solr.SolrIgnoredThreadsFilter; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.request.SolrQueryRequest; import org.junit.BeforeClass; import org.junit.Test; +@ThreadLeakFilters( + defaultFilters = true, + filters = { + SolrIgnoredThreadsFilter.class, + QuickPatchThreadsFilter.class, + HlThreadsFilter.class + }) public class HocrTest extends SolrTestCaseJ4 { @BeforeClass public static void beforeClass() throws Exception { diff --git a/src/test/java/com/github/dbmdz/solrocr/solr/MiniOcrTest.java b/src/test/java/com/github/dbmdz/solrocr/solr/MiniOcrTest.java index 497c4541..dfdfd0e0 100644 --- a/src/test/java/com/github/dbmdz/solrocr/solr/MiniOcrTest.java +++ b/src/test/java/com/github/dbmdz/solrocr/solr/MiniOcrTest.java @@ -1,5 +1,6 @@ package com.github.dbmdz.solrocr.solr; +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; import com.google.common.collect.ImmutableMap; import java.nio.charset.StandardCharsets; import java.nio.file.Files; @@ -8,12 +9,21 @@ import java.util.HashMap; import java.util.Map; import java.util.stream.Stream; +import org.apache.lucene.tests.util.QuickPatchThreadsFilter; +import org.apache.solr.SolrIgnoredThreadsFilter; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.request.SolrQueryRequest; import org.junit.BeforeClass; import org.junit.Test; +@ThreadLeakFilters( + defaultFilters = true, + filters = { + SolrIgnoredThreadsFilter.class, + QuickPatchThreadsFilter.class, + HlThreadsFilter.class + }) public class MiniOcrTest extends SolrTestCaseJ4 { @BeforeClass public static void beforeClass() throws Exception {