From d6c29d7766a1aa4860f8b65000e19ad407c73937 Mon Sep 17 00:00:00 2001 From: Johannes Baiter Date: Tue, 31 May 2022 16:41:12 +0200 Subject: [PATCH] Update docs, use material theme - Updated CI config to publish documentation to gh-pages - Updated namespaces in examples to reflect recent change - Updated changelog to always have a link to the GitHub release page - Added references to the new package management repository - mkdocs-material allows us to highlight individual lines in code blocks, which should help with the readability of the configuration snippets. --- .github/workflows/ci.yml | 47 +++++++++++++++++++ docs/alternatives.md | 2 +- docs/changes.md | 69 ++++++++++++++++++++++++++++ docs/example.md | 23 +++++----- docs/formats.md | 2 +- docs/index.md | 12 ++--- docs/indexing.md | 12 ++--- docs/installation.md | 97 ++++++++++++++++++++++++++++------------ docs/performance.md | 11 +++-- mkdocs.yml | 42 ++++++++++------- 10 files changed, 246 insertions(+), 71 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a2c07dcc..01f692b5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,3 +39,50 @@ jobs: run: chmod -R a+rw ./target - name: Run integration tests run: ./integration-tests/run.sh + + publish_mkdocs_latest: + if: github.event_name == 'push' and contains(github.ref, 'refs/tags/') + runs-on: ubuntu-latest + needs: build + env: + GIT_COMMITTER_NAME: mkdocs-mike + GIT_COMMITTER_EMAIL: mkdocs-mike@nowhere.tld + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: 3.10 + cache: 'pip' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install mkdocs singledispatch mkdocs-material mike + - name: Deploy documentation with mike + run: mike deploy ${{github.ref_name}} latest + - name: Push gh-pages branch + run: git push origin gh-pages:gh-pages + + + publish_mkdocs_wip: + if: github.event_name == 'push' and contains(github.ref, 'main') + runs-on: ubuntu-latest + needs: build + env: + GIT_COMMITTER_NAME: mkdocs-mike + GIT_COMMITTER_EMAIL: mkdocs-mike@nowhere.tld + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: 3.10 + cache: 'pip' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install mkdocs singledispatch mkdocs-material mike + - name: Deploy documentation with mike + run: mike deploy wip + - name: Push gh-pages branch + run: git push origin gh-pages:gh-pages diff --git a/docs/alternatives.md b/docs/alternatives.md index c44ebb4f..280f7f9b 100644 --- a/docs/alternatives.md +++ b/docs/alternatives.md @@ -43,7 +43,7 @@ chain. This component must to be placed **after the tokenizer**: A full field definition for an OCR field with alternative expansion could look like this: -```xml +```xml hl_lines="3 4 5 6 7 9" diff --git a/docs/changes.md b/docs/changes.md index 4aba2500..44e3cec0 100644 --- a/docs/changes.md +++ b/docs/changes.md @@ -1,16 +1,67 @@ +## 0.8.0 (2022-05-??) +The major improvement in this version is compatibility with Solr 9. + +Due to a number of API changes in Solr and Lucene, we now have to ship two separate releases, +one for Solr 7 and 8 and one for Solr 9, so please take extra care when downloading to pick +the correct release. In the Package Repository, the Solr 7/8 release will always have version +with the suffix `-solr78`. + +We also **changed the package namespaces** for all user-facing components so they are easier +to identify and write. What this means is that you will need to change the `class="..."` +attributes in your `solrconfig.xml` and `schema.xml` to match the new package namespaces. +Whenever you previously had `de.digitalcollections.solrocr..ClassName`, you +now have to simply write `solrocr.ClassName`. + +**New Features:** + +- For users running Solr in the Solrcloud mode, the plugin can now be installed via Solr's + [Package Manager](https://solr.apache.org/guide/solr/latest/configuration-guide/package-manager.html): + ``` + $ bin/solr package add-repo dbmdz.github.io https://dbmdz.github.io/solr + $ bin/solr package install ocrhighlighting # For Solr 9 + $ bin/solr package install ocrhighlighting:0.8.0-solr78 # For Solr 7 and 8 + ``` + Note that Solr 7/8 users need to manually specify the version. + +**API changes:** + +- Changed deployment process to use two separate packages, one for Solr 9 and later and one for Solr 7/8, with a `-solr78.jar` suffix +- Changed namespace of all user-facing components to simply `solrocr` and moved all + user-facing component classes to it: + * `de.digitalcollections.solrocr.lucene.filters.OcrCharFilterFactory`
+ → `solrocr.OcrCharFilterFactory` + * `de.digitalcollections.solrocr.lucene.filters.ExternalUtf8ContentFilterFactory`
+ → `solrocr.ExternalUtf8ContentFilterFactory` + * `de.digitalcollections.solrocr.lucene.OcrAlternativesFilterFactory`
+ → `solrocr.OcrAlternativesFilterFactory` + * `de.digitalcollections.solrocr.lucene.OcrHighlightComponent`
+ → `solrocr.OcrHighlightComponent` + +**Bugfixes** + +- Fix handling of quoted property values in hOCR title tags. We deviate a bit from the spec + to be more compatible with existing real-world data: Values like `x_source` can now either + be quoted in single- or double-quotes, or not at all, the parser will handle every case. + ## 0.7.2 (2022-03-22) +[GitHub Release](https://github.com/dbmdz/solr-ocrhighlighting/releases/tag/0.7.2) + And yet another bugfix release. **Bugfixes:** + - Fixed using single-quotes in MiniOCR input, previously these files were not recognized as valid MiniOCR files ([#247](https://github.com/dbmdz/solr-ocrhighlighting/pull/247), thanks @mspalti for the fix!) - Fixed `OutOfBoundsException` when using alternatives with very long tokens ( [#230](https://github.com/dbmdz/solr-ocrhighlighting/pull/230), thanks @fd17 for the report and review) ## 0.7.1 (2021-09-24) +[GitHub Release](https://github.com/dbmdz/solr-ocrhighlighting/releases/tag/0.7.1) + Another bugfix release, upgrading is recommended. **Bugfixes:** + - Fix text display and "number of snippets" slider in demo setup - Fix instances where we were using Java SDK methods that relied on a default locale, which led to hard-to-debug issues in some locales @@ -20,6 +71,8 @@ Another bugfix release, upgrading is recommended. - Fix issue with namespaced ALTO documents ## 0.7.0 (2021-07-12) +[GitHub Release](https://github.com/dbmdz/solr-ocrhighlighting/releases/tag/0.7.0) + This is a bugfix release, especially users with ALTO files are encouraged to upgrade. Other than bugfixes, this is the first release to support Solr 8.9. @@ -30,6 +83,8 @@ bugfixes, this is the first release to support Solr 8.9. - Fix issue when an hOCR file had empty OCR boxes ## 0.6.0 (2021-05-11) +[GitHub Release](https://github.com/dbmdz/solr-ocrhighlighting/releases/tag/0.6.0) + This is a major new release with significant improvements in stability, accuracy and most importantly performance. Updating is **highly** recommended, especially for ALTO users, who can expect a speed-up in indexing of up to **6000% (i.e. 60x as fast)**. We also recommend updating your JVM to at least Java 11 (LTS), since Java 9 introduced @@ -77,6 +132,7 @@ significantly. **API changes:** + - **No more need for an explicit `hl.fl` parameter for highlighting non-OCR fields.** By default, if highlighting is enabled and no `hl.fl` parameter is passed by the user, Solr falls back to highlighting every stored field in the document. Previously this did not work with the plugin and @@ -93,6 +149,7 @@ significantly. See the above section unter *New Features* for an explanation of this flag. **Bugfixes:** + - **Improved tolerance for incomplete bounding boxes.** Previously the occurrence of an incomplete bounding box in a snippet (i.e. with one or more missing coordinates) would crash the whole query. We now simply insert a `0` default value in these cases. @@ -103,9 +160,12 @@ significantly. the OCR parsers would try to either load a file from the empty string or parse OCR markup from it. ## 0.5.0 (2020-10-07) +[GitHub Release](https://github.com/dbmdz/solr-ocrhighlighting/releases/tag/0.5.0) + No breaking changes this time around, but a few essential bugfixes, more stability and a new feature. **API changes:** + - **Snippets are now sorted by their descending score/relevancy.** Previously the order was non-deterministic, which broke the use case for dynamically fetching more snippets. - **Add a new boolean `hl.ocr.alignSpans` parameter to align text and image spans.** This new option (disabled by @@ -113,6 +173,7 @@ No breaking changes this time around, but a few essential bugfixes, more stabili to correspond to actual OCR word boundaries. **Bugfixes:** + - **Fix regular highlighting in distributed setup.** Regular, non-OCR highlighting was broken in previous versions due to a bad check in the shard response collection phase if users only requested regular highlighting, but not for OCR fields @@ -125,9 +186,12 @@ No breaking changes this time around, but a few essential bugfixes, more stabili ## 0.4.1 (2020-06-02) +[GitHub Release](https://github.com/dbmdz/solr-ocrhighlighting/releases/tag/0.4.1) + This is a patch release with a fix for excessive memory usage during indexing. ## 0.4.0 (2020-05-11) +[GitHub Release](https://github.com/dbmdz/solr-ocrhighlighting/releases/tag/0.4.0) This is a major release with a focus on compatibility and performance. @@ -136,6 +200,7 @@ This is a major release with a focus on compatibility and performance. the future. **Breaking API changes:** + - **Add new `pages` key to snippet response with page dimensions**. This can be helpful if you need to calculate the snippet coordinates relative to the page image dimensions. - **Replace `page` key on regions and highlights with `pageIdx`**. That is, instead of a string with the @@ -146,11 +211,13 @@ This is a major release with a focus on compatibility and performance. disjunct parts of the page or even multiple pages. **Format changes:** + - hocr: Add support for retrieving page identifier from `x_source` an `ppageno` properties - hocr: Strip out title tag during indexing and highlighting - ALTO: The plugin now supports ALTO files with coordinates expressed as floating point numbers (thanks to @mspalti!) **Performance:** + - Add concurrent preloading for highlighting target files. This can result in a nice performance boost, since by the time the plugin gets to actually highlighting the files, their contents are already in the OS' page cache. See the [Performance Tuning section in the docs](https://dbmdz.github.io/solr-ocrhighlighting/performance/) for more @@ -159,12 +226,14 @@ This is a major release with a focus on compatibility and performance. compared to previous versions. **Miscellaneous:** + - Log warnings during source pointer parsing - Filter out empty files during indexing - Add new documentation section on performance tuning - Empty regions or regions with only whitespace are no longer included in the output ## 0.3.1 (2019-07-26) +[GitHub Release](https://github.com/dbmdz/solr-ocrhighlighting/releases/tag/0.3.1) This is patch release that fixes compatibility with Solr/Lucene 8.2. diff --git a/docs/example.md b/docs/example.md index b3bb00cd..ca20a6d0 100644 --- a/docs/example.md +++ b/docs/example.md @@ -44,17 +44,17 @@ To run the example setup yourself, you will need: ## Solr Configuration Walkthrough [`solrconfig.xml`](https://github.com/dbmdz/solr-ocrhighlighting/blob/master/example/solr/cores/ocr/conf/solrconfig.xml) -```xml +```xml hl_lines="7 10 11 21" - 7.6 - - + 9.0 - + - @@ -73,11 +73,14 @@ To run the example setup yourself, you will need: ``` [`schema.xml`](https://github.com/dbmdz/solr-ocrhighlighting/blob/master/example/solr/cores/ocr/conf/schema.xml) -```xml - +```xml hl_lines="4 5 6 7" + - - + + diff --git a/docs/formats.md b/docs/formats.md index 3929b7f2..22310956 100644 --- a/docs/formats.md +++ b/docs/formats.md @@ -56,7 +56,7 @@ You should use this format when: - you want to store the OCR in the index (to keep the index size as low) - reusing the existing OCR files is not possible or practical (to keep occupied disk space low) -- you want the best possible performance, highlighting MiniOCR is ~25% faster than ALTO and ~50% faster than hOCR +- you want the best possible performance, highlighting MiniOCR is ~25% faster than ALTO and ~50% faster than hOCR (in an artificial benchmark that is purely CPU-bound) A basic example looks like this: diff --git a/docs/index.md b/docs/index.md index 8d568adf..0eb58a60 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,4 +1,4 @@ -# Solr OCR Highlighting +# Solr OCR Highlighting Plugin ![Highlighted OCR snippet](img/snippet.png) @@ -21,8 +21,8 @@ and its position on the page at query time: { "ulx": 196, "uly": 1703, "lrx": 1232, "lry": 1968, "pageIdx": 0 } ], "highlights":[ - [{ "text": "Mason and Jeremiah", "ulx": 675, "uly": 110, "lrx": 1036, "lry": 145, - "parentRegionIdx": 0}, + [{ "text": "Mason and Jeremiah", "ulx": 675, "uly": 110, "lrx": 1036, + "lry": 145, "parentRegionIdx": 0}, { "text": "Dixon,", "ulx": 1, "uly": 167, "lrx": 119, "lry": 204, "parentRegionIdx": 0 }] ] @@ -41,7 +41,7 @@ does not interfere with Solr's standard highlighting component, i.e. it works transparently with non-OCR fields and just lets the default implementation handle those. -The plugin **works with all Solr versions >= 7.x**. +The plugin **works with all Solr versions >= 7.x up to 9.0**. ## Features - Index various [OCR formats](formats.md) directly without any pre-processing @@ -49,7 +49,7 @@ The plugin **works with all Solr versions >= 7.x**. * [ALTO](formats.md#alto) * [MiniOCR](formats.md#miniocr) - Retrieve all the information needed to render a highlighted snippet view - directly from Solr, without post-processing + directly from Solr, without post-processing on the client-side - Keep your index size manageable by optionally re-using OCR documents on disk for highlighting @@ -62,5 +62,5 @@ If you want to see the **plugin in action**, you can play around with the [example setup](example.md) hosted at [https://ocrhl.jbaiter.de](https://ocrhl.jbaiter.de) Should you want to **run the example on your own computer** and play around with the -settings, the [Docker-based setup is available on GitHub](https://github.com/dbmdz/solr-ocrhighlighting/tree/master/example) +settings, the [Docker-based setup is available on GitHub](https://github.com/dbmdz/solr-ocrhighlighting/tree/main/example) and instructions for using it are in the [Example Setup chapter](example.md) diff --git a/docs/indexing.md b/docs/indexing.md index 15178000..edfa4a5b 100644 --- a/docs/indexing.md +++ b/docs/indexing.md @@ -1,10 +1,10 @@ # Indexing OCR documents -**If you want to store the OCR in the index itself** you can skip this section: Just put the OCR -content in the field and submit it to Solr for indexing. We recommend using the space-efficient -[MiniOCR format](./formats.md#miniocr) if you decide to go this way. +!!! note "If you want to store the OCR in the index itself you can all but _skip this section_" + Just put the OCR content in the field and submit it to Solr for indexing. We recommend using the space-efficient + [MiniOCR format](./formats.md#miniocr) if you decide to go this way. -Indexing OCR documents without storing the actual content in the index is also relatively simple: +Indexing OCR documents without storing the actual content in the index is relatively simple: When building the index document, instead of putting the actual OCR content into the field, you use a **source pointer**. This pointer will tell the plugin from which location to load the OCR content during indexing and highlighting. @@ -26,6 +26,8 @@ the (again, potentially very large) contents themselves in the index. account. To signal to the plugin that a given source path is encoded in ASCII, include the `{ascii}` string after the path, e.g. `/mnt/data/ocrdoc.xml{ascii}[31337:41337]`. + For even more advice on performance tuning, refer to the [corresponding documentation section](./performance.md). + The structure of the source pointers depends on how your actual OCR files on disk map to documents in the Solr index. @@ -111,7 +113,7 @@ The format of the regions is inspired by [Python's slicing syntax](https://docs. - `start:end` → Everything between the byte offsets `start` (inclusive) and `end` (exclusive) - `:end` → Everything from the start of the file to byte offset `end` (exclusive) -!!! caution "Region Requirements"" +!!! caution "Region Requirements" - The concatenated content of your regions must be a half-way valid XML structure. While we tolerate *unclosed tags or unmatched closing tags* (they often can't be avoided), other errors such as partial tags (i.e. a missing `<` or `>`) will lead to an error during indexing. diff --git a/docs/installation.md b/docs/installation.md index f80e7271..402e128e 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -1,55 +1,83 @@ ## Requirements - Some familiarity with configuring Solr -- Solr >= 7.5 +- Solr ≥ 7.5 - OCR documents need to be in [hOCR](formats.md#hocr), [ALTO](formats.md#alto) or [MiniOCR](formats.md#miniocr) formats, with at least page-, and word-level segmentation -## Obtaining the plugin JAR +## Manually installing the plugin JAR +To use the latest release version, refer to the [GitHub Releases list](https://github.com/dbmdz/solr-ocrhighlighting/releases). From there, download the corresponding JAR file. +To make the plugin available to Solr, create a new directory `$SOLR_HOME/contrib/ocrhighlighting/lib` and place the JAR you just downloaded there. -To use the latest release version, refer to the [GitHub Releases list](https://github.com/dbmdz/solr-ocrhighlighting/releases). From there, download the JAR file for the latest version. -To make the plugin available to Solr, create a new directory `$SOLR_HOME/contrib/ocrsearch/lib` and place the JAR you just downloaded there. +## For Solrcloud users: Installation as a Solr Package +Since version 8.4, Solrcloud ships with a package management subsystem that can be used +to conveniently install plugins from the command-line. To install the OCR highlighting +plugin in this way, follow these steps on one of the nodes in +your Solrcloud cluster. All paths are relative to the Solr installation directory: +- **Add repository** to the local package registry:
+ `$ ./bin/solr package add-repo dbmdz.github.io https://dbmdz.github.io/solr` +- **Install package** in the latest version:
+ `$ ./bin/solr package install ocrhighlighting` if you're on Solr 9, otherwise: + `$ ./bin/solr package install ocrhighlighting:0.8.0-solr78` + +!!! caution "Be sure to use the `ocrhighlighting:` prefix when specifying classes in your configuration." + When using the Package Manager, classes from plugins have to be prefixed (separated by a colon) by + their plugin's identifier, for this plugin this identifier is `ocrhighlighting`. So whenever + you see an attribute like `class="solrocr.SomeClass"`, you have to write + `class="ocrhighlighting:solrocr.SomeClass"` in your config instead. # Core Configuration To enable the use of the plugin for your Solr core, you will have to edit both the `solrconfig.xml` and the `schema.xml` file in your core's `conf` directory. +Additionally, if you have installed the plugin via Solr's Package Management, you will +have to *deploy* the plugin to your collection/core using Solr's CLI: + +```bash +$ bin/solr package deploy ocrhighlighting -collections +``` + ## SolrConfig -In your core's `solrconfig.xml, you need to: +In your core's `solrconfig.xml`, you need to: -1. Instruct the core to load the OCR highlighting plugin, so it can find the classes - needed to perform OCR indexing and highlighting. +1. Enable the plugin for your collection/core by instructing the collection from where to + load the plugin classes (**Skip when using Solrcloud with Package Manager** ) 2. Define a search component that will perform the OCR highlighting at query time 3. Add the search component to your request handlers that will trigger the highlighting. -```xml +```xml hl_lines="10 16 17 18 33" - - + - - + + @@ -64,6 +92,7 @@ In your core's `solrconfig.xml, you need to: If you run into problems, a look into these sections of the Solr user's guide might be helpful: - [Resource and Plugin Loading](https://lucene.apache.org/solr/guide/8_1/resource-and-plugin-loading.html) +- [Package Manager](https://solr.apache.org/guide/8_11/package-manager.html) - [RequestHandlers and SearchComponents in SolrConfig](https://lucene.apache.org/solr/guide/8_1/requesthandlers-and-searchcomponents-in-solrconfig.html) @@ -74,31 +103,42 @@ In the core's `schema.xml`, you need to: 1. Define a new field type that will hold your indexed OCR text 2. Define which fields are going to hold the indexed OCR text. -The **field type** for OCR text is usually identical to your regular text field, with the +The **field type** for OCR text is very similar to your regular text field, with the difference that there are one or two extra *character filters* at the beginning of your *index analysis chain*: + - `ExternalUtf8ContentFilterFactory` will (optionally) allow you to index and highlight OCR from external sources on the file system. More on this in the [Indexing chapter](./indexing.md). + - `OcrCharFilterFactory` will retrieve the raw OCR data and extract the plain text that is going to pass through the rest of the analysis chain. It will auto-detect the used OCR formats, which means that **you can use different OCR formats alongside each other**. After this filter, Solr will treat the field just like a regular text field for purposes of analysis. -```xml +Additionally, you need to enable the `storeOffsetsWithPositions` option. The plugin uses these +offsets to locate the matching terms in the OCR documents. + +```xml hl_lines="6 11 12 14 15 29" - + + - + - + - + @@ -107,7 +147,8 @@ difference that there are one or two extra *character filters* at the beginning - + ``` diff --git a/docs/performance.md b/docs/performance.md index 0c1aa386..dc58503d 100644 --- a/docs/performance.md +++ b/docs/performance.md @@ -30,8 +30,8 @@ Important factors include: - *Number of possible parallel reads* (see below): Does the storage layer support more than one active reader? Generally speaking, local storage is better than remote storage (like NFS or CIFS), due to the network latency, and -flash-based storage is better than disk-based storage, due to the lower random read latency. A RAID setup is -preferred over a JBOD setup, due to the potential for parallel reads. +flash-based storage is better than disk-based storage, due to the lower random read latency and the possibility to +do parallel reads. A RAID1/10 setup is preferred over a RAID0/JBOD setup, due to the increased potential for parallel reads. ## Plugin configuration The plugin offers the possibility to perform a **concurrent read-ahead of highlighting target files**. This will perform @@ -56,8 +56,11 @@ of reads from either the `qtp...` or `solr-ocrhlighight` threads on the second Example configuration tuned for remote NFS storage mounted with `rsize=65536`: ```xml - + ``` diff --git a/mkdocs.yml b/mkdocs.yml index ba90b805..00707f49 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,19 +1,29 @@ site_name: Solr OCR Highlighting Plugin -theme: readthedocs +#theme: readthedocs +theme: material nav: - - Introduction: index.md - - Installation: installation.md - - Indexing: indexing.md - - Indexing Alternative Terms: alternatives.md - - Querying: query.md - - Example Setup: example.md - - Performance Tuning: performance.md - - Supported Formats: formats.md - - Change Log: changes.md + - Introduction: index.md + - Installation: installation.md + - Indexing: indexing.md + - Indexing Alternative Terms: alternatives.md + - Querying: query.md + - Example Setup: example.md + - Performance Tuning: performance.md + - Supported Formats: formats.md + - Change Log: changes.md markdown_extensions: - - def_list - - attr_list - - fenced_code - - admonition - - smarty -repo_url: https://github.com/dbmdz/solr-ocrhighlighting/ \ No newline at end of file + - pymdownx.highlight: + anchor_linenums: true + - pymdownx.inlinehilite + - pymdownx.snippets + - pymdownx.superfences + - admonition + - pymdownx.details + - pymdownx.superfences + - def_list +repo_url: https://github.com/dbmdz/solr-ocrhighlighting/ +plugins: + - mike +extra: + version: + provider: mike \ No newline at end of file