From a505a25698f16c7a66729634ce7ff1fc3aa50dac Mon Sep 17 00:00:00 2001 From: Johannes Baiter Date: Fri, 10 Jun 2022 10:52:21 +0200 Subject: [PATCH] Release 0.8.1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a bugfix release targeting mainly the MiniOCR and ALTO implementations. **Bufgfixes:** - ALTO: Fix handling of empty words. Previously any words after a word element with no text **would be skipped entirely during indexing** 😱😱. - MiniOCR: Fix handling of empty words, Previously a word element with no text would make the parser crash. - MiniOCR: Make the `wh` attribute on `

` page elements actually optional. The documentation said it was optional, but the parser would crash when attempting to handle elements without the attribute **Other Changes:** - A warning will now be logged if none of the fields requested with `hl.ocr.fl` exist or are defined as stored fields. Previously highlighting would just not work, with no indications to users as to why this was the case. --- docs/changes.md | 25 +++++++++++++++++++ docs/installation.md | 2 +- integration-tests/run.sh | 2 +- pom.xml | 2 +- .../formats/miniocr/MiniOcrFormat.java | 4 +-- .../dbmdz/solrocr/solr/MiniOcrTest.java | 3 +-- 6 files changed, 30 insertions(+), 8 deletions(-) diff --git a/docs/changes.md b/docs/changes.md index aeb55650..538beb5e 100644 --- a/docs/changes.md +++ b/docs/changes.md @@ -1,4 +1,29 @@ +## 0.8.1 (2022-06-10) +[GitHub Release](https://github.com/dbmdz/solr-ocrhighlighting/releases/tag/0.8.1) + +This is a bugfix release targeting mainly the MiniOCR and ALTO +implementations. + +**Bufgfixes:** + +- ALTO: Fix handling of empty words. Previously any words after a word element + with no text **would be skipped entirely during indexing** 😱😱. +- MiniOCR: Fix handling of empty words, Previously a word element with no text + would make the parser crash. +- MiniOCR: Make the `wh` attribute on `

` page elements actually optional. + The documentation said it was optional, but the parser would crash when + attempting to handle elements without the attribute + +**Other Changes:** + +- A warning will now be logged if none of the fields requested with `hl.ocr.fl` + exist or are defined as stored fields. Previously highlighting would just + not work, with no indications to users as to why this was the case. + + ## 0.8.0 (2022-06-01) +[GitHub Release](https://github.com/dbmdz/solr-ocrhighlighting/releases/tag/0.8.0) + The major improvement in this version is compatibility with Solr 9. Due to a number of API changes in Solr and Lucene, we now have to ship two separate releases, diff --git a/docs/installation.md b/docs/installation.md index 402e128e..76d73874 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -19,7 +19,7 @@ your Solrcloud cluster. All paths are relative to the Solr installation director `$ ./bin/solr package add-repo dbmdz.github.io https://dbmdz.github.io/solr` - **Install package** in the latest version:
`$ ./bin/solr package install ocrhighlighting` if you're on Solr 9, otherwise: - `$ ./bin/solr package install ocrhighlighting:0.8.0-solr78` + `$ ./bin/solr package install ocrhighlighting:0.8.1-solr78` !!! caution "Be sure to use the `ocrhighlighting:` prefix when specifying classes in your configuration." When using the Package Manager, classes from plugins have to be prefixed (separated by a colon) by diff --git a/integration-tests/run.sh b/integration-tests/run.sh index 13810a66..705e2652 100755 --- a/integration-tests/run.sh +++ b/integration-tests/run.sh @@ -51,7 +51,7 @@ for version in $SOLR9_VERSIONS; do -v "$plugin_dir:/build" \ -p "31337:8983" \ solr:$version \ - solr-precreate ocr /opt/core-config & > /dev/null 2>&1 & \ + solr-precreate ocr /opt/core-config > /dev/null 2>&1 & \ wait_for_solr "$container_name" if ! python3 test.py; then printf " !!!FAIL!!!\n" diff --git a/pom.xml b/pom.xml index e2cfdf7b..e41d7158 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ de.digitalcollections solr-ocrhighlighting - 0.8.1-SNAPSHOT + 0.8.1 Solr OCR Highlighting Plugin diff --git a/src/main/java/com/github/dbmdz/solrocr/formats/miniocr/MiniOcrFormat.java b/src/main/java/com/github/dbmdz/solrocr/formats/miniocr/MiniOcrFormat.java index 044e5e18..7774532d 100644 --- a/src/main/java/com/github/dbmdz/solrocr/formats/miniocr/MiniOcrFormat.java +++ b/src/main/java/com/github/dbmdz/solrocr/formats/miniocr/MiniOcrFormat.java @@ -54,9 +54,7 @@ public OcrPage parsePageFragment(String pageFragment) { } Dimension dims = null; if (m.group("width") != null && m.group("height") != null) { - dims = new Dimension( - Integer.parseInt(m.group("width")), - Integer.parseInt(m.group("height"))); + dims = new Dimension(Integer.parseInt(m.group("width")), Integer.parseInt(m.group("height"))); } String pageId = m.group("pageId"); return new OcrPage(pageId, dims); diff --git a/src/test/java/com/github/dbmdz/solrocr/solr/MiniOcrTest.java b/src/test/java/com/github/dbmdz/solrocr/solr/MiniOcrTest.java index d0448714..feb455ff 100644 --- a/src/test/java/com/github/dbmdz/solrocr/solr/MiniOcrTest.java +++ b/src/test/java/com/github/dbmdz/solrocr/solr/MiniOcrTest.java @@ -336,8 +336,7 @@ public void testPagesWithoutDimensions() { assertQ( req, "count(//lst[@name='57371']//arr[@name='snippets']/lst)='10'", - "(//lst[@name='57371']//arr[@name='snippets']/lst)[1]/arr[@name='pages']/lst/str[@name='id']/text()='716'" - ); + "(//lst[@name='57371']//arr[@name='snippets']/lst)[1]/arr[@name='pages']/lst/str[@name='id']/text()='716'"); assertU(delI("57371")); assertU(commit()); }