[app][rfct] mjr re-organization

ulb-sachsen-anhalt · May 24, 2024 · 723eef0 · 723eef0
1 parent 906c083
commit 723eef0
Show file tree

Hide file tree

Showing 15 changed files with 434 additions and 482 deletions.
diff --git a/README.md b/README.md
@@ -7,16 +7,16 @@ Python3 Tool to report evaluation outcomes from mass digitalization workflows.
 
 ## Features
 
-* match automatically groundtruth (i.e. reference data) and candidates by filename
+* match groundtruth (i.e. reference data) and candidates by filename start
 * use geometric information to evaluate only specific frame (i.e. specific column or region from large page) of
   candidates (requires ALTO or PAGE format)
-* aggregate evaluation outcome on domain range (with multiple subdomains)
+* aggregate evaluation outcomes on domain range (with multiple subdomains) according to folder layout
 * choose from textual metrics based on characters or words plus common Information Retrieval
-* choose between accuracy / error rate and different UTF-8 Python norms
+* choose from different UTF-8 Python norms
 * formats: ALTO, PAGE or plain text for both groundtruth and candidates
 * speedup with parallel execution
 * additional OCR util:
-  * filter custom areas of single OCR files
+  * filter custom areas of single OCR files of ALTO files
 
 ## Installation
 
@@ -28,13 +28,12 @@ pip install digital-eval
 
 ### Metrics
 
-Calculate similarity (`acc`) or difference (`err`) ratios between single reference/groundtruth and test/candidate item.
+#### Edit-Distance based Strin Similarity
 
-#### Edit-Distance based
-
-Character-based text string minus whitechars (`Cs`, `Characters`) or Letter-based (`Ls`, `Letters`) minus whites,
-punctuation and digits.
-Word/Token-based edit-distance of single tokens identified by whitespaces.
+Calculate similarity for each single reference/groundtruth and test/candidate item.
+Complete haracter-based text string (`Cs`, `Characters`) or Letter-based (`Ls`, `Letters`) minus whitespaces,
+punctuation and common digits (arabic, persian). 
+Word/Token-based edit-distance of single tokens identified by Word or String elements or whitespaces, depending on data.
 
 #### Set based
 
@@ -141,8 +140,8 @@ digital-eval --help
 
 Contributions, suggestions and proposals welcome!
 
-## Licence
+## License
 
 Under terms of the [MIT license](https://opensource.org/licenses/MIT).
 
-**NOTE**: This software depends on other packages that _may_ be licensed under different open source licenses.
+**NOTE**: This software depends on packages that _may_ be licensed under different terms.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,3 +1,36 @@
 [build-system]
 requires = ["setuptools>=61.0"]
 build-backend = "setuptools.build_meta"
+
+[tool.setuptools.dynamic]
+version = {attr = "digital_eval.__version__"}
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[project]
+name = "digital-eval"
+dynamic = ["version"]
+description = "Evaluate Digitalization Data"
+readme = "README.md"
+requires-python = ">=3.8"
+authors = [{name = "Universitäts- und Landesbibliothek Sachsen-Anhalt",email = "[email protected]"}]
+classifiers = [
+    "Programming Language :: Python :: 3",
+	"License :: OSI Approved :: MIT License"
+]
+dependencies = [
+	"rapidfuzz>3",
+    "nltk",
+    "requests",
+    "docker",
+    "numpy",
+    "digital-object==0.2.0",
+]
+
+[project.urls]
+Homepage = "https://github.com/ulb-sachsen-anhalt/digital-eval"
+
+[project.scripts]
+digital-eval = "digital_eval.cli:start"
+ocr-util = "ocr_util.cli:start"
diff --git a/requirements.txt b/requirements.txt
diff --git a/setup.cfg b/setup.cfg
diff --git a/src/digital_eval/VERSION b/src/digital_eval/VERSION
diff --git a/src/digital_eval/__init__.py b/src/digital_eval/__init__.py
@@ -1,6 +1,5 @@
-#
-# provided API exports
-#
+"""digital eval main API"""
+__version__ = '1.6.0'
 from .evaluation import (
     Evaluator,
     find_groundtruth,
@@ -18,4 +17,3 @@
     MetricIRRec,
     MetricIRFM,
 )
-