From 9699e10dc1c805b0c36df9a6417acc27aa1049cc Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Fri, 17 Nov 2023 12:10:14 +0100 Subject: [PATCH 01/11] models:reference_files - add tuples for action and replacement template --- acacore/models/reference_files.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/acacore/models/reference_files.py b/acacore/models/reference_files.py index 6c08040..33b4706 100644 --- a/acacore/models/reference_files.py +++ b/acacore/models/reference_files.py @@ -1,11 +1,16 @@ """Data models for the data on saved to different .json files on the `reference_files` repo.""" from typing import Literal from typing import Optional +from typing import get_args as get_type_args from pydantic import BaseModel from pydantic import Field TActionType = Literal["convert", "extract", "replace", "manual", "rename", "ignore", "reidentify"] +TReplaceTemplate = Literal["text", "empty", "password-protected", "corrupted", "not-preservable", "not-convertable"] + +ActionTypeEnum: tuple[TActionType, ...] = get_type_args(TActionType) +ReplaceTemplateEnum: tuple[TReplaceTemplate, ...] = get_type_args(TReplaceTemplate) class CustomSignature(BaseModel): @@ -58,7 +63,7 @@ class ReplaceAction(BaseModel): if template is set to "text". """ - template: Literal["text", "empty", "password-protected", "corrupted", "not-preservable", "not-convertable"] + template: TReplaceTemplate template_text: Optional[str] = None @@ -120,7 +125,7 @@ class ReIdentifyAction(BaseModel): """ reasoning: str - onfail: Optional[Literal["convert", "extract", "replace", "manual", "rename", "ignore"]] = None + onfail: Optional[TActionType] = None class RenameAction(BaseModel): From 5afe3db45b1d2d7f1f31b88afe9c57d828071374 Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Fri, 17 Nov 2023 12:10:57 +0100 Subject: [PATCH 02/11] models:reference_files - format types for better VCS --- acacore/models/reference_files.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/acacore/models/reference_files.py b/acacore/models/reference_files.py index 33b4706..6f68fec 100644 --- a/acacore/models/reference_files.py +++ b/acacore/models/reference_files.py @@ -6,8 +6,23 @@ from pydantic import BaseModel from pydantic import Field -TActionType = Literal["convert", "extract", "replace", "manual", "rename", "ignore", "reidentify"] -TReplaceTemplate = Literal["text", "empty", "password-protected", "corrupted", "not-preservable", "not-convertable"] +TActionType = Literal[ + "convert", + "extract", + "replace", + "manual", + "rename", + "ignore", + "reidentify", +] +TReplaceTemplate = Literal[ + "text", + "empty", + "password-protected", + "corrupted", + "not-preservable", + "not-convertable", +] ActionTypeEnum: tuple[TActionType, ...] = get_type_args(TActionType) ReplaceTemplateEnum: tuple[TReplaceTemplate, ...] = get_type_args(TReplaceTemplate) From 6303a02dcd45c8a6e7bfec04f9922e15d54ed162 Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Fri, 17 Nov 2023 12:12:34 +0100 Subject: [PATCH 03/11] models:reference_files - add duplicate to replace template --- acacore/models/reference_files.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/acacore/models/reference_files.py b/acacore/models/reference_files.py index 6f68fec..26231c1 100644 --- a/acacore/models/reference_files.py +++ b/acacore/models/reference_files.py @@ -1,7 +1,7 @@ """Data models for the data on saved to different .json files on the `reference_files` repo.""" +from typing import get_args as get_type_args from typing import Literal from typing import Optional -from typing import get_args as get_type_args from pydantic import BaseModel from pydantic import Field @@ -20,6 +20,7 @@ "empty", "password-protected", "corrupted", + "duplicate", "not-preservable", "not-convertable", ] From f8351d0689d107bb032072c41e99a61997614272 Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Fri, 17 Nov 2023 16:56:26 +0100 Subject: [PATCH 04/11] siegfried - use a literal type for match classes Classes come from Pronom. --- acacore/siegfried/siegfried.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/acacore/siegfried/siegfried.py b/acacore/siegfried/siegfried.py index b64d737..61f464e 100644 --- a/acacore/siegfried/siegfried.py +++ b/acacore/siegfried/siegfried.py @@ -21,6 +21,26 @@ _byte_match_regexp_multi = re_compile(r"^byte match at \[\[(\d+) +(\d+)]( \[\d+ +\d+])*]( \([^)]*\))?$") _extension_match = re_compile(r"^extension match (.+)$") TSignature = Literal["pronom", "loc", "tika", "freedesktop", "pronom-tika-loc", "deluxe", "archivematica"] +TSiegfriedClass = Literal[ + "aggregate", + "audio", + "database", + "dataset", + "email", + "font", + "gis", + "image (raster)", + "image (vector)", + "model", + "page description", + "presentation", + "spreadsheet", + "text (mark-up)", + "text (structured)", + "text (unstructured)", + "video", + "word processor", +] def _check_process(process: CompletedProcess) -> CompletedProcess: @@ -71,7 +91,7 @@ class SiegfriedMatch(BaseModel): format: str # noqa: A003 version: Optional[str] = None mime: str - match_class: Optional[str] = Field(None, alias="class") + match_class: Optional[list[TSiegfriedClass]] = Field(None, alias="class") basis: list[str] warning: list[str] URI: Optional[AnyUrl] = None @@ -148,6 +168,7 @@ def unknown_id(cls, data: object): "id": None if data["id"].lower().strip() == "unknown" else data["id"].strip() or None, "basis": filter(bool, map(str.strip, data["basis"].strip().split(";"))), "warning": filter(bool, map(str.strip, data["warning"].strip().split(";"))), + "class": [c for c in map(str.strip, data.get("class", "").lower().split(",")) if c], } return data From fa11e37a2a070adf2818243252a092566f983e4e Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Fri, 17 Nov 2023 17:02:29 +0100 Subject: [PATCH 05/11] tests:files - update match_class properties to fit the new format --- tests/files/files.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/files/files.json b/tests/files/files.json index 8ac6a50..1a05b3c 100644 --- a/tests/files/files.json +++ b/tests/files/files.json @@ -11,7 +11,7 @@ "format": "Portable Network Graphics", "version": "1.0", "mime": "image/png", - "match_class": "Image (Raster)", + "match_class": ["image (raster)"], "basis": ["byte match at [[0 16] [519 12]]"], "warning": ["extension mismatch"], "URI": null, @@ -29,7 +29,7 @@ "format": "JSON Data Interchange Format", "version": "", "mime": "application/json", - "match_class": "", + "match_class": [], "basis": ["extension match json"], "warning": ["match on extension only"], "URI": null, @@ -47,7 +47,7 @@ "format": "Lotus WordPro Document", "version": "96", "mime": "application/lwp", - "match_class": "Word Processor", + "match_class": ["word processor"], "basis": ["extension match lwp", "byte match at 0, 32"], "warning": [], "URI": null, @@ -65,7 +65,7 @@ "format": "MPEG 1/2 Audio Layer 3", "version": "", "mime": "audio/mpeg", - "match_class": "Audio", + "match_class": ["audio"], "basis": ["extension match mp3", "byte match at 0, 1521 (signature 5/9)"], "warning": [], "URI": null, @@ -83,7 +83,7 @@ "format": "MPEG-4 Media File", "version": "", "mime": "application/mp4", - "match_class": "Audio, Video", + "match_class": ["audio", "video"], "basis": ["extension match mp4", "byte match at [[4 8] [135072 4]]"], "warning": [], "URI": null, @@ -101,7 +101,7 @@ "format": "Acrobat PDF 1.7 - Portable Document Format", "version": "1.7", "mime": "application/pdf", - "match_class": "Page Description", + "match_class": ["page description"], "basis": ["extension match pdf", "byte match at [[0 8] [38463 5]]"], "warning": [], "URI": null, @@ -119,7 +119,7 @@ "format": "Plain Text File", "version": "", "mime": "text/plain", - "match_class": "", + "match_class": [], "basis": ["extension match txt", "text match UTF-8 Unicode"], "warning": [], "URI": null, From aa5c04a59cb2a89facec7271b6735c651c3bc418 Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Mon, 20 Nov 2023 11:16:43 +0100 Subject: [PATCH 06/11] models:reference_files - rename reasoning to reason --- acacore/models/reference_files.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/acacore/models/reference_files.py b/acacore/models/reference_files.py index 26231c1..262db1e 100644 --- a/acacore/models/reference_files.py +++ b/acacore/models/reference_files.py @@ -88,11 +88,11 @@ class ManualAction(BaseModel): Class representing a manual action in a workflow. Attributes: - reasoning (str): The reasoning behind the manual action. + reason (str): The reason behind the manual action. process (str): The process for performing the manual action. """ - reasoning: str + reason: str process: str @@ -108,7 +108,7 @@ class IgnoreIfAction(BaseModel): pixel_height (Optional[int]): Height for images. size (Optional[int]): Size for all files. binary_size (Optional[int]): Size for binary files. - reason (Optional[int]): A reasoning for the specific condition. + reason (Optional[int]): A reason for the specific condition. """ pixel_total: Optional[int] = Field(None, gt=0) @@ -121,26 +121,26 @@ class IgnoreIfAction(BaseModel): class IgnoreAction(BaseModel): """ - Class representing an action to ignore a specific file based on the given reasoning. + Class representing an action to ignore a specific file based on the given reason. Attributes: - reasoning (str): The reasoning for ignoring the file. + reason (str): The reason for ignoring the file. ignore_if (list[IgnoreIfAction]): An optional list of ignore conditions. """ - reasoning: Optional[str] = None + reason: Optional[str] = None ignore_if: list[IgnoreIfAction] = Field(default_factory=list) class ReIdentifyAction(BaseModel): """ - Class representing an action to ignore a specific file based on the given reasoning. + Class representing an action to ignore a specific file based on the given reason. Attributes: - reasoning (str): The reasoning for ignoring the file. + reason (str): The reason for ignoring the file. """ - reasoning: str + reason: str onfail: Optional[TActionType] = None From df92e5fcc417d2088fc4304c475cfc2c1f42e891 Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Mon, 20 Nov 2023 12:29:56 +0100 Subject: [PATCH 07/11] models:file - use match classes as identifiers for actions --- acacore/models/file.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/acacore/models/file.py b/acacore/models/file.py index 352f09a..6bca4de 100644 --- a/acacore/models/file.py +++ b/acacore/models/file.py @@ -8,6 +8,7 @@ from acacore.siegfried.siegfried import Siegfried from acacore.siegfried.siegfried import SiegfriedFile +from acacore.siegfried.siegfried import TSiegfriedClass from acacore.utils.functions import file_checksum from acacore.utils.functions import get_bof from acacore.utils.functions import get_eof @@ -101,15 +102,17 @@ def from_file( root=root, processed=processed, ) + match_classes: list[TSiegfriedClass] = [] if siegfried: - file.identify(siegfried, set_match=True) + siegfried_match = file.identify(siegfried, set_match=True).best_match() + match_classes.extend(siegfried_match.match_class if siegfried_match else []) if custom_signatures and not file.puid: file.identify_custom(custom_signatures, set_match=True) if actions: - file.get_action(actions) + file.get_action(actions, match_classes) if custom_signatures and file.action == "reidentify": custom_match = file.identify_custom(custom_signatures) @@ -120,7 +123,7 @@ def from_file( if custom_match.extension and file.suffix != custom_match.extension: file.warning.append("extension mismatch") file.warning = file.warning or None - file.get_action(actions) + file.get_action(actions, match_classes) elif file.action_data.reidentify and file.action_data.reidentify.onfail: file.action = file.action_data.reidentify.onfail else: @@ -216,8 +219,29 @@ def identify_custom( return signature - def get_action(self, actions: dict[str, Action]) -> Optional[Action]: - action: Optional[Action] = actions.get(self.puid) + def get_action( + self, + actions: dict[str, Action], + match_classes: Optional[list[TSiegfriedClass]] = None, + ) -> Optional[Action]: + action: Optional[Action] = None + + identifiers: list[str] = [ + self.puid, + *(match_classes or []), + ] + if self.suffix: + identifiers.append(f"!ext={''.join(self.get_absolute_path().suffixes)}") + if self.is_binary: + identifiers.append("!binary") + if not self.size: + identifiers.append("!empty") + + for identifier in identifiers: + action = actions.get(identifier) + if action: + break + self.action, self.action_data = action.action if action else None, action.action_data if action else None return action From 6834102bd0a474607ab4525a54c407435f9482bf Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Mon, 20 Nov 2023 12:53:14 +0100 Subject: [PATCH 08/11] tests:files - add empty file --- tests/files/empty.empty | 0 tests/files/files.json | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 tests/files/empty.empty diff --git a/tests/files/empty.empty b/tests/files/empty.empty new file mode 100644 index 0000000..e69de29 diff --git a/tests/files/files.json b/tests/files/files.json index 1a05b3c..9fb0d42 100644 --- a/tests/files/files.json +++ b/tests/files/files.json @@ -1,4 +1,22 @@ { + "empty.empty": { + "filesize": 0, + "checksum": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", + "binary": false, + "errors": "", + "matches": { + "ns": "pronom", + "id": null, + "format": "", + "version": "", + "mime": "", + "match_class": [], + "basis": [], + "warning": ["no match"], + "URI": null, + "permalink": null + } + }, "ico.ico": { "filesize": 531, "image_size": [16, 16], From 80cdfaa79e6d436f38ef90cc2f96628bb423d3eb Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Mon, 20 Nov 2023 12:53:39 +0100 Subject: [PATCH 09/11] tests:siegfried - update to handle files with not matches --- tests/test_siegfried.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_siegfried.py b/tests/test_siegfried.py index c56c227..ff955c0 100644 --- a/tests/test_siegfried.py +++ b/tests/test_siegfried.py @@ -65,7 +65,9 @@ def test_identify(siegfried: Siegfried, test_files: Path, test_files_data: dict[ assert result.filesize == filedata["filesize"] assert result.matches assert result.matches[0].model_dump() == filedata["matches"] - assert result.best_match().model_dump() == filedata["matches"] + assert ( + result.best_match() is None and filedata["matches"]["id"] is None + ) or result.best_match().model_dump() == filedata["matches"] def test_identify_many(siegfried: Siegfried, test_files: Path, test_files_data: dict[str, dict]): @@ -74,5 +76,6 @@ def test_identify_many(siegfried: Siegfried, test_files: Path, test_files_data: assert result.filename == str(test_files / filename) assert result.filesize == filedata["filesize"] assert result.matches - assert result.matches[0].model_dump() == filedata["matches"] - assert result.best_match().model_dump() == filedata["matches"] + assert ( + result.best_match() is None and filedata["matches"]["id"] is None + ) or result.best_match().model_dump() == filedata["matches"] From ca7de801a7497ea617574c376b16723d0e0b6030 Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Mon, 20 Nov 2023 13:05:06 +0100 Subject: [PATCH 10/11] version - minor 1.0.2 > 1.1.0 --- acacore/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/acacore/__version__.py b/acacore/__version__.py index 7863915..6849410 100644 --- a/acacore/__version__.py +++ b/acacore/__version__.py @@ -1 +1 @@ -__version__ = "1.0.2" +__version__ = "1.1.0" From ed0218341c615c934820f50ab1d0319735f3bcb2 Mon Sep 17 00:00:00 2001 From: Matteo Campinoti Date: Mon, 20 Nov 2023 13:05:06 +0100 Subject: [PATCH 11/11] poetry - version minor 1.0.2 > 1.1.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index aad5540..d4d3877 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "acacore" -version = "1.0.2" +version = "1.1.0" description = "" authors = ["Matteo Campinoti "] license = "GPL-3.0"