Skip to content

Commit

Permalink
Merge pull request #19 from aarhusstadsarkiv/dev-matca
Browse files Browse the repository at this point in the history
Version 1.1.0
  • Loading branch information
MatteoCampinoti94 authored Nov 20, 2023
2 parents 28b6e21 + ed02183 commit 8674d8e
Show file tree
Hide file tree
Showing 8 changed files with 117 additions and 30 deletions.
2 changes: 1 addition & 1 deletion acacore/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.0.2"
__version__ = "1.1.0"
34 changes: 29 additions & 5 deletions acacore/models/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from acacore.siegfried.siegfried import Siegfried
from acacore.siegfried.siegfried import SiegfriedFile
from acacore.siegfried.siegfried import TSiegfriedClass
from acacore.utils.functions import file_checksum
from acacore.utils.functions import get_bof
from acacore.utils.functions import get_eof
Expand Down Expand Up @@ -101,15 +102,17 @@ def from_file(
root=root,
processed=processed,
)
match_classes: list[TSiegfriedClass] = []

if siegfried:
file.identify(siegfried, set_match=True)
siegfried_match = file.identify(siegfried, set_match=True).best_match()
match_classes.extend(siegfried_match.match_class if siegfried_match else [])

if custom_signatures and not file.puid:
file.identify_custom(custom_signatures, set_match=True)

if actions:
file.get_action(actions)
file.get_action(actions, match_classes)

if custom_signatures and file.action == "reidentify":
custom_match = file.identify_custom(custom_signatures)
Expand All @@ -120,7 +123,7 @@ def from_file(
if custom_match.extension and file.suffix != custom_match.extension:
file.warning.append("extension mismatch")
file.warning = file.warning or None
file.get_action(actions)
file.get_action(actions, match_classes)
elif file.action_data.reidentify and file.action_data.reidentify.onfail:
file.action = file.action_data.reidentify.onfail
else:
Expand Down Expand Up @@ -216,8 +219,29 @@ def identify_custom(

return signature

def get_action(self, actions: dict[str, Action]) -> Optional[Action]:
action: Optional[Action] = actions.get(self.puid)
def get_action(
self,
actions: dict[str, Action],
match_classes: Optional[list[TSiegfriedClass]] = None,
) -> Optional[Action]:
action: Optional[Action] = None

identifiers: list[str] = [
self.puid,
*(match_classes or []),
]
if self.suffix:
identifiers.append(f"!ext={''.join(self.get_absolute_path().suffixes)}")
if self.is_binary:
identifiers.append("!binary")
if not self.size:
identifiers.append("!empty")

for identifier in identifiers:
action = actions.get(identifier)
if action:
break

self.action, self.action_data = action.action if action else None, action.action_data if action else None
return action

Expand Down
45 changes: 33 additions & 12 deletions acacore/models/reference_files.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,32 @@
"""Data models for the data on saved to different .json files on the `reference_files` repo."""
from typing import get_args as get_type_args
from typing import Literal
from typing import Optional

from pydantic import BaseModel
from pydantic import Field

TActionType = Literal["convert", "extract", "replace", "manual", "rename", "ignore", "reidentify"]
TActionType = Literal[
"convert",
"extract",
"replace",
"manual",
"rename",
"ignore",
"reidentify",
]
TReplaceTemplate = Literal[
"text",
"empty",
"password-protected",
"corrupted",
"duplicate",
"not-preservable",
"not-convertable",
]

ActionTypeEnum: tuple[TActionType, ...] = get_type_args(TActionType)
ReplaceTemplateEnum: tuple[TReplaceTemplate, ...] = get_type_args(TReplaceTemplate)


class CustomSignature(BaseModel):
Expand Down Expand Up @@ -58,7 +79,7 @@ class ReplaceAction(BaseModel):
if template is set to "text".
"""

template: Literal["text", "empty", "password-protected", "corrupted", "not-preservable", "not-convertable"]
template: TReplaceTemplate
template_text: Optional[str] = None


Expand All @@ -67,11 +88,11 @@ class ManualAction(BaseModel):
Class representing a manual action in a workflow.
Attributes:
reasoning (str): The reasoning behind the manual action.
reason (str): The reason behind the manual action.
process (str): The process for performing the manual action.
"""

reasoning: str
reason: str
process: str


Expand All @@ -87,7 +108,7 @@ class IgnoreIfAction(BaseModel):
pixel_height (Optional[int]): Height for images.
size (Optional[int]): Size for all files.
binary_size (Optional[int]): Size for binary files.
reason (Optional[int]): A reasoning for the specific condition.
reason (Optional[int]): A reason for the specific condition.
"""

pixel_total: Optional[int] = Field(None, gt=0)
Expand All @@ -100,27 +121,27 @@ class IgnoreIfAction(BaseModel):

class IgnoreAction(BaseModel):
"""
Class representing an action to ignore a specific file based on the given reasoning.
Class representing an action to ignore a specific file based on the given reason.
Attributes:
reasoning (str): The reasoning for ignoring the file.
reason (str): The reason for ignoring the file.
ignore_if (list[IgnoreIfAction]): An optional list of ignore conditions.
"""

reasoning: Optional[str] = None
reason: Optional[str] = None
ignore_if: list[IgnoreIfAction] = Field(default_factory=list)


class ReIdentifyAction(BaseModel):
"""
Class representing an action to ignore a specific file based on the given reasoning.
Class representing an action to ignore a specific file based on the given reason.
Attributes:
reasoning (str): The reasoning for ignoring the file.
reason (str): The reason for ignoring the file.
"""

reasoning: str
onfail: Optional[Literal["convert", "extract", "replace", "manual", "rename", "ignore"]] = None
reason: str
onfail: Optional[TActionType] = None


class RenameAction(BaseModel):
Expand Down
23 changes: 22 additions & 1 deletion acacore/siegfried/siegfried.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,26 @@
_byte_match_regexp_multi = re_compile(r"^byte match at \[\[(\d+) +(\d+)]( \[\d+ +\d+])*]( \([^)]*\))?$")
_extension_match = re_compile(r"^extension match (.+)$")
TSignature = Literal["pronom", "loc", "tika", "freedesktop", "pronom-tika-loc", "deluxe", "archivematica"]
TSiegfriedClass = Literal[
"aggregate",
"audio",
"database",
"dataset",
"email",
"font",
"gis",
"image (raster)",
"image (vector)",
"model",
"page description",
"presentation",
"spreadsheet",
"text (mark-up)",
"text (structured)",
"text (unstructured)",
"video",
"word processor",
]


def _check_process(process: CompletedProcess) -> CompletedProcess:
Expand Down Expand Up @@ -71,7 +91,7 @@ class SiegfriedMatch(BaseModel):
format: str # noqa: A003
version: Optional[str] = None
mime: str
match_class: Optional[str] = Field(None, alias="class")
match_class: Optional[list[TSiegfriedClass]] = Field(None, alias="class")
basis: list[str]
warning: list[str]
URI: Optional[AnyUrl] = None
Expand Down Expand Up @@ -148,6 +168,7 @@ def unknown_id(cls, data: object):
"id": None if data["id"].lower().strip() == "unknown" else data["id"].strip() or None,
"basis": filter(bool, map(str.strip, data["basis"].strip().split(";"))),
"warning": filter(bool, map(str.strip, data["warning"].strip().split(";"))),
"class": [c for c in map(str.strip, data.get("class", "").lower().split(",")) if c],
}
return data

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "acacore"
version = "1.0.2"
version = "1.1.0"
description = ""
authors = ["Matteo Campinoti <[email protected]>"]
license = "GPL-3.0"
Expand Down
Empty file added tests/files/empty.empty
Empty file.
32 changes: 25 additions & 7 deletions tests/files/files.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,22 @@
{
"empty.empty": {
"filesize": 0,
"checksum": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
"binary": false,
"errors": "",
"matches": {
"ns": "pronom",
"id": null,
"format": "",
"version": "",
"mime": "",
"match_class": [],
"basis": [],
"warning": ["no match"],
"URI": null,
"permalink": null
}
},
"ico.ico": {
"filesize": 531,
"image_size": [16, 16],
Expand All @@ -11,7 +29,7 @@
"format": "Portable Network Graphics",
"version": "1.0",
"mime": "image/png",
"match_class": "Image (Raster)",
"match_class": ["image (raster)"],
"basis": ["byte match at [[0 16] [519 12]]"],
"warning": ["extension mismatch"],
"URI": null,
Expand All @@ -29,7 +47,7 @@
"format": "JSON Data Interchange Format",
"version": "",
"mime": "application/json",
"match_class": "",
"match_class": [],
"basis": ["extension match json"],
"warning": ["match on extension only"],
"URI": null,
Expand All @@ -47,7 +65,7 @@
"format": "Lotus WordPro Document",
"version": "96",
"mime": "application/lwp",
"match_class": "Word Processor",
"match_class": ["word processor"],
"basis": ["extension match lwp", "byte match at 0, 32"],
"warning": [],
"URI": null,
Expand All @@ -65,7 +83,7 @@
"format": "MPEG 1/2 Audio Layer 3",
"version": "",
"mime": "audio/mpeg",
"match_class": "Audio",
"match_class": ["audio"],
"basis": ["extension match mp3", "byte match at 0, 1521 (signature 5/9)"],
"warning": [],
"URI": null,
Expand All @@ -83,7 +101,7 @@
"format": "MPEG-4 Media File",
"version": "",
"mime": "application/mp4",
"match_class": "Audio, Video",
"match_class": ["audio", "video"],
"basis": ["extension match mp4", "byte match at [[4 8] [135072 4]]"],
"warning": [],
"URI": null,
Expand All @@ -101,7 +119,7 @@
"format": "Acrobat PDF 1.7 - Portable Document Format",
"version": "1.7",
"mime": "application/pdf",
"match_class": "Page Description",
"match_class": ["page description"],
"basis": ["extension match pdf", "byte match at [[0 8] [38463 5]]"],
"warning": [],
"URI": null,
Expand All @@ -119,7 +137,7 @@
"format": "Plain Text File",
"version": "",
"mime": "text/plain",
"match_class": "",
"match_class": [],
"basis": ["extension match txt", "text match UTF-8 Unicode"],
"warning": [],
"URI": null,
Expand Down
9 changes: 6 additions & 3 deletions tests/test_siegfried.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ def test_identify(siegfried: Siegfried, test_files: Path, test_files_data: dict[
assert result.filesize == filedata["filesize"]
assert result.matches
assert result.matches[0].model_dump() == filedata["matches"]
assert result.best_match().model_dump() == filedata["matches"]
assert (
result.best_match() is None and filedata["matches"]["id"] is None
) or result.best_match().model_dump() == filedata["matches"]


def test_identify_many(siegfried: Siegfried, test_files: Path, test_files_data: dict[str, dict]):
Expand All @@ -74,5 +76,6 @@ def test_identify_many(siegfried: Siegfried, test_files: Path, test_files_data:
assert result.filename == str(test_files / filename)
assert result.filesize == filedata["filesize"]
assert result.matches
assert result.matches[0].model_dump() == filedata["matches"]
assert result.best_match().model_dump() == filedata["matches"]
assert (
result.best_match() is None and filedata["matches"]["id"] is None
) or result.best_match().model_dump() == filedata["matches"]

0 comments on commit 8674d8e

Please sign in to comment.