Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Version 1.1.0 #19

Merged
merged 11 commits into from
Nov 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion acacore/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.0.2"
__version__ = "1.1.0"
34 changes: 29 additions & 5 deletions acacore/models/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from acacore.siegfried.siegfried import Siegfried
from acacore.siegfried.siegfried import SiegfriedFile
from acacore.siegfried.siegfried import TSiegfriedClass
from acacore.utils.functions import file_checksum
from acacore.utils.functions import get_bof
from acacore.utils.functions import get_eof
Expand Down Expand Up @@ -101,15 +102,17 @@ def from_file(
root=root,
processed=processed,
)
match_classes: list[TSiegfriedClass] = []

if siegfried:
file.identify(siegfried, set_match=True)
siegfried_match = file.identify(siegfried, set_match=True).best_match()
match_classes.extend(siegfried_match.match_class if siegfried_match else [])

if custom_signatures and not file.puid:
file.identify_custom(custom_signatures, set_match=True)

if actions:
file.get_action(actions)
file.get_action(actions, match_classes)

if custom_signatures and file.action == "reidentify":
custom_match = file.identify_custom(custom_signatures)
Expand All @@ -120,7 +123,7 @@ def from_file(
if custom_match.extension and file.suffix != custom_match.extension:
file.warning.append("extension mismatch")
file.warning = file.warning or None
file.get_action(actions)
file.get_action(actions, match_classes)
elif file.action_data.reidentify and file.action_data.reidentify.onfail:
file.action = file.action_data.reidentify.onfail
else:
Expand Down Expand Up @@ -216,8 +219,29 @@ def identify_custom(

return signature

def get_action(self, actions: dict[str, Action]) -> Optional[Action]:
action: Optional[Action] = actions.get(self.puid)
def get_action(
self,
actions: dict[str, Action],
match_classes: Optional[list[TSiegfriedClass]] = None,
) -> Optional[Action]:
action: Optional[Action] = None

identifiers: list[str] = [
self.puid,
*(match_classes or []),
]
if self.suffix:
identifiers.append(f"!ext={''.join(self.get_absolute_path().suffixes)}")
if self.is_binary:
identifiers.append("!binary")
if not self.size:
identifiers.append("!empty")

for identifier in identifiers:
action = actions.get(identifier)
if action:
break

self.action, self.action_data = action.action if action else None, action.action_data if action else None
return action

Expand Down
45 changes: 33 additions & 12 deletions acacore/models/reference_files.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,32 @@
"""Data models for the data on saved to different .json files on the `reference_files` repo."""
from typing import get_args as get_type_args
from typing import Literal
from typing import Optional

from pydantic import BaseModel
from pydantic import Field

TActionType = Literal["convert", "extract", "replace", "manual", "rename", "ignore", "reidentify"]
TActionType = Literal[
"convert",
"extract",
"replace",
"manual",
"rename",
"ignore",
"reidentify",
]
TReplaceTemplate = Literal[
"text",
"empty",
"password-protected",
"corrupted",
"duplicate",
"not-preservable",
"not-convertable",
]

ActionTypeEnum: tuple[TActionType, ...] = get_type_args(TActionType)
ReplaceTemplateEnum: tuple[TReplaceTemplate, ...] = get_type_args(TReplaceTemplate)


class CustomSignature(BaseModel):
Expand Down Expand Up @@ -58,7 +79,7 @@ class ReplaceAction(BaseModel):
if template is set to "text".
"""

template: Literal["text", "empty", "password-protected", "corrupted", "not-preservable", "not-convertable"]
template: TReplaceTemplate
template_text: Optional[str] = None


Expand All @@ -67,11 +88,11 @@ class ManualAction(BaseModel):
Class representing a manual action in a workflow.

Attributes:
reasoning (str): The reasoning behind the manual action.
reason (str): The reason behind the manual action.
process (str): The process for performing the manual action.
"""

reasoning: str
reason: str
process: str


Expand All @@ -87,7 +108,7 @@ class IgnoreIfAction(BaseModel):
pixel_height (Optional[int]): Height for images.
size (Optional[int]): Size for all files.
binary_size (Optional[int]): Size for binary files.
reason (Optional[int]): A reasoning for the specific condition.
reason (Optional[int]): A reason for the specific condition.
"""

pixel_total: Optional[int] = Field(None, gt=0)
Expand All @@ -100,27 +121,27 @@ class IgnoreIfAction(BaseModel):

class IgnoreAction(BaseModel):
"""
Class representing an action to ignore a specific file based on the given reasoning.
Class representing an action to ignore a specific file based on the given reason.

Attributes:
reasoning (str): The reasoning for ignoring the file.
reason (str): The reason for ignoring the file.
ignore_if (list[IgnoreIfAction]): An optional list of ignore conditions.
"""

reasoning: Optional[str] = None
reason: Optional[str] = None
ignore_if: list[IgnoreIfAction] = Field(default_factory=list)


class ReIdentifyAction(BaseModel):
"""
Class representing an action to ignore a specific file based on the given reasoning.
Class representing an action to ignore a specific file based on the given reason.

Attributes:
reasoning (str): The reasoning for ignoring the file.
reason (str): The reason for ignoring the file.
"""

reasoning: str
onfail: Optional[Literal["convert", "extract", "replace", "manual", "rename", "ignore"]] = None
reason: str
onfail: Optional[TActionType] = None


class RenameAction(BaseModel):
Expand Down
23 changes: 22 additions & 1 deletion acacore/siegfried/siegfried.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,26 @@
_byte_match_regexp_multi = re_compile(r"^byte match at \[\[(\d+) +(\d+)]( \[\d+ +\d+])*]( \([^)]*\))?$")
_extension_match = re_compile(r"^extension match (.+)$")
TSignature = Literal["pronom", "loc", "tika", "freedesktop", "pronom-tika-loc", "deluxe", "archivematica"]
TSiegfriedClass = Literal[
"aggregate",
"audio",
"database",
"dataset",
"email",
"font",
"gis",
"image (raster)",
"image (vector)",
"model",
"page description",
"presentation",
"spreadsheet",
"text (mark-up)",
"text (structured)",
"text (unstructured)",
"video",
"word processor",
]


def _check_process(process: CompletedProcess) -> CompletedProcess:
Expand Down Expand Up @@ -71,7 +91,7 @@ class SiegfriedMatch(BaseModel):
format: str # noqa: A003
version: Optional[str] = None
mime: str
match_class: Optional[str] = Field(None, alias="class")
match_class: Optional[list[TSiegfriedClass]] = Field(None, alias="class")
basis: list[str]
warning: list[str]
URI: Optional[AnyUrl] = None
Expand Down Expand Up @@ -148,6 +168,7 @@ def unknown_id(cls, data: object):
"id": None if data["id"].lower().strip() == "unknown" else data["id"].strip() or None,
"basis": filter(bool, map(str.strip, data["basis"].strip().split(";"))),
"warning": filter(bool, map(str.strip, data["warning"].strip().split(";"))),
"class": [c for c in map(str.strip, data.get("class", "").lower().split(",")) if c],
}
return data

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "acacore"
version = "1.0.2"
version = "1.1.0"
description = ""
authors = ["Matteo Campinoti <[email protected]>"]
license = "GPL-3.0"
Expand Down
Empty file added tests/files/empty.empty
Empty file.
32 changes: 25 additions & 7 deletions tests/files/files.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,22 @@
{
"empty.empty": {
"filesize": 0,
"checksum": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
"binary": false,
"errors": "",
"matches": {
"ns": "pronom",
"id": null,
"format": "",
"version": "",
"mime": "",
"match_class": [],
"basis": [],
"warning": ["no match"],
"URI": null,
"permalink": null
}
},
"ico.ico": {
"filesize": 531,
"image_size": [16, 16],
Expand All @@ -11,7 +29,7 @@
"format": "Portable Network Graphics",
"version": "1.0",
"mime": "image/png",
"match_class": "Image (Raster)",
"match_class": ["image (raster)"],
"basis": ["byte match at [[0 16] [519 12]]"],
"warning": ["extension mismatch"],
"URI": null,
Expand All @@ -29,7 +47,7 @@
"format": "JSON Data Interchange Format",
"version": "",
"mime": "application/json",
"match_class": "",
"match_class": [],
"basis": ["extension match json"],
"warning": ["match on extension only"],
"URI": null,
Expand All @@ -47,7 +65,7 @@
"format": "Lotus WordPro Document",
"version": "96",
"mime": "application/lwp",
"match_class": "Word Processor",
"match_class": ["word processor"],
"basis": ["extension match lwp", "byte match at 0, 32"],
"warning": [],
"URI": null,
Expand All @@ -65,7 +83,7 @@
"format": "MPEG 1/2 Audio Layer 3",
"version": "",
"mime": "audio/mpeg",
"match_class": "Audio",
"match_class": ["audio"],
"basis": ["extension match mp3", "byte match at 0, 1521 (signature 5/9)"],
"warning": [],
"URI": null,
Expand All @@ -83,7 +101,7 @@
"format": "MPEG-4 Media File",
"version": "",
"mime": "application/mp4",
"match_class": "Audio, Video",
"match_class": ["audio", "video"],
"basis": ["extension match mp4", "byte match at [[4 8] [135072 4]]"],
"warning": [],
"URI": null,
Expand All @@ -101,7 +119,7 @@
"format": "Acrobat PDF 1.7 - Portable Document Format",
"version": "1.7",
"mime": "application/pdf",
"match_class": "Page Description",
"match_class": ["page description"],
"basis": ["extension match pdf", "byte match at [[0 8] [38463 5]]"],
"warning": [],
"URI": null,
Expand All @@ -119,7 +137,7 @@
"format": "Plain Text File",
"version": "",
"mime": "text/plain",
"match_class": "",
"match_class": [],
"basis": ["extension match txt", "text match UTF-8 Unicode"],
"warning": [],
"URI": null,
Expand Down
9 changes: 6 additions & 3 deletions tests/test_siegfried.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ def test_identify(siegfried: Siegfried, test_files: Path, test_files_data: dict[
assert result.filesize == filedata["filesize"]
assert result.matches
assert result.matches[0].model_dump() == filedata["matches"]
assert result.best_match().model_dump() == filedata["matches"]
assert (
result.best_match() is None and filedata["matches"]["id"] is None
) or result.best_match().model_dump() == filedata["matches"]


def test_identify_many(siegfried: Siegfried, test_files: Path, test_files_data: dict[str, dict]):
Expand All @@ -74,5 +76,6 @@ def test_identify_many(siegfried: Siegfried, test_files: Path, test_files_data:
assert result.filename == str(test_files / filename)
assert result.filesize == filedata["filesize"]
assert result.matches
assert result.matches[0].model_dump() == filedata["matches"]
assert result.best_match().model_dump() == filedata["matches"]
assert (
result.best_match() is None and filedata["matches"]["id"] is None
) or result.best_match().model_dump() == filedata["matches"]
Loading