Skip to content

Commit

Permalink
Merge pull request #753
Browse files Browse the repository at this point in the history
v5.0.3
  • Loading branch information
MatteoCampinoti94 authored Jan 8, 2025
2 parents 27e6071 + 605e733 commit 8b7d855
Show file tree
Hide file tree
Showing 7 changed files with 30 additions and 13 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# Changelog

## v5.0.3

### Fixes

* Fix files being overwritten causing an error when MSG and TNEF files contained more than one attachment with the same
name.

## v5.0.2

### Changes
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ Usage: digiarch edit [OPTIONS] COMMAND [ARGS]...
@like toggles LIKE syntax for the values following it in the same column.
@file toggles file reading for the values following it in the same column: each
value will be considered as a file path and values will be read from the lines
in the given file (@null, @notnull, @true, and @false in files are not supported).
in the given file (@null, @notnull, @true, @false, and @like are not supported when using @file).
Changing to a new @<field> resets like and file toggles. Values for the same
column will be matched with OR logic, while values from different columns will
be matched with AND logic.
Expand Down
2 changes: 1 addition & 1 deletion digiarch/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "5.0.2"
__version__ = "5.0.3"
2 changes: 1 addition & 1 deletion digiarch/commands/edit/edit.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def grp_edit():
@like toggles LIKE syntax for the values following it in the same column.
@file toggles file reading for the values following it in the same column: each
value will be considered as a file path and values will be read from the lines
in the given file (@null, @notnull, @true, and @false in files are not supported).
in the given file (@null, @notnull, @true, @false, and @like are not supported when using @file).
Changing to a new @<field> resets like and file toggles. Values for the same
column will be matched with OR logic, while values from different columns will
be matched with AND logic.
Expand Down
17 changes: 13 additions & 4 deletions digiarch/commands/extract/extractors/extractor_msg.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,16 @@ def msg_attachments(
return inline_attachments, attachments


def prepare_attachment_name(names: list[str], name: str, n: int) -> [tuple[str], str, str]:
"""Deduplicate attachment name by attaching a prefix to the sanitized name with the index of that name if it has already been extracted."""
name = name.strip() or f"attachment-{n}"
name_sanitized: str = sanitize_filename(name, 20, True).strip("_") or f"attachment-{n}"
names.append(name_sanitized.lower())
if (count := names.count(name_sanitized.lower())) > 1:
name_sanitized = f"{count - 1}_{name_sanitized}"
return names, name, name_sanitized


class MsgExtractor(ExtractorBase):
tool_names: ClassVar[list[str]] = ["msg"]

Expand All @@ -120,11 +130,11 @@ def extract(self) -> list[tuple[Path, Path]]:
inline_attachments, attachments = msg_attachments(msg, body_html, body_rtf)

with TempDir(self.file.root) as tmp_dir:
names: list[str] = []
for n, attachment in enumerate(inline_attachments + attachments):
if isinstance(attachment, (Message, MessageSigned)):
name: str = (attachment.filename or "").strip() or (attachment.subject or "").strip()
name = name.strip() or f"attachment-{n}"
name_sanitized: str = sanitize_filename(name, 20, True).strip("_") or f"attachment-{n}"
names, name, name_sanitized = prepare_attachment_name(names, name, n)
attachment.export(tmp_dir / name_sanitized)
files.append((name_sanitized, name))
elif isinstance(attachment.data, bytes):
Expand All @@ -133,8 +143,7 @@ def extract(self) -> list[tuple[Path, Path]]:
if isinstance(attachment, Attachment)
else attachment.longFilename or ""
)
name = name.strip() or f"attachment-{n}"
name_sanitized: str = sanitize_filename(name, 20, True).strip("_") or f"attachment-{n}"
names, name, name_sanitized = prepare_attachment_name(names, name, n)
with tmp_dir.joinpath(name_sanitized).open("wb") as fh:
fh.write(attachment.data or b"")
files.append((name_sanitized, name))
Expand Down
11 changes: 6 additions & 5 deletions digiarch/commands/extract/extractors/extractor_tnef.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@

from tnefparse import TNEF

from digiarch.common import sanitize_filename
from digiarch.common import TempDir

from .base import ExtractorBase
from .extractor_msg import prepare_attachment_name


class TNEFExtractor(ExtractorBase):
Expand All @@ -23,12 +23,13 @@ def extract(self) -> list[tuple[Path, Path]]:
tnef = TNEF(fh.read())

with TempDir(self.file.root) as tmp_dir:
for attachment in tnef.attachments:
names: list[str] = []
for n, attachment in enumerate(tnef.attachments):
name: str = attachment.long_filename() or attachment.name
path: Path = tmp_dir.joinpath(sanitize_filename(name, 20, True))
with path.open("wb") as oh:
names, name, name_sanitized = prepare_attachment_name(names, name, n)
with tmp_dir.joinpath(name_sanitized).open("wb") as oh:
oh.write(attachment.data)
files.append((path.name, name))
files.append((name_sanitized, name))

if not files:
return []
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "digiarch"
version = "5.0.2"
version = "5.0.3"
description = "Tools for the Digital Archive Project at Aarhus Stadsarkiv"
authors = ["Aarhus Stadsarkiv <[email protected]>"]
license = "GPL-3.0"
Expand Down

0 comments on commit 8b7d855

Please sign in to comment.