From ed0e591eba9725856612b2367a98ce7d95d5737a Mon Sep 17 00:00:00 2001 From: Charly C Date: Tue, 4 Jul 2023 10:03:17 +0200 Subject: [PATCH] add support for different file metadata encodings (#125) --- README.rst | 15 ++++++++ libarchive/entry.py | 90 ++++++++++++++++++++++++++++++++++----------- libarchive/ffi.py | 6 +++ libarchive/read.py | 26 ++++++++----- libarchive/write.py | 22 +++++++---- tests/test_entry.py | 14 +++++++ 6 files changed, 135 insertions(+), 38 deletions(-) diff --git a/README.rst b/README.rst index d8a1b78..64bef11 100644 --- a/README.rst +++ b/README.rst @@ -114,6 +114,21 @@ and the optional third argument is the compression format (called “filter” i libarchive). The acceptable values are listed in ``libarchive.ffi.WRITE_FORMATS`` and ``libarchive.ffi.WRITE_FILTERS``. +File metadata codecs +-------------------- + +By default, UTF-8 is used to read and write file attributes from and to archives. +A different codec can be specified through the ``header_codec`` arguments of the +``*_reader`` and ``*_writer`` functions. Example:: + + with libarchive.file_writer('test.tar', 'ustar', header_codec='cp037') as archive: + ... + with file_reader('test.tar', header_codec='cp037') as archive: + ... + +In addition to file paths (``pathname`` and ``linkpath``), the specified codec is +used to encode and decode user and group names (``uname`` and ``gname``). + License ======= diff --git a/libarchive/entry.py b/libarchive/entry.py index 727288e..20f1adb 100644 --- a/libarchive/entry.py +++ b/libarchive/entry.py @@ -1,5 +1,5 @@ from contextlib import contextmanager -from ctypes import c_char_p, create_string_buffer +from ctypes import create_string_buffer from enum import IntEnum import math @@ -34,15 +34,19 @@ def format_time(seconds, nanos): class ArchiveEntry: - __slots__ = ('_archive_p', '_entry_p') + __slots__ = ('_archive_p', '_entry_p', 'header_codec') - def __init__(self, archive_p=None, **attributes): + def __init__(self, archive_p=None, header_codec='utf-8', **attributes): """Allocate memory for an `archive_entry` struct. - The attributes are passed to the `modify` method. + The `header_codec` is used to decode and encode file paths and other + attributes. + + The `**attributes` are passed to the `modify` method. """ self._archive_p = archive_p self._entry_p = ffi.entry_new() + self.header_codec = header_codec if attributes: self.modify(**attributes) @@ -54,7 +58,7 @@ def __str__(self): """Returns the file's path""" return self.pathname - def modify(self, **attributes): + def modify(self, header_codec=None, **attributes): """Convenience method to modify the entry's attributes. Args: @@ -83,6 +87,8 @@ def modify(self, **attributes): rdevmajor (int): major part of the device number rdevminor (int): minor part of the device number """ + if header_codec: + self.header_codec = header_codec for name, value in attributes.items(): setattr(self, name, value) @@ -112,23 +118,45 @@ def gid(self, gid): @property def uname(self): - return ffi.entry_uname_w(self._entry_p) + uname = ffi.entry_uname_w(self._entry_p) + if not uname: + uname = ffi.entry_uname(self._entry_p) + if uname is not None: + try: + uname = uname.decode(self.header_codec) + except UnicodeError: + pass + return uname @uname.setter def uname(self, value): if not isinstance(value, bytes): - value = value.encode('utf8') - ffi.entry_update_uname_utf8(self._entry_p, value) + value = value.encode(self.header_codec) + if self.header_codec == 'utf-8': + ffi.entry_update_uname_utf8(self._entry_p, value) + else: + ffi.entry_copy_uname(self._entry_p, value) @property def gname(self): - return ffi.entry_gname_w(self._entry_p) + gname = ffi.entry_gname_w(self._entry_p) + if not gname: + gname = ffi.entry_gname(self._entry_p) + if gname is not None: + try: + gname = gname.decode(self.header_codec) + except UnicodeError: + pass + return gname @gname.setter def gname(self, value): if not isinstance(value, bytes): - value = value.encode('utf8') - ffi.entry_update_gname_utf8(self._entry_p, value) + value = value.encode(self.header_codec) + if self.header_codec == 'utf-8': + ffi.entry_update_gname_utf8(self._entry_p, value) + else: + ffi.entry_copy_gname(self._entry_p, value) def get_blocks(self, block_size=ffi.page_size): """Read the file's content, keeping only one chunk in memory at a time. @@ -294,28 +322,48 @@ def pathname(self): path = ffi.entry_pathname_w(self._entry_p) if not path: path = ffi.entry_pathname(self._entry_p) - try: - path = path.decode() - except UnicodeError: - pass + if path is not None: + try: + path = path.decode(self.header_codec) + except UnicodeError: + pass return path @pathname.setter def pathname(self, value): if not isinstance(value, bytes): - value = value.encode('utf8') - ffi.entry_update_pathname_utf8(self._entry_p, c_char_p(value)) + value = value.encode(self.header_codec) + if self.header_codec == 'utf-8': + ffi.entry_update_pathname_utf8(self._entry_p, value) + else: + ffi.entry_copy_pathname(self._entry_p, value) @property def linkpath(self): - return (ffi.entry_symlink_w(self._entry_p) or + path = ( + ( + ffi.entry_symlink_w(self._entry_p) or + ffi.entry_symlink(self._entry_p) + ) if self.issym else ( ffi.entry_hardlink_w(self._entry_p) or - ffi.entry_symlink(self._entry_p) or - ffi.entry_hardlink(self._entry_p)) + ffi.entry_hardlink(self._entry_p) + ) + ) + if isinstance(path, bytes): + try: + path = path.decode(self.header_codec) + except UnicodeError: + pass + return path @linkpath.setter def linkpath(self, value): - ffi.entry_update_link_utf8(self._entry_p, value) + if not isinstance(value, bytes): + value = value.encode(self.header_codec) + if self.header_codec == 'utf-8': + ffi.entry_update_link_utf8(self._entry_p, value) + else: + ffi.entry_copy_link(self._entry_p, value) # aliases for compatibility with the standard `tarfile` module path = property(pathname.fget, pathname.fset, doc="alias of pathname") diff --git a/libarchive/ffi.py b/libarchive/ffi.py index 2b85794..1fc321a 100644 --- a/libarchive/ffi.py +++ b/libarchive/ffi.py @@ -200,6 +200,8 @@ def get_write_filter_function(filter_name): ffi('entry_rdevminor', [c_archive_entry_p], c_uint) ffi('entry_uid', [c_archive_entry_p], c_longlong) ffi('entry_gid', [c_archive_entry_p], c_longlong) +ffi('entry_uname', [c_archive_entry_p], c_char_p) +ffi('entry_gname', [c_archive_entry_p], c_char_p) ffi('entry_uname_w', [c_archive_entry_p], c_wchar_p) ffi('entry_gname_w', [c_archive_entry_p], c_wchar_p) @@ -222,9 +224,13 @@ def get_write_filter_function(filter_name): ffi('entry_unset_ctime', [c_archive_entry_p], None) ffi('entry_unset_birthtime', [c_archive_entry_p], None) +ffi('entry_copy_pathname', [c_archive_entry_p, c_char_p], None) ffi('entry_update_pathname_utf8', [c_archive_entry_p, c_char_p], c_int, check_int) +ffi('entry_copy_link', [c_archive_entry_p, c_char_p], None) ffi('entry_update_link_utf8', [c_archive_entry_p, c_char_p], c_int, check_int) +ffi('entry_copy_uname', [c_archive_entry_p, c_char_p], None) ffi('entry_update_uname_utf8', [c_archive_entry_p, c_char_p], c_int, check_int) +ffi('entry_copy_gname', [c_archive_entry_p, c_char_p], None) ffi('entry_update_gname_utf8', [c_archive_entry_p, c_char_p], c_int, check_int) ffi('entry_clear', [c_archive_entry_p], c_archive_entry_p) diff --git a/libarchive/read.py b/libarchive/read.py index 3e2dbcc..fd18667 100644 --- a/libarchive/read.py +++ b/libarchive/read.py @@ -12,16 +12,18 @@ class ArchiveRead: - def __init__(self, archive_p): + def __init__(self, archive_p, header_codec='utf-8'): self._pointer = archive_p + self.header_codec = header_codec def __iter__(self): """Iterates through an archive's entries. """ archive_p = self._pointer + header_codec = self.header_codec read_next_header2 = ffi.read_next_header2 while 1: - entry = ArchiveEntry(archive_p) + entry = ArchiveEntry(archive_p, header_codec) r = read_next_header2(archive_p, entry._entry_p) if r == ARCHIVE_EOF: return @@ -68,6 +70,7 @@ def custom_reader( read_func, format_name='all', filter_name='all', open_func=None, seek_func=None, close_func=None, block_size=page_size, archive_read_class=ArchiveRead, passphrase=None, + header_codec='utf-8', ): """Read an archive using a custom function. """ @@ -79,12 +82,13 @@ def custom_reader( if seek_func: ffi.read_set_seek_callback(archive_p, seek_cb) ffi.read_open(archive_p, None, open_cb, read_cb, close_cb) - yield archive_read_class(archive_p) + yield archive_read_class(archive_p, header_codec) @contextmanager def fd_reader( fd, format_name='all', filter_name='all', block_size=4096, passphrase=None, + header_codec='utf-8', ): """Read an archive from a file descriptor. """ @@ -94,12 +98,13 @@ def fd_reader( except (OSError, AttributeError): # pragma: no cover pass ffi.read_open_fd(archive_p, fd, block_size) - yield ArchiveRead(archive_p) + yield ArchiveRead(archive_p, header_codec) @contextmanager def file_reader( path, format_name='all', filter_name='all', block_size=4096, passphrase=None, + header_codec='utf-8', ): """Read an archive from a file. """ @@ -109,22 +114,25 @@ def file_reader( except (OSError, AttributeError): # pragma: no cover pass ffi.read_open_filename_w(archive_p, path, block_size) - yield ArchiveRead(archive_p) + yield ArchiveRead(archive_p, header_codec) @contextmanager -def memory_reader(buf, format_name='all', filter_name='all', passphrase=None): +def memory_reader( + buf, format_name='all', filter_name='all', passphrase=None, + header_codec='utf-8', +): """Read an archive from memory. """ with new_archive_read(format_name, filter_name, passphrase) as archive_p: ffi.read_open_memory(archive_p, cast(buf, c_void_p), len(buf)) - yield ArchiveRead(archive_p) + yield ArchiveRead(archive_p, header_codec) @contextmanager def stream_reader( stream, format_name='all', filter_name='all', block_size=page_size, - passphrase=None, + passphrase=None, header_codec='utf-8', ): """Read an archive from a stream. @@ -158,7 +166,7 @@ def seek_func(archive_p, context, offset, whence): if stream.seekable(): ffi.read_set_seek_callback(archive_p, seek_cb) ffi.read_open(archive_p, None, open_cb, read_cb, close_cb) - yield ArchiveRead(archive_p) + yield ArchiveRead(archive_p, header_codec) seekable_stream_reader = stream_reader diff --git a/libarchive/write.py b/libarchive/write.py index f03b565..7ba191d 100644 --- a/libarchive/write.py +++ b/libarchive/write.py @@ -30,8 +30,9 @@ def new_archive_read_disk(path, flags=0, lookup=False): class ArchiveWrite: - def __init__(self, archive_p): + def __init__(self, archive_p, header_codec='utf-8'): self._pointer = archive_p + self.header_codec = header_codec def add_entries(self, entries): """Add the given entries to the archive. @@ -74,7 +75,7 @@ def add_files( if block_size <= 0: block_size = 10240 # pragma: no cover - entry = ArchiveEntry() + entry = ArchiveEntry(header_codec=self.header_codec) entry_p = entry._entry_p destination_path = attributes.pop('pathname', None) for path in paths: @@ -122,7 +123,7 @@ def add_file_from_memory( """"Add file from memory to archive. Args: - entry_path (str): the file's path + entry_path (str | bytes): the file's path entry_size (int): the file's size, in bytes entry_data (bytes | Iterable[bytes]): the file's content filetype (int): see `libarchive.entry.ArchiveEntry.modify()` @@ -140,7 +141,8 @@ def add_file_from_memory( entry = ArchiveEntry( pathname=entry_path, size=entry_size, filetype=filetype, - perm=permission, **other_attributes + perm=permission, header_codec=self.header_codec, + **other_attributes ) write_header(archive_pointer, entry._entry_p) @@ -200,6 +202,7 @@ def custom_writer( write_func, format_name, filter_name=None, open_func=None, close_func=None, block_size=page_size, archive_write_class=ArchiveWrite, options='', passphrase=None, + header_codec='utf-8', ): """Create an archive and send it in chunks to the `write_func` function. @@ -220,13 +223,14 @@ def write_cb_internal(archive_p, context, buffer_, length): ffi.write_set_bytes_in_last_block(archive_p, 1) ffi.write_set_bytes_per_block(archive_p, block_size) ffi.write_open(archive_p, None, open_cb, write_cb, close_cb) - yield archive_write_class(archive_p) + yield archive_write_class(archive_p, header_codec) @contextmanager def fd_writer( fd, format_name, filter_name=None, archive_write_class=ArchiveWrite, options='', passphrase=None, + header_codec='utf-8', ): """Create an archive and write it into a file descriptor. @@ -236,13 +240,14 @@ def fd_writer( with new_archive_write(format_name, filter_name, options, passphrase) as archive_p: ffi.write_open_fd(archive_p, fd) - yield archive_write_class(archive_p) + yield archive_write_class(archive_p, header_codec) @contextmanager def file_writer( filepath, format_name, filter_name=None, archive_write_class=ArchiveWrite, options='', passphrase=None, + header_codec='utf-8', ): """Create an archive and write it into a file. @@ -252,13 +257,14 @@ def file_writer( with new_archive_write(format_name, filter_name, options, passphrase) as archive_p: ffi.write_open_filename_w(archive_p, filepath) - yield archive_write_class(archive_p) + yield archive_write_class(archive_p, header_codec) @contextmanager def memory_writer( buf, format_name, filter_name=None, archive_write_class=ArchiveWrite, options='', passphrase=None, + header_codec='utf-8', ): """Create an archive and write it into a buffer. @@ -270,4 +276,4 @@ def memory_writer( used = byref(c_size_t()) buf_p = cast(buf, c_void_p) ffi.write_open_memory(archive_p, buf_p, len(buf), used) - yield archive_write_class(archive_p) + yield archive_write_class(archive_p, header_codec) diff --git a/tests/test_entry.py b/tests/test_entry.py index 58e2483..419cecb 100644 --- a/tests/test_entry.py +++ b/tests/test_entry.py @@ -135,3 +135,17 @@ def test_the_life_cycle_of_archive_entries(): assert type(entry3) is ArchiveEntry assert type(entry2) is PassedArchiveEntry assert type(entry1) is PassedArchiveEntry + + +def test_non_ASCII_encoding_of_file_metadata(): + buf = bytes(bytearray(100_000)) + file_name = 'README.rst' + encoded_file_name = 'README.rst'.encode('cp037') + with memory_writer(buf, 'ustar', header_codec='cp037') as archive: + archive.add_file(file_name) + with memory_reader(buf) as archive: + entry = next(iter(archive)) + assert entry.pathname == encoded_file_name + with memory_reader(buf, header_codec='cp037') as archive: + entry = next(iter(archive)) + assert entry.pathname == file_name