Skip to content

Commit

Permalink
add support for different file metadata encodings (#125)
Browse files Browse the repository at this point in the history
  • Loading branch information
Changaco authored Jul 4, 2023
1 parent fbf0362 commit ed0e591
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 38 deletions.
15 changes: 15 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,21 @@ and the optional third argument is the compression format (called “filter” i
libarchive). The acceptable values are listed in ``libarchive.ffi.WRITE_FORMATS``
and ``libarchive.ffi.WRITE_FILTERS``.

File metadata codecs
--------------------

By default, UTF-8 is used to read and write file attributes from and to archives.
A different codec can be specified through the ``header_codec`` arguments of the
``*_reader`` and ``*_writer`` functions. Example::

with libarchive.file_writer('test.tar', 'ustar', header_codec='cp037') as archive:
...
with file_reader('test.tar', header_codec='cp037') as archive:
...

In addition to file paths (``pathname`` and ``linkpath``), the specified codec is
used to encode and decode user and group names (``uname`` and ``gname``).

License
=======

Expand Down
90 changes: 69 additions & 21 deletions libarchive/entry.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from contextlib import contextmanager
from ctypes import c_char_p, create_string_buffer
from ctypes import create_string_buffer
from enum import IntEnum
import math

Expand Down Expand Up @@ -34,15 +34,19 @@ def format_time(seconds, nanos):

class ArchiveEntry:

__slots__ = ('_archive_p', '_entry_p')
__slots__ = ('_archive_p', '_entry_p', 'header_codec')

def __init__(self, archive_p=None, **attributes):
def __init__(self, archive_p=None, header_codec='utf-8', **attributes):
"""Allocate memory for an `archive_entry` struct.
The attributes are passed to the `modify` method.
The `header_codec` is used to decode and encode file paths and other
attributes.
The `**attributes` are passed to the `modify` method.
"""
self._archive_p = archive_p
self._entry_p = ffi.entry_new()
self.header_codec = header_codec
if attributes:
self.modify(**attributes)

Expand All @@ -54,7 +58,7 @@ def __str__(self):
"""Returns the file's path"""
return self.pathname

def modify(self, **attributes):
def modify(self, header_codec=None, **attributes):
"""Convenience method to modify the entry's attributes.
Args:
Expand Down Expand Up @@ -83,6 +87,8 @@ def modify(self, **attributes):
rdevmajor (int): major part of the device number
rdevminor (int): minor part of the device number
"""
if header_codec:
self.header_codec = header_codec
for name, value in attributes.items():
setattr(self, name, value)

Expand Down Expand Up @@ -112,23 +118,45 @@ def gid(self, gid):

@property
def uname(self):
return ffi.entry_uname_w(self._entry_p)
uname = ffi.entry_uname_w(self._entry_p)
if not uname:
uname = ffi.entry_uname(self._entry_p)
if uname is not None:
try:
uname = uname.decode(self.header_codec)
except UnicodeError:
pass
return uname

@uname.setter
def uname(self, value):
if not isinstance(value, bytes):
value = value.encode('utf8')
ffi.entry_update_uname_utf8(self._entry_p, value)
value = value.encode(self.header_codec)
if self.header_codec == 'utf-8':
ffi.entry_update_uname_utf8(self._entry_p, value)
else:
ffi.entry_copy_uname(self._entry_p, value)

@property
def gname(self):
return ffi.entry_gname_w(self._entry_p)
gname = ffi.entry_gname_w(self._entry_p)
if not gname:
gname = ffi.entry_gname(self._entry_p)
if gname is not None:
try:
gname = gname.decode(self.header_codec)
except UnicodeError:
pass
return gname

@gname.setter
def gname(self, value):
if not isinstance(value, bytes):
value = value.encode('utf8')
ffi.entry_update_gname_utf8(self._entry_p, value)
value = value.encode(self.header_codec)
if self.header_codec == 'utf-8':
ffi.entry_update_gname_utf8(self._entry_p, value)
else:
ffi.entry_copy_gname(self._entry_p, value)

def get_blocks(self, block_size=ffi.page_size):
"""Read the file's content, keeping only one chunk in memory at a time.
Expand Down Expand Up @@ -294,28 +322,48 @@ def pathname(self):
path = ffi.entry_pathname_w(self._entry_p)
if not path:
path = ffi.entry_pathname(self._entry_p)
try:
path = path.decode()
except UnicodeError:
pass
if path is not None:
try:
path = path.decode(self.header_codec)
except UnicodeError:
pass
return path

@pathname.setter
def pathname(self, value):
if not isinstance(value, bytes):
value = value.encode('utf8')
ffi.entry_update_pathname_utf8(self._entry_p, c_char_p(value))
value = value.encode(self.header_codec)
if self.header_codec == 'utf-8':
ffi.entry_update_pathname_utf8(self._entry_p, value)
else:
ffi.entry_copy_pathname(self._entry_p, value)

@property
def linkpath(self):
return (ffi.entry_symlink_w(self._entry_p) or
path = (
(
ffi.entry_symlink_w(self._entry_p) or
ffi.entry_symlink(self._entry_p)
) if self.issym else (
ffi.entry_hardlink_w(self._entry_p) or
ffi.entry_symlink(self._entry_p) or
ffi.entry_hardlink(self._entry_p))
ffi.entry_hardlink(self._entry_p)
)
)
if isinstance(path, bytes):
try:
path = path.decode(self.header_codec)
except UnicodeError:
pass
return path

@linkpath.setter
def linkpath(self, value):
ffi.entry_update_link_utf8(self._entry_p, value)
if not isinstance(value, bytes):
value = value.encode(self.header_codec)
if self.header_codec == 'utf-8':
ffi.entry_update_link_utf8(self._entry_p, value)
else:
ffi.entry_copy_link(self._entry_p, value)

# aliases for compatibility with the standard `tarfile` module
path = property(pathname.fget, pathname.fset, doc="alias of pathname")
Expand Down
6 changes: 6 additions & 0 deletions libarchive/ffi.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,8 @@ def get_write_filter_function(filter_name):
ffi('entry_rdevminor', [c_archive_entry_p], c_uint)
ffi('entry_uid', [c_archive_entry_p], c_longlong)
ffi('entry_gid', [c_archive_entry_p], c_longlong)
ffi('entry_uname', [c_archive_entry_p], c_char_p)
ffi('entry_gname', [c_archive_entry_p], c_char_p)
ffi('entry_uname_w', [c_archive_entry_p], c_wchar_p)
ffi('entry_gname_w', [c_archive_entry_p], c_wchar_p)

Expand All @@ -222,9 +224,13 @@ def get_write_filter_function(filter_name):
ffi('entry_unset_ctime', [c_archive_entry_p], None)
ffi('entry_unset_birthtime', [c_archive_entry_p], None)

ffi('entry_copy_pathname', [c_archive_entry_p, c_char_p], None)
ffi('entry_update_pathname_utf8', [c_archive_entry_p, c_char_p], c_int, check_int)
ffi('entry_copy_link', [c_archive_entry_p, c_char_p], None)
ffi('entry_update_link_utf8', [c_archive_entry_p, c_char_p], c_int, check_int)
ffi('entry_copy_uname', [c_archive_entry_p, c_char_p], None)
ffi('entry_update_uname_utf8', [c_archive_entry_p, c_char_p], c_int, check_int)
ffi('entry_copy_gname', [c_archive_entry_p, c_char_p], None)
ffi('entry_update_gname_utf8', [c_archive_entry_p, c_char_p], c_int, check_int)

ffi('entry_clear', [c_archive_entry_p], c_archive_entry_p)
Expand Down
26 changes: 17 additions & 9 deletions libarchive/read.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,18 @@

class ArchiveRead:

def __init__(self, archive_p):
def __init__(self, archive_p, header_codec='utf-8'):
self._pointer = archive_p
self.header_codec = header_codec

def __iter__(self):
"""Iterates through an archive's entries.
"""
archive_p = self._pointer
header_codec = self.header_codec
read_next_header2 = ffi.read_next_header2
while 1:
entry = ArchiveEntry(archive_p)
entry = ArchiveEntry(archive_p, header_codec)
r = read_next_header2(archive_p, entry._entry_p)
if r == ARCHIVE_EOF:
return
Expand Down Expand Up @@ -68,6 +70,7 @@ def custom_reader(
read_func, format_name='all', filter_name='all',
open_func=None, seek_func=None, close_func=None,
block_size=page_size, archive_read_class=ArchiveRead, passphrase=None,
header_codec='utf-8',
):
"""Read an archive using a custom function.
"""
Expand All @@ -79,12 +82,13 @@ def custom_reader(
if seek_func:
ffi.read_set_seek_callback(archive_p, seek_cb)
ffi.read_open(archive_p, None, open_cb, read_cb, close_cb)
yield archive_read_class(archive_p)
yield archive_read_class(archive_p, header_codec)


@contextmanager
def fd_reader(
fd, format_name='all', filter_name='all', block_size=4096, passphrase=None,
header_codec='utf-8',
):
"""Read an archive from a file descriptor.
"""
Expand All @@ -94,12 +98,13 @@ def fd_reader(
except (OSError, AttributeError): # pragma: no cover
pass
ffi.read_open_fd(archive_p, fd, block_size)
yield ArchiveRead(archive_p)
yield ArchiveRead(archive_p, header_codec)


@contextmanager
def file_reader(
path, format_name='all', filter_name='all', block_size=4096, passphrase=None,
header_codec='utf-8',
):
"""Read an archive from a file.
"""
Expand All @@ -109,22 +114,25 @@ def file_reader(
except (OSError, AttributeError): # pragma: no cover
pass
ffi.read_open_filename_w(archive_p, path, block_size)
yield ArchiveRead(archive_p)
yield ArchiveRead(archive_p, header_codec)


@contextmanager
def memory_reader(buf, format_name='all', filter_name='all', passphrase=None):
def memory_reader(
buf, format_name='all', filter_name='all', passphrase=None,
header_codec='utf-8',
):
"""Read an archive from memory.
"""
with new_archive_read(format_name, filter_name, passphrase) as archive_p:
ffi.read_open_memory(archive_p, cast(buf, c_void_p), len(buf))
yield ArchiveRead(archive_p)
yield ArchiveRead(archive_p, header_codec)


@contextmanager
def stream_reader(
stream, format_name='all', filter_name='all', block_size=page_size,
passphrase=None,
passphrase=None, header_codec='utf-8',
):
"""Read an archive from a stream.
Expand Down Expand Up @@ -158,7 +166,7 @@ def seek_func(archive_p, context, offset, whence):
if stream.seekable():
ffi.read_set_seek_callback(archive_p, seek_cb)
ffi.read_open(archive_p, None, open_cb, read_cb, close_cb)
yield ArchiveRead(archive_p)
yield ArchiveRead(archive_p, header_codec)


seekable_stream_reader = stream_reader
Loading

0 comments on commit ed0e591

Please sign in to comment.