Skip to content

Commit

Permalink
Fix automatic size hint on uploads
Browse files Browse the repository at this point in the history
When uploading without a size hint (via headers or `--size-hint`), a x-archive-size-hint header is added automatically. However, prior to this commit, the value was the individual file size on each file's `PUT` request. This effectively made the size hint useless because it does not, in fact, provide a size hint for the total item size to the S3 backend at item creation time.

Notes on detailed changes to implement this:

* Rename `internetarchive.utils.recursive_file_count` to `recursive_file_count_and_size`, adding a wrapper for backwards compatibility
* Add support for paths (rather than only file-like objects) to `internetarchive.utils.get_file_size`
* Add a `internetarchive.utils.is_filelike_obj` helper function
* Fix a bug introduced by 62c8513 where `total_files` would never be `None` and so `recursive_file_count` was never called, possibly leading to incorrect derive queueing.
* Add tests for the fixed behaviour
  • Loading branch information
JustAnotherArchivist committed Feb 13, 2024
1 parent 6e23216 commit 1c4e9c5
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 25 deletions.
14 changes: 8 additions & 6 deletions internetarchive/item.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
iter_directory,
json,
norm_filepath,
recursive_file_count,
recursive_file_count_and_size,
validate_s3_identifier,
)

Expand Down Expand Up @@ -1194,11 +1194,13 @@ def upload(self, files,

responses = []
file_index = 0
if queue_derive and total_files is None:
if checksum:
total_files = recursive_file_count(files, item=self, checksum=True)
else:
total_files = recursive_file_count(files, item=self, checksum=False)
headers = headers or {}
if (queue_derive or not headers.get('x-archive-size-hint')) and total_files == 0:
total_files, total_size = recursive_file_count_and_size(files,
item=self,
checksum=checksum)
if not headers.get('x-archive-size-hint'):
headers['x-archive-size-hint'] = str(total_size)
file_metadata = None
for f in files:
if isinstance(f, dict):
Expand Down
58 changes: 39 additions & 19 deletions internetarchive/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,15 +216,19 @@ def _get_tag_text(tag_name, xml_obj):


def get_file_size(file_obj) -> int | None:
try:
file_obj.seek(0, os.SEEK_END)
size = file_obj.tell()
# Avoid OverflowError.
if size > sys.maxsize:
if is_filelike_obj(file_obj):
try:
file_obj.seek(0, os.SEEK_END)
size = file_obj.tell()
# Avoid OverflowError.
if size > sys.maxsize:
size = None
file_obj.seek(0, os.SEEK_SET)
except OSError:
size = None
file_obj.seek(0, os.SEEK_SET)
except OSError:
size = None
else:
st = os.stat(file_obj)
size = st.st_size
return size


Expand All @@ -237,11 +241,14 @@ def iter_directory(directory: str):
yield (filepath, key)


def recursive_file_count(files, item=None, checksum=False):
"""Given a filepath or list of filepaths, return the total number of files."""
def recursive_file_count_and_size(files, item=None, checksum=False):
"""Given a filepath or list of filepaths, return the total number and size of files.
If `checksum` is `True`, skip over files whose MD5 hash matches any file in the `item`.
"""
if not isinstance(files, (list, set)):
files = [files]
total_files = 0
total_size = 0
if checksum is True:
md5s = [f.get('md5') for f in item.files]
else:
Expand All @@ -264,24 +271,27 @@ def recursive_file_count(files, item=None, checksum=False):
except (AttributeError, TypeError):
is_dir = False
if is_dir:
for x, _ in iter_directory(f):
if checksum is True:
with open(x, 'rb') as fh:
lmd5 = get_md5(fh)
if lmd5 in md5s:
continue
total_files += 1
it = iter_directory(f)
else:
it = [(f, None)]
for x, _ in it:
if checksum is True:
try:
with open(f, 'rb') as fh:
with open(x, 'rb') as fh:
lmd5 = get_md5(fh)
except TypeError:
# Support file-like objects.
lmd5 = get_md5(f)
lmd5 = get_md5(x)
if lmd5 in md5s:
continue
total_size += get_file_size(x)
total_files += 1
return total_files, total_size


def recursive_file_count(*args, **kwargs):
"""Like `recursive_file_count_and_size`, but returns only the file count."""
total_files, _ = recursive_file_count_and_size(*args, **kwargs)
return total_files


Expand All @@ -294,6 +304,16 @@ def is_dir(obj) -> bool:
return False


def is_filelike_obj(obj) -> bool:
"""Distinguish file-like from path-like objects"""
try:
os.fspath(obj)
except TypeError:
return True
else:
return False


def reraise_modify(
caught_exc: Exception,
append_msg: str,
Expand Down
21 changes: 21 additions & 0 deletions tests/cli/test_ia_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,27 @@ def test_ia_upload_size_hint(capsys, tmpdir_ch, nasa_mocker):
assert 'Accept-Encoding:gzip, deflate' in err


def test_ia_upload_automatic_size_hint_files(capsys, tmpdir_ch, nasa_mocker):
with open('foo', 'w') as fh:
fh.write('foo')
with open('bar', 'w') as fh:
fh.write('bar')

ia_call(['ia', 'upload', '--debug', 'nasa', 'foo', 'bar'])
out, err = capsys.readouterr()
assert 'x-archive-size-hint:6' in err

def test_ia_upload_automatic_size_hint_dir(capsys, tmpdir_ch, nasa_mocker):
with open('foo', 'w') as fh:
fh.write('foo')
with open('bar', 'w') as fh:
fh.write('bar')

ia_call(['ia', 'upload', '--debug', 'nasa', '.'])
out, err = capsys.readouterr()
assert 'x-archive-size-hint:6' in err


def test_ia_upload_unicode(tmpdir_ch, caplog):
with open('தமிழ் - baz ∆.txt', 'w') as fh:
fh.write('unicode foo')
Expand Down
34 changes: 34 additions & 0 deletions tests/test_item.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,40 @@ def test_upload_checksum(tmpdir, nasa_item):
assert r.status_code is None


def test_upload_automatic_size_hint(tmpdir, nasa_item):
with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
_expected_headers = deepcopy(EXPECTED_S3_HEADERS)
del _expected_headers['x-archive-size-hint']
_expected_headers['x-archive-size-hint'] = '15'
rsps.add(responses.PUT, S3_URL_RE,
adding_headers=_expected_headers)

files = []
with open(os.path.join(tmpdir, 'file'), 'w') as fh:
fh.write('a')
files.append(os.path.join(tmpdir, 'file'))

os.mkdir(os.path.join(tmpdir, 'dir'))
with open(os.path.join(tmpdir, 'dir', 'file0'), 'w') as fh:
fh.write('bb')
with open(os.path.join(tmpdir, 'dir', 'file1'), 'w') as fh:
fh.write('cccc')
files.append(os.path.join(tmpdir, 'dir'))

with open(os.path.join(tmpdir, 'obj'), 'wb') as fh:
fh.write(b'dddddddd')
fh.seek(0, os.SEEK_SET)
files.append(fh)

_responses = nasa_item.upload(files,
access_key='a',
secret_key='b')
for r in _responses:
headers = {k.lower(): str(v) for k, v in r.headers.items()}
del headers['content-type']
assert headers == _expected_headers


def test_modify_metadata(nasa_item, nasa_metadata):
with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
rsps.add(responses.POST, f'{PROTOCOL}//archive.org/metadata/nasa')
Expand Down

0 comments on commit 1c4e9c5

Please sign in to comment.