From 1c4e9c53a530ae68539db61551d5476fa1eecbde Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 13 Feb 2024 19:45:37 +0000 Subject: [PATCH] Fix automatic size hint on uploads When uploading without a size hint (via headers or `--size-hint`), a x-archive-size-hint header is added automatically. However, prior to this commit, the value was the individual file size on each file's `PUT` request. This effectively made the size hint useless because it does not, in fact, provide a size hint for the total item size to the S3 backend at item creation time. Notes on detailed changes to implement this: * Rename `internetarchive.utils.recursive_file_count` to `recursive_file_count_and_size`, adding a wrapper for backwards compatibility * Add support for paths (rather than only file-like objects) to `internetarchive.utils.get_file_size` * Add a `internetarchive.utils.is_filelike_obj` helper function * Fix a bug introduced by 62c85133090e21659f09964fedd4c18ae4a27483 where `total_files` would never be `None` and so `recursive_file_count` was never called, possibly leading to incorrect derive queueing. * Add tests for the fixed behaviour --- internetarchive/item.py | 14 +++++---- internetarchive/utils.py | 58 +++++++++++++++++++++++++------------ tests/cli/test_ia_upload.py | 21 ++++++++++++++ tests/test_item.py | 34 ++++++++++++++++++++++ 4 files changed, 102 insertions(+), 25 deletions(-) diff --git a/internetarchive/item.py b/internetarchive/item.py index 1cd3cfe0..929805db 100644 --- a/internetarchive/item.py +++ b/internetarchive/item.py @@ -57,7 +57,7 @@ iter_directory, json, norm_filepath, - recursive_file_count, + recursive_file_count_and_size, validate_s3_identifier, ) @@ -1194,11 +1194,13 @@ def upload(self, files, responses = [] file_index = 0 - if queue_derive and total_files is None: - if checksum: - total_files = recursive_file_count(files, item=self, checksum=True) - else: - total_files = recursive_file_count(files, item=self, checksum=False) + headers = headers or {} + if (queue_derive or not headers.get('x-archive-size-hint')) and total_files == 0: + total_files, total_size = recursive_file_count_and_size(files, + item=self, + checksum=checksum) + if not headers.get('x-archive-size-hint'): + headers['x-archive-size-hint'] = str(total_size) file_metadata = None for f in files: if isinstance(f, dict): diff --git a/internetarchive/utils.py b/internetarchive/utils.py index 38b09546..e9d17206 100644 --- a/internetarchive/utils.py +++ b/internetarchive/utils.py @@ -216,15 +216,19 @@ def _get_tag_text(tag_name, xml_obj): def get_file_size(file_obj) -> int | None: - try: - file_obj.seek(0, os.SEEK_END) - size = file_obj.tell() - # Avoid OverflowError. - if size > sys.maxsize: + if is_filelike_obj(file_obj): + try: + file_obj.seek(0, os.SEEK_END) + size = file_obj.tell() + # Avoid OverflowError. + if size > sys.maxsize: + size = None + file_obj.seek(0, os.SEEK_SET) + except OSError: size = None - file_obj.seek(0, os.SEEK_SET) - except OSError: - size = None + else: + st = os.stat(file_obj) + size = st.st_size return size @@ -237,11 +241,14 @@ def iter_directory(directory: str): yield (filepath, key) -def recursive_file_count(files, item=None, checksum=False): - """Given a filepath or list of filepaths, return the total number of files.""" +def recursive_file_count_and_size(files, item=None, checksum=False): + """Given a filepath or list of filepaths, return the total number and size of files. + If `checksum` is `True`, skip over files whose MD5 hash matches any file in the `item`. + """ if not isinstance(files, (list, set)): files = [files] total_files = 0 + total_size = 0 if checksum is True: md5s = [f.get('md5') for f in item.files] else: @@ -264,24 +271,27 @@ def recursive_file_count(files, item=None, checksum=False): except (AttributeError, TypeError): is_dir = False if is_dir: - for x, _ in iter_directory(f): - if checksum is True: - with open(x, 'rb') as fh: - lmd5 = get_md5(fh) - if lmd5 in md5s: - continue - total_files += 1 + it = iter_directory(f) else: + it = [(f, None)] + for x, _ in it: if checksum is True: try: - with open(f, 'rb') as fh: + with open(x, 'rb') as fh: lmd5 = get_md5(fh) except TypeError: # Support file-like objects. - lmd5 = get_md5(f) + lmd5 = get_md5(x) if lmd5 in md5s: continue + total_size += get_file_size(x) total_files += 1 + return total_files, total_size + + +def recursive_file_count(*args, **kwargs): + """Like `recursive_file_count_and_size`, but returns only the file count.""" + total_files, _ = recursive_file_count_and_size(*args, **kwargs) return total_files @@ -294,6 +304,16 @@ def is_dir(obj) -> bool: return False +def is_filelike_obj(obj) -> bool: + """Distinguish file-like from path-like objects""" + try: + os.fspath(obj) + except TypeError: + return True + else: + return False + + def reraise_modify( caught_exc: Exception, append_msg: str, diff --git a/tests/cli/test_ia_upload.py b/tests/cli/test_ia_upload.py index 3255d981..3d876085 100644 --- a/tests/cli/test_ia_upload.py +++ b/tests/cli/test_ia_upload.py @@ -124,6 +124,27 @@ def test_ia_upload_size_hint(capsys, tmpdir_ch, nasa_mocker): assert 'Accept-Encoding:gzip, deflate' in err +def test_ia_upload_automatic_size_hint_files(capsys, tmpdir_ch, nasa_mocker): + with open('foo', 'w') as fh: + fh.write('foo') + with open('bar', 'w') as fh: + fh.write('bar') + + ia_call(['ia', 'upload', '--debug', 'nasa', 'foo', 'bar']) + out, err = capsys.readouterr() + assert 'x-archive-size-hint:6' in err + +def test_ia_upload_automatic_size_hint_dir(capsys, tmpdir_ch, nasa_mocker): + with open('foo', 'w') as fh: + fh.write('foo') + with open('bar', 'w') as fh: + fh.write('bar') + + ia_call(['ia', 'upload', '--debug', 'nasa', '.']) + out, err = capsys.readouterr() + assert 'x-archive-size-hint:6' in err + + def test_ia_upload_unicode(tmpdir_ch, caplog): with open('தமிழ் - baz ∆.txt', 'w') as fh: fh.write('unicode foo') diff --git a/tests/test_item.py b/tests/test_item.py index 9ae7d7c7..adaf0bcd 100644 --- a/tests/test_item.py +++ b/tests/test_item.py @@ -580,6 +580,40 @@ def test_upload_checksum(tmpdir, nasa_item): assert r.status_code is None +def test_upload_automatic_size_hint(tmpdir, nasa_item): + with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: + _expected_headers = deepcopy(EXPECTED_S3_HEADERS) + del _expected_headers['x-archive-size-hint'] + _expected_headers['x-archive-size-hint'] = '15' + rsps.add(responses.PUT, S3_URL_RE, + adding_headers=_expected_headers) + + files = [] + with open(os.path.join(tmpdir, 'file'), 'w') as fh: + fh.write('a') + files.append(os.path.join(tmpdir, 'file')) + + os.mkdir(os.path.join(tmpdir, 'dir')) + with open(os.path.join(tmpdir, 'dir', 'file0'), 'w') as fh: + fh.write('bb') + with open(os.path.join(tmpdir, 'dir', 'file1'), 'w') as fh: + fh.write('cccc') + files.append(os.path.join(tmpdir, 'dir')) + + with open(os.path.join(tmpdir, 'obj'), 'wb') as fh: + fh.write(b'dddddddd') + fh.seek(0, os.SEEK_SET) + files.append(fh) + + _responses = nasa_item.upload(files, + access_key='a', + secret_key='b') + for r in _responses: + headers = {k.lower(): str(v) for k, v in r.headers.items()} + del headers['content-type'] + assert headers == _expected_headers + + def test_modify_metadata(nasa_item, nasa_metadata): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.POST, f'{PROTOCOL}//archive.org/metadata/nasa')