From c5a18f3d2beb95963db62461a0757f244b8ca6b3 Mon Sep 17 00:00:00 2001 From: Dobatymo Date: Thu, 28 May 2020 11:59:52 +0800 Subject: [PATCH] WIP fix derive logic and redundant hash calc removed all counting whatsoever and just queue the derive in the end --- internetarchive/item.py | 40 ++++++++++----------------------- internetarchive/utils.py | 48 ---------------------------------------- 2 files changed, 12 insertions(+), 76 deletions(-) diff --git a/internetarchive/item.py b/internetarchive/item.py index 154a5116..104f5dd4 100644 --- a/internetarchive/item.py +++ b/internetarchive/item.py @@ -49,7 +49,7 @@ from requests.exceptions import HTTPError from internetarchive.utils import IdentifierListAsItems, get_md5, chunk_generator, \ - IterableToFileAdapter, iter_directory, recursive_file_count, norm_filepath + IterableToFileAdapter, iter_directory, norm_filepath from internetarchive.files import File from internetarchive.iarequest import MetadataRequest, S3Request from internetarchive.auth import S3Auth @@ -1159,45 +1159,32 @@ def upload(self, files, """ queue_derive = True if queue_derive is None else queue_derive remote_dir_name = None - total_files = None + if isinstance(files, dict): if files.get('name'): files = [files] - total_files = 1 else: files = list(files.items()) if not isinstance(files, (list, tuple)): files = [files] - if all(isinstance(f, dict) and f.get('name') for f in files): - total_files = len(files) responses = [] file_index = 0 - if queue_derive and total_files is None: - if checksum: - total_files = recursive_file_count(files, item=self, checksum=True) - else: - total_files = recursive_file_count(files, item=self, checksum=False) file_metadata = None for f in files: + if isinstance(f, dict): if f.get('name'): file_metadata = f.copy() del file_metadata['name'] f = f['name'] + if (isinstance(f, string_types) and is_dir(f)) \ or (isinstance(f, tuple) and is_dir(f[-1])): if isinstance(f, tuple): remote_dir_name = f[0].strip('/') f = f[-1] for filepath, key in iter_directory(f): - file_index += 1 - # Set derive header if queue_derive is True, - # and this is the last request being made. - if queue_derive is True and file_index >= total_files: - _queue_derive = True - else: - _queue_derive = False if not f.endswith('/'): if remote_dir_name: key = '{0}{1}/{2}'.format(remote_dir_name, f, key) @@ -1213,7 +1200,7 @@ def upload(self, files, headers=headers, access_key=access_key, secret_key=secret_key, - queue_derive=_queue_derive, + queue_derive=False, verbose=verbose, verify=verify, checksum=checksum, @@ -1225,15 +1212,6 @@ def upload(self, files, request_kwargs=request_kwargs) responses.append(resp) else: - file_index += 1 - # Set derive header if queue_derive is True, - # and this is the last request being made. - # if queue_derive is True and file_index >= len(files): - if queue_derive is True and file_index >= total_files: - _queue_derive = True - else: - _queue_derive = False - if not isinstance(f, (list, tuple)): key, body = (None, f) else: @@ -1247,7 +1225,7 @@ def upload(self, files, headers=headers, access_key=access_key, secret_key=secret_key, - queue_derive=_queue_derive, + queue_derive=False, verbose=verbose, verify=verify, checksum=checksum, @@ -1258,6 +1236,12 @@ def upload(self, files, validate_identifier=validate_identifier, request_kwargs=request_kwargs) responses.append(resp) + + if queue_derive: + # Came this far without any exceptions raised, so all uploads + # probably completed successfully. Derive now. + self.derive() + return responses diff --git a/internetarchive/utils.py b/internetarchive/utils.py index a11bf1a6..70f1ebbd 100644 --- a/internetarchive/utils.py +++ b/internetarchive/utils.py @@ -222,54 +222,6 @@ def iter_directory(directory): yield (filepath, key) -def recursive_file_count(files, item=None, checksum=False): - """Given a filepath or list of filepaths, return the total number of files.""" - if not isinstance(files, (list, set)): - files = [files] - total_files = 0 - if checksum is True: - md5s = [f.get('md5') for f in item.files] - else: - md5s = list() - if isinstance(files, dict): - # make sure to use local filenames. - _files = files.values() - else: - if isinstance(files[0], tuple): - _files = dict(files).values() - else: - _files = files - for f in _files: - try: - is_dir = os.path.isdir(f) - except TypeError: - try: - f = f[0] - is_dir = os.path.isdir(f) - except (AttributeError, TypeError): - is_dir = False - if is_dir: - for x, _ in iter_directory(f): - if checksum is True: - with open(x, 'rb') as fh: - lmd5 = get_md5(fh) - if lmd5 in md5s: - continue - total_files += 1 - else: - if checksum is True: - try: - with open(f, 'rb') as fh: - lmd5 = get_md5(fh) - except TypeError: - # Support file-like objects. - lmd5 = get_md5(f) - if lmd5 in md5s: - continue - total_files += 1 - return total_files - - def is_dir(obj): """Special is_dir function to handle file-like object cases that cannot be stat'd"""