From d31e5c3418b284c5fcc3ef84ac62a9833bd8e3a6 Mon Sep 17 00:00:00 2001 From: Chlod Alejandro Date: Sun, 26 May 2024 15:39:08 +0800 Subject: [PATCH] Skip early when downloading existing file #614 moved skip checks to until after response headers have been received, which drastically slows down the download process if the file already exists or the file has an equal checksum. Since the file name and checksum are already known prior to download, these checks should remain at the start to avoid having to make a request which would eventually be discarded anyway. --- internetarchive/files.py | 51 ++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/internetarchive/files.py b/internetarchive/files.py index 020b9fa9..7b43b9f3 100644 --- a/internetarchive/files.py +++ b/internetarchive/files.py @@ -226,6 +226,25 @@ def download(# noqa: max-complexity=38 parent_dir = os.path.dirname(file_path) + # Check if we should skip... + if not return_responses and os.path.exists(file_path.encode('utf-8')): + if ignore_existing: + msg = f'skipping {file_path}, file already exists.' + log.info(msg) + if verbose: + print(f' {msg}', file=sys.stderr) + return + elif checksum: + with open(file_path, 'rb') as fp: + md5_sum = utils.get_md5(fp) + + if md5_sum == self.md5: + msg = f'skipping {file_path}, file already exists based on checksum.' + log.info(msg) + if verbose: + print(f' {msg}', file=sys.stderr) + return + # Retry loop while True: try: @@ -256,35 +275,17 @@ def download(# noqa: max-complexity=38 response.raise_for_status() - # Check if we should skip... - if not return_responses and os.path.exists(file_path.encode('utf-8')): - if ignore_existing: - msg = f'skipping {file_path}, file already exists.' - log.info(msg) - if verbose: - print(f' {msg}', file=sys.stderr) - return - elif checksum: - with open(file_path, 'rb') as fp: - md5_sum = utils.get_md5(fp) - - if md5_sum == self.md5: - msg = f'skipping {file_path}, file already exists based on checksum.' + # Check if we should skip based on last modified time... + if not fileobj and not return_responses and os.path.exists(file_path.encode('utf-8')): + st = os.stat(file_path.encode('utf-8')) + if st.st_mtime == last_mod_mtime: + if self.name == f'{self.identifier}_files.xml' or (st.st_size == self.size): + msg = (f'skipping {file_path}, file already exists based on ' + 'length and date.') log.info(msg) if verbose: print(f' {msg}', file=sys.stderr) return - elif not fileobj: - st = os.stat(file_path.encode('utf-8')) - if st.st_mtime == last_mod_mtime: - if self.name == f'{self.identifier}_files.xml' \ - or (st.st_size == self.size): - msg = (f'skipping {file_path}, file already exists based on ' - 'length and date.') - log.info(msg) - if verbose: - print(f' {msg}', file=sys.stderr) - return elif return_responses: return response