From ea9023a0a6f5a5ea04e5a22a7b8b5974a000cae2 Mon Sep 17 00:00:00 2001 From: Benoit Boulanger Date: Wed, 19 Jul 2023 21:56:44 -0400 Subject: [PATCH 1/7] Add checksum archive file (issue #600) --- internetarchive/api.py | 6 +++++ internetarchive/cli/ia_download.py | 2 ++ internetarchive/files.py | 35 +++++++++++++++++++++++++----- internetarchive/item.py | 10 +++++++-- 4 files changed, 46 insertions(+), 7 deletions(-) diff --git a/internetarchive/api.py b/internetarchive/api.py index 8ebd295c..77703b54 100644 --- a/internetarchive/api.py +++ b/internetarchive/api.py @@ -304,6 +304,7 @@ def download( verbose: bool = False, ignore_existing: bool = False, checksum: bool = False, + checksum_archive: bool = False, destdir: str | None = None, no_directory: bool = False, retries: int | None = None, @@ -334,6 +335,10 @@ def download( :param checksum: Skip downloading file based on checksum. + :param checksum_archive: Skip downloading file based on checksum, and skip + checksum validation if it already succeeded + (will create and use _checksum_archive.txt). + :param destdir: The directory to download files to. :param no_directory: Download files to current working @@ -367,6 +372,7 @@ def download( verbose=verbose, ignore_existing=ignore_existing, checksum=checksum, + checksum_archive=checksum_archive, destdir=destdir, no_directory=no_directory, retries=retries, diff --git a/internetarchive/cli/ia_download.py b/internetarchive/cli/ia_download.py index 335bb89b..6ace3676 100644 --- a/internetarchive/cli/ia_download.py +++ b/internetarchive/cli/ia_download.py @@ -48,6 +48,7 @@ ia metadata --formats + --checksum-archive Skip files based on _checksum_archive.txt [default: False]. --on-the-fly Download on-the-fly files, as well as other matching files. on-the-fly files include derivative EPUB, MOBI and DAISY files [default: False]. @@ -198,6 +199,7 @@ def main(argv, session: ArchiveSession) -> None: verbose=not args['--quiet'], ignore_existing=args['--ignore-existing'], checksum=args['--checksum'], + checksum_archive=args['--checksum-archive'], destdir=args['--destdir'], no_directory=args['--no-directories'], retries=retries, diff --git a/internetarchive/files.py b/internetarchive/files.py index c8029469..51b6d239 100644 --- a/internetarchive/files.py +++ b/internetarchive/files.py @@ -137,10 +137,10 @@ def __repr__(self): f'format={self.format!r})') def download(self, file_path=None, verbose=None, ignore_existing=None, - checksum=None, destdir=None, retries=None, ignore_errors=None, - fileobj=None, return_responses=None, no_change_timestamp=None, - params=None, chunk_size=None, stdout=None, ors=None, - timeout=None): + checksum=None, checksum_archive=None, destdir=None, retries=None, + ignore_errors=None, fileobj=None, return_responses=None, + no_change_timestamp=None, params=None, chunk_size=None, stdout=None, + ors=None, timeout=None): """Download the file into the current working directory. :type file_path: str @@ -156,6 +156,11 @@ def download(self, file_path=None, verbose=None, ignore_existing=None, :type checksum: bool :param checksum: (optional) Skip downloading file based on checksum. + :type checksum_archive: bool + :param checksum_archive: (optional) Skip downloading file based on checksum, and + skip checksum validation if it already succeeded + (will create and use _checksum_archive.txt). + :type destdir: str :param destdir: (optional) The directory to download files to. @@ -198,6 +203,7 @@ def download(self, file_path=None, verbose=None, ignore_existing=None, verbose = False if verbose is None else verbose ignore_existing = False if ignore_existing is None else ignore_existing checksum = False if checksum is None else checksum + checksum_archive = False if checksum_archive is None else checksum_archive retries = retries or 2 ignore_errors = ignore_errors or False return_responses = return_responses or False @@ -209,6 +215,7 @@ def download(self, file_path=None, verbose=None, ignore_existing=None, file_path = file_path or self.name if destdir: + print(f"destdir: {destdir}") if return_responses is not True: try: os.mkdir(destdir) @@ -218,6 +225,20 @@ def download(self, file_path=None, verbose=None, ignore_existing=None, raise OSError(f'{destdir} is not a directory!') file_path = os.path.join(destdir, file_path) + if checksum_archive: + checksum_archive_filename = '_checksum_archive.txt' + if not os.path.exists(checksum_archive_filename): + with open(checksum_archive_filename, 'wt', encoding='utf-8') as f: + pass + with open(checksum_archive_filename, 'rt', encoding='utf-8') as f: + checksum_archive_data = f.read().splitlines() + if file_path in checksum_archive_data: + msg = f'skipping {file_path}, file already exists based on checksum_archive.' + log.info(msg) + if verbose: + print(f' {msg}', file=sys.stderr) + return + if not return_responses and os.path.exists(file_path.encode('utf-8')): if ignore_existing: msg = f'skipping {file_path}, file already exists.' @@ -225,7 +246,7 @@ def download(self, file_path=None, verbose=None, ignore_existing=None, if verbose: print(f' {msg}', file=sys.stderr) return - elif checksum: + elif checksum or checksum_archive: with open(file_path, 'rb') as fp: md5_sum = utils.get_md5(fp) @@ -234,6 +255,10 @@ def download(self, file_path=None, verbose=None, ignore_existing=None, log.info(msg) if verbose: print(f' {msg}', file=sys.stderr) + if checksum_archive: + # add file to checksum_archive to skip it next time + with open(checksum_archive_filename, 'a', encoding='utf-8') as f: + f.write(f'{file_path}\n') return elif not fileobj: st = os.stat(file_path.encode('utf-8')) diff --git a/internetarchive/item.py b/internetarchive/item.py index 18a2d072..a0c61a16 100644 --- a/internetarchive/item.py +++ b/internetarchive/item.py @@ -589,6 +589,7 @@ def download(self, verbose: bool = False, ignore_existing: bool = False, checksum: bool = False, + checksum_archive: bool = False, destdir: str | None = None, no_directory: bool = False, retries: int | None = None, @@ -627,6 +628,10 @@ def download(self, :param checksum: Skip downloading file based on checksum. + :param checksum_archive: Skip downloading file based on checksum, and skip + checksum validation if it already succeeded + (will create and use _checksum_archive.txt). + :param destdir: The directory to download files to. :param no_directory: Download files to current working @@ -670,6 +675,7 @@ def download(self, ignore_existing = bool(ignore_existing) ignore_errors = bool(ignore_errors) checksum = bool(checksum) + checksum_archive = bool(checksum_archive) no_directory = bool(no_directory) return_responses = bool(return_responses) no_change_timestamp = bool(no_change_timestamp) @@ -746,8 +752,8 @@ def download(self, ors = True else: ors = False - r = f.download(path, verbose, ignore_existing, checksum, destdir, - retries, ignore_errors, fileobj, return_responses, + r = f.download(path, verbose, ignore_existing, checksum, checksum_archive, + destdir, retries, ignore_errors, fileobj, return_responses, no_change_timestamp, params, None, stdout, ors, timeout) if return_responses: responses.append(r) From 656bf7ba529cc1c5fe7e80bc90cbdd4b6239ef68 Mon Sep 17 00:00:00 2001 From: Benoit Boulanger Date: Wed, 19 Jul 2023 22:48:53 -0400 Subject: [PATCH 2/7] Add doc for checksum archive file (issue #600) --- docs/source/quickstart.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index c9df9194..7d4b629d 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -179,6 +179,20 @@ Alternatively, you can skip files based on md5 checksums. This is will take long skipping nasa/nasa_meta.xml, file already exists based on checksum. skipping nasa/nasa_reviews.xml, file already exists based on checksum. +Furthermore, you can skip files based on md5 checksums and user a checksum_archive file. This is will be faster than checksum alone because checksums will only need to be calculated once for every file already downloaded. Once calculated successfully, the item/file will be written to the checksum_archive file and succeeding runs will skip the checksum validation:: + + >>> download('nasa', verbose=True, checksum_archive=True) + nasa: + skipping nasa/__ia_thumb.jpg, file already exists based on checksum_archive. + skipping nasa/globe_west_540.jpg, file already exists based on checksum_archive. + skipping nasa/globe_west_540_thumb.jpg, file already exists based on checksum_archive. + skipping nasa/nasa_archive.torrent, file already exists based on checksum_archive. + skipping nasa_files.xml: 2.56kiB [00:00, 5.76MiB/s] + skipping nasa/nasa_itemimage.jpg, file already exists based on checksum_archive. + skipping nasa/nasa_meta.sqlite, file already exists based on checksum. + skipping nasa/nasa_meta.xml, file already exists based on checksum. + downloading nasa/nasa_reviews.xml, file already exists based on checksum. + By default, the :func:`download ` function will download all of the files in an item. However, there are a couple parameters that can be used to download only specific files. Files can be filtered using the ``glob_pattern`` parameter:: >>> download('nasa', verbose=True, glob_pattern='*xml') From 5cf93beb653031dcd97654b16bbf0357bd24d139 Mon Sep 17 00:00:00 2001 From: Benoit Boulanger Date: Wed, 24 Apr 2024 00:54:40 -0400 Subject: [PATCH 3/7] Fixed logic for checksum archive validation --- internetarchive/files.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/internetarchive/files.py b/internetarchive/files.py index e0c421ea..695fe80e 100644 --- a/internetarchive/files.py +++ b/internetarchive/files.py @@ -281,14 +281,8 @@ def download( # noqa: max-complexity=38 # Check if we should skip... if not return_responses and os.path.exists(file_path.encode('utf-8')): - if ignore_existing: - msg = f'skipping {file_path}, file already exists.' - log.info(msg) - if verbose: - print(f' {msg}', file=sys.stderr) - return - elif checksum_archive: - checksum_archive_filename = '_checksum_archive.txt' # TODO Define this at a better place + if checksum_archive: + checksum_archive_filename = '_checksum_archive.txt' if not os.path.exists(checksum_archive_filename): with open(checksum_archive_filename, 'wt', encoding='utf-8') as f: pass @@ -300,6 +294,12 @@ def download( # noqa: max-complexity=38 if verbose: print(f' {msg}', file=sys.stderr) return + if ignore_existing: + msg = f'skipping {file_path}, file already exists.' + log.info(msg) + if verbose: + print(f' {msg}', file=sys.stderr) + return elif checksum or checksum_archive: with open(file_path, 'rb') as fp: md5_sum = utils.get_md5(fp) From 5b85676ebe6f224c9616bc1b11f4acb784f50b17 Mon Sep 17 00:00:00 2001 From: Benoit Boulanger Date: Wed, 24 Apr 2024 01:27:40 -0400 Subject: [PATCH 4/7] resolve minor linter issues --- internetarchive/api.py | 2 +- internetarchive/cli/ia_download.py | 3 ++- internetarchive/files.py | 9 ++++++--- internetarchive/item.py | 2 +- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/internetarchive/api.py b/internetarchive/api.py index a70eca42..9c0e34ec 100644 --- a/internetarchive/api.py +++ b/internetarchive/api.py @@ -336,7 +336,7 @@ def download( :param checksum: Skip downloading file based on checksum. - :param checksum_archive: Skip downloading file based on checksum, and skip + :param checksum_archive: Skip downloading file based on checksum, and skip checksum validation if it already succeeded (will create and use _checksum_archive.txt). diff --git a/internetarchive/cli/ia_download.py b/internetarchive/cli/ia_download.py index 6ace3676..eb992c57 100644 --- a/internetarchive/cli/ia_download.py +++ b/internetarchive/cli/ia_download.py @@ -48,7 +48,8 @@ ia metadata --formats - --checksum-archive Skip files based on _checksum_archive.txt [default: False]. + --checksum-archive Skip files based on _checksum_archive.txt + [default: False]. --on-the-fly Download on-the-fly files, as well as other matching files. on-the-fly files include derivative EPUB, MOBI and DAISY files [default: False]. diff --git a/internetarchive/files.py b/internetarchive/files.py index 695fe80e..35593292 100644 --- a/internetarchive/files.py +++ b/internetarchive/files.py @@ -284,12 +284,15 @@ def download( # noqa: max-complexity=38 if checksum_archive: checksum_archive_filename = '_checksum_archive.txt' if not os.path.exists(checksum_archive_filename): - with open(checksum_archive_filename, 'wt', encoding='utf-8') as f: + with open(checksum_archive_filename, 'w', encoding='utf-8') as f: pass - with open(checksum_archive_filename, 'rt', encoding='utf-8') as f: + with open(checksum_archive_filename, encoding='utf-8') as f: checksum_archive_data = f.read().splitlines() if file_path in checksum_archive_data: - msg = f'skipping {file_path}, file already exists based on checksum_archive.' + msg = ( + f'skipping {file_path}, ' + f'file already exists based on checksum_archive.' + ) log.info(msg) if verbose: print(f' {msg}', file=sys.stderr) diff --git a/internetarchive/item.py b/internetarchive/item.py index acc975e2..35f2d2e8 100644 --- a/internetarchive/item.py +++ b/internetarchive/item.py @@ -752,7 +752,7 @@ def download(self, ors = True else: ors = False - r = f.download(path, verbose, ignore_existing, checksum, checksum_archive, + r = f.download(path, verbose, ignore_existing, checksum, checksum_archive, destdir, retries, ignore_errors, fileobj, return_responses, no_change_timestamp, params, None, stdout, ors, timeout) if return_responses: From 41d261540b7e5b30a61bd2db251132f562f2c465 Mon Sep 17 00:00:00 2001 From: Benoit Boulanger Date: Thu, 25 Apr 2024 16:42:21 -0400 Subject: [PATCH 5/7] add tests for checksum_archive. make test_clobber, test_checksum and test_checksum_archive platform-independent --- tests/cli/test_ia_download.py | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/tests/cli/test_ia_download.py b/tests/cli/test_ia_download.py index 87ddcb78..a90db594 100644 --- a/tests/cli/test_ia_download.py +++ b/tests/cli/test_ia_download.py @@ -73,8 +73,9 @@ def test_clobber(tmpdir_ch): stdout, stderr = call_cmd(cmd) assert files_downloaded('nasa') == {'nasa_meta.xml'} - expected_stderr = ('nasa:\n' - ' skipping nasa/nasa_meta.xml, file already exists based on length and date.') + prefix = 'nasa:\n'.replace('\n', os.linesep) + filepath = os.path.join('nasa', 'nasa_meta.xml') + expected_stderr = f'{prefix} skipping {filepath}, file already exists based on length and date.' assert expected_stderr == stderr @@ -84,7 +85,31 @@ def test_checksum(tmpdir_ch): stdout, stderr = call_cmd('ia --insecure download --checksum nasa nasa_meta.xml') assert files_downloaded('nasa') == {'nasa_meta.xml'} - assert 'nasa:\n skipping nasa/nasa_meta.xml, file already exists based on checksum.' == stderr + prefix = 'nasa:\n'.replace('\n', os.linesep) + filepath = os.path.join('nasa', 'nasa_meta.xml') + assert f'{prefix} skipping {filepath}, file already exists based on checksum.' == stderr + + +def test_checksum_archive(tmpdir_ch): + call_cmd('ia --insecure download nasa nasa_meta.xml') + assert files_downloaded('nasa') == {'nasa_meta.xml'} + + stdout, stderr = call_cmd('ia --insecure download --checksum-archive nasa nasa_meta.xml') + assert files_downloaded('nasa') == {'nasa_meta.xml'} + prefix = 'nasa:\n'.replace('\n', os.linesep) + filepath = os.path.join('nasa', 'nasa_meta.xml') + assert f'{prefix} skipping {filepath}, file already exists based on checksum.' == stderr + + assert '_checksum_archive.txt' in files_downloaded('.') + with open(os.path.join('.', '_checksum_archive.txt'), encoding='utf-8') as f: + filepath = os.path.join('nasa', 'nasa_meta.xml') + assert f.read() == f'{filepath}\n' + + stdout, stderr = call_cmd('ia --insecure download --checksum-archive nasa nasa_meta.xml') + assert files_downloaded('nasa') == {'nasa_meta.xml'} + prefix = 'nasa:\n'.replace('\n', os.linesep) + filepath = os.path.join('nasa', 'nasa_meta.xml') + assert f'{prefix} skipping {filepath}, file already exists based on checksum_archive.' == stderr def test_no_directories(tmpdir_ch): From cb8694115b2cce24f34ee8847268bd0d6652685a Mon Sep 17 00:00:00 2001 From: Benoit Boulanger Date: Thu, 25 Apr 2024 16:59:33 -0400 Subject: [PATCH 6/7] increase max-args to 24, to accomodate the extra argument I had to add in item.py -> Item -> download for checksum_archive --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2ed3e1d8..827d9792 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,7 +74,7 @@ line-length = 102 max-complexity = 33 [tool.ruff.pylint] -max-args = 23 +max-args = 24 max-branches = 33 max-statements = 124 From c1d0867840017a22cbb57818bd5315cee540a548 Mon Sep 17 00:00:00 2001 From: jake Date: Fri, 14 Jun 2024 11:57:44 -0700 Subject: [PATCH 7/7] Don't print destdir in verbose mode --- internetarchive/files.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/internetarchive/files.py b/internetarchive/files.py index 5a28d7e5..8db330c6 100644 --- a/internetarchive/files.py +++ b/internetarchive/files.py @@ -234,8 +234,6 @@ def download( # noqa: max-complexity=38 file_path = file_path or self.name if destdir: - if verbose: - print(f"destdir: {destdir}") if return_responses is not True: try: os.mkdir(destdir)