From ea9023a0a6f5a5ea04e5a22a7b8b5974a000cae2 Mon Sep 17 00:00:00 2001
From: Benoit Boulanger <benbou@gmail.com>
Date: Wed, 19 Jul 2023 21:56:44 -0400
Subject: [PATCH 1/7] Add checksum archive file (issue #600)

---
 internetarchive/api.py             |  6 +++++
 internetarchive/cli/ia_download.py |  2 ++
 internetarchive/files.py           | 35 +++++++++++++++++++++++++-----
 internetarchive/item.py            | 10 +++++++--
 4 files changed, 46 insertions(+), 7 deletions(-)
diff --git a/internetarchive/api.py b/internetarchive/api.py
index 8ebd295c..77703b54 100644
--- a/internetarchive/api.py
+++ b/internetarchive/api.py
@@ -304,6 +304,7 @@ def download(
     verbose: bool = False,
     ignore_existing: bool = False,
     checksum: bool = False,
+    checksum_archive: bool = False,
     destdir: str | None = None,
     no_directory: bool = False,
     retries: int | None = None,
@@ -334,6 +335,10 @@ def download(
 
     :param checksum: Skip downloading file based on checksum.
 
+    :param checksum_archive: Skip downloading file based on checksum, and skip 
+                             checksum validation if it already succeeded
+                             (will create and use _checksum_archive.txt).
+
     :param destdir: The directory to download files to.
 
     :param no_directory: Download files to current working
@@ -367,6 +372,7 @@ def download(
         verbose=verbose,
         ignore_existing=ignore_existing,
         checksum=checksum,
+        checksum_archive=checksum_archive,
         destdir=destdir,
         no_directory=no_directory,
         retries=retries,
diff --git a/internetarchive/cli/ia_download.py b/internetarchive/cli/ia_download.py
index 335bb89b..6ace3676 100644
--- a/internetarchive/cli/ia_download.py
+++ b/internetarchive/cli/ia_download.py
@@ -48,6 +48,7 @@
 
                                                  ia metadata --formats <identifier>
 
+    --checksum-archive                       Skip files based on _checksum_archive.txt [default: False].
     --on-the-fly                             Download on-the-fly files, as well as other matching
                                              files. on-the-fly files include derivative EPUB, MOBI
                                              and DAISY files [default: False].
@@ -198,6 +199,7 @@ def main(argv, session: ArchiveSession) -> None:
             verbose=not args['--quiet'],
             ignore_existing=args['--ignore-existing'],
             checksum=args['--checksum'],
+            checksum_archive=args['--checksum-archive'],
             destdir=args['--destdir'],
             no_directory=args['--no-directories'],
             retries=retries,
diff --git a/internetarchive/files.py b/internetarchive/files.py
index c8029469..51b6d239 100644
--- a/internetarchive/files.py
+++ b/internetarchive/files.py
@@ -137,10 +137,10 @@ def __repr__(self):
                 f'format={self.format!r})')
 
     def download(self, file_path=None, verbose=None, ignore_existing=None,
-                 checksum=None, destdir=None, retries=None, ignore_errors=None,
-                 fileobj=None, return_responses=None, no_change_timestamp=None,
-                 params=None, chunk_size=None, stdout=None, ors=None,
-                 timeout=None):
+                 checksum=None, checksum_archive=None, destdir=None, retries=None,
+                 ignore_errors=None, fileobj=None, return_responses=None,
+                 no_change_timestamp=None, params=None, chunk_size=None, stdout=None,
+                 ors=None, timeout=None):
         """Download the file into the current working directory.
 
         :type file_path: str
@@ -156,6 +156,11 @@ def download(self, file_path=None, verbose=None, ignore_existing=None,
         :type checksum: bool
         :param checksum: (optional) Skip downloading file based on checksum.
 
+        :type checksum_archive: bool
+        :param checksum_archive: (optional) Skip downloading file based on checksum, and
+                                 skip checksum validation if it already succeeded
+                                 (will create and use _checksum_archive.txt).
+
         :type destdir: str
         :param destdir: (optional) The directory to download files to.
 
@@ -198,6 +203,7 @@ def download(self, file_path=None, verbose=None, ignore_existing=None,
         verbose = False if verbose is None else verbose
         ignore_existing = False if ignore_existing is None else ignore_existing
         checksum = False if checksum is None else checksum
+        checksum_archive = False if checksum_archive is None else checksum_archive
         retries = retries or 2
         ignore_errors = ignore_errors or False
         return_responses = return_responses or False
@@ -209,6 +215,7 @@ def download(self, file_path=None, verbose=None, ignore_existing=None,
         file_path = file_path or self.name
 
         if destdir:
+            print(f"destdir: {destdir}")
             if return_responses is not True:
                 try:
                     os.mkdir(destdir)
@@ -218,6 +225,20 @@ def download(self, file_path=None, verbose=None, ignore_existing=None,
                 raise OSError(f'{destdir} is not a directory!')
             file_path = os.path.join(destdir, file_path)
 
+        if checksum_archive:
+            checksum_archive_filename = '_checksum_archive.txt'
+            if not os.path.exists(checksum_archive_filename):
+                with open(checksum_archive_filename, 'wt', encoding='utf-8') as f:
+                    pass
+            with open(checksum_archive_filename, 'rt', encoding='utf-8') as f:
+                checksum_archive_data = f.read().splitlines()
+            if file_path in checksum_archive_data:
+                msg = f'skipping {file_path}, file already exists based on checksum_archive.'
+                log.info(msg)
+                if verbose:
+                    print(f' {msg}', file=sys.stderr)
+                return
+
         if not return_responses and os.path.exists(file_path.encode('utf-8')):
             if ignore_existing:
                 msg = f'skipping {file_path}, file already exists.'
@@ -225,7 +246,7 @@ def download(self, file_path=None, verbose=None, ignore_existing=None,
                 if verbose:
                     print(f' {msg}', file=sys.stderr)
                 return
-            elif checksum:
+            elif checksum or checksum_archive:
                 with open(file_path, 'rb') as fp:
                     md5_sum = utils.get_md5(fp)
 
@@ -234,6 +255,10 @@ def download(self, file_path=None, verbose=None, ignore_existing=None,
                     log.info(msg)
                     if verbose:
                         print(f' {msg}', file=sys.stderr)
+                    if checksum_archive:
+                        # add file to checksum_archive to skip it next time
+                        with open(checksum_archive_filename, 'a', encoding='utf-8') as f:
+                            f.write(f'{file_path}\n')
                     return
             elif not fileobj:
                 st = os.stat(file_path.encode('utf-8'))
diff --git a/internetarchive/item.py b/internetarchive/item.py
index 18a2d072..a0c61a16 100644
--- a/internetarchive/item.py
+++ b/internetarchive/item.py
@@ -589,6 +589,7 @@ def download(self,
                  verbose: bool = False,
                  ignore_existing: bool = False,
                  checksum: bool = False,
+                 checksum_archive: bool = False,
                  destdir: str | None = None,
                  no_directory: bool = False,
                  retries: int | None = None,
@@ -627,6 +628,10 @@ def download(self,
 
         :param checksum: Skip downloading file based on checksum.
 
+        :param checksum_archive: Skip downloading file based on checksum, and skip
+                                 checksum validation if it already succeeded
+                                 (will create and use _checksum_archive.txt).
+
         :param destdir: The directory to download files to.
 
         :param no_directory: Download files to current working
@@ -670,6 +675,7 @@ def download(self,
         ignore_existing = bool(ignore_existing)
         ignore_errors = bool(ignore_errors)
         checksum = bool(checksum)
+        checksum_archive = bool(checksum_archive)
         no_directory = bool(no_directory)
         return_responses = bool(return_responses)
         no_change_timestamp = bool(no_change_timestamp)
@@ -746,8 +752,8 @@ def download(self,
                 ors = True
             else:
                 ors = False
-            r = f.download(path, verbose, ignore_existing, checksum, destdir,
-                           retries, ignore_errors, fileobj, return_responses,
+            r = f.download(path, verbose, ignore_existing, checksum, checksum_archive, 
+                           destdir, retries, ignore_errors, fileobj, return_responses,
                            no_change_timestamp, params, None, stdout, ors, timeout)
             if return_responses:
                 responses.append(r)

From 656bf7ba529cc1c5fe7e80bc90cbdd4b6239ef68 Mon Sep 17 00:00:00 2001
From: Benoit Boulanger <benbou@gmail.com>
Date: Wed, 19 Jul 2023 22:48:53 -0400
Subject: [PATCH 2/7] Add doc for checksum archive file (issue #600)

---
 docs/source/quickstart.rst | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
index c9df9194..7d4b629d 100644
--- a/docs/source/quickstart.rst
+++ b/docs/source/quickstart.rst
@@ -179,6 +179,20 @@ Alternatively, you can skip files based on md5 checksums. This is will take long
      skipping nasa/nasa_meta.xml, file already exists based on checksum.
      skipping nasa/nasa_reviews.xml, file already exists based on checksum.
 
+Furthermore, you can skip files based on md5 checksums and user a checksum_archive file. This is will be faster than checksum alone because checksums will only need to be calculated once for every file already downloaded. Once calculated successfully, the item/file will be written to the checksum_archive file and succeeding runs will skip the checksum validation::
+
+    >>> download('nasa', verbose=True, checksum_archive=True)
+    nasa:
+     skipping nasa/__ia_thumb.jpg, file already exists based on checksum_archive.
+     skipping nasa/globe_west_540.jpg, file already exists based on checksum_archive.
+     skipping nasa/globe_west_540_thumb.jpg, file already exists based on checksum_archive.
+     skipping nasa/nasa_archive.torrent, file already exists based on checksum_archive.
+     skipping nasa_files.xml: 2.56kiB [00:00, 5.76MiB/s]
+     skipping nasa/nasa_itemimage.jpg, file already exists based on checksum_archive.
+     skipping nasa/nasa_meta.sqlite, file already exists based on checksum.
+     skipping nasa/nasa_meta.xml, file already exists based on checksum.
+     downloading nasa/nasa_reviews.xml, file already exists based on checksum.
+
 By default, the :func:`download <internetarchive.download>` function will download all of the files in an item. However, there are a couple parameters that can be used to download only specific files. Files can be filtered using the ``glob_pattern`` parameter::
 
     >>> download('nasa', verbose=True, glob_pattern='*xml')

From 5cf93beb653031dcd97654b16bbf0357bd24d139 Mon Sep 17 00:00:00 2001
From: Benoit Boulanger <benbou@gmail.com>
Date: Wed, 24 Apr 2024 00:54:40 -0400
Subject: [PATCH 3/7] Fixed logic for checksum archive validation

---
 internetarchive/files.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/internetarchive/files.py b/internetarchive/files.py
index e0c421ea..695fe80e 100644
--- a/internetarchive/files.py
+++ b/internetarchive/files.py
@@ -281,14 +281,8 @@ def download(  # noqa: max-complexity=38
 
                 # Check if we should skip...
                 if not return_responses and os.path.exists(file_path.encode('utf-8')):
-                    if ignore_existing:
-                        msg = f'skipping {file_path}, file already exists.'
-                        log.info(msg)
-                        if verbose:
-                            print(f' {msg}', file=sys.stderr)
-                        return
-                    elif checksum_archive:
-                        checksum_archive_filename = '_checksum_archive.txt'  # TODO Define this at a better place
+                    if checksum_archive:
+                        checksum_archive_filename = '_checksum_archive.txt'
                         if not os.path.exists(checksum_archive_filename):
                             with open(checksum_archive_filename, 'wt', encoding='utf-8') as f:
                                 pass
@@ -300,6 +294,12 @@ def download(  # noqa: max-complexity=38
                             if verbose:
                                 print(f' {msg}', file=sys.stderr)
                             return
+                    if ignore_existing:
+                        msg = f'skipping {file_path}, file already exists.'
+                        log.info(msg)
+                        if verbose:
+                            print(f' {msg}', file=sys.stderr)
+                        return
                     elif checksum or checksum_archive:
                         with open(file_path, 'rb') as fp:
                             md5_sum = utils.get_md5(fp)

From 5b85676ebe6f224c9616bc1b11f4acb784f50b17 Mon Sep 17 00:00:00 2001
From: Benoit Boulanger <benbou@gmail.com>
Date: Wed, 24 Apr 2024 01:27:40 -0400
Subject: [PATCH 4/7] resolve minor linter issues

---
 internetarchive/api.py             | 2 +-
 internetarchive/cli/ia_download.py | 3 ++-
 internetarchive/files.py           | 9 ++++++---
 internetarchive/item.py            | 2 +-
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/internetarchive/api.py b/internetarchive/api.py
index a70eca42..9c0e34ec 100644
--- a/internetarchive/api.py
+++ b/internetarchive/api.py
@@ -336,7 +336,7 @@ def download(
 
     :param checksum: Skip downloading file based on checksum.
 
-    :param checksum_archive: Skip downloading file based on checksum, and skip 
+    :param checksum_archive: Skip downloading file based on checksum, and skip
                              checksum validation if it already succeeded
                              (will create and use _checksum_archive.txt).
 
diff --git a/internetarchive/cli/ia_download.py b/internetarchive/cli/ia_download.py
index 6ace3676..eb992c57 100644
--- a/internetarchive/cli/ia_download.py
+++ b/internetarchive/cli/ia_download.py
@@ -48,7 +48,8 @@
 
                                                  ia metadata --formats <identifier>
 
-    --checksum-archive                       Skip files based on _checksum_archive.txt [default: False].
+    --checksum-archive                       Skip files based on _checksum_archive.txt
+                                             [default: False].
     --on-the-fly                             Download on-the-fly files, as well as other matching
                                              files. on-the-fly files include derivative EPUB, MOBI
                                              and DAISY files [default: False].
diff --git a/internetarchive/files.py b/internetarchive/files.py
index 695fe80e..35593292 100644
--- a/internetarchive/files.py
+++ b/internetarchive/files.py
@@ -284,12 +284,15 @@ def download(  # noqa: max-complexity=38
                     if checksum_archive:
                         checksum_archive_filename = '_checksum_archive.txt'
                         if not os.path.exists(checksum_archive_filename):
-                            with open(checksum_archive_filename, 'wt', encoding='utf-8') as f:
+                            with open(checksum_archive_filename, 'w', encoding='utf-8') as f:
                                 pass
-                        with open(checksum_archive_filename, 'rt', encoding='utf-8') as f:
+                        with open(checksum_archive_filename, encoding='utf-8') as f:
                             checksum_archive_data = f.read().splitlines()
                         if file_path in checksum_archive_data:
-                            msg = f'skipping {file_path}, file already exists based on checksum_archive.'
+                            msg = (
+                                f'skipping {file_path}, '
+                                f'file already exists based on checksum_archive.'
+                            )
                             log.info(msg)
                             if verbose:
                                 print(f' {msg}', file=sys.stderr)
diff --git a/internetarchive/item.py b/internetarchive/item.py
index acc975e2..35f2d2e8 100644
--- a/internetarchive/item.py
+++ b/internetarchive/item.py
@@ -752,7 +752,7 @@ def download(self,
                 ors = True
             else:
                 ors = False
-            r = f.download(path, verbose, ignore_existing, checksum, checksum_archive, 
+            r = f.download(path, verbose, ignore_existing, checksum, checksum_archive,
                            destdir, retries, ignore_errors, fileobj, return_responses,
                            no_change_timestamp, params, None, stdout, ors, timeout)
             if return_responses:

From 41d261540b7e5b30a61bd2db251132f562f2c465 Mon Sep 17 00:00:00 2001
From: Benoit Boulanger <benbou@gmail.com>
Date: Thu, 25 Apr 2024 16:42:21 -0400
Subject: [PATCH 5/7] add tests for checksum_archive. make test_clobber,
 test_checksum and test_checksum_archive platform-independent

---
 tests/cli/test_ia_download.py | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/tests/cli/test_ia_download.py b/tests/cli/test_ia_download.py
index 87ddcb78..a90db594 100644
--- a/tests/cli/test_ia_download.py
+++ b/tests/cli/test_ia_download.py
@@ -73,8 +73,9 @@ def test_clobber(tmpdir_ch):
 
     stdout, stderr = call_cmd(cmd)
     assert files_downloaded('nasa') == {'nasa_meta.xml'}
-    expected_stderr = ('nasa:\n'
-                       ' skipping nasa/nasa_meta.xml, file already exists based on length and date.')
+    prefix = 'nasa:\n'.replace('\n', os.linesep)
+    filepath = os.path.join('nasa', 'nasa_meta.xml')
+    expected_stderr = f'{prefix} skipping {filepath}, file already exists based on length and date.'
     assert expected_stderr == stderr
 
 
@@ -84,7 +85,31 @@ def test_checksum(tmpdir_ch):
 
     stdout, stderr = call_cmd('ia --insecure download --checksum nasa nasa_meta.xml')
     assert files_downloaded('nasa') == {'nasa_meta.xml'}
-    assert 'nasa:\n skipping nasa/nasa_meta.xml, file already exists based on checksum.' == stderr
+    prefix = 'nasa:\n'.replace('\n', os.linesep)
+    filepath = os.path.join('nasa', 'nasa_meta.xml')
+    assert f'{prefix} skipping {filepath}, file already exists based on checksum.' == stderr
+
+
+def test_checksum_archive(tmpdir_ch):
+    call_cmd('ia --insecure download nasa nasa_meta.xml')
+    assert files_downloaded('nasa') == {'nasa_meta.xml'}
+
+    stdout, stderr = call_cmd('ia --insecure download --checksum-archive nasa nasa_meta.xml')
+    assert files_downloaded('nasa') == {'nasa_meta.xml'}
+    prefix = 'nasa:\n'.replace('\n', os.linesep)
+    filepath = os.path.join('nasa', 'nasa_meta.xml')
+    assert f'{prefix} skipping {filepath}, file already exists based on checksum.' == stderr
+
+    assert '_checksum_archive.txt' in files_downloaded('.')
+    with open(os.path.join('.', '_checksum_archive.txt'), encoding='utf-8') as f:
+        filepath = os.path.join('nasa', 'nasa_meta.xml')
+        assert f.read() == f'{filepath}\n'
+
+    stdout, stderr = call_cmd('ia --insecure download --checksum-archive nasa nasa_meta.xml')
+    assert files_downloaded('nasa') == {'nasa_meta.xml'}
+    prefix = 'nasa:\n'.replace('\n', os.linesep)
+    filepath = os.path.join('nasa', 'nasa_meta.xml')
+    assert f'{prefix} skipping {filepath}, file already exists based on checksum_archive.' == stderr
 
 
 def test_no_directories(tmpdir_ch):

From cb8694115b2cce24f34ee8847268bd0d6652685a Mon Sep 17 00:00:00 2001
From: Benoit Boulanger <benbou@gmail.com>
Date: Thu, 25 Apr 2024 16:59:33 -0400
Subject: [PATCH 6/7] increase max-args to 24, to accomodate the extra argument
 I had to add in item.py -> Item -> download for checksum_archive

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2ed3e1d8..827d9792 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,7 +74,7 @@ line-length = 102
 max-complexity = 33
 
 [tool.ruff.pylint]
-max-args = 23
+max-args = 24
 max-branches = 33
 max-statements = 124
 

From c1d0867840017a22cbb57818bd5315cee540a548 Mon Sep 17 00:00:00 2001
From: jake <jake@jakes-MacBook-Pro-2.local>
Date: Fri, 14 Jun 2024 11:57:44 -0700
Subject: [PATCH 7/7] Don't print destdir in verbose mode

---
 internetarchive/files.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/internetarchive/files.py b/internetarchive/files.py
index 5a28d7e5..8db330c6 100644
--- a/internetarchive/files.py
+++ b/internetarchive/files.py
@@ -234,8 +234,6 @@ def download(  # noqa: max-complexity=38
         file_path = file_path or self.name
 
         if destdir:
-            if verbose:
-                print(f"destdir: {destdir}")
             if return_responses is not True:
                 try:
                     os.mkdir(destdir)