diff --git a/internetarchive/files.py b/internetarchive/files.py index c8029469..f161b716 100644 --- a/internetarchive/files.py +++ b/internetarchive/files.py @@ -28,6 +28,7 @@ import socket import sys from contextlib import nullcontext, suppress +from email.utils import parsedate_to_datetime from urllib.parse import quote from requests.exceptions import ( @@ -218,33 +219,6 @@ def download(self, file_path=None, verbose=None, ignore_existing=None, raise OSError(f'{destdir} is not a directory!') file_path = os.path.join(destdir, file_path) - if not return_responses and os.path.exists(file_path.encode('utf-8')): - if ignore_existing: - msg = f'skipping {file_path}, file already exists.' - log.info(msg) - if verbose: - print(f' {msg}', file=sys.stderr) - return - elif checksum: - with open(file_path, 'rb') as fp: - md5_sum = utils.get_md5(fp) - - if md5_sum == self.md5: - msg = f'skipping {file_path}, file already exists based on checksum.' - log.info(msg) - if verbose: - print(f' {msg}', file=sys.stderr) - return - elif not fileobj: - st = os.stat(file_path.encode('utf-8')) - if (st.st_mtime == self.mtime) and (st.st_size == self.size) \ - or self.name.endswith('_files.xml') and st.st_size != 0: - msg = f'skipping {file_path}, file already exists based on length and date.' - log.info(msg) - if verbose: - print(f' {msg}', file=sys.stderr) - return - parent_dir = os.path.dirname(file_path) try: if parent_dir != '' and return_responses is not True: @@ -255,8 +229,44 @@ def download(self, file_path=None, verbose=None, ignore_existing=None, timeout=timeout, auth=self.auth, params=params) + + # Get timestamp from Last-Modified header + dt = parsedate_to_datetime(response.headers['Last-Modified']) + last_mod_mtime = dt.timestamp() + response.raise_for_status() - if return_responses: + + # Check if we should skip... + if not return_responses and os.path.exists(file_path.encode('utf-8')): + if ignore_existing: + msg = f'skipping {file_path}, file already exists.' + log.info(msg) + if verbose: + print(f' {msg}', file=sys.stderr) + return + elif checksum: + with open(file_path, 'rb') as fp: + md5_sum = utils.get_md5(fp) + + if md5_sum == self.md5: + msg = f'skipping {file_path}, file already exists based on checksum.' + log.info(msg) + if verbose: + print(f' {msg}', file=sys.stderr) + return + elif not fileobj: + st = os.stat(file_path.encode('utf-8')) + if st.st_mtime == last_mod_mtime: + if self.name == f'{self.identifier}_files.xml' \ + or (st.st_size == self.size): + msg = (f'skipping {file_path}, file already exists based on ' + 'length and date.') + log.info(msg) + if verbose: + print(f' {msg}', file=sys.stderr) + return + + elif return_responses: return response if verbose: @@ -298,11 +308,11 @@ def download(self, file_path=None, verbose=None, ignore_existing=None, else: raise exc - # Set mtime with mtime from files.xml. + # Set mtime with timestamp from Last-Modified header if not no_change_timestamp: # If we want to set the timestamp to that of the original archive... with suppress(OSError): # Probably file-like object, e.g. sys.stdout. - os.utime(file_path.encode('utf-8'), (0, self.mtime)) + os.utime(file_path.encode('utf-8'), (0,last_mod_mtime)) msg = f'downloaded {self.identifier}/{self.name} to {file_path}' log.info(msg) diff --git a/tests/test_api.py b/tests/test_api.py index 0f81a27c..6ef181f9 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -266,10 +266,12 @@ def test_upload_validate_identifier(): def test_download(tmpdir): tmpdir.chdir() + last_mod_header = {"Last-Modified": "Tue, 14 Nov 2023 20:25:48 GMT"} with IaRequestsMock() as rsps: rsps.add(responses.GET, f'{PROTOCOL}//archive.org/download/nasa/nasa_meta.xml', - body='test content') + body='test content', + adding_headers=last_mod_header) rsps.add_metadata_mock('nasa') download('nasa', 'nasa_meta.xml') p = os.path.join(str(tmpdir), 'nasa') diff --git a/tests/test_item.py b/tests/test_item.py index adaf0bcd..4d22f5b7 100644 --- a/tests/test_item.py +++ b/tests/test_item.py @@ -24,6 +24,7 @@ DOWNLOAD_URL_RE = re.compile(f'{PROTOCOL}//archive.org/download/.*') S3_URL_RE = re.compile(r'.*s3.us.archive.org/.*') +EXPECTED_LAST_MOD_HEADER = {"Last-Modified": "Tue, 14 Nov 2023 20:25:48 GMT"} EXPECTED_S3_HEADERS = { 'content-length': '7557', 'x-archive-queue-derive': '1', @@ -145,11 +146,15 @@ def test_get_files_no_matches(nasa_item): def test_download(tmpdir, nasa_item): tmpdir.chdir() with IaRequestsMock() as rsps: - rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content') + rsps.add(responses.GET, DOWNLOAD_URL_RE, + body='test content', + adding_headers=EXPECTED_LAST_MOD_HEADER) nasa_item.download(files='nasa_meta.xml') assert len(tmpdir.listdir()) == 1 with IaRequestsMock() as rsps: - rsps.add(responses.GET, DOWNLOAD_URL_RE, body='new test content') + rsps.add(responses.GET, DOWNLOAD_URL_RE, + body='new test content', + adding_headers=EXPECTED_LAST_MOD_HEADER) nasa_item.download(files='nasa_meta.xml') with open('nasa/nasa_meta.xml') as fh: assert fh.read() == 'new test content' @@ -158,7 +163,9 @@ def test_download(tmpdir, nasa_item): def test_download_io_error(tmpdir, nasa_item): tmpdir.chdir() with IaRequestsMock() as rsps: - rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content') + rsps.add(responses.GET, DOWNLOAD_URL_RE, + body='test content', + adding_headers=EXPECTED_LAST_MOD_HEADER) nasa_item.download(files='nasa_meta.xml') rsps.reset() with pytest.raises(ConnectionError): @@ -167,7 +174,9 @@ def test_download_io_error(tmpdir, nasa_item): def test_download_ignore_errors(tmpdir, nasa_item): with IaRequestsMock() as rsps: - rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content') + rsps.add(responses.GET, DOWNLOAD_URL_RE, + body='test content', + adding_headers=EXPECTED_LAST_MOD_HEADER) nasa_item.download(files='nasa_meta.xml') nasa_item.download(files='nasa_meta.xml', ignore_errors=True) @@ -177,11 +186,13 @@ def test_download_ignore_existing(tmpdir, nasa_item): with IaRequestsMock( assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, DOWNLOAD_URL_RE, - body='test content') + body='test content', + adding_headers=EXPECTED_LAST_MOD_HEADER) nasa_item.download(files='nasa_meta.xml', ignore_existing=True) rsps.add(responses.GET, DOWNLOAD_URL_RE, - body='new test content') + body='new test content', + adding_headers=EXPECTED_LAST_MOD_HEADER) nasa_item.download(files='nasa_meta.xml', ignore_existing=True) with open('nasa/nasa_meta.xml') as fh: assert fh.read() == 'test content' @@ -190,11 +201,15 @@ def test_download_ignore_existing(tmpdir, nasa_item): def test_download_clobber(tmpdir, nasa_item): tmpdir.chdir() with IaRequestsMock() as rsps: - rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content') + rsps.add(responses.GET, DOWNLOAD_URL_RE, + body='test content', + adding_headers=EXPECTED_LAST_MOD_HEADER) nasa_item.download(files='nasa_meta.xml') rsps.reset() - rsps.add(responses.GET, DOWNLOAD_URL_RE, body='new test content') + rsps.add(responses.GET, DOWNLOAD_URL_RE, + body='new test content', + adding_headers=EXPECTED_LAST_MOD_HEADER) nasa_item.download(files='nasa_meta.xml') assert load_file('nasa/nasa_meta.xml') == 'new test content' @@ -205,8 +220,12 @@ def test_download_checksum(tmpdir, caplog): # test overwrite based on checksum. with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa') - rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content') - rsps.add(responses.GET, DOWNLOAD_URL_RE, body='overwrite based on md5') + rsps.add(responses.GET, DOWNLOAD_URL_RE, + body='test content', + adding_headers=EXPECTED_LAST_MOD_HEADER) + rsps.add(responses.GET, DOWNLOAD_URL_RE, + body='overwrite based on md5', + adding_headers=EXPECTED_LAST_MOD_HEADER) nasa_item = get_item('nasa') nasa_item.download(files='nasa_meta.xml') @@ -218,7 +237,8 @@ def test_download_checksum(tmpdir, caplog): with caplog.at_level(logging.DEBUG): rsps.reset() rsps.add(responses.GET, DOWNLOAD_URL_RE, - body=load_test_data_file('nasa_meta.xml')) + body=load_test_data_file('nasa_meta.xml'), + adding_headers=EXPECTED_LAST_MOD_HEADER) nasa_item.download(files='nasa_meta.xml', checksum=True, verbose=True) nasa_item.download(files='nasa_meta.xml', checksum=True, verbose=True) @@ -229,7 +249,9 @@ def test_download_checksum(tmpdir, caplog): def test_download_destdir(tmpdir, nasa_item): tmpdir.chdir() with IaRequestsMock() as rsps: - rsps.add(responses.GET, DOWNLOAD_URL_RE, body='new destdir') + rsps.add(responses.GET, DOWNLOAD_URL_RE, + body='new destdir', + adding_headers=EXPECTED_LAST_MOD_HEADER) dest = os.path.join(str(tmpdir), 'new destdir') nasa_item.download(files='nasa_meta.xml', destdir=dest) assert 'nasa' in os.listdir(dest) @@ -241,7 +263,9 @@ def test_download_no_directory(tmpdir, nasa_item): url_re = re.compile(f'{PROTOCOL}//archive.org/download/.*') tmpdir.chdir() with IaRequestsMock() as rsps: - rsps.add(responses.GET, url_re, body='no dest dir') + rsps.add(responses.GET, url_re, + body='no dest dir', + adding_headers=EXPECTED_LAST_MOD_HEADER) nasa_item.download(files='nasa_meta.xml', no_directory=True) with open(os.path.join(str(tmpdir), 'nasa_meta.xml')) as fh: assert fh.read() == 'no dest dir' @@ -278,9 +302,11 @@ def test_download_dry_run_on_the_fly_formats(tmpdir, capsys, nasa_item): def test_download_verbose(tmpdir, capsys, nasa_item): tmpdir.chdir() with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: + headers = {'content-length': '11'} + headers.update(EXPECTED_LAST_MOD_HEADER) rsps.add(responses.GET, DOWNLOAD_URL_RE, body='no dest dir', - adding_headers={'content-length': '11'}) + adding_headers=headers) nasa_item.download(files='nasa_meta.xml', verbose=True) out, err = capsys.readouterr() assert 'downloading nasa_meta.xml' in err