Skip to content

Commit

Permalink
merging last mod mtime changes with resume download changes
Browse files Browse the repository at this point in the history
  • Loading branch information
jake authored and jake committed Apr 3, 2024
1 parent 8453441 commit a068b4d
Showing 1 changed file with 127 additions and 81 deletions.
208 changes: 127 additions & 81 deletions internetarchive/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import sys
from contextlib import nullcontext, suppress
from email.utils import parsedate_to_datetime
from time import sleep
from urllib.parse import quote

from requests.exceptions import (
Expand All @@ -40,7 +41,7 @@
)
from tqdm import tqdm

from internetarchive import auth, iarequest, utils
from internetarchive import auth, exceptions, iarequest, utils

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -137,7 +138,8 @@ def __repr__(self):
f'size={self.size!r}, '
f'format={self.format!r})')

def download(self, file_path=None, verbose=None, ignore_existing=None,
def download(# noqa: max-complexity=38
self, file_path=None, verbose=None, ignore_existing=None,
checksum=None, destdir=None, retries=None, ignore_errors=None,
fileobj=None, return_responses=None, no_change_timestamp=None,
params=None, chunk_size=None, stdout=None, ors=None,
Expand Down Expand Up @@ -205,6 +207,9 @@ def download(self, file_path=None, verbose=None, ignore_existing=None,
no_change_timestamp = no_change_timestamp or False
params = params or None
timeout = 12 if not timeout else timeout
headers = {}
retries_sleep = 3 # TODO: exponential sleep
retrying = False # for retry loop

self.item.session.mount_http_adapter(max_retries=retries)
file_path = file_path or self.name
Expand All @@ -220,93 +225,134 @@ def download(self, file_path=None, verbose=None, ignore_existing=None,
file_path = os.path.join(destdir, file_path)

parent_dir = os.path.dirname(file_path)
try:
if parent_dir != '' and return_responses is not True:
os.makedirs(parent_dir, exist_ok=True)

response = self.item.session.get(self.url,
stream=True,
timeout=timeout,
auth=self.auth,
params=params)

# Get timestamp from Last-Modified header
dt = parsedate_to_datetime(response.headers['Last-Modified'])
last_mod_mtime = dt.timestamp()

response.raise_for_status()

# Check if we should skip...
if not return_responses and os.path.exists(file_path.encode('utf-8')):
if ignore_existing:
msg = f'skipping {file_path}, file already exists.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return
elif checksum:
with open(file_path, 'rb') as fp:
md5_sum = utils.get_md5(fp)

if md5_sum == self.md5:
msg = f'skipping {file_path}, file already exists based on checksum.'

# Retry loop
while True:
try:
if parent_dir != '' and return_responses is not True:
os.makedirs(parent_dir, exist_ok=True)

if not return_responses \
and not ignore_existing \
and os.path.exists(file_path.encode('utf-8')):
st = os.stat(file_path.encode('utf-8'))
if st.st_size != self.size and not ignore_existing:
headers = {"Range": f"bytes={st.st_size}-{self.size}"}

response = self.item.session.get(self.url,
stream=True,
timeout=timeout,
auth=self.auth,
params=params,
headers=headers)

# Get timestamp from Last-Modified header
last_mod_header = response.headers.get('Last-Modified')
if last_mod_header:
dt = parsedate_to_datetime(last_mod_header)
last_mod_mtime = dt.timestamp()
else:
last_mod_mtime = 0

response.raise_for_status()

# Check if we should skip...
if not return_responses and os.path.exists(file_path.encode('utf-8')):
if ignore_existing:
msg = f'skipping {file_path}, file already exists.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return
elif not fileobj:
st = os.stat(file_path.encode('utf-8'))
if st.st_mtime == last_mod_mtime:
if self.name == f'{self.identifier}_files.xml' \
or (st.st_size == self.size):
msg = (f'skipping {file_path}, file already exists based on '
'length and date.')
elif checksum:
with open(file_path, 'rb') as fp:
md5_sum = utils.get_md5(fp)

if md5_sum == self.md5:
msg = f'skipping {file_path}, file already exists based on checksum.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return

elif return_responses:
return response

if verbose:
total = int(response.headers.get('content-length', 0)) or None
progress_bar = tqdm(desc=f' downloading {self.name}',
total=total,
unit='iB',
unit_scale=True,
unit_divisor=1024)
else:
progress_bar = nullcontext()

if not chunk_size:
chunk_size = 1048576
if stdout:
fileobj = os.fdopen(sys.stdout.fileno(), "wb", closefd=False)
if not fileobj:
fileobj = open(file_path.encode('utf-8'), 'wb')

with fileobj, progress_bar as bar:
for chunk in response.iter_content(chunk_size=chunk_size):
if chunk:
size = fileobj.write(chunk)
if bar is not None:
bar.update(size)
if ors:
fileobj.write(os.environ.get("ORS", "\n").encode("utf-8"))
except (RetryError, HTTPError, ConnectTimeout, OSError, ReadTimeout) as exc:
msg = f'error downloading file {file_path}, exception raised: {exc}'
log.error(msg)
try:
os.remove(file_path)
except OSError:
pass
if verbose:
print(f' {msg}', file=sys.stderr)
if ignore_errors:
return False
else:
raise exc
elif not fileobj:
if st.st_mtime == last_mod_mtime:
if self.name == f'{self.identifier}_files.xml' \
or (st.st_size == self.size):
msg = (f'skipping {file_path}, file already exists based on '
'length and date.')
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return

elif return_responses:
return response



if verbose:
total = int(response.headers.get('content-length', 0)) or None
progress_bar = tqdm(desc=f' downloading {self.name}',
total=total,
unit='iB',
unit_scale=True,
unit_divisor=1024)
else:
progress_bar = nullcontext()

if not chunk_size:
chunk_size = 1048576
if stdout:
fileobj = os.fdopen(sys.stdout.fileno(), 'wb', closefd=False)
if not fileobj or retrying:
if 'Range' in headers:
fileobj = open(file_path.encode('utf-8'), 'ab')
else:
fileobj = open(file_path.encode('utf-8'), 'wb')

with fileobj, progress_bar as bar:
for chunk in response.iter_content(chunk_size=chunk_size):
if chunk:
size = fileobj.write(chunk)
if bar is not None:
bar.update(size)
if ors:
fileobj.write(os.environ.get("ORS", "\n").encode("utf-8"))

if 'Range' in headers:
with open(file_path, 'rb') as fh:
local_checksum = utils.get_md5(fh)
try:
assert local_checksum == self.md5
except AssertionError:
msg = (f"\"{file_path}\" corrupt, "
"checksums do not match. "
"Remote file may have been modified, retry download.")
raise exceptions.InvalidChecksumError(msg)
break
except (RetryError, HTTPError, ConnectTimeout, OSError, ReadTimeout,
exceptions.InvalidChecksumError) as exc:
if retries > 0:
retrying = True
retries -= 1
msg = ('download failed, sleeping for '
f'{retries_sleep} seconds and retrying. '
f'{retries} retries left.')
log.warning(msg)
sleep(retries_sleep)
continue
msg = f'error downloading file {file_path}, exception raised: {exc}'
log.error(msg)
try:
os.remove(file_path)
except OSError:
pass
if verbose:
print(f' {msg}', file=sys.stderr)
if ignore_errors:
return False
else:
raise exc

# Set mtime with timestamp from Last-Modified header
if not no_change_timestamp:
Expand Down

0 comments on commit a068b4d

Please sign in to comment.