From e7747a63896892c799934d1a0121d381272fca3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Max=20Zettlmei=C3=9Fl?= Date: Tue, 27 Jul 2021 15:42:59 +0200 Subject: [PATCH] Output a more helpful error message on malformed bulk upload CSV data. Malformed metadata keys in a bulk upload CSV file previously lead to a somewhat obscure ConnectionResetError. Now the offending key is named in an error message which should greatly help users who encounter this problem. The XML parser in the standard library does not properly verify tag names. I avoided the regular expression solution at first and chose lxml because the proper regex to verify XML tag names is rather complex. But as it turns out, the Internet Archive only allows a very limited subset of those characters in their metadata keys (`'[A-Za-z][.-0-9A-Za-z_]+`). (Verified by trying to create keys containing all other Unicode characters from 0 to x10FFFF against the metadata API.) See: https://archive.org/services/docs/api/metadata-schema/index.html#internet-archive-metadata --- internetarchive/cli/ia_upload.py | 11 +++++++++-- internetarchive/utils.py | 11 +++++++++++ pex-requirements.txt | 10 +++++----- setup.py | 10 +++++----- 4 files changed, 30 insertions(+), 12 deletions(-) diff --git a/internetarchive/cli/ia_upload.py b/internetarchive/cli/ia_upload.py index 26a47f7f..9eab9686 100644 --- a/internetarchive/cli/ia_upload.py +++ b/internetarchive/cli/ia_upload.py @@ -77,7 +77,8 @@ from internetarchive.cli.argparser import get_args_dict, convert_str_list_to_unicode from internetarchive.session import ArchiveSession -from internetarchive.utils import validate_s3_identifier, get_s3_xml_text, InvalidIdentifierException +from internetarchive.utils import (InvalidIdentifierException, get_s3_xml_text, + is_valid_metadata_key, validate_s3_identifier) # Only import backports.csv for Python2 (in support of FreeBSD port). PY2 = sys.version_info[0] == 2 @@ -263,6 +264,11 @@ def main(argv, session): spreadsheet = csv.DictReader(csvfp) prev_identifier = None for row in spreadsheet: + for metadata_key in row: + if not is_valid_metadata_key(metadata_key): + print('error: "%s" is not a valid metadata key.' % metadata_key, + file=sys.stderr) + sys.exit(1) upload_kwargs_copy = deepcopy(upload_kwargs) if row.get('REMOTE_NAME'): local_file = {row['REMOTE_NAME']: row['file']} @@ -271,7 +277,8 @@ def main(argv, session): local_file = row['file'] identifier = row.get('item', row.get('identifier')) if not identifier: - print('error: no identifier column on spreadsheet!') + print('error: no identifier column on spreadsheet.', + file=sys.stderr) sys.exit(1) del row['file'] if 'identifier' in row: diff --git a/internetarchive/utils.py b/internetarchive/utils.py index dc973f02..d0d61deb 100644 --- a/internetarchive/utils.py +++ b/internetarchive/utils.py @@ -358,3 +358,14 @@ def delete_items_from_dict(d, to_delete): for i in d: delete_items_from_dict(i, to_delete) return remove_none(d) + + +def is_valid_metadata_key(name): + # According to the documentation a metadata key + # has to be a valid XML tag name. + # + # The actual allowed tag names (at least as tested with the metadata API), + # are way more restrictive and only allow ".-A-Za-z_", possibly followed + # by an index in square brackets e. g. [0]. + # On the other hand the Archive allows tags starting with the string "xml". + return bool(re.fullmatch('[A-Za-z][.\-0-9A-Za-z_]+(?:\[[0-9]+\])?', name)) diff --git a/pex-requirements.txt b/pex-requirements.txt index 9a6e9089..0679e4c3 100644 --- a/pex-requirements.txt +++ b/pex-requirements.txt @@ -1,8 +1,8 @@ -requests>=2.9.1,<3.0.0 -jsonpatch>=0.4 +backports.csv docopt>=0.6.0,<0.7.0 -tqdm>=4.0.0 -six>=1.13.0,<2.0.0 +jsonpatch>=0.4 +requests>=2.9.1,<3.0.0 schema>=0.4.0 -backports.csv setuptools +six>=1.13.0,<2.0.0 +tqdm>=4.0.0 diff --git a/setup.py b/setup.py index 92a4835c..ddf7f99d 100644 --- a/setup.py +++ b/setup.py @@ -36,13 +36,13 @@ ], }, install_requires=[ - 'requests>=2.9.1,<3.0.0', - 'jsonpatch>=0.4', + 'backports.csv < 1.07;python_version<"3.4"', 'docopt>=0.6.0,<0.7.0', - 'tqdm>=4.0.0', - 'six>=1.13.0,<2.0.0', + 'jsonpatch>=0.4', + 'requests>=2.9.1,<3.0.0', 'schema>=0.4.0', - 'backports.csv < 1.07;python_version<"3.4"', + 'six>=1.13.0,<2.0.0', + 'tqdm>=4.0.0', ], classifiers=[ 'Development Status :: 5 - Production/Stable',