diff --git a/src/digiflow/__init__.py b/src/digiflow/__init__.py index f8dfa52..9f5c158 100644 --- a/src/digiflow/__init__.py +++ b/src/digiflow/__init__.py @@ -47,4 +47,4 @@ validate_tiff, ) -from .common import UNSET_LABEL, XMLNS +from .common import UNSET_LABEL, XMLNS, FallbackLogger diff --git a/src/digiflow/common.py b/src/digiflow/common.py index 7af95d9..20bda85 100644 --- a/src/digiflow/common.py +++ b/src/digiflow/common.py @@ -1,5 +1,7 @@ """common constants""" +import logging +import sys XMLNS = { 'alto': 'http://www.loc.gov/standards/alto/ns-v4#', @@ -23,3 +25,22 @@ UNSET_LABEL = 'n.a.' + +class FallbackLogger: + """Different way to inject logging facilities""" + + def __init__(self, some_logger=None): + self.logger: logging.Logger = some_logger + + def log(self, message: str, *args, level = logging.INFO): + """Encapsulate Loggin""" + if self.logger: + self.logger.log(level, message, *args) + else: + message = message.replace('%s','{}') + if args is not None and len(args) > 0: + message = message.format(*args) + if level >= logging.ERROR: + print(message, file=sys.stderr) + else: + print(message) diff --git a/src/digiflow/record/__init__.py b/src/digiflow/record/__init__.py index 6cfacf4..67ab6ba 100644 --- a/src/digiflow/record/__init__.py +++ b/src/digiflow/record/__init__.py @@ -8,3 +8,4 @@ State, Datetime, ) +from .record_service import Client, HandlerInformation, run_server diff --git a/src/digiflow/record/common.py b/src/digiflow/record/common.py index 4ae75cc..9206f2c 100644 --- a/src/digiflow/record/common.py +++ b/src/digiflow/record/common.py @@ -1,5 +1,7 @@ """Common record attributes""" +import ast +import time import typing import digiflow as df @@ -14,6 +16,8 @@ UNSET_LABEL = 'n.a.' FIELD_IDENTIFIER = 'IDENTIFIER' +FIELD_URN = 'URN' +FIELD_SYSTEM_HANDLE = 'HANDLE' FIELD_SPEC = 'SETSPEC' FIELD_DATESTAMP = 'CREATED' FIELD_INFO = 'INFO' @@ -25,6 +29,17 @@ RECORD_HEADER = [FIELD_IDENTIFIER, FIELD_INFO, FIELD_STATE, FIELD_STATETIME] +DEFAULT_MAPPINGS = { + 'identifier': FIELD_IDENTIFIER, + 'ext_urn': FIELD_URN, + 'system_handle': FIELD_SYSTEM_HANDLE, + 'setspec': FIELD_SPEC, + 'created_time': FIELD_DATESTAMP, + 'info': FIELD_INFO, + 'state': FIELD_STATE, + 'state_time': FIELD_STATETIME, +} + class RecordDataException(Exception): """Mark inconsistent record data, @@ -38,9 +53,9 @@ class RecordDataException(Exception): class Record: """ - OAIRecord based on valid URN-Identifier with optional set specification data + Record based on valid OAI-URN-Identifier with optional setspec data based on http://www.openarchives.org/OAI/2.0/guidelines-oai-identifier.htm - and commonly transported via OAI-PMH API + transported via OAI-PMH API or delivered by RecordService Examples: @@ -56,11 +71,13 @@ class Record: def __init__(self, urn): self.__urn = urn self.__local_ident = None + self.ext_urn = UNSET_LABEL + self.system_handle = UNSET_LABEL self.set = UNSET_LABEL - self.date_stamp = UNSET_LABEL - self.info = UNSET_LABEL - self.state = UNSET_LABEL - self.state_datetime = UNSET_LABEL + self.created_time = UNSET_LABEL + self._info = UNSET_LABEL + self._state = UNSET_LABEL + self.state_time = UNSET_LABEL @property def local_identifier(self): @@ -89,20 +106,86 @@ def __str__(self) -> str: the_str = f"{self.__urn}" if self.set != df.UNSET_LABEL: the_str = f"{the_str}\t{self.set}" - if self.date_stamp != df.UNSET_LABEL: - the_str = f"{the_str}\t{self.date_stamp}" - if self.info != df.UNSET_LABEL: - the_str = f"{the_str}\t{self.info}" - return f"{the_str}\n{self.state}\t{self.state_datetime}" + if self.created_time != df.UNSET_LABEL: + the_str = f"{the_str}\t{self.created_time}" + if self._info != df.UNSET_LABEL: + the_str = f"{the_str}\t{self._info}" + return f"{the_str}\t{self._state}\t{self.state_time}" @staticmethod - def create(input_data): + def parse(input_data): """De-serialize record from different input forms""" record = Record(UNSET_LABEL) if isinstance(input_data, dict): record = row_to_record(input_data) return record + def dict(self, dict_map=None) -> typing.Dict: + """Serialize Record into Python dict + as input for JSON load + """ + as_dict = {} + if dict_map is None: + dict_map = DEFAULT_MAPPINGS + for label, field in dict_map.items(): + if hasattr(self, label): + as_dict[field] = getattr(self, label) + # as_dict[FIELD_IDENTIFIER] = self.identifier + # if self._info != UNSET_LABEL: + # as_dict[FIELD_INFO] = self._info + # if self._state != UNSET_LABEL: + # as_dict[FIELD_STATE] = self._state + # if self.state_time != UNSET_LABEL: + # as_dict[FIELD_STATETIME] = self.state_time + # # legacy + # if self.setspec != UNSET_LABEL: + # as_dict[FIELD_SPEC] = self.setspec + # if self.created_time != UNSET_LABEL: + # as_dict[FIELD_DATESTAMP] = self.created_time + return as_dict + + @property + def state(self): + """Get state""" + return self._state + + @state.setter + def state(self, state_label): + """Set new state and update statetime""" + + self._state = state_label + right_now = time.strftime(STATETIME_FORMAT) + self.state_time = right_now + + @property + def info(self): + """Get Record Information""" + return self._info + + @info.setter + def info(self, any_value): + """Update existing Information lazy. + Assume info consists of at least + a single dict or several dicts, + in which case only the last dict + will be updated""" + + try: + if any_value == UNSET_LABEL: + any_value = {} + if self._info == UNSET_LABEL: + self._info = {} + if isinstance(any_value, str): + any_value = ast.literal_eval(any_value) + elif isinstance(self._info, str): + self._info = ast.literal_eval(self._info) + if isinstance(self._info, dict): + self._info.update(any_value) + elif isinstance(self._info, tuple): + self._info[-1].update(any_value) + except (AttributeError,SyntaxError, ValueError): + self._info = any_value + def row_to_record(row: typing.Dict): """Serialize data row to Record with all @@ -113,16 +196,20 @@ def row_to_record(row: typing.Dict): if FIELD_IDENTIFIER not in row: raise RecordDataException(f"Missing {FIELD_IDENTIFIER} in {row}") record = Record(row[FIELD_IDENTIFIER]) + if FIELD_URN in row and str(row[FIELD_URN]).strip(): + record.ext_urn = row[FIELD_URN] + if FIELD_SYSTEM_HANDLE in row and str(row[FIELD_SYSTEM_HANDLE]).strip(): + record.system_handle = row[FIELD_SYSTEM_HANDLE] if FIELD_SPEC in row and str(row[FIELD_SPEC]).strip(): record.set = str(row[FIELD_SPEC]).strip() if FIELD_DATESTAMP in row and str(row[FIELD_DATESTAMP]).strip(): - record.date_stamp = str(row[FIELD_DATESTAMP]).strip() + record.created_time = str(row[FIELD_DATESTAMP]).strip() if FIELD_INFO in row and str(FIELD_INFO).strip(): record.info = str(row[FIELD_INFO]).strip() if FIELD_STATE not in row: - raise RecordDataException(f"Missing {FIELD_STATE} in {row}") - record.state = row[FIELD_STATE] - if FIELD_STATETIME not in row: - raise RecordDataException(f"Missing {FIELD_STATETIME} in {row}") - record.state_datetime = row[FIELD_STATETIME] + record.state = UNSET_LABEL + else: + record.state = row[FIELD_STATE] + if FIELD_STATETIME in row: + record.state_time = row[FIELD_STATETIME] return record diff --git a/src/digiflow/record/record_handler.py b/src/digiflow/record/record_handler.py index 79aa020..3fabf86 100644 --- a/src/digiflow/record/record_handler.py +++ b/src/digiflow/record/record_handler.py @@ -6,7 +6,6 @@ import time -import digiflow as df import digiflow.record as df_r RECORD_STATE_MASK_FRAME = 'other_load' @@ -67,6 +66,7 @@ def __init__(self, data_path, data_fields=None, @property def total_len(self): + """Number of records""" return len(self.data) def _build_data(self): @@ -118,17 +118,15 @@ def _restore_header(self, first_line): self.header = _header def _validate_header(self, data_fields): - """validate both occurence and order""" + """validate header fields presence and order""" if self.header != data_fields: - msg = "invalid fields: '{}', expect: '{}'".format( - self.header, data_fields) + msg = f"invalid fields: '{self.header}', expect: '{data_fields}'" raise RecordHandlerException(msg) def next_record(self, state=None): """ - Get *NEXT* IRecord _from scratch_ with - given state if any exist, raise Exception - otherwise + Get *NEXT* Record with given state + if any exist, otherwise None """ if not state: @@ -140,6 +138,7 @@ def next_record(self, state=None): if state == row[self.state_field]: self.position = f"{(i+1):04d}/{(self.total_len):04d}" return self.transform_func(row) + return None def get(self, identifier, exact_match=True): """Read data for first Record with diff --git a/src/digiflow/record/record_service.py b/src/digiflow/record/record_service.py new file mode 100644 index 0000000..d3383e6 --- /dev/null +++ b/src/digiflow/record/record_service.py @@ -0,0 +1,292 @@ +"""API for handling records with server/client mechanics""" + +import dataclasses +import functools +import http.server +import json +import logging +import sys + +from pathlib import Path + +import requests + +import digiflow as df +import digiflow.record as df_r + + +DEFAULT_COMMAND_NEXT = 'next' +DEFAULT_COMMAND_UPDATE = 'update' +_MIME_TXT = 'text/plain' +DEFAULT_MARK_BUSY = 'busy' + +X_HEADER_GET_STATE = 'X-GET-STATE' +X_HEADER_SET_STATE = 'X-SET-STATE' + +STATETIME_FORMAT = '%Y-%m-%d_%H:%M:%S' + +DATA_EXHAUSTED_MARK = 'no open records' + + +@dataclasses.dataclass +class HandlerInformation: + """Encapsulate some basic + information needed to do + the handling""" + + data_path: Path + logger: logging.Logger + + def __init__(self, data_path, logger): + """Enforce proper types""" + if isinstance(data_path, str): + self.data_path = Path(data_path) + if not self.data_path.is_absolute(): + self.data_path = self.data_path.resolve() + self.logger = logger + + +class RecordExhaustedException(Exception): + """Mark state when no more records can be + served to clients anymore""" + + +class RecordRequestHandler(http.server.SimpleHTTPRequestHandler, + df.FallbackLogger): + """Simple handler for POST and GET requests + without additional security - use at own risk + """ + + def __init__(self, start_info: HandlerInformation, + *args, + **kwargs): + self.record_list_directory: Path = start_info.data_path + self.logger = start_info.logger + self.command_next = DEFAULT_COMMAND_NEXT + self.command_update = DEFAULT_COMMAND_UPDATE + super(http.server.SimpleHTTPRequestHandler, self).__init__(*args, **kwargs) + + def do_GET(self): + """handle GET request""" + client_name = self.address_string() + if self.path == '/favicon.ico': + return + self.log("request '%s' from client %s", self.path, client_name, level=logging.DEBUG) + get_record_state = self.headers.get(X_HEADER_GET_STATE) + set_record_state = self.headers.get(X_HEADER_SET_STATE) + try: + _, file_name, command = self.path.split('/') + except ValueError: + self.wfile.write( + b'please provide record file name and command ' + b' e.g.: /oai_record_vd18/next') + self.log("missing data: '%s'", self.path, level=logging.WARNING) + return + if command == DEFAULT_COMMAND_NEXT: + state, data = self.get_next_record(file_name, client_name, + get_record_state, set_record_state) + self.log("deliver next record: '%s'", data, level=logging.DEBUG) + if isinstance(data, str): + self._set_headers(state, _MIME_TXT) + self.wfile.write(data.encode('utf-8')) + else: + self._set_headers(state) + self.wfile.write(json.dumps(data, default=df_r.Record.dict).encode('utf-8')) + + def do_POST(self): + """handle POST request""" + data = 'no data available' + client_name = self.address_string() + self.log('url path %s from %s', self.path, client_name) + try: + _, file_name, command = self.path.split('/') + except ValueError as val_err: + self.wfile.write( + b'please provide record file name and command ' + b' e.g.: /records-vd18/next') + self.log('request next record failed %s', val_err.args[0], level=logging.ERROR) + content_length = int(self.headers['Content-Length']) + post_data = self.rfile.read(content_length) + data_dict = json.loads(post_data) + self.log("POST request, Path: %s", self.path, level=logging.DEBUG) + self.log('do_POST: %s', data_dict) + if command == DEFAULT_COMMAND_UPDATE: + ident = data_dict.get(df_r.FIELD_IDENTIFIER) + if ident: + state, data = self.update_record(file_name, data_dict) + if isinstance(data, str): + self._set_headers(state, _MIME_TXT) + self.wfile.write(data.encode('utf-8')) + else: + self._set_headers(state) + self.wfile.write(json.dumps(data, default=data.dict).encode('utf-8')) + else: + self._set_headers(404, _MIME_TXT) + self.wfile.write(f"no entry for {ident} in {file_name}!".encode('utf-8')) + + def _set_headers(self, state=200, mime_type='application/json') -> None: + self.send_response(state) + self.send_header('Content-type', mime_type) + self.end_headers() + + def get_data_file(self, data_file_name: str): + """data_file_name comes with no extension! + so we must search for a valid match- + returns propably None-values if + nothing found. + """ + if isinstance(data_file_name, str): + data_file_name = Path(data_file_name).stem + for a_file in self.record_list_directory.iterdir(): + if data_file_name == Path(a_file).stem: + data_file = self.record_list_directory / a_file.name + return data_file + self.log("found no %s in %s", data_file_name, self.record_list_directory, + level=logging.CRITICAL) + return None + + def get_next_record(self, file_name, client_name, requested_state, set_state) -> tuple: + """Deliver next record data if both + * in watched directory exists record list matching file_name + * inside this record list are open records available + """ + + self.log("look for %s in %s", file_name, self.record_list_directory) + data_file_path = self.get_data_file(file_name) + # no match results in 404 - resources not available after all + if data_file_path is None: + self.log("no '%s' found in '%s'", file_name, self.record_list_directory, + level=logging.WARNING) + return (404, f"no file '{file_name}' in {self.record_list_directory}") + + handler = df_r.RecordHandler(data_file_path, transform_func=df_r.row_to_record) + next_record = handler.next_record(requested_state) + # if no record available, alert no resource + if next_record is None: + the_msg = f'{DATA_EXHAUSTED_MARK}: {data_file_path}' + self.log(the_msg) + return (404, the_msg) + + # store information which client got the package delivered + client_info = {'client': client_name} + next_record.info = client_info + handler.save_record_state( + next_record.identifier, set_state, **{df_r.FIELD_INFO: f'{next_record.info}'}) + return (200, next_record) + + def update_record(self, data_file, in_data) -> tuple: + """write data dict send by client + throws RuntimeError if record to update not found + """ + + data_file_path = self.get_data_file(data_file) + if data_file_path is None: + self.log('do_POST: %s not found', data_file_path, level=logging.ERROR) + return (404, f"data file not found: {data_file_path}") + try: + handler = df_r.RecordHandler(data_file_path) + if isinstance(in_data, dict): + in_data = df_r.Record.parse(in_data) + in_ident = in_data.identifier + prev_record: df_r.Record = handler.get(in_ident) + prev_record.info = in_data.info + info_str = f"{prev_record.info}" + handler.save_record_state(in_ident, + state=in_data.state, **{df_r.FIELD_INFO: info_str}) + _msg = f"update done for {in_ident} in '{data_file_path}" + self.log(_msg) + return (200, _msg) + except RuntimeError as _rer: + _msg = f"update fail for {in_ident} in '{data_file_path}' ({_rer.args[0]})" + self.log(_msg, level=logging.ERROR) + return (500, _msg) + + +class Client(df.FallbackLogger): + """Implementation of OAI Service client with + capabilities to get next OAI Record data + and communicate results (done|fail) + """ + + def __init__(self, oai_record_list_label, host, port, + + notify_callback=None, logger=None): + self.oai_record_list_label = oai_record_list_label + self.record: df_r.Record = None + self.oai_server_url = f'http://{host}:{port}/{oai_record_list_label}' + self.notify_callback = notify_callback + super().__init__(some_logger=logger) + + def get_record(self, get_record_state, set_record_state): + """Request Record from service and de-serialize + json encoded content into record object + """ + try: + the_headers = {X_HEADER_GET_STATE: get_record_state, + X_HEADER_SET_STATE: set_record_state} + response = requests.get(f'{self.oai_server_url}/next', + timeout=300, headers=the_headers) + except requests.exceptions.RequestException as err: + if self.logger is not None: + self.logger.error("OAI server connection fails: %s", err) + if self.notify_callback: + self.notify_callback(f'[OCR-D-ODEM] Failure for {self.oai_server_url}', err) + sys.exit(1) + status = response.status_code + result = response.content + if status == 404: + # probably nothing to do? + if DATA_EXHAUSTED_MARK in str(result): + if self.logger is not None: + self.logger.info(result) + raise RecordExhaustedException(result.decode(encoding='utf-8')) + # otherwise exit anyway + sys.exit(1) + + if status != 200: + if self.logger is not None: + self.logger.error( + "server connection status: %s -> %s", status, result) + sys.exit(1) + + # otherwise response ok + self.record = df_r.Record.parse(response.json()) + return self.record + + def update(self, status, oai_urn, **kwargs): + """Store status update && send message to OAI Service""" + if self.logger is not None: + self.logger.debug("set status '%s' for urn '%s'", status, oai_urn) + self.record = df_r.Record(oai_urn) + self.record.state = status + # if we have to report somethin' new, then append it + if kwargs is not None and len(kwargs) > 0: + try: + self.record.info = kwargs + except AttributeError as attr_err: + self.logger.error("info update failed for %s: %s (prev:%s, in:%s)", + self.record.identifier, + attr_err.args[0], + self.record.info, kwargs) + if self.logger is not None: + self.logger.debug("update record %s url %s", self.record, self.oai_server_url) + return requests.post(f'{self.oai_server_url}/update', json=self.record.dict(), timeout=60) + + +def run_server(host, port, start_data: HandlerInformation): + """start server to process requests + for local file resources""" + + the_logger = start_data.logger + the_logger.info("server starts listen at: %s:%s", host, port) + the_logger.info("deliver record files from: %s", start_data.data_path) + the_logger.info("call next record with: %s:%s//next", host, port) + the_logger.info("post update data with: %s:%s//update", host, port) + the_logger.info("stop server press CTRL+C") + the_handler = functools.partial(RecordRequestHandler, start_data) + with http.server.HTTPServer((host, int(port)), the_handler) as the_server: + try: + the_server.serve_forever(5.0) + except KeyboardInterrupt: + the_server.shutdown() + the_logger.info("shutdown record server (%s:%s)", host, port) diff --git a/tests/test_digiflow_record_handler.py b/tests/test_digiflow_record_handler.py index 65e07fc..44ab4cc 100644 --- a/tests/test_digiflow_record_handler.py +++ b/tests/test_digiflow_record_handler.py @@ -250,30 +250,30 @@ def test_record_datestamp(oai_record_list): """Check if proper datestamp gets picked""" # arrange - _handler = df_r.RecordHandler(oai_record_list) + hndlr = df_r.RecordHandler(oai_record_list) # act - _record = _handler.next_record() + rcrd: df_r.Record = hndlr.next_record() # assert - assert _record.identifier == 'oai:digitale.bibliothek.uni-halle.de/zd:9510508' - assert _record.local_identifier == '9510508' - assert _record.date_stamp == '2015-08-25T20:00:35Z' + assert rcrd.identifier == 'oai:digitale.bibliothek.uni-halle.de/zd:9510508' + assert rcrd.local_identifier == '9510508' + assert rcrd.created_time == '2015-08-25T20:00:35Z' def test_record_get_fullident(oai_record_list): """Check if proper datestamp gets picked""" # arrange - _ident_urn = 'oai:digitale.bibliothek.uni-halle.de/zd:9510508' - _handler = df_r.RecordHandler(oai_record_list) + ident_urn = 'oai:digitale.bibliothek.uni-halle.de/zd:9510508' + hndlr = df_r.RecordHandler(oai_record_list) # act - _record = _handler.get(_ident_urn) + rcrd: df_r.Record = hndlr.get(ident_urn) # assert - assert _record.identifier == _ident_urn - assert _record.date_stamp == '2015-08-25T20:00:35Z' + assert rcrd.identifier == ident_urn + assert rcrd.created_time == '2015-08-25T20:00:35Z' def test_record_get_partialident(oai_record_list): @@ -282,17 +282,17 @@ def test_record_get_partialident(oai_record_list): identifier) has been provided""" # arrange - _ident_urn = 'oai:digitale.bibliothek.uni-halle.de/zd:9510508' - _handler = df_r.RecordHandler(oai_record_list) + ident_urn = 'oai:digitale.bibliothek.uni-halle.de/zd:9510508' + handler = df_r.RecordHandler(oai_record_list) # act - _record_exact = _handler.get('9510508') - _record_fuzzy = _handler.get('9510508', exact_match=False) + record_exact = handler.get('9510508') + record_fuzzy: df_r.Record = handler.get('9510508', exact_match=False) # assert - assert not _record_exact - assert _record_fuzzy.identifier == _ident_urn - assert _record_fuzzy.date_stamp == '2015-08-25T20:00:35Z' + assert not record_exact + assert record_fuzzy.identifier == ident_urn + assert record_fuzzy.created_time == '2015-08-25T20:00:35Z' def test_record_get_non_existent(oai_record_list): diff --git a/tests/test_record_service.py b/tests/test_record_service.py new file mode 100644 index 0000000..ab0c47c --- /dev/null +++ b/tests/test_record_service.py @@ -0,0 +1,125 @@ +"""API record service""" + +import ast +import typing + +import pytest + +import digiflow.record as df_r + + +RECORD_IDENTIFIER = 'IDENTIFIER' +RECORD_INFO = 'INFO' +INFO_N_OCR = 'n_ocr' + + +def test_update_record_info_plain(): + """Behavior when updating record + with single, common info field + """ + + # arrange + the_urn = "oai:opendata.uni-halle.de:1981185920/38841" + rec: typing.Dict = {RECORD_IDENTIFIER: the_urn} + rec[RECORD_INFO] = "{'gvk-ppn': '1764064194', 'pica': 'Aa', 'pages': 9, 'languages': ['ger']}" + new_kwargs = {INFO_N_OCR: 5} + + # act + curr_info = ast.literal_eval(rec[RECORD_INFO]) + curr_info.update(**new_kwargs) + rec[RECORD_INFO] = curr_info + + # assert + assert isinstance(curr_info, dict) + assert INFO_N_OCR in rec[RECORD_INFO] + assert rec[RECORD_INFO][INFO_N_OCR] == 5 + + +def test_explore_update_record_info_with_multiple_info_entries(): + """Behavior when updating record + with single, common info field + """ + + # arrange + the_urn = "oai:opendata.uni-halle.de:1981185920/38841" + rec: typing.Dict = {RECORD_IDENTIFIER: the_urn} + rec[RECORD_INFO] = "{'ppn': '334587093', 'pica': 'Af', 'pages': 575, 'languages': ['ger']},\ + {'client': '141.48.10.246'},\ + {'ppn': '334587093', 'pica': 'Af', 'pages': 575, 'languages': ['ger']}" + new_kwargs = {INFO_N_OCR: 5} + + # act + curr_info = ast.literal_eval(rec[RECORD_INFO]) + + with pytest.raises(AttributeError) as attr_exc: + curr_info.update(**new_kwargs) + + # assert + assert isinstance(curr_info, tuple) + assert "'tuple' object has no attribute 'update'" in attr_exc.value.args[0] + + +def test_fix_update_record_info_with_multiple_info_entries(): + """Behavior when updating record + with single, common info field + """ + + # arrange + the_urn = "oai:opendata.uni-halle.de:1981185920/38841" + rec: typing.Dict = {RECORD_IDENTIFIER: the_urn} + rec[RECORD_INFO] = "{'ppn': '334587093', 'pica': 'Af', 'pages': 575, 'languages': ['ger']},\ + {'client': '141.48.10.246'},\ + {'ppn': '334587093', 'pica': 'Af', 'pages': 575, 'languages': ['ger']}" + new_kwargs = {INFO_N_OCR: 5} + + # act + curr_info = ast.literal_eval(rec[RECORD_INFO]) + if isinstance(curr_info, tuple): + curr_info[-1].update(**new_kwargs) + rec[RECORD_INFO] = f'{curr_info[-1]}' + + # assert + assert isinstance(rec, dict) + assert f"'{INFO_N_OCR}': 5" in rec[RECORD_INFO] + expected = "{'ppn': '334587093', 'pica': 'Af', 'pages': 575, 'languages': ['ger'], 'n_ocr': 5}" + assert expected == rec[RECORD_INFO] + + +def test_record_update_dict_and_string(): + """Common situation, when existing string needs + to be merged with dictionary to prevent + + TypeError: Record.info() takes 1 positional argument but 2 were given + """ + + # arrange + the_urn = "oai:opendata.uni-halle.de:1981185920/38841" + org_info = "{'ppn': '334587093', 'pica': 'Af', 'pages': 575, 'languages': ['ger']}" + record: df_r.Record = df_r.Record(the_urn) + record.info = "{'ppn': '334587093', 'pica': 'Af', 'pages': 575, 'languages': ['ger']}" + to_update = {'client': '127.0.0.1'} + record.info = org_info + + # act + record.info = to_update + + # assert + assert 'ppn' in record.info + assert 'client' in record.info + + +def test_record_update_dealing_invalid_data(): + """Common situation with legacy INFO data + ValueError: malformed node or string + """ + + # arrange + the_urn = "oai:opendata.uni-halle.de:1981185920/38841" + org_info = "ppn#334587093, pica#Af', 'pages': 575, 'languages': ['ger']" + record: df_r.Record = df_r.Record(the_urn) + + # act + record.info = org_info + + # assert + assert 'ppn#3345' in record.info