From 18901d04a8dc87d20c0c4cb364facb18bbc7a4ba Mon Sep 17 00:00:00 2001 From: Peter Weber Date: Tue, 10 Dec 2024 17:01:13 +0100 Subject: [PATCH] apiharvester: cantook harvesting * Adds VS and NJ CANTOOK API harvesting. * Deletes OAI ebooks harvesting. * Closes #3718. Co-Authored-by: Peter Weber --- data/apisources.yml | 17 +- data/oaisources.yml | 24 - pyproject.toml | 2 - rero_ils/config.py | 24 +- rero_ils/modules/apiharvester/api.py | 158 +++++ .../cantook}/__init__.py | 4 +- rero_ils/modules/apiharvester/cantook/api.py | 190 ++++++ .../cantook/dojson/json}/__init__.py | 9 +- .../apiharvester/cantook/dojson/json/model.py | 201 ++++++ rero_ils/modules/apiharvester/cli.py | 127 ++-- rero_ils/modules/apiharvester/errors.py | 44 ++ rero_ils/modules/apiharvester/models.py | 29 +- rero_ils/modules/apiharvester/signals.py | 24 - rero_ils/modules/apiharvester/tasks.py | 61 +- rero_ils/modules/apiharvester/utils.py | 154 ++--- rero_ils/modules/cli/reroils.py | 2 - .../modules/documents/serializers/base.py | 2 +- rero_ils/modules/ebooks/cli.py | 93 --- rero_ils/modules/ebooks/dojson/__init__.py | 18 - .../modules/ebooks/dojson/contrib/__init__.py | 18 - .../ebooks/dojson/contrib/marc21/model.py | 511 --------------- rero_ils/modules/ebooks/receivers.py | 63 -- rero_ils/modules/ebooks/tasks.py | 107 ---- rero_ils/modules/ebooks/utils.py | 192 ------ rero_ils/modules/entities/api.py | 4 +- rero_ils/modules/ext.py | 4 - rero_ils/modules/files/operations.py | 2 +- rero_ils/modules/organisations/api.py | 21 +- rero_ils/modules/stats/api/indicators/base.py | 6 +- scripts/setup | 20 +- setup.py | 3 - tests/api_harvester/cantook | 0 tests/api_harvester/test_cli_api_harcester.py | 113 ++++ tests/data/xml/ebook1.xml | 89 --- tests/data/xml/ebook2.xml | 73 --- tests/fixtures/metadata.py | 16 - tests/ui/documents/test_documents_api.py | 64 -- tests/ui/ebooks/test_ebooks_receivers.py | 116 ---- tests/ui/ebooks/test_ebooks_utils.py | 36 -- .../documents/test_documents_dojson_ebooks.py | 597 ------------------ 40 files changed, 942 insertions(+), 2296 deletions(-) delete mode 100644 data/oaisources.yml create mode 100644 rero_ils/modules/apiharvester/api.py rename rero_ils/modules/{ebooks => apiharvester/cantook}/__init__.py (92%) create mode 100644 rero_ils/modules/apiharvester/cantook/api.py rename rero_ils/modules/{ebooks/dojson/contrib/marc21 => apiharvester/cantook/dojson/json}/__init__.py (85%) create mode 100644 rero_ils/modules/apiharvester/cantook/dojson/json/model.py create mode 100644 rero_ils/modules/apiharvester/errors.py delete mode 100644 rero_ils/modules/apiharvester/signals.py delete mode 100644 rero_ils/modules/ebooks/cli.py delete mode 100644 rero_ils/modules/ebooks/dojson/__init__.py delete mode 100644 rero_ils/modules/ebooks/dojson/contrib/__init__.py delete mode 100644 rero_ils/modules/ebooks/dojson/contrib/marc21/model.py delete mode 100644 rero_ils/modules/ebooks/receivers.py delete mode 100644 rero_ils/modules/ebooks/tasks.py delete mode 100644 rero_ils/modules/ebooks/utils.py create mode 100644 tests/api_harvester/cantook create mode 100644 tests/api_harvester/test_cli_api_harcester.py delete mode 100644 tests/data/xml/ebook1.xml delete mode 100644 tests/data/xml/ebook2.xml delete mode 100644 tests/ui/ebooks/test_ebooks_receivers.py delete mode 100644 tests/ui/ebooks/test_ebooks_utils.py delete mode 100644 tests/unit/documents/test_documents_dojson_ebooks.py diff --git a/data/apisources.yml b/data/apisources.yml index d898d62cff..2f4d666e92 100644 --- a/data/apisources.yml +++ b/data/apisources.yml @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # RERO ILS -# Copyright (C) 2019 RERO +# Copyright (C) 2024 RERO # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by @@ -16,8 +16,13 @@ # along with this program. If not, see . -# OAI-PMH harvester configuration. -mef: - url: http://mef.test.rero.ch/api/mef - comment: 'mef persons' - size: 1000 +# API harvester configuration. +VS-CANTOOK: + url: https://mediatheque-valais.cantookstation.eu + classname: 'rero_ils.modules.apiharvester.cantook.api.ApiCantook' + code: 'mv-cantook' + +NJ-CANTOOK: + url: https://bm.ebibliomedia.ch + classname: 'rero_ils.modules.apiharvester.cantook.api.ApiCantook' + code: 'ebibliomedia' diff --git a/data/oaisources.yml b/data/oaisources.yml deleted file mode 100644 index 1f8727cbea..0000000000 --- a/data/oaisources.yml +++ /dev/null @@ -1,24 +0,0 @@ -# -*- coding: utf-8 -*- -# -# RERO ILS -# Copyright (C) 2019 RERO -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, version 3 of the License. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - - -# OAI-PMH harvester configuration. -ebooks: - baseurl: https://ebooks.test.rero.ch:8443/oai2d - metadataprefix: marc21 - comment: '' - setspecs: '' diff --git a/pyproject.toml b/pyproject.toml index 77840655ab..5a8101c9e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -154,7 +154,6 @@ reverse = "rero_ils.dojson.cli:reverse" pjson = "rero_ils.dojson.cli:pretty_json_dump" [tool.poetry.plugins."dojson.cli.rule"] -marc21_ebooks_to_json = "rero_ils.modules.ebooks.dojson.contrib.marc21:marc21" marc21_dnb_to_json = "rero_ils.modules.documents.dojson.contrib.marc21tojson:marc21_dnb" marc21_kul_to_json = "rero_ils.modules.documents.dojson.contrib.marc21tojson:marc21_kul" marc21_loc_to_json = "rero_ils.modules.documents.dojson.contrib.marc21tojson:marc21_loc" @@ -227,7 +226,6 @@ apiharvester = "rero_ils.modules.apiharvester.tasks" collections = "rero_ils.modules.collections.tasks" documents = "rero_ils.modules.documents.tasks" remote_entities = "rero_ils.modules.entities.remote_entities.tasks" -ebooks = "rero_ils.modules.ebooks.tasks" holdings = "rero_ils.modules.holdings.tasks" items = "rero_ils.modules.items.tasks" loans = "rero_ils.modules.loans.tasks" diff --git a/rero_ils/config.py b/rero_ils/config.py index 7ce7b5ec4d..136d7ae977 100644 --- a/rero_ils/config.py +++ b/rero_ils/config.py @@ -392,12 +392,6 @@ def _(x): "schedule": timedelta(minutes=60), "enabled": False, }, - "ebooks-harvester": { - "task": "invenio_oaiharvester.tasks.list_records_from_dates", - "schedule": crontab(minute=22, hour=22), - "kwargs": {"name": "ebooks"}, - "enabled": False, - }, "notification-creation": { "task": "rero_ils.modules.notifications.tasks.create_notifications", "schedule": crontab(minute=0, hour=3), # Every day at 05:00 UTC, @@ -526,12 +520,18 @@ def _(x): "kwargs": {"delete": True}, "enabled": False, }, - # "mef-harvester": { - # "task": "rero_ils.modules.apiharvester.tasks.harvest_records", - # "schedule": timedelta(minutes=60), - # "kwargs": {"name": "mef", "enabled": False), - # "enabled": False, - # }, + "harvest-vs-cantook": { + "task": "rero_ils.modules.apiharvester.tasks.harvest_records", + "schedule": crontab(minute=33, hour=3), # Every day at 03:33 UTC, + "kwargs": {"name": "VS-CANTOOK"}, + "enabled": False, + }, + "harvest-nj-cantook": { + "task": "rero_ils.modules.apiharvester.tasks.harvest_records", + "schedule": crontab(minute=44, hour=4), # Every day at 04:44 UTC, + "kwargs": {"name": "NJ-CANTOOK"}, + "enabled": False, + }, } CELERY_BROKER_HEARTBEAT = 0 diff --git a/rero_ils/modules/apiharvester/api.py b/rero_ils/modules/apiharvester/api.py new file mode 100644 index 0000000000..d27465aae0 --- /dev/null +++ b/rero_ils/modules/apiharvester/api.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- +# +# RERO ILS +# Copyright (C) 2024 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""API for cantook records.""" + +from __future__ import absolute_import, print_function + +import click + +from rero_ils.modules.apiharvester.models import ApiHarvestConfig +from rero_ils.modules.locations.api import Location +from rero_ils.modules.organisations.api import Organisation + + +class ApiHarvest: + """ApiHarvest class. + + config: saved config from ApiHarvester class + file_name: to save records to file + process: create harvested records + harvest_count: how many records to harvest + verbose: print verbose messages + """ + + def __init__( + self, name, file_name=None, process=False, harvest_count=None, verbose=False + ): + """Class init.""" + config = self.get_config(name) + if not config: + raise NameError(f"API Config not found: {name}") + self.config = config + self.file = file_name + self.process = process + self.harvest_count = harvest_count + self.verbose = verbose + self._vendor = None + self._url = self.config.url + self._code = self.config.code + self._count = 0 + self._count_new = 0 + self._count_upd = 0 + self._count_del = 0 + info = {} + for organisation in Organisation.get_records_by_online_harvested_source( + self._code + ): + locations = {} + for location_pid in organisation.get_online_locations(): + locations[location_pid] = None + location = Location.get_record_by_pid(location_pid) + library = location.get_library() + if url := library.get_online_harvested_source_url(source=self._code): + locations[location_pid] = url + info[organisation.pid] = { + "item_type_pid": organisation.online_circulation_category(), + "locations": locations, + } + self._info = info + + @classmethod + def get_config(cls, name): + """Get config. + + :param name: name of config + """ + return ApiHarvestConfig.query.filter_by(name=name).first() + + def get_request_url(self, start_date="1990-01-01", page=1): + """Get request URL. + + start_date: date from where records has to be harvested + page: page from where records have to be harvested + """ + raise NotImplementedError() + + def create_update_record(self, record): + """Create new record or update record. + + :param record: record to create or update + """ + raise NotImplementedError() + + def save_record(self, record): + """Save record to file. + + :param record: record to write to file + """ + if self.file: + self.file.write(record) + + def msg_text(self, pid, msg): + """Logging message text.""" + return f"{self._count}: {self._vendor}:{self._code} {pid} = {msg}" + + def process_records(self, records): + """Process records. + + :param records: records to process + """ + for record in records: + if self.harvest_count >= 0 and self._count >= self.harvest_count: + break + self._count += 1 + self.save_record(record) + if self.process: + pid, status = self.create_update_record(record) + self.verbose_print(self.msg_text(pid=pid, msg=status.value)) + + def verbose_print(self, msg): + """Print verbose message. + + :param msg: message to print if verbose + """ + if self.verbose: + click.echo(msg) + + def harvest_records(self, from_date): + """Harvest records from servers. + + :param from_date: records changed after this date to harvest + """ + self.process_records([]) + return self._count + + @property + def count(self): + """Get count.""" + return self._count + + @property + def count_new(self): + """Get new count.""" + return self._count_new + + @property + def count_upd(self): + """Get updated count.""" + return self._count_upd + + @property + def count_del(self): + """Get deleted count.""" + return self._count_del diff --git a/rero_ils/modules/ebooks/__init__.py b/rero_ils/modules/apiharvester/cantook/__init__.py similarity index 92% rename from rero_ils/modules/ebooks/__init__.py rename to rero_ils/modules/apiharvester/cantook/__init__.py index c955f3eee2..dc9e72752f 100644 --- a/rero_ils/modules/ebooks/__init__.py +++ b/rero_ils/modules/apiharvester/cantook/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # RERO ILS -# Copyright (C) 2019-2022 RERO +# Copyright (C) 2024 RERO # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by @@ -15,4 +15,4 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -"""JSON schemas.""" +"""ApiCantook.""" diff --git a/rero_ils/modules/apiharvester/cantook/api.py b/rero_ils/modules/apiharvester/cantook/api.py new file mode 100644 index 0000000000..60887713cc --- /dev/null +++ b/rero_ils/modules/apiharvester/cantook/api.py @@ -0,0 +1,190 @@ +# -*- coding: utf-8 -*- +# +# RERO ILS +# Copyright (C) 2024 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""API for cantook records.""" + +from __future__ import absolute_import, print_function + +from invenio_db import db +from requests import codes as requests_codes +from requests import get as requests_get + +from rero_ils.modules.documents.api import Document, DocumentsSearch +from rero_ils.modules.holdings.api import Holding, HoldingsSearch, create_holding +from rero_ils.modules.utils import JsonWriter + +from ..api import ApiHarvest +from ..models import HarvestActionType +from .dojson.json import cantook_json + + +class ApiCantook(ApiHarvest): + """ApiCantook class. + + Class for harvesting ebooks from cantook API resources. + """ + + def __init__( + self, name, file_name=None, process=False, harvest_count=-1, verbose=False + ): + """Class init.""" + super().__init__( + name=name, + process=process, + harvest_count=harvest_count, + verbose=verbose, + ) + if file_name: + self.file = JsonWriter(file_name) + self._vendor = "CANTOOK" + + def get_request_url(self, start_date="1990-01-01", page=1): + """Get request URL. + + start_date: date from where records has to be harvested + page: page from where records have to be harvested + """ + params = f"start_at={start_date}&page={page}" + return f"{self._url}/v1/resources.json?{params}" + + def delete_holdings(self, document_pid): + """ + Delete holdings. + + :param document_pid: document pid + """ + for hold_pid in list(Holding.get_holdings_pid_by_document_pid(document_pid)): + if holding := Holding.get_record_by_pid(hold_pid): + for electronic_location in holding["electronic_location"]: + if electronic_location["source"] == self._code: + holding.delete(dbcommit=True, delindex=True) + break + + def create_holdings(self, document_pid, link): + """ + Create holdings. + + :param document_pid: document pid + :param link: link to cantook document + """ + holdings = [] + for _, info in self._info.items(): + item_type_pid = info["item_type_pid"] + for location_pid, url in info["locations"].items(): + if url: + uri_split = link.split("/")[3:] + uri_split.insert(0, url.rstrip("/")) + link = "/".join(uri_split) + # See if the holding already exist + query = ( + HoldingsSearch() + .filter("term", document__pid=document_pid) + .filter("term", location__pid=location_pid) + .filter("term", holdings_type="electronic") + .filter("term", electronic_location__source=self._code) + ) + if query.count() == 0: + holding = create_holding( + document_pid=document_pid, + location_pid=location_pid, + item_type_pid=item_type_pid, + electronic_location={"source": self._code, "uri": link}, + holdings_type="electronic", + ) + holdings.append(holding) + db.session.commit() + for holding in holdings: + holding.reindex() + + def create_update_record(self, data): + """Create, update or delete record. + + :param data: date for record operation + """ + status = HarvestActionType.NOTSET + record = None + record_data = cantook_json.do(data) + if record_data.pop("deleted", None): + status = HarvestActionType.DELETED + link = record_data.pop("link", None) + # See if we have this document already + harvested_id = record_data.pop("pid") + query = ( + DocumentsSearch() + .filter("term", identifiedBy__value__raw=harvested_id) + .source(includes=["pid"]) + ) + try: + pid = next(query.scan()).pid + except StopIteration: + pid = None + if pid: + if doc := Document.get_record_by_pid(pid): + status = HarvestActionType.UPDATED + record_data["pid"] = doc["pid"] + # TODO: Do we have always to replace the document? + record = doc.replace(data=record_data, dbcommit=True, reindex=True) + if status == HarvestActionType.DELETED: + self._count_del += 1 + self.delete_holdings() + # Try to delete document (we have to delete `harvested` for this) + doc.pop("harvested", None) + if not doc.reasons_not_to_delete: + doc.delete(dbcommit=True, delindex=True) + pid = harvested_id + else: + self._count_upd += 1 + # TODO: do we have to delete and recreate holdings ? + # self.delete_holdings() + self.create_holdings(document_pid=record.pid, link=link) + elif status == HarvestActionType.NOTSET: + self._count_new += 1 + status = HarvestActionType.CREATED + record = Document.create(data=record_data, dbcommit=True, reindex=True) + self.create_holdings(document_pid=record.pid, link=link) + return pid or harvested_id, status + + def harvest_records(self, from_date): + """Harvest cantook records. + + from_date: record changed after this date to get + max: maximum records to fetcher + file: to save the fetched record + """ + self._count = 0 + url = self.get_request_url(start_date=from_date, page=1) + request = requests_get(url) + total_pages = int(request.headers.get("X-Total-Pages", 0)) + total_items = int(request.headers.get("X-Total-Items", 0)) + current_page = int(request.headers.get("X-Current-Page", 0)) + count = 0 + while ( + request.status_code == requests_codes.ok + and current_page <= total_pages + and (self.harvest_count < 0 or self._count < self.harvest_count) + ): + count += 1 + self.verbose_print(f"API page: {current_page} url: {url}") + self.process_records(request.json().get("resources", [])) + # get next page and update current_page + url = self.get_request_url(start_date=from_date, page=current_page + 1) + request = requests_get(url) + current_page = int(request.headers.get("X-Current-Page", 0)) + if count > 10: + raise StopIteration(f"Count: {count}") + + return self._count, total_items diff --git a/rero_ils/modules/ebooks/dojson/contrib/marc21/__init__.py b/rero_ils/modules/apiharvester/cantook/dojson/json/__init__.py similarity index 85% rename from rero_ils/modules/ebooks/dojson/contrib/marc21/__init__.py rename to rero_ils/modules/apiharvester/cantook/dojson/json/__init__.py index 9547a74462..1419e89d2f 100644 --- a/rero_ils/modules/ebooks/dojson/contrib/marc21/__init__.py +++ b/rero_ils/modules/apiharvester/cantook/dojson/json/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # RERO ILS -# Copyright (C) 2019-2022 RERO +# Copyright (C) 2024 RERO # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by @@ -15,8 +15,9 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -"""MARC21 RERO to JSON.""" +"""Marc21 data conversion.""" -from .model import marc21 -__all__ = "marc21" +from .model import cantook_json + +__all__ = "cantook_json" diff --git a/rero_ils/modules/apiharvester/cantook/dojson/json/model.py b/rero_ils/modules/apiharvester/cantook/dojson/json/model.py new file mode 100644 index 0000000000..d9d605978d --- /dev/null +++ b/rero_ils/modules/apiharvester/cantook/dojson/json/model.py @@ -0,0 +1,201 @@ +# -*- coding: utf-8 -*- +# +# RERO ILS +# Copyright (C) 2024 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Cantook json record transformation.""" + +import dateparser + +from rero_ils.modules.documents.models import DocumentFictionType +from rero_ils.modules.utils import get_schema_for_resource + + +class Transformation(object): + """Transformation CANTOOK Json to RERO-ILS Json.""" + + def __init__(self, data=None, logger=None, verbose=False, transform=True): + """Constructor.""" + self.data = data + self.logger = logger + self.verbose = verbose + self.json_dict = {} + if data and transform: + self._transform() + + def _transform(self): + """Call the transformation functions.""" + for func in dir(self): + if func.startswith("trans"): + func = getattr(self, func) + func() + + def do(self, data): + """Do the transformation. + + :param data: json data to transform + :returns: rero-ils document data + """ + self.data = data + self._transform() + return self.json_dict + + @property + def json(self): + """Json data.""" + return self.json_dict or None + + def trans_constants(self): + """Add constants.""" + self.json_dict["$schema"] = get_schema_for_resource("doc") + self.json_dict["harvested"] = True + self.json_dict["issuance"] = { + "main_type": "rdami:1001", + "subtype": "materialUnit", + } + self.json_dict["adminMetadata"] = {"encodingLevel": "Not applicable"} + self.json_dict["type"] = [ + {"main_type": "docmaintype_book", "subtype": "docsubtype_e-book"} + ] + + def trans_pid(self): + """Transformation pid.""" + self.json_dict["pid"] = f"cantook-{self.data['id']}" + + def trans_identified_by(self): + """Transformation IdentifiedBy.""" + identified_by = [ + { + "source": "CANTOOK", + "type": "bf:Local", + "value": f"cantook-{self.data['id']}", + } + ] + for media in self.data.get("media"): + nature = media.get("nature") + if nature in ["paper", "epub", "audio"] and media["key_type"] == "isbn13": + identified_by.append( + {"type": "bf:Isbn", "value": media.get("key"), "note": nature} + ) + if nature == "audio": + self.json_dict["type"] = [ + { + "main_type": "docmaintype_audio", + "subtype": "docsubtype_audio_book", + } + ] + self.json_dict["identifiedBy"] = identified_by + + def trans_titel(self): + """Transformation Title.""" + title = {"type": "bf:Title"} + if maintitle := self.data.get("title"): + title["mainTitle"] = [{"value": maintitle}] + if subtitle := self.data.get("subtitle"): + title["subtitle"] = [{"value": subtitle}] + self.json_dict["title"] = [title] + + def trans_provision_activity(self): + """Transform provisionActivity.""" + publisher_name = self.data["publisher_name"] + start_date = dateparser.parse(self.data["created_at"]) + self.json_dict["provisionActivity"] = [ + { + "startDate": start_date.year, + "statement": [ + {"label": [{"value": publisher_name}], "type": "bf:Agent"}, + {"label": [{"value": str(start_date.year)}], "type": "Date"}, + ], + "type": "bf:Publication", + } + ] + + def trans_electronic_locator(self): + """Transformation electronicLocator.""" + electronic_locators = [] + if cover := self.data.get("cover"): + electronic_locators.append( + { + "content": "coverImage", + "type": "relatedResource", + "url": cover, + } + ) + if flipbook := self.data.get("flipbook"): + electronic_locators.append( + { + "content": "extract", + "type": "relatedResource", + "url": flipbook, + } + ) + if electronic_locators: + self.json_dict["electronicLocator"] = electronic_locators + + def trans_fiction(self): + """Transformation fiction.""" + if self.data.get("fiction"): + self.json_dict["fiction_statement"] = DocumentFictionType.Fiction.value + self.json_dict["fiction_statement"] = DocumentFictionType.Unspecified.value + + def trans_language(self): + """Transformation language.""" + if languages := [ + {"type": "bf:Language", "value": language} + for language in self.data.get("languages", []) + ]: + self.json_dict["language"] = languages + + def trans_subjects(self): + """Transformation Subject.""" + subjects = [] + for classification in self.data.get("classifications", []): + for caption in classification.get("captions", []): + if subject := caption.get("fr"): + subjects.append( + { + "entity": { + "authorized_access_point": subject, + "type": "bf:Topic", + } + } + ) + if subjects: + self.json_dict["subjects"] = subjects + + def trans_summary(self): + """Transformation Summary.""" + if summary := self.data.get("summary"): + self.json_dict["summary"] = [{"label": [{"value": summary}]}] + + def trans_extent(self): + """Transformation Extend.""" + if page_count := self.data.get("page_count"): + self.json_dict["extent"] = f"{page_count} pages" + + # to be used to create holdings + def trans_links(self): + """Transformation links.""" + if link := self.data.get("link"): + self.json_dict["link"] = link + + # to be used for deleted records + def trans_deleted(self): + """Transformation deleted.""" + if unavailable_since := self.data.get("unavailable_since"): + self.json_dict["deleted"] = unavailable_since + + +cantook_json = Transformation() diff --git a/rero_ils/modules/apiharvester/cli.py b/rero_ils/modules/apiharvester/cli.py index 81a3e4ffdf..2c9c770734 100644 --- a/rero_ils/modules/apiharvester/cli.py +++ b/rero_ils/modules/apiharvester/cli.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # RERO ILS -# Copyright (C) 2019-2022 RERO +# Copyright (C) 2024 RERO # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by @@ -20,15 +20,16 @@ from __future__ import absolute_import, print_function import click +import dateparser import yaml from flask import current_app from flask.cli import with_appcontext from werkzeug.local import LocalProxy -from rero_ils.modules.apiharvester.tasks import harvest_records +from rero_ils.modules.apiharvester.tasks import api_harvest_records from .models import ApiHarvestConfig -from .utils import api_source +from .utils import api_source, get_apiharvest_object datastore = LocalProxy(lambda: current_app.extensions["security"].datastore) @@ -38,48 +39,40 @@ def apiharvester(): """Api harvester commands.""" -@apiharvester.command("source") +@apiharvester.command("add-source") @click.argument("name") @click.option("-U", "--url", default="", help="Url") -@click.option("-m", "--mimetype", default="", help="Mimetype") -@click.option("-s", "--size", default=-1, type=int, help="Size") -@click.option("-c", "--comment", default="", help="Comment") +@click.option("-n", "--classname", default="", help="Class name") +@click.option("-c", "--code", default="", help="Code") @click.option("-u", "--update", is_flag=True, default=False, help="Update config") @with_appcontext -def api_source_config(name, url, mimetype, size, comment, update): +def add_api_source_config(name, url, classname, code, update): """Add or Update ApiHarvestConfig.""" - click.echo(f"ApiHarvesterConfig: {name} ", nl=False) - msg = api_source( - name=name, url=url, mimetype=mimetype, size=size, comment=comment, update=update - ) - click.echo(msg) + msg = api_source(name=name, url=url, classname=classname, code=code, update=update) + click.echo(f"ApiHarvestConfig {name}: {msg}") -@apiharvester.command("sources") +@apiharvester.command("init-config") @click.argument("configfile", type=click.File("rb")) @click.option("-u", "--update", is_flag=True, default=False, help="Update config") @with_appcontext -def api_source_config_from_file(configfile, update): +def init_api_harvest_config(configfile, update): """Add or update ApiHarvestConfigs from file.""" - configs = yaml.load(configfile, Loader=yaml.FullLoader) - for name, values in sorted(configs.items()): - url = values.get("url", "") - mimetype = values.get("mimetype", "") - size = values.get("size", 100) - comment = values.get("comment", "") - click.echo(f"ApiHarvesterConfig: {name} {url} ", nl=False) - msg = api_source( - name=name, - url=url, - mimetype=mimetype, - size=size, - comment=comment, - update=update, - ) - click.echo(msg) + if configs := yaml.load(configfile, Loader=yaml.FullLoader): + for name, values in sorted(configs.items()): + url = values.get("url", "") + classname = values.get("classname", "") + code = values.get("code", "") + msg = api_source( + name=name, url=url, classname=classname, code=code, update=update + ) + click.echo(f"ApiHarvestConfig {name}: {msg}") + + else: + click.secho(f"ERROR: no YML config found in: {configfile.name}", fg="red") -@apiharvester.command("harvest") +@apiharvester.command() @click.option( "-n", "--name", default=None, help="Name of persistent configuration to use." ) @@ -89,12 +82,6 @@ def api_source_config_from_file(configfile, update): default=None, help="The lower bound date for the harvesting (optional).", ) -@click.option( - "-u", - "--url", - default=None, - help="The upper bound date for the harvesting (optional).", -) @click.option( "-k", "--enqueue", @@ -102,46 +89,30 @@ def api_source_config_from_file(configfile, update): default=False, help="Enqueue harvesting and return immediately.", ) -@click.option( - "--signals/--no-signals", - default=True, - help="Signals sent with Api harvesting results.", -) -@click.option("-s", "--size", type=int, default=0, help="Size of chunks (optional).") @click.option( "-m", - "--max_results", + "--harvest_count", type=int, - default=0, + default=-1, help="maximum of records to harvest (optional).", ) @click.option("-v", "--verbose", "verbose", is_flag=True, default=False) @with_appcontext -def harvest(name, from_date, url, enqueue, signals, size, max_results, verbose): - """Harvest api.""" +def harvest(name, from_date, enqueue, harvest_count, verbose): + """Harvest records from an API repository.""" if name: click.secho(f"Harvest api: {name}", fg="green") - elif url: - click.secho(f"Harvest api: {url}", fg="green") + if from_date: + from_date = dateparser.parse(from_date).isoformat() if enqueue: - harvest_records.delay( - url=url, - name=name, - from_date=from_date, - signals=signals, - size=size, - max_results=max_results, - verbose=verbose, + async_id = api_harvest_records.delay( + name=name, from_date=from_date, harvest_count=harvest_count, verbose=verbose ) + if verbose: + click.echo(f"AsyncResult {async_id}") else: - harvest_records( - url=url, - name=name, - from_date=from_date, - signals=signals, - size=size, - max_results=max_results, - verbose=verbose, + api_harvest_records( + name=name, from_date=from_date, harvest_count=harvest_count, verbose=verbose ) @@ -149,11 +120,23 @@ def harvest(name, from_date, url, enqueue, signals, size, max_results, verbose): @with_appcontext def info(): """List infos for tasks.""" - apis = ApiHarvestConfig.query.all() + apis = ApiHarvestConfig.query.order_by(ApiHarvestConfig.name.asc()).all() for api in apis: click.echo(api.name) - click.echo(f"\tlastrun : {api.lastrun}") - click.echo(f"\turl : {api.url}") - click.echo(f"\tmimetype : {api.mimetype}") - click.echo(f"\tsize : {api.size}") - click.echo(f"\tcomment : {api.comment}") + click.echo(f"\tlastrun : {api.lastrun}") + click.echo(f"\turl : {api.url}") + click.echo(f"\tclassname : {api.classname}") + click.echo(f"\tcode : {api.code}") + + +@apiharvester.command() +@click.argument("name") +@click.option("-d", "--date", default=None, help="Set last run (default: now).") +@with_appcontext +def set_last_run(name, date): + """Set last run.""" + if config := get_apiharvest_object(name=name): + new_date = config.update_lastrun(new_date=date) + click.secho(f"Set last run {name}: {new_date}", fg="green") + else: + click.secho(f"No config found: {name}", fg="red") diff --git a/rero_ils/modules/apiharvester/errors.py b/rero_ils/modules/apiharvester/errors.py new file mode 100644 index 0000000000..f4b5c6303b --- /dev/null +++ b/rero_ils/modules/apiharvester/errors.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +# +# RERO ILS +# Copyright (C) 2024 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Api harvester errors.""" + +from __future__ import absolute_import, print_function + + +class ApiHarvesterError(Exception): + """Base exception for apiharvester.""" + + +class ApiRequestError(ApiHarvesterError): + """Error with the Api request.""" + + +class NameOrUrlMissing(ApiHarvesterError): + """Name or url for harvesting missing.""" + + +class WrongDateCombination(ApiHarvesterError): + """'Until' date is larger that 'from' date.""" + + +class IdentifiersOrDates(ApiHarvesterError): + """Identifiers cannot be used in combination with dates.""" + + +class ApiHarvesterConfigNotFound(ApiHarvesterError): + """No ApiHarvesterConfig was found.""" diff --git a/rero_ils/modules/apiharvester/models.py b/rero_ils/modules/apiharvester/models.py index c657a127ea..dfd9f657ac 100644 --- a/rero_ils/modules/apiharvester/models.py +++ b/rero_ils/modules/apiharvester/models.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # RERO ILS -# Copyright (C) 2019-2022 RERO +# Copyright (C) 2024 RERO # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by @@ -20,27 +20,32 @@ from __future__ import absolute_import from datetime import datetime, timezone +from enum import Enum -import pytz from invenio_db import db -from invenio_pidstore.models import RecordIdentifier -class ApiHarvestConfig(RecordIdentifier): - """Sequence generator for Document identifiers.""" +class HarvestActionType(Enum): + """Harvest action types.""" + + DELETED = "DELETED" + UPDATED = "UPDATED" + CREATED = "CREATED" + NOTSET = "NOTSET" + + +class ApiHarvestConfig(db.Model): + """Represents a ApiHarvestConfig record.""" __tablename__ = "apiharvester_config" - __mapper_args__ = {"concrete": True} id = db.Column(db.Integer, primary_key=True) url = db.Column(db.String(255), nullable=False, server_default="") name = db.Column(db.String(255), nullable=False) - mimetype = db.Column(db.String(255), nullable=False) - size = db.Column(db.Integer, nullable=False) - comment = db.Column(db.Text, nullable=True) - default_last_run = datetime.strptime("1900-1-1", "%Y-%m-%d") + classname = db.Column(db.String(255), nullable=False) + code = db.Column(db.Text, nullable=True) lastrun = db.Column( - db.DateTime, default=pytz.utc.localize(default_last_run), nullable=True + db.DateTime, default=datetime(year=1900, month=1, day=1), nullable=True ) def save(self): @@ -51,3 +56,5 @@ def save(self): def update_lastrun(self, new_date=None): """Update the 'lastrun' attribute of object to now.""" self.lastrun = new_date or datetime.now(timezone.utc) + self.save() + return self.lastrun diff --git a/rero_ils/modules/apiharvester/signals.py b/rero_ils/modules/apiharvester/signals.py deleted file mode 100644 index f24e2b621c..0000000000 --- a/rero_ils/modules/apiharvester/signals.py +++ /dev/null @@ -1,24 +0,0 @@ -# -*- coding: utf-8 -*- -# -# RERO ILS -# Copyright (C) 2019-2022 RERO -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, version 3 of the License. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -"""ApiHarvester signals.""" - -from blinker import Namespace - -_signals = Namespace() - -apiharvest_part = _signals.signal("apiharvest_part") diff --git a/rero_ils/modules/apiharvester/tasks.py b/rero_ils/modules/apiharvester/tasks.py index fdc6134fba..fb70cb0a77 100644 --- a/rero_ils/modules/apiharvester/tasks.py +++ b/rero_ils/modules/apiharvester/tasks.py @@ -19,40 +19,41 @@ from __future__ import absolute_import, print_function +import click from celery import shared_task +from flask import current_app +from invenio_records_rest.utils import obj_or_import_string -from .models import ApiHarvestConfig -from .utils import get_records +from .utils import get_apiharvest_object -@shared_task(ignore_result=True) -def harvest_records( - url=None, - name=None, - from_date=None, - signals=True, - size=0, - max_results=0, - verbose=False, -): +@shared_task(ignore_result=True, soft_time_limit=3600) +def api_harvest_records(name, from_date=None, harvest_count=-1, verbose=False): """Harvest records.""" - config = ApiHarvestConfig.query.filter_by(name=name).first() - if config: - if not url: - url = config.url + count = -1 + + if config := get_apiharvest_object(name=name): if not from_date: - from_date = config.lastrun + from_date = config.lastrun.isoformat() config.update_lastrun() - if size == 0: - size = config.size - - for next, records in get_records( - url=url, - name=name, - from_date=from_date, - size=size, - max_results=max_results, - signals=signals, - verbose=verbose, - ): - pass + msg = f"API harvest {name} class name: {config.classname} " + msg += f"from date: {from_date} url: {config.url}" + + current_app.logger.info(msg) + HarvestClass = obj_or_import_string(config.classname) + harvest = HarvestClass( + name=name, verbose=verbose, harvest_count=harvest_count, process=True + ) + count, total = harvest.harvest_records(from_date=from_date) + msg = ( + f"API harvest {name} items={total} |" + f" got={count} new={harvest.count_new}" + f" updated={harvest.count_upd} deleted={harvest.count_del}" + ) + if verbose: + click.echo(msg) + current_app.logger.info(msg) + count = harvest.count + else: + current_app.logger.error(f"No config found: {name}") + return count diff --git a/rero_ils/modules/apiharvester/utils.py b/rero_ils/modules/apiharvester/utils.py index 3365da4b80..b54659a2a8 100644 --- a/rero_ils/modules/apiharvester/utils.py +++ b/rero_ils/modules/apiharvester/utils.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # RERO ILS -# Copyright (C) 2019-2022 RERO +# Copyright (C) 2024 RERO # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by @@ -19,109 +19,93 @@ from __future__ import absolute_import, print_function -import click -import requests -from dateutil import parser from flask import current_app from invenio_db import db +from invenio_oaiserver.models import OAISet +from sqlalchemy.exc import OperationalError +from .errors import ApiHarvesterConfigNotFound from .models import ApiHarvestConfig -from .signals import apiharvest_part -def api_source(name, url="", mimetype="", size=100, comment="", update=False): - """Add ApiHarvesterConfig.""" +def add_set(spec, name, pattern, description="..."): + """Add OAI set. + + :param spec: set identifier + :param name: human readable name of the set + :param pattern: search pattern to get records + :param description: human readable description + """ + try: + oaiset = OAISet( + spec=spec, name=name, description=description, system_created=False + ) + oaiset.search_pattern = pattern + db.session.add(oaiset) + db.session.commit() + msg = f"OAIset added: {name}" + except Exception as err: + db.session.rollback() + msg = f"OAIset exist: {name} {err}" + return msg + + +def api_source(name, url="", classname=None, code="", update=False): + """Add ApiHarvestConfig do DB. + + name: name for the configuration + url: harvesting url + classname: Class responsible for getting record_serializers + code: code added to electronic_location['nonpublic_note'] + update: update configuration if exist + """ with current_app.app_context(): + msg = "No Update" source = ApiHarvestConfig.query.filter_by(name=name).first() if not source: source = ApiHarvestConfig( - name=name, url=url, mimetype=mimetype, size=100, comment=comment + name=name, url=url, classname=classname, code=code ) source.save() db.session.commit() - return "Added" + msg = "Add" elif update: source.name = name msg = [] if url != "": source.url = url msg.append(f"url:{url}") - if mimetype != "": - source.mimetype = mimetype - msg.append(f"mimetype:{mimetype}") - if size != -1: - source.size = size - msg.append(f"size:{size}") - if comment != "": - source.comment = comment - msg.append(f"comment:{comment}") + source.classname = classname + msg.append(f"classname:{classname}") + if code != "": + source.code = code + msg.append(f"code:{code}") db.session.commit() - return f'Updated: {", ".join(msg)}' - return "Not Updated" - - -def extract_records(data): - """Extract a record from REST data.""" - records = [] - hits = data.get("hits", {}).get("hits", {}) - for hit in hits: - # pid = data.get('id', '') - # updated = data.get('updated', '') - # links = data.get('links', {}).get('self', '') - record = hit.get("metadata", "") - records.append(record) - return records - - -def get_records( - url=None, - name=None, - from_date=None, - max_results=0, - size=100, - signals=True, - verbose=False, - **kwargs, -): - """Harvest multiple records from invenio api.""" - url += f"/?size={size}" - if from_date: - if isinstance(from_date, str): - from_date = parser.parse(from_date) - from_date = from_date.isoformat() - # we have to urlencode the : from the time with \: - from_date = from_date.replace(":", "%5C:") - url += f"&q=_updated:>{from_date}" - url += f"&size={size}" - - if verbose: - click.echo(f"Get records from {url}") - - try: - count = 0 - request = requests.get(url) - data = request.json() - - total = data["hits"]["total"]["value"] - click.echo(f"API records found: {total}") - - next_url = data.get("links", {}).get("self", True) - while next_url and (count < max_results or max_results == 0): - records = extract_records(data) - count += len(records) + msg = f'Update {", ".join(msg)}' + return msg + + +def get_apiharvest_object(name): + """Query and returns an ApiHarvestConfig object based on its name. + + :param name: The name of the ApiHarvestConfig object. + :return: The ApiHarvestConfig object. + """ + get_config_error_count = 0 + get_config_ok = False + while not get_config_ok and get_config_error_count < 5: + try: + obj = ApiHarvestConfig.query.filter_by(name=name).first() + get_config_ok = True + except OperationalError: + get_config_error_count += 1 + current_app.logger.error( + "ApiHarvestConfig OperationalError: " f"{get_config_error_count} {name}" + ) - if count - max_results > 0 and max_results != 0: - records = records[:max_results] + if not obj: + raise ApiHarvesterConfigNotFound( + f"Unable to find ApiHarvesterConfig obj with name {name}." + ) - request = requests.get(next_url) - data = request.json() - if signals: - apiharvest_part.send( - records=records, name=name, url=next, verbose=verbose, **kwargs - ) - else: - yield next_url, records - next_url = data.get("links", {}).get("next", None) - except Exception as error: - click.secho(f"Harvesting API ConnectionRefusedError: {error}", fg="red") - yield url, [] + return obj diff --git a/rero_ils/modules/cli/reroils.py b/rero_ils/modules/cli/reroils.py index a2df4a82c2..a67470683d 100644 --- a/rero_ils/modules/cli/reroils.py +++ b/rero_ils/modules/cli/reroils.py @@ -24,7 +24,6 @@ from rero_ils.modules.acquisition.cli import acquisition from rero_ils.modules.apiharvester.cli import apiharvester -from rero_ils.modules.ebooks.cli import oaiharvester from rero_ils.modules.entities.remote_entities.cli import entity from rero_ils.modules.migrations.cli import migrations from rero_ils.modules.monitoring.cli import monitoring @@ -50,7 +49,6 @@ def reroils(): reroils.add_command(migrations) reroils.add_command(monitoring) reroils.add_command(notifications) -reroils.add_command(oaiharvester) reroils.add_command(scheduler) reroils.add_command(stats) reroils.add_command(utils) diff --git a/rero_ils/modules/documents/serializers/base.py b/rero_ils/modules/documents/serializers/base.py index 28dadc771f..b2b221fb53 100644 --- a/rero_ils/modules/documents/serializers/base.py +++ b/rero_ils/modules/documents/serializers/base.py @@ -76,7 +76,7 @@ def __init__(self, record, **kwargs): @abstractmethod def format(self): """Return formatted record.""" - raise NotImplementedError + raise NotImplementedError() def _get_document_types(self): """Return document types.""" diff --git a/rero_ils/modules/ebooks/cli.py b/rero_ils/modules/ebooks/cli.py deleted file mode 100644 index 881fe918a6..0000000000 --- a/rero_ils/modules/ebooks/cli.py +++ /dev/null @@ -1,93 +0,0 @@ -# -*- coding: utf-8 -*- -# -# RERO ILS -# Copyright (C) 2019-2022 RERO -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, version 3 of the License. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -"""Click command-line interface for ebook record management.""" - -from __future__ import absolute_import, print_function - -import click -import yaml -from flask.cli import with_appcontext -from invenio_oaiharvester.cli import oaiharvester -from invenio_oaiharvester.models import OAIHarvestConfig - -from .utils import add_oai_source - - -@oaiharvester.command("addsource") -@click.argument("name") -@click.argument("baseurl") -@click.option( - "-m", "--metadataprefix", default="marc21", help="The prefix for the metadata" -) -@click.option( - "-s", "--setspecs", default="", help="The ‘set’ criteria for the harvesting" -) -@click.option("-c", "--comment", default="", help="Comment") -@click.option("-u", "--update", is_flag=True, default=False, help="Update config") -@with_appcontext -def add_oai_source_config(name, baseurl, metadataprefix, setspecs, comment, update): - """Add OAIHarvestConfig.""" - click.echo(f"Add OAIHarvestConfig: {name} ", nl=False) - msg = add_oai_source( - name=name, - baseurl=baseurl, - metadataprefix=metadataprefix, - setspecs=setspecs, - comment=comment, - update=update, - ) - click.echo(msg) - - -@oaiharvester.command("initconfig") -@click.argument("configfile", type=click.File("rb")) -@click.option("-u", "--update", is_flag=True, default=False, help="Update config") -@with_appcontext -def init_oai_harvest_config(configfile, update): - """Init OAIHarvestConfig.""" - configs = yaml.load(configfile, Loader=yaml.FullLoader) - for name, values in sorted(configs.items()): - baseurl = values["baseurl"] - metadataprefix = values.get("metadataprefix", "marc21") - setspecs = values.get("setspecs", "") - comment = values.get("comment", "") - click.echo(f"Add OAIHarvestConfig: {name} {baseurl} ", nl=False) - msg = add_oai_source( - name=name, - baseurl=baseurl, - metadataprefix=metadataprefix, - setspecs=setspecs, - comment=comment, - update=update, - ) - click.echo(msg) - - -@oaiharvester.command("info") -@with_appcontext -def info(): - """List infos for tasks.""" - oais = OAIHarvestConfig.query.all() - for oai in oais: - click.echo(oai.name) - click.echo("\tlastrun : ", nl=False) - click.echo(oai.lastrun) - click.echo("\tbaseurl : " + oai.baseurl) - click.echo("\tmetadataprefix: " + oai.metadataprefix) - click.echo("\tcomment : " + oai.comment) - click.echo("\tsetspecs : " + oai.setspecs) diff --git a/rero_ils/modules/ebooks/dojson/__init__.py b/rero_ils/modules/ebooks/dojson/__init__.py deleted file mode 100644 index 453d190002..0000000000 --- a/rero_ils/modules/ebooks/dojson/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# -*- coding: utf-8 -*- -# -# RERO ILS -# Copyright (C) 2019-2022 RERO -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, version 3 of the License. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -"""DOJSON transformations.""" diff --git a/rero_ils/modules/ebooks/dojson/contrib/__init__.py b/rero_ils/modules/ebooks/dojson/contrib/__init__.py deleted file mode 100644 index 28b47606ef..0000000000 --- a/rero_ils/modules/ebooks/dojson/contrib/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# -*- coding: utf-8 -*- -# -# RERO ILS -# Copyright (C) 2019-2022 RERO -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, version 3 of the License. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -"""DOJSON contrib for rero-ils.""" diff --git a/rero_ils/modules/ebooks/dojson/contrib/marc21/model.py b/rero_ils/modules/ebooks/dojson/contrib/marc21/model.py deleted file mode 100644 index 73c4144dc7..0000000000 --- a/rero_ils/modules/ebooks/dojson/contrib/marc21/model.py +++ /dev/null @@ -1,511 +0,0 @@ -# -*- coding: utf-8 -*- -# -# RERO ILS -# Copyright (C) 2019-2022 RERO -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, version 3 of the License. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -"""rero-ils MARC21 model definition.""" - - -import contextlib -import re - -from dojson import utils -from isbnlib import EAN13 - -from rero_ils.dojson.utils import ( - ReroIlsMarc21Overdo, - TitlePartList, - add_note, - extract_subtitle_and_parallel_titles_from_field_245_b, - get_field_items, - get_field_link_data, - make_year, - remove_trailing_punctuation, -) -from rero_ils.modules.documents.dojson.contrib.marc21tojson.utils import do_language -from rero_ils.modules.documents.models import DocumentFictionType -from rero_ils.modules.documents.utils import create_authorized_access_point -from rero_ils.modules.entities.models import EntityType - -marc21 = ReroIlsMarc21Overdo() - - -@marc21.over("issuance", "leader") -@utils.ignore_value -def marc21_to_issuance(self, key, value): - """Set the mode of issuance.""" - self["issuance"] = dict(main_type="rdami:1001", subtype="materialUnit") - if marc21.admin_meta_data: - self["adminMetadata"] = marc21.admin_meta_data - self["fiction_statement"] = DocumentFictionType.Unspecified.value - - -@marc21.over("language", "^008") -@utils.ignore_value -def marc21_to_language_from_008(self, key, value): - """Get languages. - - languages: 008 and 041 [$a, repetitive] - """ - return do_language(self, marc21) - - -@marc21.over("language", "^041") -@utils.ignore_value -def marc21_to_language_from_041(self, key, value): - """Get languages. - - languages: 008 and 041 [$a, repetitive] - """ - # if we dont have languages from 008 try to set it with 041 - return do_language(self, marc21) - - -@marc21.over("identifiedBy", "^020..") -@utils.ignore_value -def marc21_to_identifier_isbn(self, key, value): - """Get identifier isbn. - - identifiers_isbn: 020 $a - """ - if isbn13 := EAN13(value.get("a")): - identifiers = self.get("identifiedBy", []) - identifier = {"type": "bf:Isbn", "value": isbn13} - identifiers.append(identifier) - return identifiers - return None - - -@marc21.over("type", "^0248.$") -def marc21_to_type(self, key, value): - """Get document type.""" - if value.get("a").find("cantook") > -1: - return [{"main_type": "docmaintype_book", "subtype": "docsubtype_e-book"}] - return None - - -@marc21.over("identifiedBy", "^035..") -@utils.ignore_value -def marc21_to_identifier_rero_id(self, key, value): - """Get identifier reroId. - - identifiers:reroID: 035$a - """ - identifiers = self.get("identifiedBy", []) - identifier = {"type": "bf:Local", "value": value.get("a")} - identifiers.append(identifier) - return identifiers - - -@marc21.over("contribution", "(^100|^700|^710|^711)..") -@utils.for_each_value -@utils.ignore_value -def marc21_to_contribution(self, key, value): - """Get contribution.""" - if key[4] == "2" or key[:3] not in ["100", "700", "710", "711"]: - return None - agent_data = {"type": "bf:Person"} - if value.get("a"): - name = utils.force_list(value.get("a"))[0] - agent_data["preferred_name"] = remove_trailing_punctuation(name) - - # 100|700 Person - if key[:3] in ["100", "700"]: - if value.get("b"): - numeration = utils.force_list(value.get("b"))[0] - agent_data["numeration"] = remove_trailing_punctuation(numeration) - if value.get("c"): - qualifier = utils.force_list(value.get("c"))[0] - agent_data["qualifier"] = remove_trailing_punctuation(qualifier) - if value.get("d"): - date = utils.force_list(value.get("d"))[0] - date = date.rstrip(",") - dates = remove_trailing_punctuation(date).split("-") - with contextlib.suppress(Exception): - if date_of_birth := dates[0].strip(): - agent_data["date_of_birth"] = date_of_birth - with contextlib.suppress(Exception): - if date_of_death := dates[1].strip(): - agent_data["date_of_death"] = date_of_death - if value.get("q"): - fuller_form_of_name = utils.force_list(value.get("q"))[0] - agent_data["fuller_form_of_name"] = ( - remove_trailing_punctuation(fuller_form_of_name).lstrip("(").rstrip(")") - ) - - elif key[:3] in ["710", "711"]: - agent_data["type"] = "bf:Organisation" - agent_data["conference"] = key[:3] == "711" - if value.get("e"): - subordinate_units = [ - subordinate_unit.rstrip(".") - for subordinate_unit in utils.force_list(value.get("e")) - ] - - agent_data["subordinate_unit"] = subordinate_units - if value.get("n"): - numbering = utils.force_list(value.get("n"))[0] - agent_data["numbering"] = ( - remove_trailing_punctuation(numbering).lstrip("(").rstrip(")") - ) - if value.get("d"): - conference_date = utils.force_list(value.get("d"))[0] - if ( - conference_date := remove_trailing_punctuation(conference_date) - .lstrip("(") - .rstrip(")") - ): - agent_data["conference_date"] = conference_date - if value.get("c"): - place = utils.force_list(value.get("c"))[0] - if place := remove_trailing_punctuation(place).lstrip("(").rstrip(")"): - agent_data["place"] = place - agent = { - "type": agent_data["type"], - "authorized_access_point": create_authorized_access_point(agent_data), - } - if agent_data.get("identifiedBy"): - agent["identifiedBy"] = agent_data["identifiedBy"] - roles = ["aut"] - if value.get("4"): - roles = list(utils.force_list(value.get("4"))) - elif key[:3] == "100": - roles = ["cre"] - elif key[:3] == "711": - roles = ["aut"] - else: - roles = ["ctb"] - return {"entity": agent, "role": roles} - - -@marc21.over("title", "^245..") -@utils.ignore_value -def marc21_to_title(self, key, value): - """Get title data. - - field 245: - $a : non repetitive - $b : non repetitive - $c : non repetitive - $n : repetitive - $p : repetitive - $6 : non repetitive - field 246: - $a : non repetitive - $n : repetitive - $p : repetitive - $6 : non repetitive - """ - subfield_245_a = "" - subfield_245_b = "" - if fields_245 := marc21.get_fields("245"): - subfields_245_a = marc21.get_subfields(fields_245[0], "a") - subfields_245_b = marc21.get_subfields(fields_245[0], "b") - if subfields_245_a: - subfield_245_a = subfields_245_a[0] - if subfields_245_b: - subfield_245_b = subfields_245_b[0] - field_245_a_end_with_equal = re.search(r"\s*=\s*$", subfield_245_a) - field_245_a_end_with_colon = re.search(r"\s*:\s*$", subfield_245_a) - field_245_a_end_with_semicolon = re.search(r"\s*;\s*$", subfield_245_a) - field_245_b_contains_equal = re.search(r"=", subfield_245_b) - - fields_246 = marc21.get_fields("246") - subfield_246_a = "" - if fields_246: - if subfields_246_a := marc21.get_subfields(fields_246[0], "a"): - subfield_246_a = subfields_246_a[0] - - tag_link, link = get_field_link_data(value) - items = get_field_items(value) - index = 1 - title_list = [] - title_data = {} - part_list = TitlePartList(part_number_code="n", part_name_code="p") - parallel_titles = [] - pararalel_title_data_list = [] - pararalel_title_string_set = set() - responsibility = {} - - subfield_selection = {"a", "b", "c", "n", "p"} - for blob_key, blob_value in items: - if blob_key in subfield_selection: - value_data = marc21.build_value_with_alternate_graphic( - "245", blob_key, blob_value, index, link, ",.", ":;/-=" - ) - if blob_key in {"a", "b", "c"}: - subfield_selection.remove(blob_key) - if blob_key == "a": - title_data["mainTitle"] = value_data - elif blob_key == "b": - if subfield_246_a: - subtitle, parallel_titles, pararalel_title_string_set = ( - extract_subtitle_and_parallel_titles_from_field_245_b( - value_data, field_245_a_end_with_equal - ) - ) - if subtitle: - title_data["subtitle"] = subtitle - elif value_data: - title_data["subtitle"] = value_data - elif blob_key == "c": - responsibility = marc21.build_responsibility_data(value_data) - elif blob_key in ["n", "p"]: - part_list.update_part(value_data, blob_key, blob_value) - if blob_key != "__order__": - index += 1 - title_data["type"] = "bf:Title" - if the_part_list := part_list.get_part_list(): - title_data["part"] = the_part_list - if title_data: - title_list.append(title_data) - variant_title_list = marc21.build_variant_title_data(pararalel_title_string_set) - - title_list.extend(iter(parallel_titles)) - title_list.extend(iter(variant_title_list)) - if responsibility: - self["responsibilityStatement"] = responsibility - return title_list or None - - -@marc21.over("editionStatement", "^250..") -@utils.for_each_value -@utils.ignore_value -def marc21_to_edition_statement(self, key, value): - """Get edition statement data. - - editionDesignation: 250 [$a non repetitive] (without trailing ponctuation) - responsibility: 250 [$b non repetitive] - """ - edition_data = {} - if subfields_a := utils.force_list(value.get("a")): - subfield_a = remove_trailing_punctuation(subfields_a[0]) - edition_data["editionDesignation"] = [{"value": subfield_a}] - if subfields_b := utils.force_list(value.get("b")): - subfields_b = subfields_b[0] - edition_data["responsibility"] = [{"value": subfields_b}] - return edition_data or None - - -@marc21.over("copyrightDate", "^264.4") -@utils.ignore_value -def marc21_to_copyright_date(self, key, value): - """Get Copyright Date.""" - copyright_dates = self.get("copyrightDate", []) - copyright_date = value.get("c") - if copyright_date: - if match := re.search(r"^([©℗])+\s*(\d{4}.*)", copyright_date): - copyright_date = " ".join((match.group(1), match.group(2))) - else: - raise ValueError("Bad format of copyright date") - copyright_dates.append(copyright_date) - return copyright_dates or None - - -@marc21.over("provisionActivity", "^(260..|264.[_0-3])") -@utils.for_each_value -@utils.ignore_value -def marc21_to_provision_activity(self, key, value): - """Get publisher data. - - publisher.name: 264 [$b repetitive] - publisher.place: 264 [$a repetitive] - publicationDate: 264 [$c repetitive] (but take only the first one) - """ - - def build_statement(field_value, ind2): - - def build_place_or_agent_data(code, label): - type_per_code = {"a": EntityType.PLACE, "b": EntityType.AGENT} - return ( - {"type": type_per_code[code], "label": [{"value": value}]} - if (value := remove_trailing_punctuation(label)) - else None - ) - - # function build_statement start here - statement = [] - items = get_field_items(field_value) - for blob_key, blob_value in items: - if blob_key in ("a", "b"): - place_or_agent_data = build_place_or_agent_data(blob_key, blob_value) - if place_or_agent_data: - statement.append(place_or_agent_data) - return statement or None - - def build_place(marc21): - place = {} - if marc21.country: - place["country"] = marc21.country - if place: - place["type"] = EntityType.PLACE - return place - - # the function marc21_to_provision_activity start here - ind2 = key[4] - type_per_ind2 = { - " ": "bf:Publication", - "_": "bf:Publication", - "0": "bf:Production", - "1": "bf:Publication", - "2": "bf:Distribution", - "3": "bf:Manufacture", - } - if key[:3] == "260": - ind2 = "1" # to force type to bf:Publication for field 260 - publication = { - "type": type_per_ind2[ind2], - "statement": [], - } - - publication["statement"] = build_statement(value, ind2) - - subfields_c = utils.force_list(value.get("c")) - if subfields_c: - subfield_c = subfields_c[0] - publication["statement"].append( - {"label": [{"value": subfield_c}], "type": "Date"} - ) - if ind2 in (" ", "1"): - dates = subfield_c.replace("[", "").replace("]", "").split("-") - try: - start_date = make_year(dates[0]) - if start_date: - publication["startDate"] = start_date - except Exception: - pass - try: - end_date = make_year(dates[1]) - if end_date: - publication["endDate"] = end_date - except Exception: - pass - place = build_place(marc21) - if place and place.get("country") != "xx": - publication["place"] = [place] - - return publication or None - - -@marc21.over("extent", "^300..") -@utils.ignore_value -def marc21_to_description(self, key, value): - """Get extent. - - extent: 300$a (the first one if many) - """ - if value.get("a") and not self.get("extent", None): - self["extent"] = remove_trailing_punctuation( - utils.force_list(value.get("a"))[0] - ) - return None - - -@marc21.over("note", "^500..") -@utils.for_each_value -@utils.ignore_value -def marc21_to_notes(self, key, value): - """Get notes. - - note: [500$a repetitive] - """ - add_note(dict(noteType="general", label=value.get("a", "")), self) - - return None - - -@marc21.over("summary", "^520..") -@utils.for_each_value -@utils.ignore_value -def marc21_to_summary(self, key, value): - """Get summary from repetitive field 520.""" - key_per_code = {"a": "label", "c": "source"} - # parse field 520 subfields for extracting: - # summary and source parts - tag_link, link = get_field_link_data(value) - items = get_field_items(value) - index = 1 - summary = {} - subfield_selection = {"a", "c"} - for blob_key, blob_value in items: - if blob_key in subfield_selection: - subfield_selection.remove(blob_key) - if blob_key == "a": - summary_data = marc21.build_value_with_alternate_graphic( - "520", blob_key, blob_value, index, link, ",.", ":;/-=" - ) - else: - summary_data = blob_value - if summary_data: - summary[key_per_code[blob_key]] = summary_data - if blob_key != "__order__": - index += 1 - return summary or None - - -@marc21.over("subjects", "^6....") -@utils.for_each_value -@utils.ignore_value -@utils.ignore_value -def marc21_to_subjects(self, key, value): - """Get subjects. - - subjects: 6xx [duplicates could exist between several vocabularies, - if possible deduplicate] - """ - seen = {} - for subject in utils.force_list(value.get("a")): - subject = {"type": EntityType.TOPIC, "authorized_access_point": subject} - str_subject = str(subject) - if str_subject not in seen: - seen[str_subject] = 1 - self.setdefault("subjects", []).append(dict(entity=subject)) - return None - - -@marc21.over("electronicLocator", "^8564.") -@utils.for_each_value -@utils.ignore_value -def marc21_electronicLocator(self, key, value): - """Get electronic locator.""" - indicator2 = key[4] - electronic_locator = {} - url = utils.force_list(value.get("u"))[0].strip() - subfield_3 = value.get("3") # materials_specified - if subfield_3: - subfield_3 = utils.force_list(subfield_3)[0] - if indicator2 == "2": - if subfield_3 and subfield_3 == "Image de couverture": - electronic_locator = { - "url": url, - "type": "relatedResource", - "content": "coverImage", - } - elif indicator2 == "0": - if subfield_x := value.get("x"): # nonpublic_note - electronic_locator = { - "url": url, - "type": "resource", - "source": utils.force_list(subfield_x)[0], - } - if subfield_q := value.get("q"): # electronic_format_type - if subfield_q == "audio": - self["type"] = [ - { - "main_type": "docmaintype_audio", - "subtype": "docsubtype_audio_book", - } - ] - return electronic_locator or None diff --git a/rero_ils/modules/ebooks/receivers.py b/rero_ils/modules/ebooks/receivers.py deleted file mode 100644 index b8fc9442ba..0000000000 --- a/rero_ils/modules/ebooks/receivers.py +++ /dev/null @@ -1,63 +0,0 @@ -# -*- coding: utf-8 -*- -# -# RERO ILS -# Copyright (C) 2019-2022 RERO -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, version 3 of the License. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -"""Signals connections for ebooks document.""" - -from dojson.contrib.marc21.utils import create_record -from flask import current_app - -from ..utils import set_timestamp -from .dojson.contrib.marc21 import marc21 -from .tasks import create_records, delete_records - - -def publish_harvested_records( - sender=None, records=None, max_results=None, *args, **kwargs -): - """Create, index the harvested records.""" - # name = kwargs['name'] - records = records or [] - if max_results: - records = list(records)[: int(max_results)] - converted_records = [] - deleted_records = [] - for record in records: - rec = create_record(record.xml) - rec = marc21.do(rec) - rec.setdefault("harvested", True) - - identifiers = rec.get("identifiedBy", []) - identifiers.append( - {"type": "bf:Local", "source": "cantook", "value": record.header.identifier} - ) - rec["identifiedBy"] = identifiers - if record.deleted: - deleted_records.append(rec) - else: - converted_records.append(rec) - if converted_records: - current_app.logger.info( - f"publish_harvester: received {len(converted_records)} " "records to create" - ) - create_records(converted_records) - if deleted_records: - current_app.logger.info( - f"publish_harvester: received {len(deleted_records)} " "records to delete" - ) - delete_records(deleted_records) - msg = f"deleted: {len(deleted_records)}, created: {len(converted_records)}" - set_timestamp("ebooks-harvester", msg=msg) diff --git a/rero_ils/modules/ebooks/tasks.py b/rero_ils/modules/ebooks/tasks.py deleted file mode 100644 index ed62abe807..0000000000 --- a/rero_ils/modules/ebooks/tasks.py +++ /dev/null @@ -1,107 +0,0 @@ -# -*- coding: utf-8 -*- -# -# RERO ILS -# Copyright (C) 2019-2022 RERO -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, version 3 of the License. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -"""Celery tasks to create records.""" - -from __future__ import absolute_import, print_function - -from celery import shared_task -from flask import current_app - -from ..documents.api import Document, DocumentsSearch -from ..utils import do_bulk_index, get_schema_for_resource, set_timestamp -from .utils import create_document_holding, update_document_holding - - -@shared_task(ignore_result=True) -def create_records(records): - """Records creation and indexing.""" - n_updated = 0 - n_created = 0 - uuids = [] - for record in records: - # add document type - if "type" not in record: - record["type"] = [ - {"main_type": "docmaintype_book", "subtype": "docsubtype_e-book"} - ] - # check if already harvested - pid = None - for identifier in record.get("identifiedBy"): - if identifier.get("source") == "cantook": - harvested_id = identifier.get("value") - query = ( - DocumentsSearch() - .filter("term", identifiedBy__value__raw=harvested_id) - .source(includes=["pid"]) - ) - try: - pid = next(query.scan()).pid - except StopIteration: - pid = None - try: - # add documents schema - pid_type = Document.provider.pid_type - record["$schema"] = get_schema_for_resource(pid_type) - if pid: - # update the record - record["pid"] = pid - existing_record = update_document_holding(record, pid) - n_updated += 1 - uuids.append(existing_record.id) - elif new_record := create_document_holding(record): - n_created += 1 - uuids.append(new_record.id) - except Exception as err: - current_app.logger.error(f"EBOOKS CREATE RECORDS: {err} {record}") - do_bulk_index(uuids, doc_type="doc", process=True) - - current_app.logger.info(f"create_records: {n_updated} updated, {n_created} new") - set_timestamp("ebooks_create_records", created=n_created, updated=n_updated) - return n_created, n_updated - - -@shared_task(ignore_result=True) -def delete_records(records): - """Records deleting.""" - count = 0 - for record in records: - # check if exist - pid = None - for identifier in record.get("identifiedBy"): - if identifier.get("source") == "cantook": - harvested_id = identifier.get("value") - query = ( - DocumentsSearch() - .filter("term", identifiedBy__value__raw=harvested_id) - .source(includes=["pid"]) - ) - try: - pid = [r.pid for r in query.scan()].pop() - except IndexError: - pid = None - try: - if pid: - # update the record - existing_record = Document.get_record_by_pid(pid) - # TODO: delete record and linked references - count += 1 - except Exception as err: - current_app.logger.error(f"EBOOKS DELETE RECORDS: {err} {record}") - current_app.logger.info(f"delete_records: {count}") - set_timestamp("ebooks_delete_records", deleted=count) - return count diff --git a/rero_ils/modules/ebooks/utils.py b/rero_ils/modules/ebooks/utils.py deleted file mode 100644 index e64a7c13aa..0000000000 --- a/rero_ils/modules/ebooks/utils.py +++ /dev/null @@ -1,192 +0,0 @@ -# -*- coding: utf-8 -*- -# -# RERO ILS -# Copyright (C) 2019-2022 RERO -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, version 3 of the License. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -"""Utilities.""" - -from flask import current_app -from invenio_db import db -from invenio_oaiharvester.models import OAIHarvestConfig - -from rero_ils.modules.locations.api import Location - -from ..documents.api import Document -from ..holdings.api import ( - Holding, - HoldingsSearch, - create_holding, - get_holding_pid_by_doc_location_item_type, -) -from ..organisations.api import Organisation - - -def add_oai_source( - name, baseurl, metadataprefix="marc21", setspecs="", comment="", update=False -): - """Add OAIHarvestConfig.""" - with current_app.app_context(): - source = OAIHarvestConfig.query.filter_by(name=name).first() - if not source: - source = OAIHarvestConfig( - name=name, - baseurl=baseurl, - metadataprefix=metadataprefix, - setspecs=setspecs, - comment=comment, - ) - source.save() - db.session.commit() - return "Added" - elif update: - source.name = name - source.baseurl = baseurl - source.metadataprefix = metadataprefix - if setspecs != "": - source.setspecs = setspecs - if comment != "": - source.comment = comment - db.session.commit() - return "Updated" - return "Not Updated" - - -def get_harvested_sources(record): - """Get the harvested sources from electronicLocator.""" - harvested_sources = [] - new_electronic_locators = [] - electronic_locators = record.get("electronicLocator", []) - for electronic_locator in electronic_locators: - if source := electronic_locator.get("source"): - harvested_sources.append( - {"source": source, "uri": electronic_locator.get("url")} - ) - else: - new_electronic_locators.append(electronic_locator) - if new_electronic_locators: - record["electronicLocator"] = new_electronic_locators - return harvested_sources - - -def create_document_holding(record): - """Create a document and a holding for a harvested ebook.""" - harvested_sources = get_harvested_sources(record) - new_record = None - holdings = [] - for harvested_source in harvested_sources: - if org := Organisation.get_record_by_online_harvested_source( - source=harvested_source["source"] - ): - if not new_record: - new_record = Document.create(data=record, dbcommit=False, reindex=False) - if new_record: - item_type_pid = org.online_circulation_category() - location_pids = org.get_online_locations() - for location_pid in location_pids: - location = Location.get_record_by_pid(location_pid) - library = location.get_library() - if url := library.get_online_harvested_source_url( - source=harvested_source["source"] - ): - uri_split = harvested_source["uri"].split("/")[3:] - uri_split.insert(0, url.rstrip("/")) - harvested_source["uri"] = "/".join(uri_split) - hold = create_holding( - document_pid=new_record.pid, - location_pid=location_pid, - item_type_pid=item_type_pid, - electronic_location=harvested_source, - holdings_type="electronic", - ) - holdings.append(hold) - else: - current_app.logger.warning( - f"create document holding no org: {harvested_source['source']}" - ) - db.session.commit() - for hold in holdings: - hold.reindex() - # the document has been reindexed by the holdings - if not holdings and new_record: - new_record.reindex() - return new_record - - -def update_document_holding(record, pid): - """Update a document and a holding for a harvested ebook.""" - harvested_sources = get_harvested_sources(record) - new_record = None - existing_record = Document.get_record_by_pid(pid) - new_record = existing_record.replace(data=record, dbcommit=False, reindex=False) - # Save all source uris to find holdings we can delete later - source_uris = [] - holdings = [] - for harvested_source in harvested_sources: - if org := Organisation.get_record_by_online_harvested_source( - source=harvested_source["source"] - ): - # add the organisation source uri - source_uris.append(harvested_source["uri"]) - item_type_pid = org.online_circulation_category() - for location_pid in org.get_online_locations(): - location = Location.get_record_by_pid(location_pid) - library = location.get_library() - # replace "https://some.uri" from ebooks with library uri - if url := library.get_online_harvested_source_url( - source=harvested_source["source"] - ): - uri_split = harvested_source["uri"].split("/")[3:] - uri_split.insert(0, url.rstrip("/")) - new_uri = "/".join(uri_split) - harvested_source["uri"] = new_uri - # add the library source uri - source_uris.append(new_uri) - if not get_holding_pid_by_doc_location_item_type( - new_record.pid, location_pid, item_type_pid, "electronic" - ): - hold = create_holding( - document_pid=new_record.pid, - location_pid=location_pid, - item_type_pid=item_type_pid, - electronic_location=harvested_source, - holdings_type="electronic", - ) - holdings.append(hold) - db.session.commit() - for hold in holdings: - hold.reindex() - # the document has been reindexed by the holdings - if not holdings and new_record: - new_record.reindex() - HoldingsSearch.flush_and_refresh() - # delete all double holdings and holdings without valid source uri - seen_uris = [] - for holding_pid in Holding.get_holdings_pid_by_document_pid(pid): - holding = Holding.get_record_by_pid(holding_pid) - to_delete = True - for electronic_location in holding.get("electronic_location", []): - uri = electronic_location.get("uri") - if electronic_location.get("source") and uri not in seen_uris: - seen_uris.append(uri) - if uri in source_uris: - to_delete = False - if to_delete: - current_app.logger.info( - "Delete harvested holding | " - f"document: {pid} " - f'holding: {holding.pid} {holding.get("electronic_location")}' - ) - holding.delete(force=False, dbcommit=True, delindex=True) - return new_record diff --git a/rero_ils/modules/entities/api.py b/rero_ils/modules/entities/api.py index 74d0a60ce6..6c29ceab66 100644 --- a/rero_ils/modules/entities/api.py +++ b/rero_ils/modules/entities/api.py @@ -63,7 +63,7 @@ def get_authorized_access_point(self, language): :param language: language for authorized access point. :returns: authorized access point in given language. """ - raise NotImplementedError + raise NotImplementedError() @abstractmethod def get_links_to_me(self, get_pids=False): @@ -90,7 +90,7 @@ def reasons_not_to_delete(self): @abstractmethod def resource_type(self): """Get the entity type.""" - raise NotImplementedError + raise NotImplementedError() @property def organisation_pids(self): diff --git a/rero_ils/modules/ext.py b/rero_ils/modules/ext.py index 92dccb3e45..595e670bc9 100644 --- a/rero_ils/modules/ext.py +++ b/rero_ils/modules/ext.py @@ -31,7 +31,6 @@ from invenio_base.utils import obj_or_import_string from invenio_circulation.signals import loan_state_changed from invenio_indexer.signals import before_record_index -from invenio_oaiharvester.signals import oaiharvest_finished from invenio_records.signals import ( after_record_insert, after_record_update, @@ -64,7 +63,6 @@ from rero_ils.modules.acquisition.acq_receipts.listener import enrich_acq_receipt_data from rero_ils.modules.acquisition.budgets.listener import budget_is_active_changed from rero_ils.modules.collections.listener import enrich_collection_data -from rero_ils.modules.ebooks.receivers import publish_harvested_records from rero_ils.modules.holdings.listener import ( enrich_holding_data, update_items_locations_and_types, @@ -346,8 +344,6 @@ def register_signals(self, app): loan_state_changed.connect(listener_loan_state_changed, weak=False) - oaiharvest_finished.connect(publish_harvested_records, weak=False) - # store the username in the session user_logged_in.connect(set_user_name) user_logged_out.connect(remove_user_name) diff --git a/rero_ils/modules/files/operations.py b/rero_ils/modules/files/operations.py index 52f538a7a6..bfdcc33f43 100644 --- a/rero_ils/modules/files/operations.py +++ b/rero_ils/modules/files/operations.py @@ -45,7 +45,7 @@ def on_post_commit(self, uow): :param uow: obj - UnitOfWork instance. """ - raise NotImplementedError + raise NotImplementedError() class ReindexDoc(ReindexOperationBase): diff --git a/rero_ils/modules/organisations/api.py b/rero_ils/modules/organisations/api.py index ff3d433d56..f915b85ae3 100644 --- a/rero_ils/modules/organisations/api.py +++ b/rero_ils/modules/organisations/api.py @@ -106,13 +106,20 @@ def get_record_by_online_harvested_source(cls, source): :param source: the record source :return: Organisation record or None. """ - results = ( - OrganisationsSearch().filter("term", online_harvested_source=source).scan() - ) - try: - return Organisation.get_record_by_pid(next(results).pid) - except StopIteration: - return None + for org in cls.get_records_by_online_harvested_source(source): + return org + + @classmethod + def get_records_by_online_harvested_source(cls, source): + """Get record by online harvested source. + + :param source: the record source + :return: Organisation record or None. + """ + query = OrganisationsSearch().filter("term", online_harvested_source=source) + org_pids = [hit.pid for hit in query.source("pid").scan()] + for org_pid in org_pids: + yield Organisation.get_record_by_pid(org_pid) @property def organisation_pid(self): diff --git a/rero_ils/modules/stats/api/indicators/base.py b/rero_ils/modules/stats/api/indicators/base.py index 9f5669b9d2..d230b4c9a1 100644 --- a/rero_ils/modules/stats/api/indicators/base.py +++ b/rero_ils/modules/stats/api/indicators/base.py @@ -39,7 +39,7 @@ def query(self): :returns: an elasticsearch query object """ - raise NotImplementedError + raise NotImplementedError() @property @abstractmethod @@ -49,7 +49,7 @@ def aggregation(self, distribution): :param distrubtion: str - report distrubtion name :returns: an elasticsearch aggregation object """ - raise NotImplementedError + raise NotImplementedError() @abstractmethod def label(self, distribution, bucket): @@ -60,4 +60,4 @@ def label(self, distribution, bucket): :returns: the label :rtype: str """ - raise NotImplementedError + raise NotImplementedError() diff --git a/scripts/setup b/scripts/setup index 6ae0474808..9b51b6ed2a 100755 --- a/scripts/setup +++ b/scripts/setup @@ -599,21 +599,25 @@ create_token organisation_scotland_token reroilstest+irma@gmail.com ${INVENIO_R create_token organisation_fictive_token reroilstest+imagination@gmail.com ${INVENIO_RERO_ACCESS_TOKEN_FICTIVE_LIBRARIAN} # # OAI configuration -info_msg "OAI configuration: ${DATA_PATH}/oaisources.yml" -eval ${PREFIX} invenio reroils oaiharvester initconfig ${DATA_PATH}/oaisources.yml +info_msg "API configuration: ${DATA_PATH}/apisources.yml" +eval ${PREFIX} invenio reroils apiharvester init-config ${DATA_PATH}/apisources.yml eval ${PREFIX} invenio reroils scheduler enable_tasks -a -v -# disable ebook harvesting -eval ${PREFIX} invenio reroils scheduler enable_tasks -n ebooks-harvester -d +info_msg disable VS/NJ CANTOOK harvesting +eval ${PREFIX} invenio reroils scheduler enable_tasks -n harvest-vs-cantook -d +eval ${PREFIX} invenio reroils scheduler enable_tasks -n harvest-nj-cantook -d if ${DEPLOYMENT} then # start oai harvesting asynchrone: beats must be running - info_msg "Start OAI harvesting asynchrone" - eval ${PREFIX} invenio reroils oaiharvester harvest -n ebooks -a max_results=150 -q -k + info_msg "Start VS-CANTOOK harvesting asynchrone" + eval ${PREFIX} invenio reroils apiharvester harvest -n VS-CANTOOK -m 150 -k + info_msg "Start NJ-CANTOOK harvesting asynchrone" + eval ${PREFIX} invenio reroils apiharvester harvest -n NJ-CANTOOK -m 150 -k else - info_msg "For ebooks harvesting run:" - msg "\tinvenio reroils oaiharvester harvest -n ebooks -a max_results=100 -q" + info_msg "For VS/NJ CANTOOK harvesting run:" + msg "\tinvenio reroils apiharvester harvest -n VS-CANTOOK -m 100 -v" + msg "\tinvenio reroils apiharvester harvest -n NJ-CANTOOK -m 100 -v" fi if ${ES_MAPPING} diff --git a/setup.py b/setup.py index 6f230d1983..5dec3eda2a 100644 --- a/setup.py +++ b/setup.py @@ -147,7 +147,6 @@ def run(self): 'marc21tojson_loc = rero_ils.modules.documents.dojson.contrib.marc21tojson:marc21_loc', 'marc21tojson_slsp = rero_ils.modules.documents.dojson.contrib.marc21tojson:marc21_slsp', 'marc21tojson_ugent = rero_ils.modules.documents.dojson.contrib.marc21tojson:marc21_ugent', - 'marc21toebooks = rero_ils.modules.ebooks.dojson.contrib.marc21:marc21', 'unimarctojson = rero_ils.modules.documents.dojson.contrib.unimarctojson:unimarc', ], 'flask.commands': [ @@ -156,7 +155,6 @@ def run(self): 'migration = rero_ils.modules.migration.cli:migration', 'monitoring = rero_ils.modules.monitoring.cli:monitoring', 'notifications = rero_ils.modules.notifications.cli:notifications', - 'oaiharvester = rero_ils.modules.ebooks.cli:oaiharvester', 'reroils = rero_ils.modules.cli.reroils:reroils', 'scheduler = rero_ils.schedulers:scheduler', 'stats = rero_ils.modules.stats.cli:stats', @@ -330,7 +328,6 @@ def run(self): 'apiharvester = rero_ils.modules.apiharvester.tasks', 'collections = rero_ils.modules.collections.tasks', 'documents = rero_ils.modules.documents.tasks', - 'ebooks = rero_ils.modules.ebooks.tasks', 'holdings = rero_ils.modules.holdings.tasks', 'items = rero_ils.modules.items.tasks', 'loans = rero_ils.modules.loans.tasks', diff --git a/tests/api_harvester/cantook b/tests/api_harvester/cantook new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/api_harvester/test_cli_api_harcester.py b/tests/api_harvester/test_cli_api_harcester.py new file mode 100644 index 0000000000..5af57b1591 --- /dev/null +++ b/tests/api_harvester/test_cli_api_harcester.py @@ -0,0 +1,113 @@ +# -*- coding: utf-8 -*- +# +# RERO ILS +# Copyright (C) 2019 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Test api harvester cli.""" + +from os.path import dirname, join + +from click.testing import CliRunner + +from rero_ils.modules.apiharvester.cli import ( + add_api_source_config, + harvest, + info, + init_api_harvest_config, + set_last_run, +) + + +def test_cli(app): + """Test count cli.""" + config_file = join(dirname(__file__), "../data/apisources.yml") + runner = CliRunner() + result = runner.invoke(init_api_harvest_config, [config_file]) + assert result.exit_code == 0 + output = result.output.strip().split("\n") + assert output[0] == "ApiHarvestConfig NJ-CANTOOK: Add" + assert output[1] == "ApiHarvestConfig VS-CANTOOK: Add" + + runner = CliRunner() + result = runner.invoke(info) + assert result.exit_code == 0 + output = result.output.strip().split("\n") + assert output == [ + "NJ-CANTOOK", + "\tlastrun : 1900-01-01 00:00:00", + "\turl : https://bm.ebibliomedia.ch", + "\tclassname : rero_ils.modules.apiharvester.cantook.api.ApiCantook", + "\tcode : ebibliomedia", + "VS-CANTOOK", + "\tlastrun : 1900-01-01 00:00:00", + "\turl : https://mediatheque-valais.cantookstation.eu", + "\tclassname : rero_ils.modules.apiharvester.cantook.api.ApiCantook", + "\tcode : mv-cantook", + ] + + runner = CliRunner() + result = runner.invoke(set_last_run, ["NJ-CANTOOK", "-d", "2002-02-02"]) + assert result.exit_code == 0 + output = result.output.strip().split("\n") + assert output == ["Set last run NJ-CANTOOK: 2002-02-02"] + + runner = CliRunner() + result = runner.invoke(info) + assert result.exit_code == 0 + output = result.output.strip().split("\n") + assert output == [ + "NJ-CANTOOK", + "\tlastrun : 2002-02-02 00:00:00", + "\turl : https://bm.ebibliomedia.ch", + "\tclassname : rero_ils.modules.apiharvester.cantook.api.ApiCantook", + "\tcode : ebibliomedia", + "VS-CANTOOK", + "\tlastrun : 1900-01-01 00:00:00", + "\turl : https://mediatheque-valais.cantookstation.eu", + "\tclassname : rero_ils.modules.apiharvester.cantook.api.ApiCantook", + "\tcode : mv-cantook", + ] + + runner = CliRunner() + result = runner.invoke( + add_api_source_config, ["NJ-CANTOOK", "-c", "ebibliomedia-test", "-u"] + ) + assert result.exit_code == 0 + output = result.output.strip().split("\n") + assert output == ["ApiHarvestConfig NJ-CANTOOK: Update code:ebibliomedia-test"] + + runner = CliRunner() + result = runner.invoke(info) + assert result.exit_code == 0 + output = result.output.strip().split("\n") + assert output == [ + "NJ-CANTOOK", + "\tlastrun : 2002-02-02 00:00:00", + "\turl : https://bm.ebibliomedia.ch", + "\tclassname : rero_ils.modules.apiharvester.cantook.api.ApiCantook", + "\tcode : ebibliomedia-test", + "VS-CANTOOK", + "\tlastrun : 1900-01-01 00:00:00", + "\turl : https://mediatheque-valais.cantookstation.eu", + "\tclassname : rero_ils.modules.apiharvester.cantook.api.ApiCantook", + "\tcode : mv-cantook", + ] + + runner = CliRunner() + result = runner.invoke(harvest, ["-n", "VS-CANTOOK", "-v", "-m", "0"]) + assert result.exit_code == 0 + output = result.output.strip().split("\n") + print(output) + # assert output == ["ApiHarvestConfig NJ-CANTOOK: Update code:ebibliomedia-test"] diff --git a/tests/data/xml/ebook1.xml b/tests/data/xml/ebook1.xml deleted file mode 100644 index ed008271b5..0000000000 --- a/tests/data/xml/ebook1.xml +++ /dev/null @@ -1,89 +0,0 @@ - - 00000cam a2200000zu 4500 - - 9782075118842 - - - cantook/EDEN502344 - - - cantook-EDEN502344 - - - fre - - - eng - - - masked - - - Killer Game - - - 2019 - Gallimard Jeunesse - - - 400 pages - - - - Osborne est une petite ville du Nebraska où tout le monde se - connaît, pas vraiment le cadre rêvé pour une adolescente! - Mais avec ses amis, Alex la cynique et le très protecteur Darby, - Makani s'y plaît. Sans parler d'Ollie, le garçon solitaire dont - elle aimerait beaucoup se rapprocher... Tout bascule lorsque les - élèves de son lycée se font assassiner les uns après les autres. - Pour éviter de devenir une proie, Makani va devoir afronter un - terrible secret. Qui a dit qu'il ne se passait jamais rien à - Osborne? - - - - Jeunesse - albums et romans - Juvenile Fiction - - - Jeunesse - Youth - - - Policiers & Thrillers - Detective & thrillers - - - Fiction - Fiction - - - Fiction jeunesse: généralités - - Children's / Teenage fiction: General fiction - - - - Perkins, Stephanie - aut - - - Polanco, Emmanuel - - - Troin, Isabelle - trl - - - https://www.edenlivres.fr/p/502344 - Extrait - - - epub - - https://test1/resources/5ccd26d523579476a9ac9f3 - - Texte intégral - mv-cantook - - diff --git a/tests/data/xml/ebook2.xml b/tests/data/xml/ebook2.xml deleted file mode 100644 index 79113cd6a4..0000000000 --- a/tests/data/xml/ebook2.xml +++ /dev/null @@ -1,73 +0,0 @@ - - 00000cam a2200000zu 4500 - - 9782811234157 - - - cantook/immateriel.frO1006810 - - - cantook-immateriel.frO1006810 - - - fre - - - masked - - - La Vie à portée de main - - - 2019 - Milady - - - 384 pages - - - - « Chère Libby, je me rends compte que ça fait deux longues années – - bon sang ! – que tes enfants et toi vivez chez ta mère. Je t'écris - pour savoir si tu veux que je vienne à ton secours. - Depuis la mort de son mari, Libby vit chez sa mère, une femme - autoritaire qui passe son temps à critiquer tout ce qui l'entoure. - - - - Romans sentimentaux - Romance - - - Contemporain - Contemporary - - - - Roman sentimental pour adulte et roman sentimental contemporain - - Adult & contemporary romance - - - Guillaume, Nathalie - trl - - - Center, Katherine - aut - - - - http://images.immateriel.fr/covers/7GCMWJ4.png - - Image de couverture - - - audio - - https://test2/resources/5d7c7e462357947ad94991f6 - - Texte intégral - ebibliomedia - - diff --git a/tests/fixtures/metadata.py b/tests/fixtures/metadata.py index 10b022bc5e..241cfe0ae0 100644 --- a/tests/fixtures/metadata.py +++ b/tests/fixtures/metadata.py @@ -1098,22 +1098,6 @@ def pattern_bimonthly_every_two_months_two_levels_data(holdings): return deepcopy(holdings.get("pattern10")) -@pytest.fixture(scope="module") -def ebooks_1_xml(): - """Load ebook1 xml file.""" - filepath = join(dirname(__file__), "..", "data", "xml", "ebook1.xml") - with open(filepath) as fh: - return fh.read() - - -@pytest.fixture(scope="module") -def ebooks_2_xml(): - """Load ebook2 xml file.""" - filepath = join(dirname(__file__), "..", "data", "xml", "ebook2.xml") - with open(filepath) as fh: - return fh.read() - - @pytest.fixture(scope="module") def babel_filehandle(): """Load ebook2 xml file.""" diff --git a/tests/ui/documents/test_documents_api.py b/tests/ui/documents/test_documents_api.py index 85608329bf..8e51ab67fd 100644 --- a/tests/ui/documents/test_documents_api.py +++ b/tests/ui/documents/test_documents_api.py @@ -35,7 +35,6 @@ ) from rero_ils.modules.documents.models import DocumentIdentifier from rero_ils.modules.documents.tasks import delete_drafts, delete_orphan_harvested -from rero_ils.modules.ebooks.tasks import create_records from rero_ils.modules.entities.models import EntityType from rero_ils.modules.entities.remote_entities.api import ( RemoteEntitiesSearch, @@ -229,69 +228,6 @@ def test_document_can_delete(app, document_data_tmp): assert reasons == {} -def test_document_create_records( - app, - org_martigny, - org_sion, - ebook_1_data, - ebook_2_data, - item_type_online_martigny, - loc_online_martigny, - item_type_online_sion, - loc_online_sion, -): - """Test can create harvested records.""" - ebook_1_data["electronicLocator"] = [ - { - "source": "ebibliomedia", - "url": "https://www.site1.org/ebook", - "type": "resource", - } - ] - ebook_2_data["electronicLocator"] = [ - { - "source": "ebibliomedia", - "url": "https://www.site2.org/ebook", - "type": "resource", - } - ] - n_created, n_updated = create_records([ebook_1_data]) - assert n_created == 1 - assert n_updated == 0 - - ebook_1_data["electronicLocator"] = [ - { - "source": "ebibliomedia", - "url": "https://www.site2.org/ebook", - "type": "resource", - }, - { - "source": "mv-cantook", - "url": "https://www.site3.org/ebook", - "type": "resource", - }, - ] - n_created, n_updated = create_records([ebook_1_data, ebook_2_data]) - assert n_created == 1 - assert n_updated == 1 - - ebook_1_data["electronicLocator"] = [ - { - "source": "mv-cantook", - "url": "https://www.site3.org/ebook", - "type": "resource", - } - ] - n_created, n_updated = create_records([ebook_1_data, ebook_2_data]) - assert n_created == 0 - assert n_updated == 2 - - # TODO: find a way to execute celery worker tasks in travis tests - # n_created, n_updated = create_records.delay([ebook_1_data]) - # assert n_created == 0 - # assert n_updated == 1 - - def test_document_can_delete_harvested(app, ebook_1_data): """Test can delete for harvested records.""" document = Document.create(ebook_1_data, delete_pid=True) diff --git a/tests/ui/ebooks/test_ebooks_receivers.py b/tests/ui/ebooks/test_ebooks_receivers.py deleted file mode 100644 index efd1f6ef01..0000000000 --- a/tests/ui/ebooks/test_ebooks_receivers.py +++ /dev/null @@ -1,116 +0,0 @@ -# -*- coding: utf-8 -*- -# -# RERO ILS -# Copyright (C) 2019 RERO -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, version 3 of the License. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -"""Test ebooks receivers.""" - -from collections import namedtuple - -from rero_ils.modules.documents.api import Document, DocumentsSearch -from rero_ils.modules.ebooks.receivers import publish_harvested_records -from rero_ils.modules.ebooks.tasks import create_records, delete_records -from rero_ils.modules.holdings.api import Holding, HoldingsSearch - - -def test_publish_harvested_records( - app, - ebooks_1_xml, - ebooks_2_xml, - org_martigny, - loc_online_martigny, - item_type_online_martigny, - org_sion, - loc_online_sion, - item_type_online_sion, -): - """Test publish harvested records.""" - Identifier = namedtuple("Identifier", "identifier") - Record = namedtuple("Record", "xml deleted header") - records = [ - Record(xml=ebooks_1_xml, deleted=False, header=Identifier(identifier="record1")) - ] - records.append( - Record(xml=ebooks_2_xml, deleted=False, header=Identifier(identifier="record2")) - ) - records.append( - Record(xml=ebooks_2_xml, deleted=True, header=Identifier(identifier="record3")) - ) - - kwargs = {"max": 100} - publish_harvested_records(sender=None, records=records, kwargs=kwargs) - DocumentsSearch.flush_and_refresh() - HoldingsSearch.flush_and_refresh() - - assert Document.count() == 2 - doc1 = Document.get_record_by_pid("1") - assert doc1.get("$schema") is not None - assert doc1.get("identifiedBy") == [ - {"type": "bf:Isbn", "value": "9782075118842"}, - {"type": "bf:Local", "value": "cantook-EDEN502344"}, - {"type": "bf:Local", "source": "cantook", "value": "record1"}, - ] - assert doc1.get("type") == [ - {"main_type": "docmaintype_book", "subtype": "docsubtype_e-book"} - ] - - assert len(list(Holding.get_holdings_pid_by_document_pid(doc1.pid))) == 1 - doc2 = Document.get_record_by_pid("2") - assert doc2.get("$schema") is not None - assert doc2.get("identifiedBy") == [ - {"type": "bf:Isbn", "value": "9782811234157"}, - {"type": "bf:Local", "value": "cantook-immateriel.frO1006810"}, - {"type": "bf:Local", "source": "cantook", "value": "record2"}, - ] - assert doc2.get("type") == [ - {"main_type": "docmaintype_audio", "subtype": "docsubtype_audio_book"} - ] - assert len(list(Holding.get_holdings_pid_by_document_pid(doc2.pid))) == 1 - - # test update - # cretae a double holding - hold_pid = next(Holding.get_holdings_pid_by_document_pid(doc1.pid)) - hold = Holding.get_record_by_pid(hold_pid) - Holding.create(data=hold, dbcommit=True, reindex=True, delete_pid=True) - # create a holding without valid source uri - hold["electronic_location"][0]["uri"] = "https://invalid.uri/XXXXXX" - Holding.create(data=hold, dbcommit=True, reindex=True, delete_pid=True) - HoldingsSearch.flush_and_refresh() - publish_harvested_records(sender=None, records=records) - DocumentsSearch.flush_and_refresh() - HoldingsSearch.flush_and_refresh() - assert len(list(Holding.get_holdings_pid_by_document_pid(doc1.pid))) == 1 - assert len(list(Holding.get_holdings_pid_by_document_pid(doc2.pid))) == 1 - - # test delete - records = [] - del doc1["electronicLocator"] - records.append(doc1) - doc2["electronicLocator"] = [ - { - "content": "coverImage", - "type": "relatedResource", - "url": "http://images.immateriel.fr/covers/DEQ2C5A.png", - } - ] - records.append(doc2) - - create_records(records=records) - DocumentsSearch.flush_and_refresh() - HoldingsSearch.flush_and_refresh() - assert not list(Holding.get_holdings_pid_by_document_pid(doc1.pid)) - assert not list(Holding.get_holdings_pid_by_document_pid(doc2.pid)) - - assert 2 == delete_records(records=records) diff --git a/tests/ui/ebooks/test_ebooks_utils.py b/tests/ui/ebooks/test_ebooks_utils.py deleted file mode 100644 index af34e865fa..0000000000 --- a/tests/ui/ebooks/test_ebooks_utils.py +++ /dev/null @@ -1,36 +0,0 @@ -# -*- coding: utf-8 -*- -# -# RERO ILS -# Copyright (C) 2019 RERO -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, version 3 of the License. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -"""Test ebook utils.""" - -from rero_ils.modules.ebooks.utils import add_oai_source - - -def test_add_oai_source(app): - """Test add oai source.""" - msg = add_oai_source(name="test", baseurl="http://test.com") - assert msg == "Added" - msg = add_oai_source(name="test", baseurl="http://test.com") - assert msg == "Not Updated" - msg = add_oai_source( - name="test", - baseurl="http://test.com", - setspecs="specs", - comment="comment", - update=True, - ) - assert msg == "Updated" diff --git a/tests/unit/documents/test_documents_dojson_ebooks.py b/tests/unit/documents/test_documents_dojson_ebooks.py deleted file mode 100644 index 6e9d6b79b0..0000000000 --- a/tests/unit/documents/test_documents_dojson_ebooks.py +++ /dev/null @@ -1,597 +0,0 @@ -# -*- coding: utf-8 -*- -# -# RERO ILS -# Copyright (C) 2019 RERO -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, version 3 of the License. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -"""DOJSON transiformation for ebooks module tests.""" - -from __future__ import absolute_import, print_function - -from dojson.contrib.marc21.utils import create_record - -from rero_ils.modules.ebooks.dojson.contrib.marc21 import marc21 - - -def test_marc21_to_isbn_ebooks(): - """Test dojson isbn transformation.""" - marc21xml = """ - - - 9782812933868 - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("identifiedBy") == [{"type": "bf:Isbn", "value": "9782812933868"}] - - marc21xml = """ - - - 9782812 - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("identifiedBy") is None - - marc21xml = """ - - - feedhttps-www-feedbooks-com-book-414-epub - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert not data.get("identifiedBy") - - -def test_marc21_to_languages_ebooks_from_008(): - """Test languages from field 008.""" - marc21xml = """ - - 00501naa a2200133 a 4500 - 160315s2015 cc ||| | ||||00| |fre d - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("language") == [{"type": "bf:Language", "value": "fre"}] - - -def test_marc21_to_languages_ebooks(): - """Test languages transformation. - - Test languages in multiples fields 041. - """ - marc21xml = """ - - - fre - - - eng - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("language") == [{"type": "bf:Language", "value": "fre"}] - - -def test_marc21_to_type_ebooks(): - """Test Other Standard Identifier transformation.""" - marc21xml = """ - - - http://cantookstation.com/resources/1 - - - - cantook-EDEN496624 - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - identifiers = data.get("identifiedBy", []) - assert identifiers[0] == {"type": "bf:Local", "value": "cantook-EDEN496624"} - - -def test_marc21_to_title(): - """Test title transformation.""" - marc21xml = """ - - - Elena et les joueuses - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("title") == [ - {"mainTitle": [{"value": "Elena et les joueuses"}], "type": "bf:Title"} - ] - - -def test_marc21_to_extent(): - """Test extent transformation. - - Transformation of nb pages, volumes... field 300 $a. - """ - marc21xml = """ - - - 1234 - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("extent") == "1234" - - -def test_marc21_to_description(): - """Test description transformation. - - 300 [$a repetitive]: extent, duration: - 300 [$a non repetitive]: colorContent, productionMethod, - illustrativeContent, note of type otherPhysicalDetails - 300 [$c rep - """ - marc21xml = """ - - - 116 p. - ill. - 22 cm - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("extent") == "116 p." - - marc21xml = """ - - - 116 p. - ill. - 22 cm - 12 x 15 - - - 200 p. - ill. - 19 cm - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("extent") == "116 p." - - marc21xml = """ - - - 116 p. - ill. - 22 cm - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("extent") == "116 p." - - -def test_marc21_to_notes(): - """Test notes transformation. - - Transformation notes field 500 $a. - """ - - marc21xml = """ - - - note 1 - - - note 2 - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("note") == [ - {"noteType": "general", "label": "note 1"}, - {"noteType": "general", "label": "note 2"}, - ] - - -def test_marc21_to_edition_statement_one_field_250(): - """Test dojson edition statement. - - 1 edition designation and 1 responsibility from field 250 - """ - marc21xml = """ - - - 2e ed. - avec un avant-propos par Jean Faret - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("editionStatement") == [ - { - "editionDesignation": [{"value": "2e ed."}], - "responsibility": [{"value": "avec un avant-propos par Jean Faret"}], - } - ] - - -def test_marc21_to_provision_activity_ebooks_from_field_260(): - """Test provision activity Place and Date from field 260 transformation.""" - marc21xml = """ - - - Lausanne : - - [2006] - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("provisionActivity") == [ - { - "type": "bf:Publication", - "statement": [ - {"label": [{"value": "Lausanne"}], "type": "bf:Place"}, - {"label": [{"value": "[2006]"}], "type": "Date"}, - ], - "startDate": 2006, - } - ] - - -# Copyright Date: [264 _4 $c non repetitive] -def test_marc21copyrightdate_ebooks_from_field_264_04(): - """Test dojson Copyright Date.""" - - marc21xml = """ - - - © 1971 - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("copyrightDate") == ["© 1971"] - - marc21xml = """ - - - © 1971 [extra 1973] - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("copyrightDate") == ["© 1971 [extra 1973]"] - - -def test_marc21_to_provision_activity_ebooks_from_field_264_1(): - """Test provision activity Place and Date from field 264_1 transform.""" - marc21xml = """ - - - Lausanne : - Payot, - [2006-2010] - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("provisionActivity") == [ - { - "type": "bf:Publication", - "statement": [ - {"label": [{"value": "Lausanne"}], "type": "bf:Place"}, - {"label": [{"value": "Payot"}], "type": "bf:Agent"}, - {"label": [{"value": "[2006-2010]"}], "type": "Date"}, - ], - "startDate": 2006, - "endDate": 2010, - } - ] - - -def test_marc21_to_provision_activity_ebooks_from_field_264_2(): - """Test provision activity Place and Date from field 264_2 transform.""" - marc21xml = """ - - - Lausanne : - Payot, - [2006-2010] - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("provisionActivity") == [ - { - "type": "bf:Distribution", - "statement": [ - {"label": [{"value": "Lausanne"}], "type": "bf:Place"}, - {"label": [{"value": "Payot"}], "type": "bf:Agent"}, - {"label": [{"value": "[2006-2010]"}], "type": "Date"}, - ], - } - ] - - -def test_marc21_to_subjects(): - """Test subjects transformation. - - Test subjects in field 653. - Checks applied: - - duplicates subjects removal - - generation of a list of all subjects. - """ - marc21xml = """ - - - Croissance personnelle - Self-Help - - - Santé - Health - - - Développement Personnel - Self-Help - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("subjects") == [ - { - "entity": { - "authorized_access_point": "Croissance personnelle", - "type": "bf:Topic", - } - }, - {"entity": {"authorized_access_point": "Self-Help", "type": "bf:Topic"}}, - {"entity": {"authorized_access_point": "Santé", "type": "bf:Topic"}}, - {"entity": {"authorized_access_point": "Health", "type": "bf:Topic"}}, - { - "entity": { - "authorized_access_point": "Développement Personnel", - "type": "bf:Topic", - } - }, - {"entity": {"authorized_access_point": "Self-Help", "type": "bf:Topic"}}, - ] - - -def test_marc21_to_contribution(): - """Test contribution transformation. - - Test author in field 700 with first indicator = 0 - for Forename (name without comma separator). - """ - marc21xml = """ - - - Collectif - aut - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("contribution") == [ - { - "entity": {"type": "bf:Person", "authorized_access_point": "Collectif"}, - "role": ["aut"], - } - ] - - marc21xml = """ - - - Jean-Paul - II - Pape - 1954- - aut - - - Dumont, Jean - Historien - 1921-2014 - edt - - - RERO - - - Biennale de céramique contemporaine - (17 : - 2003 : - Châteauroux) - - - """ - - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - contribution = data.get("contribution") - assert contribution == [ - { - "entity": { - "type": "bf:Person", - "authorized_access_point": "Jean-Paul II, Pape, 1954", - }, - "role": ["aut"], - }, - { - "entity": { - "authorized_access_point": "Dumont, Jean, 1921-2014, Historien", - "type": "bf:Person", - }, - "role": ["edt"], - }, - { - "entity": {"type": "bf:Organisation", "authorized_access_point": "RERO"}, - "role": ["ctb"], - }, - { - "entity": { - "type": "bf:Organisation", - "authorized_access_point": "Biennale de céramique contemporaine (17 : 2003 : " - "Châteauroux)", - }, - "role": ["aut"], - }, - ] - - -def test_marc21_to_contribution_and_translator(): - """Test contribution and translator transformation. - - Test author and translator in fields 700 with first indicator = 1 - for Surname (name with comma separator). - """ - marc21xml = """ - - - Peeters, Hagar - aut - - - Maufroy, Sandrine - trl - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("contribution") == [ - { - "entity": { - "type": "bf:Person", - "authorized_access_point": "Peeters, Hagar", - }, - "role": ["aut"], - }, - { - "entity": { - "type": "bf:Person", - "authorized_access_point": "Maufroy, Sandrine", - }, - "role": ["trl"], - }, - ] - - -def test_marc21_electronicLocator_ebooks(): - """Harvested_resources tests.""" - marc21xml = """ - - - http://site1.org/resources/1 - ebibliomedia - - - http://site5.org/resources/1 - mv-cantook - - - Image de couverture - http://site2.org/resources/2 - - - Extrait - https://www.edenlivres.fr/p/172480 - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("electronicLocator") == [ - { - "url": "http://site1.org/resources/1", - "type": "resource", - "source": "ebibliomedia", - }, - { - "url": "http://site5.org/resources/1", - "type": "resource", - "source": "mv-cantook", - }, - { - "url": "http://site2.org/resources/2", - "type": "relatedResource", - "content": "coverImage", - }, - ] - - -def test_marc21_cover_art_ebooks(): - """Cover art tests.""" - marc21xml = """ - - - Image de couverture - http://site2.org/resources/2 - - - test - http://site3.org/resources/2 - - - """ - marc21json = create_record(marc21xml) - data = marc21.do(marc21json) - assert data.get("electronicLocator") == [ - { - "url": "http://site2.org/resources/2", - "type": "relatedResource", - "content": "coverImage", - } - ]