diff --git a/data/apisources.yml b/data/apisources.yml
index d898d62cff..2f4d666e92 100644
--- a/data/apisources.yml
+++ b/data/apisources.yml
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# RERO ILS
-# Copyright (C) 2019 RERO
+# Copyright (C) 2024 RERO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@@ -16,8 +16,13 @@
# along with this program. If not, see .
-# OAI-PMH harvester configuration.
-mef:
- url: http://mef.test.rero.ch/api/mef
- comment: 'mef persons'
- size: 1000
+# API harvester configuration.
+VS-CANTOOK:
+ url: https://mediatheque-valais.cantookstation.eu
+ classname: 'rero_ils.modules.apiharvester.cantook.api.ApiCantook'
+ code: 'mv-cantook'
+
+NJ-CANTOOK:
+ url: https://bm.ebibliomedia.ch
+ classname: 'rero_ils.modules.apiharvester.cantook.api.ApiCantook'
+ code: 'ebibliomedia'
diff --git a/data/oaisources.yml b/data/oaisources.yml
deleted file mode 100644
index 1f8727cbea..0000000000
--- a/data/oaisources.yml
+++ /dev/null
@@ -1,24 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# RERO ILS
-# Copyright (C) 2019 RERO
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, version 3 of the License.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program. If not, see .
-
-
-# OAI-PMH harvester configuration.
-ebooks:
- baseurl: https://ebooks.test.rero.ch:8443/oai2d
- metadataprefix: marc21
- comment: ''
- setspecs: ''
diff --git a/pyproject.toml b/pyproject.toml
index 77840655ab..5a8101c9e4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -154,7 +154,6 @@ reverse = "rero_ils.dojson.cli:reverse"
pjson = "rero_ils.dojson.cli:pretty_json_dump"
[tool.poetry.plugins."dojson.cli.rule"]
-marc21_ebooks_to_json = "rero_ils.modules.ebooks.dojson.contrib.marc21:marc21"
marc21_dnb_to_json = "rero_ils.modules.documents.dojson.contrib.marc21tojson:marc21_dnb"
marc21_kul_to_json = "rero_ils.modules.documents.dojson.contrib.marc21tojson:marc21_kul"
marc21_loc_to_json = "rero_ils.modules.documents.dojson.contrib.marc21tojson:marc21_loc"
@@ -227,7 +226,6 @@ apiharvester = "rero_ils.modules.apiharvester.tasks"
collections = "rero_ils.modules.collections.tasks"
documents = "rero_ils.modules.documents.tasks"
remote_entities = "rero_ils.modules.entities.remote_entities.tasks"
-ebooks = "rero_ils.modules.ebooks.tasks"
holdings = "rero_ils.modules.holdings.tasks"
items = "rero_ils.modules.items.tasks"
loans = "rero_ils.modules.loans.tasks"
diff --git a/rero_ils/config.py b/rero_ils/config.py
index 7ce7b5ec4d..136d7ae977 100644
--- a/rero_ils/config.py
+++ b/rero_ils/config.py
@@ -392,12 +392,6 @@ def _(x):
"schedule": timedelta(minutes=60),
"enabled": False,
},
- "ebooks-harvester": {
- "task": "invenio_oaiharvester.tasks.list_records_from_dates",
- "schedule": crontab(minute=22, hour=22),
- "kwargs": {"name": "ebooks"},
- "enabled": False,
- },
"notification-creation": {
"task": "rero_ils.modules.notifications.tasks.create_notifications",
"schedule": crontab(minute=0, hour=3), # Every day at 05:00 UTC,
@@ -526,12 +520,18 @@ def _(x):
"kwargs": {"delete": True},
"enabled": False,
},
- # "mef-harvester": {
- # "task": "rero_ils.modules.apiharvester.tasks.harvest_records",
- # "schedule": timedelta(minutes=60),
- # "kwargs": {"name": "mef", "enabled": False),
- # "enabled": False,
- # },
+ "harvest-vs-cantook": {
+ "task": "rero_ils.modules.apiharvester.tasks.harvest_records",
+ "schedule": crontab(minute=33, hour=3), # Every day at 03:33 UTC,
+ "kwargs": {"name": "VS-CANTOOK"},
+ "enabled": False,
+ },
+ "harvest-nj-cantook": {
+ "task": "rero_ils.modules.apiharvester.tasks.harvest_records",
+ "schedule": crontab(minute=44, hour=4), # Every day at 04:44 UTC,
+ "kwargs": {"name": "NJ-CANTOOK"},
+ "enabled": False,
+ },
}
CELERY_BROKER_HEARTBEAT = 0
diff --git a/rero_ils/modules/apiharvester/api.py b/rero_ils/modules/apiharvester/api.py
new file mode 100644
index 0000000000..2dc08a7143
--- /dev/null
+++ b/rero_ils/modules/apiharvester/api.py
@@ -0,0 +1,158 @@
+# -*- coding: utf-8 -*-
+#
+# RERO ILS
+# Copyright (C) 2024 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""API for cantook records."""
+
+from __future__ import absolute_import, print_function
+
+from copy import deepcopy
+
+import click
+
+from rero_ils.modules.apiharvester.models import ApiHarvestConfig
+from rero_ils.modules.locations.api import Location
+from rero_ils.modules.organisations.api import Organisation
+
+from .models import HarvestActionType
+
+
+class ApiHarvest:
+ """ApiHarvest class.
+
+ config: saved config from ApiHarvester class
+ file_name: to save records to file
+ process: create harvested records
+ harvest_count: how many records to harvest
+ verbose: print verbose messages
+ """
+
+ def __init__(self, name, file_name=None, process=False, harvest_count=None, verbose=False):
+ """Class init."""
+ config = self.get_config(name)
+ if not config:
+ raise NameError(f"API Config not found: {name}")
+ self.config = config
+ self.file = file_name
+ self.process = process
+ self.harvest_count = harvest_count
+ self.verbose = verbose
+ self._vendor = None
+ self._url = self.config.url
+ self._code = self.config.code
+ self._count = 0
+ self._count_new = 0
+ self._count_upd = 0
+ self._count_del = 0
+ info = {}
+ for organisation in Organisation.get_records_by_online_harvested_source(self._code):
+ locations = {}
+ for location_pid in organisation.get_online_locations():
+ locations[location_pid] = None
+ location = Location.get_record_by_pid(location_pid)
+ library = location.get_library()
+ if url := library.get_online_harvested_source_url(source=self._code):
+ locations[location_pid] = url
+ info[organisation.pid] = {
+ "item_type_pid": organisation.online_circulation_category(),
+ "locations": locations
+ }
+ self._info = info
+
+ @classmethod
+ def get_config(cls, name):
+ """Get config.
+
+ :param name: name of config
+ """
+ return ApiHarvestConfig.query.filter_by(name=name).first()
+
+ def get_request_url(self, start_date="1990-01-01", page=1):
+ """Get request URL.
+
+ start_date: date from where records has to be harvested
+ page: page from where records have to be harvested
+ """
+ raise NotImplementedError()
+
+ def create_update_record(self, record):
+ """Create new record or update record.
+
+ :param record: record to create or update
+ """
+ raise NotImplementedError()
+
+ def save_record(self, record):
+ """Save record to file.
+
+ :param record: record to write to file
+ """
+ if self.file:
+ self.file.write(record)
+
+ def msg_text(self, pid, msg):
+ """Logging message text."""
+ return f"{self._count}: {self._vendor}:{self._code} {pid} = {msg}"
+
+ def process_records(self, records):
+ """Process records.
+
+ :param records: records to process
+ """
+ for record in records:
+ if self.harvest_count >= 0 and self._count >= self.harvest_count:
+ break
+ self._count += 1
+ self.save_record(record)
+ if self.process:
+ pid, status = self.create_update_record(record)
+ self.verbose_print(self.msg_text(pid=pid, msg=status.value))
+
+ def verbose_print(self, msg):
+ """Print verbose message.
+
+ :param msg: message to print if verbose
+ """
+ if self.verbose:
+ click.echo(msg)
+
+ def harvest_records(self, from_date):
+ """Harvest records from servers.
+
+ :param from_date: records changed after this date to harvest
+ """
+ self.process_records([])
+ return self._count
+
+ @property
+ def count(self):
+ """Get count."""
+ return self._count
+
+ @property
+ def count_new(self):
+ """Get new count."""
+ return self._count_new
+
+ @property
+ def count_upd(self):
+ """Get updated count."""
+ return self._count_upd
+
+ @property
+ def count_del(self):
+ """Get deleted count."""
+ return self._count_del
diff --git a/rero_ils/modules/ebooks/__init__.py b/rero_ils/modules/apiharvester/cantook/__init__.py
similarity index 92%
rename from rero_ils/modules/ebooks/__init__.py
rename to rero_ils/modules/apiharvester/cantook/__init__.py
index c955f3eee2..dc9e72752f 100644
--- a/rero_ils/modules/ebooks/__init__.py
+++ b/rero_ils/modules/apiharvester/cantook/__init__.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# RERO ILS
-# Copyright (C) 2019-2022 RERO
+# Copyright (C) 2024 RERO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@@ -15,4 +15,4 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
-"""JSON schemas."""
+"""ApiCantook."""
diff --git a/rero_ils/modules/apiharvester/cantook/api.py b/rero_ils/modules/apiharvester/cantook/api.py
new file mode 100644
index 0000000000..16b0911d5d
--- /dev/null
+++ b/rero_ils/modules/apiharvester/cantook/api.py
@@ -0,0 +1,188 @@
+# -*- coding: utf-8 -*-
+#
+# RERO ILS
+# Copyright (C) 2024 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""API for cantook records."""
+
+from __future__ import absolute_import, print_function
+
+from invenio_db import db
+from requests import codes as requests_codes
+from requests import get as requests_get
+
+from rero_ils.modules.documents.api import Document, DocumentsSearch
+from rero_ils.modules.holdings.api import Holding, HoldingsSearch, create_holding
+from rero_ils.modules.utils import JsonWriter
+
+from ..api import ApiHarvest
+from ..models import HarvestActionType
+from .dojson.json import cantook_json
+
+
+class ApiCantook(ApiHarvest):
+ """ApiCantook class.
+
+ Class for harvesting ebooks from cantook API resources.
+ """
+
+ def __init__(
+ self, name, file_name=None, process=False, harvest_count=-1, verbose=False
+ ):
+ """Class init."""
+ super().__init__(
+ name=name,
+ process=process,
+ harvest_count=harvest_count,
+ verbose=verbose,
+ )
+ if file_name:
+ self.file = JsonWriter(file_name)
+ self._vendor = "CANTOOK"
+
+ def get_request_url(self, start_date="1990-01-01", page=1):
+ """Get request URL.
+
+ start_date: date from where records has to be harvested
+ page: page from where records have to be harvested
+ """
+ params = f"start_at={start_date}&page={page}"
+ return f"{self._url}/v1/resources.json?{params}"
+
+ def delete_holdings(self, document_pid):
+ """
+ Delete holdings.
+
+ :param document_pid: document pid
+ """
+ for hold_pid in list(Holding.get_holdings_pid_by_document_pid(document_pid)):
+ if holding := Holding.get_record_by_pid(hold_pid):
+ for electronic_location in holding["electronic_location"]:
+ if electronic_location["source"] == self._code:
+ holding.delete(dbcommit=True, delindex=True)
+ break
+
+ def create_holdings(self, document_pid, link):
+ """
+ Create holdings.
+
+ :param document_pid: document pid
+ :param link: link to cantook document
+ """
+ holdings = []
+ for _, info in self._info.items():
+ item_type_pid = info["item_type_pid"]
+ for location_pid, url in info["locations"].items():
+ if url:
+ uri_split = link.split("/")[3:]
+ uri_split.insert(0, url.rstrip("/"))
+ link = "/".join(uri_split)
+ # See if the holding already exist
+ query = (
+ HoldingsSearch()
+ .filter("term", document__pid=document_pid)
+ .filter("term", location__pid=location_pid)
+ .filter("term", holdings_type="electronic")
+ .filter("term", electronic_location__source=self._code)
+ )
+ if query.count() == 0:
+ holding = create_holding(
+ document_pid=document_pid,
+ location_pid=location_pid,
+ item_type_pid=item_type_pid,
+ electronic_location={"source": self._code, "uri": link},
+ holdings_type="electronic",
+ )
+ holdings.append(holding)
+ db.session.commit()
+ for holding in holdings:
+ holding.reindex()
+
+ def create_update_record(self, data):
+ """Create, update or delete record.
+
+ :param data: date for record operation
+ """
+ status = HarvestActionType.NOTSET
+ record = None
+ record_data = cantook_json.do(data)
+ if record_data.pop("deleted", None):
+ status = HarvestActionType.DELETED
+ link = record_data.pop("link", None)
+ # See if we have this document already
+ harvested_id = record_data.pop("pid")
+ query = (
+ DocumentsSearch()
+ .filter("term", identifiedBy__value__raw=harvested_id)
+ .source(includes=["pid"])
+ )
+ try:
+ pid = next(query.scan()).pid
+ except StopIteration:
+ pid = None
+ if pid:
+ if doc := Document.get_record_by_pid(pid):
+ status = HarvestActionType.UPDATED
+ record_data["pid"] = doc["pid"]
+ # TODO: Do we have always to replace the document?
+ record = doc.replace(data=record_data, dbcommit=True, reindex=True)
+ if status == HarvestActionType.DELETED:
+ self._count_del += 1
+ self.delete_holdings()
+ if not doc.reasons_not_to_delete:
+ doc.delete(dbcommit=True, delindex=True)
+ pid = harvested_id
+ else:
+ self._count_upd += 1
+ # TODO: do we have to delete and recreate holdings ?
+ # self.delete_holdings()
+ self.create_holdings(document_pid=record.pid, link=link)
+ elif status == HarvestActionType.NOTSET:
+ self._count_new += 1
+ status = HarvestActionType.CREATED
+ record = Document.create(data=record_data, dbcommit=True, reindex=True)
+ self.create_holdings(document_pid=record.pid, link=link)
+ return pid or harvested_id, status
+
+ def harvest_records(self, from_date):
+ """Harvest cantook records.
+
+ from_date: record changed after this date to get
+ max: maximum records to fetcher
+ file: to save the fetched record
+ """
+ self._count = 0
+ url = self.get_request_url(start_date=from_date, page=1)
+ request = requests_get(url)
+ total_pages = int(request.headers.get("X-Total-Pages", 0))
+ total_items = int(request.headers.get("X-Total-Items", 0))
+ current_page = int(request.headers.get("X-Current-Page", 0))
+ count = 0
+ while (
+ request.status_code == requests_codes.ok
+ and current_page <= total_pages
+ and (self.harvest_count < 0 or self._count < self.harvest_count)
+ ):
+ count += 1
+ self.verbose_print(f"API page: {current_page} url: {url}")
+ self.process_records(request.json().get("resources", []))
+ # get next page and update current_page
+ url = self.get_request_url(start_date=from_date, page=current_page + 1)
+ request = requests_get(url)
+ current_page = int(request.headers.get("X-Current-Page", 0))
+ if count > 10:
+ raise StopIteration(f"Count: {count}")
+
+ return self._count, total_items
diff --git a/rero_ils/modules/ebooks/dojson/contrib/marc21/__init__.py b/rero_ils/modules/apiharvester/cantook/dojson/json/__init__.py
similarity index 85%
rename from rero_ils/modules/ebooks/dojson/contrib/marc21/__init__.py
rename to rero_ils/modules/apiharvester/cantook/dojson/json/__init__.py
index 9547a74462..1419e89d2f 100644
--- a/rero_ils/modules/ebooks/dojson/contrib/marc21/__init__.py
+++ b/rero_ils/modules/apiharvester/cantook/dojson/json/__init__.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# RERO ILS
-# Copyright (C) 2019-2022 RERO
+# Copyright (C) 2024 RERO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@@ -15,8 +15,9 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
-"""MARC21 RERO to JSON."""
+"""Marc21 data conversion."""
-from .model import marc21
-__all__ = "marc21"
+from .model import cantook_json
+
+__all__ = "cantook_json"
diff --git a/rero_ils/modules/apiharvester/cantook/dojson/json/model.py b/rero_ils/modules/apiharvester/cantook/dojson/json/model.py
new file mode 100644
index 0000000000..436fcfa1b4
--- /dev/null
+++ b/rero_ils/modules/apiharvester/cantook/dojson/json/model.py
@@ -0,0 +1,199 @@
+# -*- coding: utf-8 -*-
+#
+# RERO ILS
+# Copyright (C) 2024 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""Cantook json record transformation."""
+
+import dateparser
+
+from rero_ils.modules.documents.models import DocumentFictionType
+from rero_ils.modules.utils import get_schema_for_resource
+
+
+class Transformation(object):
+ """Transformation CANTOOK Json to RERO-ILS Json."""
+
+ def __init__(self, data=None, logger=None, verbose=False, transform=True):
+ """Constructor."""
+ self.data = data
+ self.logger = logger
+ self.verbose = verbose
+ self.json_dict = {}
+ if data and transform:
+ self._transform()
+
+ def _transform(self):
+ """Call the transformation functions."""
+ for func in dir(self):
+ if func.startswith("trans"):
+ func = getattr(self, func)
+ func()
+
+ def do(self, data):
+ """Do the transformation.
+
+ :param data: json data to transform
+ :returns: rero-ils document data
+ """
+ self.data = data
+ self._transform()
+ return self.json_dict
+
+ @property
+ def json(self):
+ """Json data."""
+ return self.json_dict or None
+
+ def trans_constants(self):
+ """Add constants"""
+ self.json_dict["$schema"] = get_schema_for_resource("doc")
+ self.json_dict["harvested"] = True
+ self.json_dict["issuance"] = {
+ "main_type": "rdami:1001",
+ "subtype": "materialUnit",
+ }
+ self.json_dict["adminMetadata"] = {"encodingLevel": "Not applicable"}
+ self.json_dict["type"] = [
+ {"main_type": "docmaintype_book", "subtype": "docsubtype_e-book"}
+ ]
+
+ def trans_pid(self):
+ """Transformation pid."""
+ self.json_dict["pid"] = f"cantook-{self.data['id']}"
+
+ def trans_identified_by(self):
+ """Transformation IdentifiedBy."""
+ identified_by = [
+ {
+ "source": "CANTOOK",
+ "type": "bf:Local",
+ "value": f"cantook-{self.data['id']}",
+ }
+ ]
+ for media in self.data.get("media"):
+ nature = media.get("nature")
+ if nature in ["paper", "epub", "audio"] and media["key_type"] == "isbn13":
+ identified_by.append({"type": "bf:Isbn", "value": media.get("key"), "note": nature})
+ if nature == "audio":
+ self.json_dict["type"] = [
+ {
+ "main_type": "docmaintype_audio",
+ "subtype": "docsubtype_audio_book",
+ }
+ ]
+ self.json_dict["identifiedBy"] = identified_by
+
+ def trans_titel(self):
+ """Transformation Title."""
+ title = {"type": "bf:Title"}
+ if maintitle := self.data.get("title"):
+ title["mainTitle"] = [{"value": maintitle}]
+ if subtitle := self.data.get("subtitle"):
+ title["subtitle"] = [{"value": subtitle}]
+ self.json_dict["title"] = [title]
+
+ def trans_provision_activity(self):
+ """Transform provisionActivity."""
+ publisher_name = self.data["publisher_name"]
+ start_date = dateparser.parse(self.data["created_at"])
+ self.json_dict["provisionActivity"] = [
+ {
+ "startDate": start_date.year,
+ "statement": [
+ {"label": [{"value": publisher_name}], "type": "bf:Agent"},
+ {"label": [{"value": str(start_date.year)}], "type": "Date"},
+ ],
+ "type": "bf:Publication",
+ }
+ ]
+
+ def trans_electronic_locator(self):
+ "Transformation electronicLocator"
+ electronic_locators = []
+ if cover := self.data.get("cover"):
+ electronic_locators.append(
+ {
+ "content": "coverImage",
+ "type": "relatedResource",
+ "url": cover,
+ }
+ )
+ if flipbook := self.data.get("flipbook"):
+ electronic_locators.append(
+ {
+ "content": "extract",
+ "type": "relatedResource",
+ "url": flipbook,
+ }
+ )
+ if electronic_locators:
+ self.json_dict["electronicLocator"] = electronic_locators
+
+ def trans_fiction(self):
+ """Transformation fiction."""
+ if self.data.get("fiction"):
+ self.json_dict["fiction_statement"] = DocumentFictionType.Fiction.value
+ self.json_dict["fiction_statement"] = DocumentFictionType.Unspecified.value
+
+ def trans_language(self):
+ """Transformation language."""
+ if languages := [
+ {"type": "bf:Language", "value": language}
+ for language in self.data.get("languages", [])
+ ]:
+ self.json_dict["language"] = languages
+
+ def trans_subjects(self):
+ """Transformation Subject."""
+ subjects = []
+ for classification in self.data.get("classifications", []):
+ for caption in classification.get("captions", []):
+ if subject := caption.get("fr"):
+ subjects.append(
+ {
+ "entity": {
+ "authorized_access_point": subject,
+ "type": "bf:Topic",
+ }
+ }
+ )
+ if subjects:
+ self.json_dict["subjects"] = subjects
+
+ def trans_summary(self):
+ """Transformation Summary."""
+ if summary := self.data.get("summary"):
+ self.json_dict["summary"] = [{"label": [{"value": summary}]}]
+
+ def trans_extent(self):
+ """Transformation Extend."""
+ if page_count := self.data.get("page_count"):
+ self.json_dict["extent"] = f"{page_count} pages"
+
+ # to be used to create holdings
+ def trans_links(self):
+ "Transformation links"
+ if link := self.data.get("link"):
+ self.json_dict["link"] = link
+
+ # to be used for deleted records
+ def trans_deleted(self):
+ "Transformation links"
+ if unavailable_since := self.data.get("unavailable_since"):
+ self.json_dict["deleted"] = unavailable_since
+
+
+cantook_json = Transformation()
diff --git a/rero_ils/modules/apiharvester/cli.py b/rero_ils/modules/apiharvester/cli.py
index 81a3e4ffdf..b9b17f27bb 100644
--- a/rero_ils/modules/apiharvester/cli.py
+++ b/rero_ils/modules/apiharvester/cli.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# RERO ILS
-# Copyright (C) 2019-2022 RERO
+# Copyright (C) 2024 RERO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@@ -20,15 +20,16 @@
from __future__ import absolute_import, print_function
import click
+import dateparser
import yaml
from flask import current_app
from flask.cli import with_appcontext
from werkzeug.local import LocalProxy
-from rero_ils.modules.apiharvester.tasks import harvest_records
+from rero_ils.modules.apiharvester.tasks import api_harvest_records
from .models import ApiHarvestConfig
-from .utils import api_source
+from .utils import api_source, get_apiharvest_object
datastore = LocalProxy(lambda: current_app.extensions["security"].datastore)
@@ -38,48 +39,40 @@ def apiharvester():
"""Api harvester commands."""
-@apiharvester.command("source")
+@apiharvester.command("add-source")
@click.argument("name")
@click.option("-U", "--url", default="", help="Url")
-@click.option("-m", "--mimetype", default="", help="Mimetype")
-@click.option("-s", "--size", default=-1, type=int, help="Size")
-@click.option("-c", "--comment", default="", help="Comment")
+@click.option("-n", "--classname", default="", help="Class name")
+@click.option("-c", "--code", default="", help="Code")
@click.option("-u", "--update", is_flag=True, default=False, help="Update config")
@with_appcontext
-def api_source_config(name, url, mimetype, size, comment, update):
+def add_api_source_config(name, url, classname, code, update):
"""Add or Update ApiHarvestConfig."""
- click.echo(f"ApiHarvesterConfig: {name} ", nl=False)
- msg = api_source(
- name=name, url=url, mimetype=mimetype, size=size, comment=comment, update=update
- )
- click.echo(msg)
+ msg = api_source(name=name, url=url, classname=classname, code=code, update=update)
+ click.echo(f"ApiHarvestConfig {name}: {msg}")
-@apiharvester.command("sources")
+@apiharvester.command("init-config")
@click.argument("configfile", type=click.File("rb"))
@click.option("-u", "--update", is_flag=True, default=False, help="Update config")
@with_appcontext
-def api_source_config_from_file(configfile, update):
+def init_api_harvest_config(configfile, update):
"""Add or update ApiHarvestConfigs from file."""
- configs = yaml.load(configfile, Loader=yaml.FullLoader)
- for name, values in sorted(configs.items()):
- url = values.get("url", "")
- mimetype = values.get("mimetype", "")
- size = values.get("size", 100)
- comment = values.get("comment", "")
- click.echo(f"ApiHarvesterConfig: {name} {url} ", nl=False)
- msg = api_source(
- name=name,
- url=url,
- mimetype=mimetype,
- size=size,
- comment=comment,
- update=update,
- )
- click.echo(msg)
+ if configs := yaml.load(configfile, Loader=yaml.FullLoader):
+ for name, values in sorted(configs.items()):
+ url = values.get("url", "")
+ classname = values.get("classname", "")
+ code = values.get("code", "")
+ msg = api_source(
+ name=name, url=url, classname=classname, code=code, update=update
+ )
+ click.echo(f"ApiHarvestConfig {name}: {msg}")
+
+ else:
+ click.secho(f"ERROR: no YML config found in: {configfile.name}", fg="red")
-@apiharvester.command("harvest")
+@apiharvester.command()
@click.option(
"-n", "--name", default=None, help="Name of persistent configuration to use."
)
@@ -89,12 +82,6 @@ def api_source_config_from_file(configfile, update):
default=None,
help="The lower bound date for the harvesting (optional).",
)
-@click.option(
- "-u",
- "--url",
- default=None,
- help="The upper bound date for the harvesting (optional).",
-)
@click.option(
"-k",
"--enqueue",
@@ -102,46 +89,30 @@ def api_source_config_from_file(configfile, update):
default=False,
help="Enqueue harvesting and return immediately.",
)
-@click.option(
- "--signals/--no-signals",
- default=True,
- help="Signals sent with Api harvesting results.",
-)
-@click.option("-s", "--size", type=int, default=0, help="Size of chunks (optional).")
@click.option(
"-m",
- "--max_results",
+ "--harvest_count",
type=int,
- default=0,
+ default=-1,
help="maximum of records to harvest (optional).",
)
@click.option("-v", "--verbose", "verbose", is_flag=True, default=False)
@with_appcontext
-def harvest(name, from_date, url, enqueue, signals, size, max_results, verbose):
- """Harvest api."""
+def harvest(name, from_date, enqueue, harvest_count, verbose):
+ """Harvest records from an API repository."""
if name:
click.secho(f"Harvest api: {name}", fg="green")
- elif url:
- click.secho(f"Harvest api: {url}", fg="green")
+ if from_date:
+ from_date = dateparser.parse(from_date).isoformat()
if enqueue:
- harvest_records.delay(
- url=url,
- name=name,
- from_date=from_date,
- signals=signals,
- size=size,
- max_results=max_results,
- verbose=verbose,
+ async_id = api_harvest_records.delay(
+ name=name, from_date=from_date, harvest_count=harvest_count, verbose=verbose
)
+ if verbose:
+ click.echo(f"AsyncResult {async_id}")
else:
- harvest_records(
- url=url,
- name=name,
- from_date=from_date,
- signals=signals,
- size=size,
- max_results=max_results,
- verbose=verbose,
+ api_harvest_records(
+ name=name, from_date=from_date, harvest_count=harvest_count, verbose=verbose
)
@@ -152,8 +123,20 @@ def info():
apis = ApiHarvestConfig.query.all()
for api in apis:
click.echo(api.name)
- click.echo(f"\tlastrun : {api.lastrun}")
- click.echo(f"\turl : {api.url}")
- click.echo(f"\tmimetype : {api.mimetype}")
- click.echo(f"\tsize : {api.size}")
- click.echo(f"\tcomment : {api.comment}")
+ click.echo(f"\tlastrun : {api.lastrun}")
+ click.echo(f"\turl : {api.url}")
+ click.echo(f"\tclassname : {api.classname}")
+ click.echo(f"\tcode : {api.code}")
+
+
+@apiharvester.command()
+@click.argument("name")
+@click.option("-d", "--date", default=None, help="Set last run (default: now).")
+@with_appcontext
+def set_last_run(name, date):
+ """Set last run."""
+ if config := get_apiharvest_object(name=name):
+ new_date = config.update_lastrun(new_date=date)
+ click.secho(f"Set last run {name}: {new_date}", fg='green')
+ else:
+ click.secho(f"No config found: {name}", fg="red")
diff --git a/rero_ils/modules/apiharvester/errors.py b/rero_ils/modules/apiharvester/errors.py
new file mode 100644
index 0000000000..f4b5c6303b
--- /dev/null
+++ b/rero_ils/modules/apiharvester/errors.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+#
+# RERO ILS
+# Copyright (C) 2024 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""Api harvester errors."""
+
+from __future__ import absolute_import, print_function
+
+
+class ApiHarvesterError(Exception):
+ """Base exception for apiharvester."""
+
+
+class ApiRequestError(ApiHarvesterError):
+ """Error with the Api request."""
+
+
+class NameOrUrlMissing(ApiHarvesterError):
+ """Name or url for harvesting missing."""
+
+
+class WrongDateCombination(ApiHarvesterError):
+ """'Until' date is larger that 'from' date."""
+
+
+class IdentifiersOrDates(ApiHarvesterError):
+ """Identifiers cannot be used in combination with dates."""
+
+
+class ApiHarvesterConfigNotFound(ApiHarvesterError):
+ """No ApiHarvesterConfig was found."""
diff --git a/rero_ils/modules/apiharvester/models.py b/rero_ils/modules/apiharvester/models.py
index c657a127ea..dfd9f657ac 100644
--- a/rero_ils/modules/apiharvester/models.py
+++ b/rero_ils/modules/apiharvester/models.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# RERO ILS
-# Copyright (C) 2019-2022 RERO
+# Copyright (C) 2024 RERO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@@ -20,27 +20,32 @@
from __future__ import absolute_import
from datetime import datetime, timezone
+from enum import Enum
-import pytz
from invenio_db import db
-from invenio_pidstore.models import RecordIdentifier
-class ApiHarvestConfig(RecordIdentifier):
- """Sequence generator for Document identifiers."""
+class HarvestActionType(Enum):
+ """Harvest action types."""
+
+ DELETED = "DELETED"
+ UPDATED = "UPDATED"
+ CREATED = "CREATED"
+ NOTSET = "NOTSET"
+
+
+class ApiHarvestConfig(db.Model):
+ """Represents a ApiHarvestConfig record."""
__tablename__ = "apiharvester_config"
- __mapper_args__ = {"concrete": True}
id = db.Column(db.Integer, primary_key=True)
url = db.Column(db.String(255), nullable=False, server_default="")
name = db.Column(db.String(255), nullable=False)
- mimetype = db.Column(db.String(255), nullable=False)
- size = db.Column(db.Integer, nullable=False)
- comment = db.Column(db.Text, nullable=True)
- default_last_run = datetime.strptime("1900-1-1", "%Y-%m-%d")
+ classname = db.Column(db.String(255), nullable=False)
+ code = db.Column(db.Text, nullable=True)
lastrun = db.Column(
- db.DateTime, default=pytz.utc.localize(default_last_run), nullable=True
+ db.DateTime, default=datetime(year=1900, month=1, day=1), nullable=True
)
def save(self):
@@ -51,3 +56,5 @@ def save(self):
def update_lastrun(self, new_date=None):
"""Update the 'lastrun' attribute of object to now."""
self.lastrun = new_date or datetime.now(timezone.utc)
+ self.save()
+ return self.lastrun
diff --git a/rero_ils/modules/apiharvester/signals.py b/rero_ils/modules/apiharvester/signals.py
deleted file mode 100644
index f24e2b621c..0000000000
--- a/rero_ils/modules/apiharvester/signals.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# RERO ILS
-# Copyright (C) 2019-2022 RERO
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, version 3 of the License.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program. If not, see .
-
-"""ApiHarvester signals."""
-
-from blinker import Namespace
-
-_signals = Namespace()
-
-apiharvest_part = _signals.signal("apiharvest_part")
diff --git a/rero_ils/modules/apiharvester/tasks.py b/rero_ils/modules/apiharvester/tasks.py
index fdc6134fba..61a96bf5bb 100644
--- a/rero_ils/modules/apiharvester/tasks.py
+++ b/rero_ils/modules/apiharvester/tasks.py
@@ -19,40 +19,39 @@
from __future__ import absolute_import, print_function
+import click
from celery import shared_task
+from flask import current_app
+from invenio_records_rest.utils import obj_or_import_string
-from .models import ApiHarvestConfig
-from .utils import get_records
+from .utils import get_apiharvest_object
-@shared_task(ignore_result=True)
-def harvest_records(
- url=None,
- name=None,
- from_date=None,
- signals=True,
- size=0,
- max_results=0,
- verbose=False,
-):
+@shared_task(ignore_result=True, soft_time_limit=3600)
+def api_harvest_records(name, from_date=None, harvest_count=-1, verbose=False):
"""Harvest records."""
- config = ApiHarvestConfig.query.filter_by(name=name).first()
- if config:
- if not url:
- url = config.url
+ count = -1
+
+ if config := get_apiharvest_object(name=name):
if not from_date:
- from_date = config.lastrun
+ from_date = config.lastrun.isoformat()
config.update_lastrun()
- if size == 0:
- size = config.size
-
- for next, records in get_records(
- url=url,
- name=name,
- from_date=from_date,
- size=size,
- max_results=max_results,
- signals=signals,
- verbose=verbose,
- ):
- pass
+ msg = f"API harvest {name} class name: {config.classname} "
+ msg += f"from date: {from_date} url: {config.url}"
+
+ current_app.logger.info(msg)
+ HarvestClass = obj_or_import_string(config.classname)
+ harvest = HarvestClass(name=name, verbose=verbose, harvest_count=harvest_count, process=True)
+ count, total = harvest.harvest_records(from_date=from_date)
+ msg = (
+ f"API harvest {name} items={total} |"
+ f" got={count} new={harvest.count_new}"
+ f" updated={harvest.count_upd} deleted={harvest.count_del}"
+ )
+ if verbose:
+ click.echo(msg)
+ current_app.logger.info(msg)
+ count = harvest.count
+ else:
+ current_app.logger.error(f"No config found: {name}")
+ return count
diff --git a/rero_ils/modules/apiharvester/utils.py b/rero_ils/modules/apiharvester/utils.py
index 3365da4b80..b54659a2a8 100644
--- a/rero_ils/modules/apiharvester/utils.py
+++ b/rero_ils/modules/apiharvester/utils.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# RERO ILS
-# Copyright (C) 2019-2022 RERO
+# Copyright (C) 2024 RERO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@@ -19,109 +19,93 @@
from __future__ import absolute_import, print_function
-import click
-import requests
-from dateutil import parser
from flask import current_app
from invenio_db import db
+from invenio_oaiserver.models import OAISet
+from sqlalchemy.exc import OperationalError
+from .errors import ApiHarvesterConfigNotFound
from .models import ApiHarvestConfig
-from .signals import apiharvest_part
-def api_source(name, url="", mimetype="", size=100, comment="", update=False):
- """Add ApiHarvesterConfig."""
+def add_set(spec, name, pattern, description="..."):
+ """Add OAI set.
+
+ :param spec: set identifier
+ :param name: human readable name of the set
+ :param pattern: search pattern to get records
+ :param description: human readable description
+ """
+ try:
+ oaiset = OAISet(
+ spec=spec, name=name, description=description, system_created=False
+ )
+ oaiset.search_pattern = pattern
+ db.session.add(oaiset)
+ db.session.commit()
+ msg = f"OAIset added: {name}"
+ except Exception as err:
+ db.session.rollback()
+ msg = f"OAIset exist: {name} {err}"
+ return msg
+
+
+def api_source(name, url="", classname=None, code="", update=False):
+ """Add ApiHarvestConfig do DB.
+
+ name: name for the configuration
+ url: harvesting url
+ classname: Class responsible for getting record_serializers
+ code: code added to electronic_location['nonpublic_note']
+ update: update configuration if exist
+ """
with current_app.app_context():
+ msg = "No Update"
source = ApiHarvestConfig.query.filter_by(name=name).first()
if not source:
source = ApiHarvestConfig(
- name=name, url=url, mimetype=mimetype, size=100, comment=comment
+ name=name, url=url, classname=classname, code=code
)
source.save()
db.session.commit()
- return "Added"
+ msg = "Add"
elif update:
source.name = name
msg = []
if url != "":
source.url = url
msg.append(f"url:{url}")
- if mimetype != "":
- source.mimetype = mimetype
- msg.append(f"mimetype:{mimetype}")
- if size != -1:
- source.size = size
- msg.append(f"size:{size}")
- if comment != "":
- source.comment = comment
- msg.append(f"comment:{comment}")
+ source.classname = classname
+ msg.append(f"classname:{classname}")
+ if code != "":
+ source.code = code
+ msg.append(f"code:{code}")
db.session.commit()
- return f'Updated: {", ".join(msg)}'
- return "Not Updated"
-
-
-def extract_records(data):
- """Extract a record from REST data."""
- records = []
- hits = data.get("hits", {}).get("hits", {})
- for hit in hits:
- # pid = data.get('id', '')
- # updated = data.get('updated', '')
- # links = data.get('links', {}).get('self', '')
- record = hit.get("metadata", "")
- records.append(record)
- return records
-
-
-def get_records(
- url=None,
- name=None,
- from_date=None,
- max_results=0,
- size=100,
- signals=True,
- verbose=False,
- **kwargs,
-):
- """Harvest multiple records from invenio api."""
- url += f"/?size={size}"
- if from_date:
- if isinstance(from_date, str):
- from_date = parser.parse(from_date)
- from_date = from_date.isoformat()
- # we have to urlencode the : from the time with \:
- from_date = from_date.replace(":", "%5C:")
- url += f"&q=_updated:>{from_date}"
- url += f"&size={size}"
-
- if verbose:
- click.echo(f"Get records from {url}")
-
- try:
- count = 0
- request = requests.get(url)
- data = request.json()
-
- total = data["hits"]["total"]["value"]
- click.echo(f"API records found: {total}")
-
- next_url = data.get("links", {}).get("self", True)
- while next_url and (count < max_results or max_results == 0):
- records = extract_records(data)
- count += len(records)
+ msg = f'Update {", ".join(msg)}'
+ return msg
+
+
+def get_apiharvest_object(name):
+ """Query and returns an ApiHarvestConfig object based on its name.
+
+ :param name: The name of the ApiHarvestConfig object.
+ :return: The ApiHarvestConfig object.
+ """
+ get_config_error_count = 0
+ get_config_ok = False
+ while not get_config_ok and get_config_error_count < 5:
+ try:
+ obj = ApiHarvestConfig.query.filter_by(name=name).first()
+ get_config_ok = True
+ except OperationalError:
+ get_config_error_count += 1
+ current_app.logger.error(
+ "ApiHarvestConfig OperationalError: " f"{get_config_error_count} {name}"
+ )
- if count - max_results > 0 and max_results != 0:
- records = records[:max_results]
+ if not obj:
+ raise ApiHarvesterConfigNotFound(
+ f"Unable to find ApiHarvesterConfig obj with name {name}."
+ )
- request = requests.get(next_url)
- data = request.json()
- if signals:
- apiharvest_part.send(
- records=records, name=name, url=next, verbose=verbose, **kwargs
- )
- else:
- yield next_url, records
- next_url = data.get("links", {}).get("next", None)
- except Exception as error:
- click.secho(f"Harvesting API ConnectionRefusedError: {error}", fg="red")
- yield url, []
+ return obj
diff --git a/rero_ils/modules/cli/reroils.py b/rero_ils/modules/cli/reroils.py
index a2df4a82c2..a67470683d 100644
--- a/rero_ils/modules/cli/reroils.py
+++ b/rero_ils/modules/cli/reroils.py
@@ -24,7 +24,6 @@
from rero_ils.modules.acquisition.cli import acquisition
from rero_ils.modules.apiharvester.cli import apiharvester
-from rero_ils.modules.ebooks.cli import oaiharvester
from rero_ils.modules.entities.remote_entities.cli import entity
from rero_ils.modules.migrations.cli import migrations
from rero_ils.modules.monitoring.cli import monitoring
@@ -50,7 +49,6 @@ def reroils():
reroils.add_command(migrations)
reroils.add_command(monitoring)
reroils.add_command(notifications)
-reroils.add_command(oaiharvester)
reroils.add_command(scheduler)
reroils.add_command(stats)
reroils.add_command(utils)
diff --git a/rero_ils/modules/documents/serializers/base.py b/rero_ils/modules/documents/serializers/base.py
index 28dadc771f..b2b221fb53 100644
--- a/rero_ils/modules/documents/serializers/base.py
+++ b/rero_ils/modules/documents/serializers/base.py
@@ -76,7 +76,7 @@ def __init__(self, record, **kwargs):
@abstractmethod
def format(self):
"""Return formatted record."""
- raise NotImplementedError
+ raise NotImplementedError()
def _get_document_types(self):
"""Return document types."""
diff --git a/rero_ils/modules/ebooks/cli.py b/rero_ils/modules/ebooks/cli.py
deleted file mode 100644
index 881fe918a6..0000000000
--- a/rero_ils/modules/ebooks/cli.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# RERO ILS
-# Copyright (C) 2019-2022 RERO
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, version 3 of the License.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program. If not, see .
-
-"""Click command-line interface for ebook record management."""
-
-from __future__ import absolute_import, print_function
-
-import click
-import yaml
-from flask.cli import with_appcontext
-from invenio_oaiharvester.cli import oaiharvester
-from invenio_oaiharvester.models import OAIHarvestConfig
-
-from .utils import add_oai_source
-
-
-@oaiharvester.command("addsource")
-@click.argument("name")
-@click.argument("baseurl")
-@click.option(
- "-m", "--metadataprefix", default="marc21", help="The prefix for the metadata"
-)
-@click.option(
- "-s", "--setspecs", default="", help="The ‘set’ criteria for the harvesting"
-)
-@click.option("-c", "--comment", default="", help="Comment")
-@click.option("-u", "--update", is_flag=True, default=False, help="Update config")
-@with_appcontext
-def add_oai_source_config(name, baseurl, metadataprefix, setspecs, comment, update):
- """Add OAIHarvestConfig."""
- click.echo(f"Add OAIHarvestConfig: {name} ", nl=False)
- msg = add_oai_source(
- name=name,
- baseurl=baseurl,
- metadataprefix=metadataprefix,
- setspecs=setspecs,
- comment=comment,
- update=update,
- )
- click.echo(msg)
-
-
-@oaiharvester.command("initconfig")
-@click.argument("configfile", type=click.File("rb"))
-@click.option("-u", "--update", is_flag=True, default=False, help="Update config")
-@with_appcontext
-def init_oai_harvest_config(configfile, update):
- """Init OAIHarvestConfig."""
- configs = yaml.load(configfile, Loader=yaml.FullLoader)
- for name, values in sorted(configs.items()):
- baseurl = values["baseurl"]
- metadataprefix = values.get("metadataprefix", "marc21")
- setspecs = values.get("setspecs", "")
- comment = values.get("comment", "")
- click.echo(f"Add OAIHarvestConfig: {name} {baseurl} ", nl=False)
- msg = add_oai_source(
- name=name,
- baseurl=baseurl,
- metadataprefix=metadataprefix,
- setspecs=setspecs,
- comment=comment,
- update=update,
- )
- click.echo(msg)
-
-
-@oaiharvester.command("info")
-@with_appcontext
-def info():
- """List infos for tasks."""
- oais = OAIHarvestConfig.query.all()
- for oai in oais:
- click.echo(oai.name)
- click.echo("\tlastrun : ", nl=False)
- click.echo(oai.lastrun)
- click.echo("\tbaseurl : " + oai.baseurl)
- click.echo("\tmetadataprefix: " + oai.metadataprefix)
- click.echo("\tcomment : " + oai.comment)
- click.echo("\tsetspecs : " + oai.setspecs)
diff --git a/rero_ils/modules/ebooks/dojson/__init__.py b/rero_ils/modules/ebooks/dojson/__init__.py
deleted file mode 100644
index 453d190002..0000000000
--- a/rero_ils/modules/ebooks/dojson/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# RERO ILS
-# Copyright (C) 2019-2022 RERO
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, version 3 of the License.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program. If not, see .
-
-"""DOJSON transformations."""
diff --git a/rero_ils/modules/ebooks/dojson/contrib/__init__.py b/rero_ils/modules/ebooks/dojson/contrib/__init__.py
deleted file mode 100644
index 28b47606ef..0000000000
--- a/rero_ils/modules/ebooks/dojson/contrib/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# RERO ILS
-# Copyright (C) 2019-2022 RERO
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, version 3 of the License.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program. If not, see .
-
-"""DOJSON contrib for rero-ils."""
diff --git a/rero_ils/modules/ebooks/dojson/contrib/marc21/model.py b/rero_ils/modules/ebooks/dojson/contrib/marc21/model.py
deleted file mode 100644
index 73c4144dc7..0000000000
--- a/rero_ils/modules/ebooks/dojson/contrib/marc21/model.py
+++ /dev/null
@@ -1,511 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# RERO ILS
-# Copyright (C) 2019-2022 RERO
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, version 3 of the License.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program. If not, see .
-
-"""rero-ils MARC21 model definition."""
-
-
-import contextlib
-import re
-
-from dojson import utils
-from isbnlib import EAN13
-
-from rero_ils.dojson.utils import (
- ReroIlsMarc21Overdo,
- TitlePartList,
- add_note,
- extract_subtitle_and_parallel_titles_from_field_245_b,
- get_field_items,
- get_field_link_data,
- make_year,
- remove_trailing_punctuation,
-)
-from rero_ils.modules.documents.dojson.contrib.marc21tojson.utils import do_language
-from rero_ils.modules.documents.models import DocumentFictionType
-from rero_ils.modules.documents.utils import create_authorized_access_point
-from rero_ils.modules.entities.models import EntityType
-
-marc21 = ReroIlsMarc21Overdo()
-
-
-@marc21.over("issuance", "leader")
-@utils.ignore_value
-def marc21_to_issuance(self, key, value):
- """Set the mode of issuance."""
- self["issuance"] = dict(main_type="rdami:1001", subtype="materialUnit")
- if marc21.admin_meta_data:
- self["adminMetadata"] = marc21.admin_meta_data
- self["fiction_statement"] = DocumentFictionType.Unspecified.value
-
-
-@marc21.over("language", "^008")
-@utils.ignore_value
-def marc21_to_language_from_008(self, key, value):
- """Get languages.
-
- languages: 008 and 041 [$a, repetitive]
- """
- return do_language(self, marc21)
-
-
-@marc21.over("language", "^041")
-@utils.ignore_value
-def marc21_to_language_from_041(self, key, value):
- """Get languages.
-
- languages: 008 and 041 [$a, repetitive]
- """
- # if we dont have languages from 008 try to set it with 041
- return do_language(self, marc21)
-
-
-@marc21.over("identifiedBy", "^020..")
-@utils.ignore_value
-def marc21_to_identifier_isbn(self, key, value):
- """Get identifier isbn.
-
- identifiers_isbn: 020 $a
- """
- if isbn13 := EAN13(value.get("a")):
- identifiers = self.get("identifiedBy", [])
- identifier = {"type": "bf:Isbn", "value": isbn13}
- identifiers.append(identifier)
- return identifiers
- return None
-
-
-@marc21.over("type", "^0248.$")
-def marc21_to_type(self, key, value):
- """Get document type."""
- if value.get("a").find("cantook") > -1:
- return [{"main_type": "docmaintype_book", "subtype": "docsubtype_e-book"}]
- return None
-
-
-@marc21.over("identifiedBy", "^035..")
-@utils.ignore_value
-def marc21_to_identifier_rero_id(self, key, value):
- """Get identifier reroId.
-
- identifiers:reroID: 035$a
- """
- identifiers = self.get("identifiedBy", [])
- identifier = {"type": "bf:Local", "value": value.get("a")}
- identifiers.append(identifier)
- return identifiers
-
-
-@marc21.over("contribution", "(^100|^700|^710|^711)..")
-@utils.for_each_value
-@utils.ignore_value
-def marc21_to_contribution(self, key, value):
- """Get contribution."""
- if key[4] == "2" or key[:3] not in ["100", "700", "710", "711"]:
- return None
- agent_data = {"type": "bf:Person"}
- if value.get("a"):
- name = utils.force_list(value.get("a"))[0]
- agent_data["preferred_name"] = remove_trailing_punctuation(name)
-
- # 100|700 Person
- if key[:3] in ["100", "700"]:
- if value.get("b"):
- numeration = utils.force_list(value.get("b"))[0]
- agent_data["numeration"] = remove_trailing_punctuation(numeration)
- if value.get("c"):
- qualifier = utils.force_list(value.get("c"))[0]
- agent_data["qualifier"] = remove_trailing_punctuation(qualifier)
- if value.get("d"):
- date = utils.force_list(value.get("d"))[0]
- date = date.rstrip(",")
- dates = remove_trailing_punctuation(date).split("-")
- with contextlib.suppress(Exception):
- if date_of_birth := dates[0].strip():
- agent_data["date_of_birth"] = date_of_birth
- with contextlib.suppress(Exception):
- if date_of_death := dates[1].strip():
- agent_data["date_of_death"] = date_of_death
- if value.get("q"):
- fuller_form_of_name = utils.force_list(value.get("q"))[0]
- agent_data["fuller_form_of_name"] = (
- remove_trailing_punctuation(fuller_form_of_name).lstrip("(").rstrip(")")
- )
-
- elif key[:3] in ["710", "711"]:
- agent_data["type"] = "bf:Organisation"
- agent_data["conference"] = key[:3] == "711"
- if value.get("e"):
- subordinate_units = [
- subordinate_unit.rstrip(".")
- for subordinate_unit in utils.force_list(value.get("e"))
- ]
-
- agent_data["subordinate_unit"] = subordinate_units
- if value.get("n"):
- numbering = utils.force_list(value.get("n"))[0]
- agent_data["numbering"] = (
- remove_trailing_punctuation(numbering).lstrip("(").rstrip(")")
- )
- if value.get("d"):
- conference_date = utils.force_list(value.get("d"))[0]
- if (
- conference_date := remove_trailing_punctuation(conference_date)
- .lstrip("(")
- .rstrip(")")
- ):
- agent_data["conference_date"] = conference_date
- if value.get("c"):
- place = utils.force_list(value.get("c"))[0]
- if place := remove_trailing_punctuation(place).lstrip("(").rstrip(")"):
- agent_data["place"] = place
- agent = {
- "type": agent_data["type"],
- "authorized_access_point": create_authorized_access_point(agent_data),
- }
- if agent_data.get("identifiedBy"):
- agent["identifiedBy"] = agent_data["identifiedBy"]
- roles = ["aut"]
- if value.get("4"):
- roles = list(utils.force_list(value.get("4")))
- elif key[:3] == "100":
- roles = ["cre"]
- elif key[:3] == "711":
- roles = ["aut"]
- else:
- roles = ["ctb"]
- return {"entity": agent, "role": roles}
-
-
-@marc21.over("title", "^245..")
-@utils.ignore_value
-def marc21_to_title(self, key, value):
- """Get title data.
-
- field 245:
- $a : non repetitive
- $b : non repetitive
- $c : non repetitive
- $n : repetitive
- $p : repetitive
- $6 : non repetitive
- field 246:
- $a : non repetitive
- $n : repetitive
- $p : repetitive
- $6 : non repetitive
- """
- subfield_245_a = ""
- subfield_245_b = ""
- if fields_245 := marc21.get_fields("245"):
- subfields_245_a = marc21.get_subfields(fields_245[0], "a")
- subfields_245_b = marc21.get_subfields(fields_245[0], "b")
- if subfields_245_a:
- subfield_245_a = subfields_245_a[0]
- if subfields_245_b:
- subfield_245_b = subfields_245_b[0]
- field_245_a_end_with_equal = re.search(r"\s*=\s*$", subfield_245_a)
- field_245_a_end_with_colon = re.search(r"\s*:\s*$", subfield_245_a)
- field_245_a_end_with_semicolon = re.search(r"\s*;\s*$", subfield_245_a)
- field_245_b_contains_equal = re.search(r"=", subfield_245_b)
-
- fields_246 = marc21.get_fields("246")
- subfield_246_a = ""
- if fields_246:
- if subfields_246_a := marc21.get_subfields(fields_246[0], "a"):
- subfield_246_a = subfields_246_a[0]
-
- tag_link, link = get_field_link_data(value)
- items = get_field_items(value)
- index = 1
- title_list = []
- title_data = {}
- part_list = TitlePartList(part_number_code="n", part_name_code="p")
- parallel_titles = []
- pararalel_title_data_list = []
- pararalel_title_string_set = set()
- responsibility = {}
-
- subfield_selection = {"a", "b", "c", "n", "p"}
- for blob_key, blob_value in items:
- if blob_key in subfield_selection:
- value_data = marc21.build_value_with_alternate_graphic(
- "245", blob_key, blob_value, index, link, ",.", ":;/-="
- )
- if blob_key in {"a", "b", "c"}:
- subfield_selection.remove(blob_key)
- if blob_key == "a":
- title_data["mainTitle"] = value_data
- elif blob_key == "b":
- if subfield_246_a:
- subtitle, parallel_titles, pararalel_title_string_set = (
- extract_subtitle_and_parallel_titles_from_field_245_b(
- value_data, field_245_a_end_with_equal
- )
- )
- if subtitle:
- title_data["subtitle"] = subtitle
- elif value_data:
- title_data["subtitle"] = value_data
- elif blob_key == "c":
- responsibility = marc21.build_responsibility_data(value_data)
- elif blob_key in ["n", "p"]:
- part_list.update_part(value_data, blob_key, blob_value)
- if blob_key != "__order__":
- index += 1
- title_data["type"] = "bf:Title"
- if the_part_list := part_list.get_part_list():
- title_data["part"] = the_part_list
- if title_data:
- title_list.append(title_data)
- variant_title_list = marc21.build_variant_title_data(pararalel_title_string_set)
-
- title_list.extend(iter(parallel_titles))
- title_list.extend(iter(variant_title_list))
- if responsibility:
- self["responsibilityStatement"] = responsibility
- return title_list or None
-
-
-@marc21.over("editionStatement", "^250..")
-@utils.for_each_value
-@utils.ignore_value
-def marc21_to_edition_statement(self, key, value):
- """Get edition statement data.
-
- editionDesignation: 250 [$a non repetitive] (without trailing ponctuation)
- responsibility: 250 [$b non repetitive]
- """
- edition_data = {}
- if subfields_a := utils.force_list(value.get("a")):
- subfield_a = remove_trailing_punctuation(subfields_a[0])
- edition_data["editionDesignation"] = [{"value": subfield_a}]
- if subfields_b := utils.force_list(value.get("b")):
- subfields_b = subfields_b[0]
- edition_data["responsibility"] = [{"value": subfields_b}]
- return edition_data or None
-
-
-@marc21.over("copyrightDate", "^264.4")
-@utils.ignore_value
-def marc21_to_copyright_date(self, key, value):
- """Get Copyright Date."""
- copyright_dates = self.get("copyrightDate", [])
- copyright_date = value.get("c")
- if copyright_date:
- if match := re.search(r"^([©℗])+\s*(\d{4}.*)", copyright_date):
- copyright_date = " ".join((match.group(1), match.group(2)))
- else:
- raise ValueError("Bad format of copyright date")
- copyright_dates.append(copyright_date)
- return copyright_dates or None
-
-
-@marc21.over("provisionActivity", "^(260..|264.[_0-3])")
-@utils.for_each_value
-@utils.ignore_value
-def marc21_to_provision_activity(self, key, value):
- """Get publisher data.
-
- publisher.name: 264 [$b repetitive]
- publisher.place: 264 [$a repetitive]
- publicationDate: 264 [$c repetitive] (but take only the first one)
- """
-
- def build_statement(field_value, ind2):
-
- def build_place_or_agent_data(code, label):
- type_per_code = {"a": EntityType.PLACE, "b": EntityType.AGENT}
- return (
- {"type": type_per_code[code], "label": [{"value": value}]}
- if (value := remove_trailing_punctuation(label))
- else None
- )
-
- # function build_statement start here
- statement = []
- items = get_field_items(field_value)
- for blob_key, blob_value in items:
- if blob_key in ("a", "b"):
- place_or_agent_data = build_place_or_agent_data(blob_key, blob_value)
- if place_or_agent_data:
- statement.append(place_or_agent_data)
- return statement or None
-
- def build_place(marc21):
- place = {}
- if marc21.country:
- place["country"] = marc21.country
- if place:
- place["type"] = EntityType.PLACE
- return place
-
- # the function marc21_to_provision_activity start here
- ind2 = key[4]
- type_per_ind2 = {
- " ": "bf:Publication",
- "_": "bf:Publication",
- "0": "bf:Production",
- "1": "bf:Publication",
- "2": "bf:Distribution",
- "3": "bf:Manufacture",
- }
- if key[:3] == "260":
- ind2 = "1" # to force type to bf:Publication for field 260
- publication = {
- "type": type_per_ind2[ind2],
- "statement": [],
- }
-
- publication["statement"] = build_statement(value, ind2)
-
- subfields_c = utils.force_list(value.get("c"))
- if subfields_c:
- subfield_c = subfields_c[0]
- publication["statement"].append(
- {"label": [{"value": subfield_c}], "type": "Date"}
- )
- if ind2 in (" ", "1"):
- dates = subfield_c.replace("[", "").replace("]", "").split("-")
- try:
- start_date = make_year(dates[0])
- if start_date:
- publication["startDate"] = start_date
- except Exception:
- pass
- try:
- end_date = make_year(dates[1])
- if end_date:
- publication["endDate"] = end_date
- except Exception:
- pass
- place = build_place(marc21)
- if place and place.get("country") != "xx":
- publication["place"] = [place]
-
- return publication or None
-
-
-@marc21.over("extent", "^300..")
-@utils.ignore_value
-def marc21_to_description(self, key, value):
- """Get extent.
-
- extent: 300$a (the first one if many)
- """
- if value.get("a") and not self.get("extent", None):
- self["extent"] = remove_trailing_punctuation(
- utils.force_list(value.get("a"))[0]
- )
- return None
-
-
-@marc21.over("note", "^500..")
-@utils.for_each_value
-@utils.ignore_value
-def marc21_to_notes(self, key, value):
- """Get notes.
-
- note: [500$a repetitive]
- """
- add_note(dict(noteType="general", label=value.get("a", "")), self)
-
- return None
-
-
-@marc21.over("summary", "^520..")
-@utils.for_each_value
-@utils.ignore_value
-def marc21_to_summary(self, key, value):
- """Get summary from repetitive field 520."""
- key_per_code = {"a": "label", "c": "source"}
- # parse field 520 subfields for extracting:
- # summary and source parts
- tag_link, link = get_field_link_data(value)
- items = get_field_items(value)
- index = 1
- summary = {}
- subfield_selection = {"a", "c"}
- for blob_key, blob_value in items:
- if blob_key in subfield_selection:
- subfield_selection.remove(blob_key)
- if blob_key == "a":
- summary_data = marc21.build_value_with_alternate_graphic(
- "520", blob_key, blob_value, index, link, ",.", ":;/-="
- )
- else:
- summary_data = blob_value
- if summary_data:
- summary[key_per_code[blob_key]] = summary_data
- if blob_key != "__order__":
- index += 1
- return summary or None
-
-
-@marc21.over("subjects", "^6....")
-@utils.for_each_value
-@utils.ignore_value
-@utils.ignore_value
-def marc21_to_subjects(self, key, value):
- """Get subjects.
-
- subjects: 6xx [duplicates could exist between several vocabularies,
- if possible deduplicate]
- """
- seen = {}
- for subject in utils.force_list(value.get("a")):
- subject = {"type": EntityType.TOPIC, "authorized_access_point": subject}
- str_subject = str(subject)
- if str_subject not in seen:
- seen[str_subject] = 1
- self.setdefault("subjects", []).append(dict(entity=subject))
- return None
-
-
-@marc21.over("electronicLocator", "^8564.")
-@utils.for_each_value
-@utils.ignore_value
-def marc21_electronicLocator(self, key, value):
- """Get electronic locator."""
- indicator2 = key[4]
- electronic_locator = {}
- url = utils.force_list(value.get("u"))[0].strip()
- subfield_3 = value.get("3") # materials_specified
- if subfield_3:
- subfield_3 = utils.force_list(subfield_3)[0]
- if indicator2 == "2":
- if subfield_3 and subfield_3 == "Image de couverture":
- electronic_locator = {
- "url": url,
- "type": "relatedResource",
- "content": "coverImage",
- }
- elif indicator2 == "0":
- if subfield_x := value.get("x"): # nonpublic_note
- electronic_locator = {
- "url": url,
- "type": "resource",
- "source": utils.force_list(subfield_x)[0],
- }
- if subfield_q := value.get("q"): # electronic_format_type
- if subfield_q == "audio":
- self["type"] = [
- {
- "main_type": "docmaintype_audio",
- "subtype": "docsubtype_audio_book",
- }
- ]
- return electronic_locator or None
diff --git a/rero_ils/modules/ebooks/receivers.py b/rero_ils/modules/ebooks/receivers.py
deleted file mode 100644
index b8fc9442ba..0000000000
--- a/rero_ils/modules/ebooks/receivers.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# RERO ILS
-# Copyright (C) 2019-2022 RERO
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, version 3 of the License.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program. If not, see .
-
-"""Signals connections for ebooks document."""
-
-from dojson.contrib.marc21.utils import create_record
-from flask import current_app
-
-from ..utils import set_timestamp
-from .dojson.contrib.marc21 import marc21
-from .tasks import create_records, delete_records
-
-
-def publish_harvested_records(
- sender=None, records=None, max_results=None, *args, **kwargs
-):
- """Create, index the harvested records."""
- # name = kwargs['name']
- records = records or []
- if max_results:
- records = list(records)[: int(max_results)]
- converted_records = []
- deleted_records = []
- for record in records:
- rec = create_record(record.xml)
- rec = marc21.do(rec)
- rec.setdefault("harvested", True)
-
- identifiers = rec.get("identifiedBy", [])
- identifiers.append(
- {"type": "bf:Local", "source": "cantook", "value": record.header.identifier}
- )
- rec["identifiedBy"] = identifiers
- if record.deleted:
- deleted_records.append(rec)
- else:
- converted_records.append(rec)
- if converted_records:
- current_app.logger.info(
- f"publish_harvester: received {len(converted_records)} " "records to create"
- )
- create_records(converted_records)
- if deleted_records:
- current_app.logger.info(
- f"publish_harvester: received {len(deleted_records)} " "records to delete"
- )
- delete_records(deleted_records)
- msg = f"deleted: {len(deleted_records)}, created: {len(converted_records)}"
- set_timestamp("ebooks-harvester", msg=msg)
diff --git a/rero_ils/modules/ebooks/tasks.py b/rero_ils/modules/ebooks/tasks.py
deleted file mode 100644
index ed62abe807..0000000000
--- a/rero_ils/modules/ebooks/tasks.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# RERO ILS
-# Copyright (C) 2019-2022 RERO
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, version 3 of the License.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program. If not, see .
-
-"""Celery tasks to create records."""
-
-from __future__ import absolute_import, print_function
-
-from celery import shared_task
-from flask import current_app
-
-from ..documents.api import Document, DocumentsSearch
-from ..utils import do_bulk_index, get_schema_for_resource, set_timestamp
-from .utils import create_document_holding, update_document_holding
-
-
-@shared_task(ignore_result=True)
-def create_records(records):
- """Records creation and indexing."""
- n_updated = 0
- n_created = 0
- uuids = []
- for record in records:
- # add document type
- if "type" not in record:
- record["type"] = [
- {"main_type": "docmaintype_book", "subtype": "docsubtype_e-book"}
- ]
- # check if already harvested
- pid = None
- for identifier in record.get("identifiedBy"):
- if identifier.get("source") == "cantook":
- harvested_id = identifier.get("value")
- query = (
- DocumentsSearch()
- .filter("term", identifiedBy__value__raw=harvested_id)
- .source(includes=["pid"])
- )
- try:
- pid = next(query.scan()).pid
- except StopIteration:
- pid = None
- try:
- # add documents schema
- pid_type = Document.provider.pid_type
- record["$schema"] = get_schema_for_resource(pid_type)
- if pid:
- # update the record
- record["pid"] = pid
- existing_record = update_document_holding(record, pid)
- n_updated += 1
- uuids.append(existing_record.id)
- elif new_record := create_document_holding(record):
- n_created += 1
- uuids.append(new_record.id)
- except Exception as err:
- current_app.logger.error(f"EBOOKS CREATE RECORDS: {err} {record}")
- do_bulk_index(uuids, doc_type="doc", process=True)
-
- current_app.logger.info(f"create_records: {n_updated} updated, {n_created} new")
- set_timestamp("ebooks_create_records", created=n_created, updated=n_updated)
- return n_created, n_updated
-
-
-@shared_task(ignore_result=True)
-def delete_records(records):
- """Records deleting."""
- count = 0
- for record in records:
- # check if exist
- pid = None
- for identifier in record.get("identifiedBy"):
- if identifier.get("source") == "cantook":
- harvested_id = identifier.get("value")
- query = (
- DocumentsSearch()
- .filter("term", identifiedBy__value__raw=harvested_id)
- .source(includes=["pid"])
- )
- try:
- pid = [r.pid for r in query.scan()].pop()
- except IndexError:
- pid = None
- try:
- if pid:
- # update the record
- existing_record = Document.get_record_by_pid(pid)
- # TODO: delete record and linked references
- count += 1
- except Exception as err:
- current_app.logger.error(f"EBOOKS DELETE RECORDS: {err} {record}")
- current_app.logger.info(f"delete_records: {count}")
- set_timestamp("ebooks_delete_records", deleted=count)
- return count
diff --git a/rero_ils/modules/ebooks/utils.py b/rero_ils/modules/ebooks/utils.py
deleted file mode 100644
index e64a7c13aa..0000000000
--- a/rero_ils/modules/ebooks/utils.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# RERO ILS
-# Copyright (C) 2019-2022 RERO
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, version 3 of the License.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program. If not, see .
-
-"""Utilities."""
-
-from flask import current_app
-from invenio_db import db
-from invenio_oaiharvester.models import OAIHarvestConfig
-
-from rero_ils.modules.locations.api import Location
-
-from ..documents.api import Document
-from ..holdings.api import (
- Holding,
- HoldingsSearch,
- create_holding,
- get_holding_pid_by_doc_location_item_type,
-)
-from ..organisations.api import Organisation
-
-
-def add_oai_source(
- name, baseurl, metadataprefix="marc21", setspecs="", comment="", update=False
-):
- """Add OAIHarvestConfig."""
- with current_app.app_context():
- source = OAIHarvestConfig.query.filter_by(name=name).first()
- if not source:
- source = OAIHarvestConfig(
- name=name,
- baseurl=baseurl,
- metadataprefix=metadataprefix,
- setspecs=setspecs,
- comment=comment,
- )
- source.save()
- db.session.commit()
- return "Added"
- elif update:
- source.name = name
- source.baseurl = baseurl
- source.metadataprefix = metadataprefix
- if setspecs != "":
- source.setspecs = setspecs
- if comment != "":
- source.comment = comment
- db.session.commit()
- return "Updated"
- return "Not Updated"
-
-
-def get_harvested_sources(record):
- """Get the harvested sources from electronicLocator."""
- harvested_sources = []
- new_electronic_locators = []
- electronic_locators = record.get("electronicLocator", [])
- for electronic_locator in electronic_locators:
- if source := electronic_locator.get("source"):
- harvested_sources.append(
- {"source": source, "uri": electronic_locator.get("url")}
- )
- else:
- new_electronic_locators.append(electronic_locator)
- if new_electronic_locators:
- record["electronicLocator"] = new_electronic_locators
- return harvested_sources
-
-
-def create_document_holding(record):
- """Create a document and a holding for a harvested ebook."""
- harvested_sources = get_harvested_sources(record)
- new_record = None
- holdings = []
- for harvested_source in harvested_sources:
- if org := Organisation.get_record_by_online_harvested_source(
- source=harvested_source["source"]
- ):
- if not new_record:
- new_record = Document.create(data=record, dbcommit=False, reindex=False)
- if new_record:
- item_type_pid = org.online_circulation_category()
- location_pids = org.get_online_locations()
- for location_pid in location_pids:
- location = Location.get_record_by_pid(location_pid)
- library = location.get_library()
- if url := library.get_online_harvested_source_url(
- source=harvested_source["source"]
- ):
- uri_split = harvested_source["uri"].split("/")[3:]
- uri_split.insert(0, url.rstrip("/"))
- harvested_source["uri"] = "/".join(uri_split)
- hold = create_holding(
- document_pid=new_record.pid,
- location_pid=location_pid,
- item_type_pid=item_type_pid,
- electronic_location=harvested_source,
- holdings_type="electronic",
- )
- holdings.append(hold)
- else:
- current_app.logger.warning(
- f"create document holding no org: {harvested_source['source']}"
- )
- db.session.commit()
- for hold in holdings:
- hold.reindex()
- # the document has been reindexed by the holdings
- if not holdings and new_record:
- new_record.reindex()
- return new_record
-
-
-def update_document_holding(record, pid):
- """Update a document and a holding for a harvested ebook."""
- harvested_sources = get_harvested_sources(record)
- new_record = None
- existing_record = Document.get_record_by_pid(pid)
- new_record = existing_record.replace(data=record, dbcommit=False, reindex=False)
- # Save all source uris to find holdings we can delete later
- source_uris = []
- holdings = []
- for harvested_source in harvested_sources:
- if org := Organisation.get_record_by_online_harvested_source(
- source=harvested_source["source"]
- ):
- # add the organisation source uri
- source_uris.append(harvested_source["uri"])
- item_type_pid = org.online_circulation_category()
- for location_pid in org.get_online_locations():
- location = Location.get_record_by_pid(location_pid)
- library = location.get_library()
- # replace "https://some.uri" from ebooks with library uri
- if url := library.get_online_harvested_source_url(
- source=harvested_source["source"]
- ):
- uri_split = harvested_source["uri"].split("/")[3:]
- uri_split.insert(0, url.rstrip("/"))
- new_uri = "/".join(uri_split)
- harvested_source["uri"] = new_uri
- # add the library source uri
- source_uris.append(new_uri)
- if not get_holding_pid_by_doc_location_item_type(
- new_record.pid, location_pid, item_type_pid, "electronic"
- ):
- hold = create_holding(
- document_pid=new_record.pid,
- location_pid=location_pid,
- item_type_pid=item_type_pid,
- electronic_location=harvested_source,
- holdings_type="electronic",
- )
- holdings.append(hold)
- db.session.commit()
- for hold in holdings:
- hold.reindex()
- # the document has been reindexed by the holdings
- if not holdings and new_record:
- new_record.reindex()
- HoldingsSearch.flush_and_refresh()
- # delete all double holdings and holdings without valid source uri
- seen_uris = []
- for holding_pid in Holding.get_holdings_pid_by_document_pid(pid):
- holding = Holding.get_record_by_pid(holding_pid)
- to_delete = True
- for electronic_location in holding.get("electronic_location", []):
- uri = electronic_location.get("uri")
- if electronic_location.get("source") and uri not in seen_uris:
- seen_uris.append(uri)
- if uri in source_uris:
- to_delete = False
- if to_delete:
- current_app.logger.info(
- "Delete harvested holding | "
- f"document: {pid} "
- f'holding: {holding.pid} {holding.get("electronic_location")}'
- )
- holding.delete(force=False, dbcommit=True, delindex=True)
- return new_record
diff --git a/rero_ils/modules/entities/api.py b/rero_ils/modules/entities/api.py
index 74d0a60ce6..6c29ceab66 100644
--- a/rero_ils/modules/entities/api.py
+++ b/rero_ils/modules/entities/api.py
@@ -63,7 +63,7 @@ def get_authorized_access_point(self, language):
:param language: language for authorized access point.
:returns: authorized access point in given language.
"""
- raise NotImplementedError
+ raise NotImplementedError()
@abstractmethod
def get_links_to_me(self, get_pids=False):
@@ -90,7 +90,7 @@ def reasons_not_to_delete(self):
@abstractmethod
def resource_type(self):
"""Get the entity type."""
- raise NotImplementedError
+ raise NotImplementedError()
@property
def organisation_pids(self):
diff --git a/rero_ils/modules/ext.py b/rero_ils/modules/ext.py
index 92dccb3e45..48eff215d1 100644
--- a/rero_ils/modules/ext.py
+++ b/rero_ils/modules/ext.py
@@ -64,7 +64,6 @@
from rero_ils.modules.acquisition.acq_receipts.listener import enrich_acq_receipt_data
from rero_ils.modules.acquisition.budgets.listener import budget_is_active_changed
from rero_ils.modules.collections.listener import enrich_collection_data
-from rero_ils.modules.ebooks.receivers import publish_harvested_records
from rero_ils.modules.holdings.listener import (
enrich_holding_data,
update_items_locations_and_types,
@@ -346,8 +345,6 @@ def register_signals(self, app):
loan_state_changed.connect(listener_loan_state_changed, weak=False)
- oaiharvest_finished.connect(publish_harvested_records, weak=False)
-
# store the username in the session
user_logged_in.connect(set_user_name)
user_logged_out.connect(remove_user_name)
diff --git a/rero_ils/modules/files/operations.py b/rero_ils/modules/files/operations.py
index 52f538a7a6..bfdcc33f43 100644
--- a/rero_ils/modules/files/operations.py
+++ b/rero_ils/modules/files/operations.py
@@ -45,7 +45,7 @@ def on_post_commit(self, uow):
:param uow: obj - UnitOfWork instance.
"""
- raise NotImplementedError
+ raise NotImplementedError()
class ReindexDoc(ReindexOperationBase):
diff --git a/rero_ils/modules/organisations/api.py b/rero_ils/modules/organisations/api.py
index ff3d433d56..f915b85ae3 100644
--- a/rero_ils/modules/organisations/api.py
+++ b/rero_ils/modules/organisations/api.py
@@ -106,13 +106,20 @@ def get_record_by_online_harvested_source(cls, source):
:param source: the record source
:return: Organisation record or None.
"""
- results = (
- OrganisationsSearch().filter("term", online_harvested_source=source).scan()
- )
- try:
- return Organisation.get_record_by_pid(next(results).pid)
- except StopIteration:
- return None
+ for org in cls.get_records_by_online_harvested_source(source):
+ return org
+
+ @classmethod
+ def get_records_by_online_harvested_source(cls, source):
+ """Get record by online harvested source.
+
+ :param source: the record source
+ :return: Organisation record or None.
+ """
+ query = OrganisationsSearch().filter("term", online_harvested_source=source)
+ org_pids = [hit.pid for hit in query.source("pid").scan()]
+ for org_pid in org_pids:
+ yield Organisation.get_record_by_pid(org_pid)
@property
def organisation_pid(self):
diff --git a/rero_ils/modules/stats/api/indicators/base.py b/rero_ils/modules/stats/api/indicators/base.py
index 9f5669b9d2..d230b4c9a1 100644
--- a/rero_ils/modules/stats/api/indicators/base.py
+++ b/rero_ils/modules/stats/api/indicators/base.py
@@ -39,7 +39,7 @@ def query(self):
:returns: an elasticsearch query object
"""
- raise NotImplementedError
+ raise NotImplementedError()
@property
@abstractmethod
@@ -49,7 +49,7 @@ def aggregation(self, distribution):
:param distrubtion: str - report distrubtion name
:returns: an elasticsearch aggregation object
"""
- raise NotImplementedError
+ raise NotImplementedError()
@abstractmethod
def label(self, distribution, bucket):
@@ -60,4 +60,4 @@ def label(self, distribution, bucket):
:returns: the label
:rtype: str
"""
- raise NotImplementedError
+ raise NotImplementedError()
diff --git a/scripts/setup b/scripts/setup
index 6ae0474808..a2c5fbdf39 100755
--- a/scripts/setup
+++ b/scripts/setup
@@ -599,21 +599,25 @@ create_token organisation_scotland_token reroilstest+irma@gmail.com ${INVENIO_R
create_token organisation_fictive_token reroilstest+imagination@gmail.com ${INVENIO_RERO_ACCESS_TOKEN_FICTIVE_LIBRARIAN}
# # OAI configuration
-info_msg "OAI configuration: ${DATA_PATH}/oaisources.yml"
-eval ${PREFIX} invenio reroils oaiharvester initconfig ${DATA_PATH}/oaisources.yml
+info_msg "API configuration: ${DATA_PATH}/apisources.yml"
+eval ${PREFIX} invenio reroils apiharvester init-config ${DATA_PATH}/apisources.yml
eval ${PREFIX} invenio reroils scheduler enable_tasks -a -v
-# disable ebook harvesting
-eval ${PREFIX} invenio reroils scheduler enable_tasks -n ebooks-harvester -d
+disable VS/NJ CANTOOK harvesting
+eval ${PREFIX} invenio reroils scheduler enable_tasks -n harvest-vs-cantook -d
+eval ${PREFIX} invenio reroils scheduler enable_tasks -n harvest-nj-cantook -d
if ${DEPLOYMENT}
then
# start oai harvesting asynchrone: beats must be running
- info_msg "Start OAI harvesting asynchrone"
- eval ${PREFIX} invenio reroils oaiharvester harvest -n ebooks -a max_results=150 -q -k
+ info_msg "Start VS-CANTOOK harvesting asynchrone"
+ eval ${PREFIX} invenio reroils apiharvester harvest -n VS-CANTOOK -m 150 -k
+ info_msg "Start NJ-CANTOOK harvesting asynchrone"
+ eval ${PREFIX} invenio reroils apiharvester harvest -n NJ-CANTOOK -m 150 -k
else
- info_msg "For ebooks harvesting run:"
- msg "\tinvenio reroils oaiharvester harvest -n ebooks -a max_results=100 -q"
+ info_msg "For VS/NJ CANTOOK harvesting run:"
+ msg "\tinvenio reroils apiharvester harvest -n VS-CANTOOK -m 100 -v"
+ msg "\tinvenio reroils apiharvester harvest -n NJ-CANTOOK -m 100 -v"
fi
if ${ES_MAPPING}
diff --git a/setup.py b/setup.py
index 6f230d1983..5dec3eda2a 100644
--- a/setup.py
+++ b/setup.py
@@ -147,7 +147,6 @@ def run(self):
'marc21tojson_loc = rero_ils.modules.documents.dojson.contrib.marc21tojson:marc21_loc',
'marc21tojson_slsp = rero_ils.modules.documents.dojson.contrib.marc21tojson:marc21_slsp',
'marc21tojson_ugent = rero_ils.modules.documents.dojson.contrib.marc21tojson:marc21_ugent',
- 'marc21toebooks = rero_ils.modules.ebooks.dojson.contrib.marc21:marc21',
'unimarctojson = rero_ils.modules.documents.dojson.contrib.unimarctojson:unimarc',
],
'flask.commands': [
@@ -156,7 +155,6 @@ def run(self):
'migration = rero_ils.modules.migration.cli:migration',
'monitoring = rero_ils.modules.monitoring.cli:monitoring',
'notifications = rero_ils.modules.notifications.cli:notifications',
- 'oaiharvester = rero_ils.modules.ebooks.cli:oaiharvester',
'reroils = rero_ils.modules.cli.reroils:reroils',
'scheduler = rero_ils.schedulers:scheduler',
'stats = rero_ils.modules.stats.cli:stats',
@@ -330,7 +328,6 @@ def run(self):
'apiharvester = rero_ils.modules.apiharvester.tasks',
'collections = rero_ils.modules.collections.tasks',
'documents = rero_ils.modules.documents.tasks',
- 'ebooks = rero_ils.modules.ebooks.tasks',
'holdings = rero_ils.modules.holdings.tasks',
'items = rero_ils.modules.items.tasks',
'loans = rero_ils.modules.loans.tasks',