Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sets support #206

Merged
merged 19 commits into from
Feb 22, 2022
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions invenio_oaiserver/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2015-2018 CERN.
# Copyright (C) 2015-2022 CERN.
#
# Invenio is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.
Expand Down Expand Up @@ -182,8 +182,6 @@
(GPLv3 licensed).
"""

from __future__ import absolute_import, print_function

from .ext import InvenioOAIServer
from .proxies import current_oaiserver
from .version import __version__
Expand Down
6 changes: 4 additions & 2 deletions invenio_oaiserver/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#
# This file is part of Invenio.
# Copyright (C) 2015-2018 CERN.
# Copyright (C) 2021 Graz University of Technology.
# Copyright (C) 2022 Graz University of Technology.
#
# Invenio is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.
Expand Down Expand Up @@ -124,9 +124,11 @@
OAISERVER_CREATED_KEY = "_created"
"""Record created key."""

OAISERVER_RECORD_SETS_FETCHER = 'invenio_oaiserver.utils:record_sets_fetcher'
OAISERVER_RECORD_SETS_FETCHER = 'invenio_oaiserver.percolator:find_sets_for_record'
"""Record's OAI sets function."""

OAISERVER_SET_RECORDS_QUERY_FETCHER = 'invenio_oaiserver.fetchers:set_records_query_fetcher'

OAISERVER_RECORD_CLS = 'invenio_records.api:Record'
"""Record retrieval class."""

Expand Down
13 changes: 10 additions & 3 deletions invenio_oaiserver/errors.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
# -*- coding: utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2016-2018 CERN.
# Copyright (C) 2016-2022 CERN.
# Copyright (C) 2022 Graz University of Technology.
#
# Invenio is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""Error."""

from __future__ import absolute_import, print_function


class OAIBadMetadataFormatError(Exception):
"""Metadata format required doesn't exist."""
Expand All @@ -20,3 +19,11 @@ class OAISetSpecUpdateError(Exception):

The correct way is to delete the set and create a new one.
"""


class OAINoRecordsMatchError(Exception):
"""No records match the query.

The combination of the values of the from, until, and set arguments
results in an empty list.
"""
27 changes: 8 additions & 19 deletions invenio_oaiserver/ext.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
# -*- coding: utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2015-2018 CERN.
# Copyright (C) 2021 Graz University of Technology.
# Copyright (C) 2015-2022 CERN.
# Copyright (C) 2021-2022 Graz University of Technology.
#
# Invenio is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""Invenio-OAIServer extension implementation."""

from __future__ import absolute_import, print_function

from invenio_base.utils import obj_or_import_string
from invenio_records import signals as records_signals
from sqlalchemy.event import contains, listen, remove

from . import config
Expand Down Expand Up @@ -56,6 +53,11 @@ def record_fetcher(self):
"""Get the record fetcher class for record serialization."""
return obj_or_import_string(self.app.config['OAISERVER_GETRECORD_FETCHER'])

@property
def set_records_query_fetcher(self):
"""Get query to retrieve records based on set."""
return obj_or_import_string(self.app.config['OAISERVER_SET_RECORDS_QUERY_FETCHER'])

@property
def last_update_key(self):
"""Get record update key."""
Expand All @@ -82,14 +84,6 @@ def sets(self, values):

def register_signals(self):
"""Register signals."""
from .receivers import OAIServerUpdater

# Register Record signals to update OAI informations
self.update_function = OAIServerUpdater()
records_signals.before_record_insert.connect(self.update_function,
weak=False)
records_signals.before_record_update.connect(self.update_function,
weak=False)
if self.app.config['OAISERVER_REGISTER_SET_SIGNALS']:
self.register_signals_oaiset()

Expand All @@ -105,11 +99,6 @@ def register_signals_oaiset(self):
def unregister_signals(self):
"""Unregister signals."""
# Unregister Record signals
if hasattr(self, 'update_function'):
records_signals.before_record_insert.disconnect(
self.update_function)
records_signals.before_record_update.disconnect(
self.update_function)
self.unregister_signals_oaiset()

def unregister_signals_oaiset(self):
Expand Down Expand Up @@ -168,7 +157,7 @@ def init_config(self, app):
import warnings

app.config['OAISERVER_ID_PREFIX'] = \
'oai:{0}:recid/'.format(socket.gethostname())
'oai:{0}:'.format(socket.gethostname())
warnings.warn(
"""Please specify the OAISERVER_ID_PREFIX configuration."""
"""default value is: {0}""".format(
Expand Down
19 changes: 16 additions & 3 deletions invenio_oaiserver/fetchers.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
# -*- coding: utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2016-2018 CERN.
# Copyright (C) 2016-2022 CERN.
# Copyright (C) 2021 Graz University of Technology.
#
# Invenio is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""Persistent identifier fetchers."""

from __future__ import absolute_import, print_function

from elasticsearch_dsl.query import Q
from invenio_pidstore.errors import PersistentIdentifierError
from invenio_pidstore.fetchers import FetchedPID

from .models import OAISet
from .provider import OAIIDProvider
from .query import query_string_parser


def oaiid_fetcher(record_uuid, data):
Expand All @@ -33,3 +34,15 @@ def oaiid_fetcher(record_uuid, data):
pid_type=OAIIDProvider.pid_type,
pid_value=str(pid_value),
)


def set_records_query_fetcher(setSpec):
"""Fetch query to retrieve records based on provided set spec."""
set = OAISet.query.filter(OAISet.spec == setSpec).first()
if set is None:
# raise error that no matches can be found ?
query = Q('match_none')
else:
query = Q(query_string_parser(set.search_pattern))
Copy link

@jma jma Feb 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The application crashes if the search_pattern is empty. Why the default behaviour is not to search in the _oai.set fields?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my opinion, an empty search_pattern does not make sense, if there is no way to manually manage sets.

The idea is to not store the information of the sets a record belongs to within the record itself. Instead, the record is matched against the queries of the sets (ElasticSearch percolate query) and this information is generated dynamically on request.

If the set argument is provided (which is the case if the code above is to be executed), then only records matching the specified set will be returned. Thus it is necessary to lookup the set and get its search pattern. After fetching the records belonging to this set, the above step is performed (i.e. the records are matched against all the sets queries).

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for your reply.

Here is some suggestions:

  • If the search_pattern is required, I suggest to add a no null constraint in the db model
  • to speed up the ListIdentifiers, you can use source for the elasticsearch during the get_records
  • the source fields can be a configuration for ListRecords
  • using a search_query as insitution:test raise the following error: TypeError: expected string or bytes-like object, replacing by institution:"test" works (bug)

Thanks

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea is to not store the information of the sets a record belongs to within the record itself. Instead, the record is matched against the queries of the sets (ElasticSearch percolate query) and this information is generated dynamically on request.

Probably I misunderstand something. If during the request we use the search_pattern, is the percolator still useful?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably I misunderstand something. If during the request we use the search_pattern, is the percolator still useful?

The percolate query is used to get all sets a record belongs to (needed for the header of the response).
The additional query above is used, when a setSpec is specified in the request (f.e. https://127.0.0.1:5000/oai2d?verb=ListRecords&metadataPrefix=oai_dc&set=my-set). It is used during record querying so only records belonging to this set are retrieved. Then we take these records and do a percolate query to get all the sets the records belong to.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jma First of all thanks for reviewing!

As @rekt-hard explained, the big change here is that we want to get away from storing OAI sets information inside the record. It's highly very inefficient to do it this way (e.g. define a new set, or modify an existing set and you may need to reindex every single record in the entire repository. The only thing we're loosing is the ability to say that a record was deleted from a set. A harvest can obtain that information by running ListIdentifiers for a set and compare against a local set.

On a side note, can I ask you not to use the "Request changes" when reviewing a PR. Instead, simply prefix your comment with Comment/Question/Minor//Major according (see definitions here). Basically idea is that lack of a green approve, means it's not approved. Just wanted to explain why I'm gonna press the "dismiss review"


return query
7 changes: 1 addition & 6 deletions invenio_oaiserver/minters.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,17 @@
# -*- coding: utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2016-2018 CERN.
# Copyright (C) 2016-2022 CERN.
#
# Invenio is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""Persistent identifier minters."""

from __future__ import absolute_import, print_function

from datetime import datetime

from flask import current_app
from invenio_pidstore import current_pidstore

from .provider import OAIIDProvider
from .utils import datetime_to_datestamp


def oaiid_minter(record_uuid, data):
Expand Down
70 changes: 28 additions & 42 deletions invenio_oaiserver/models.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,20 @@
# -*- coding: utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2015-2018 CERN.
# Copyright (C) 2015-2022 CERN.
# Copyright (C) 2022 Graz University of Technology.
#
# Invenio is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""Models for storing information about OAIServer state."""

from datetime import datetime

from flask_babelex import lazy_gettext as _
from invenio_db import db
from sqlalchemy.event import listen
from sqlalchemy.orm import validates
from sqlalchemy_utils import Timestamp

from .errors import OAISetSpecUpdateError
from .proxies import current_oaiserver
from .utils import datetime_to_datestamp


class OAISet(db.Model, Timestamp):
Expand Down Expand Up @@ -76,52 +72,42 @@ def validate_spec(self, key, value):
raise OAISetSpecUpdateError("Updating spec is not allowed.")
return value

def add_record(self, record):
"""Add a record to the OAISet.

:param record: Record to be added.
:type record: `invenio_records.api.Record` or derivative.
"""
record.setdefault('_oai', {}).setdefault('sets', [])

assert not self.has_record(record)

record['_oai']['sets'].append(self.spec)
# TODO: Add and remove can be implemented but it will require to
# update the `search_pattern`

def remove_record(self, record):
"""Remove a record from the OAISet.
# def add_record(self, record):
ppanero marked this conversation as resolved.
Show resolved Hide resolved
# """Add a record to the OAISet.

:param record: Record to be removed.
:type record: `invenio_records.api.Record` or derivative.
"""
assert self.has_record(record)
# :param record: Record to be added.
# :type record: `invenio_records.api.Record` or derivative.
# """
# record.setdefault('_oai', {}).setdefault('sets', [])

record['_oai']['sets'] = [
s for s in record['_oai']['sets'] if s != self.spec]
# assert not self.has_record(record)

def has_record(self, record):
"""Check if the record blongs to the OAISet.
# record['_oai']['sets'].append(self.spec)

:param record: Record to be checked.
:type record: `invenio_records.api.Record` or derivative.
"""
return self.spec in record.get('_oai', {}).get('sets', [])
# def remove_record(self, record):
# """Remove a record from the OAISet.

# :param record: Record to be removed.
# :type record: `invenio_records.api.Record` or derivative.
# """
# assert self.has_record(record)

def oaiset_removed_or_inserted(mapper, connection, target):
"""Invalidate cache on collection insert or delete."""
current_oaiserver.sets = None
# record['_oai']['sets'] = [
# s for s in record['_oai']['sets'] if s != self.spec]

# TODO: has_record can be implemented but it will require to
# to do a full search.

def oaiset_attribute_changed(target, value, oldvalue, initiator):
"""Invalidate cache if dbquery change."""
if value != oldvalue:
current_oaiserver.sets = None
# def has_record(self, record):
# """Check if the record blongs to the OAISet.

# :param record: Record to be checked.
# :type record: `invenio_records.api.Record` or derivative.
# """
# return self.spec in record.get('_oai', {}).get('sets', [])

# update cache with list of collections
listen(OAISet, 'after_insert', oaiset_removed_or_inserted)
listen(OAISet, 'after_delete', oaiset_removed_or_inserted)
listen(OAISet.search_pattern, 'set', oaiset_attribute_changed)

__all__ = ('OAISet', )
Loading