From a703001db2240c6fb8a567f763bf41f62f5ef2f1 Mon Sep 17 00:00:00 2001 From: Fatimah Zulfiqar Date: Wed, 4 Dec 2024 14:05:01 +0100 Subject: [PATCH 1/2] schema: added identifiers to subjects --- invenio_rdm_records/records/api.py | 2 +- .../os-v1/rdmrecords/drafts/draft-v6.0.0.json | 10 ++++++++++ .../rdmrecords/records/record-v7.0.0.json | 10 ++++++++++ .../os-v2/rdmrecords/drafts/draft-v6.0.0.json | 10 ++++++++++ .../rdmrecords/records/record-v7.0.0.json | 19 ++++++++++++------- .../v7/rdmrecords/drafts/draft-v6.0.0.json | 10 ++++++++++ .../v7/rdmrecords/records/record-v6.0.0.json | 10 ++++++++++ 7 files changed, 63 insertions(+), 8 deletions(-) diff --git a/invenio_rdm_records/records/api.py b/invenio_rdm_records/records/api.py index 377ef6c82..83180c770 100644 --- a/invenio_rdm_records/records/api.py +++ b/invenio_rdm_records/records/api.py @@ -184,7 +184,7 @@ class CommonFieldsMixin: ), subjects=PIDListRelation( "metadata.subjects", - keys=["subject", "scheme", "props"], + keys=["subject", "scheme", "props", "identifiers"], pid_field=Subject.pid, cache_key="subjects", ), diff --git a/invenio_rdm_records/records/mappings/os-v1/rdmrecords/drafts/draft-v6.0.0.json b/invenio_rdm_records/records/mappings/os-v1/rdmrecords/drafts/draft-v6.0.0.json index 3c532097e..2e78f22ae 100644 --- a/invenio_rdm_records/records/mappings/os-v1/rdmrecords/drafts/draft-v6.0.0.json +++ b/invenio_rdm_records/records/mappings/os-v1/rdmrecords/drafts/draft-v6.0.0.json @@ -1238,6 +1238,16 @@ "props": { "type": "object", "dynamic": "true" + }, + "identifiers": { + "properties": { + "identifier": { + "type": "keyword" + }, + "scheme": { + "type": "keyword" + } + } } } }, diff --git a/invenio_rdm_records/records/mappings/os-v1/rdmrecords/records/record-v7.0.0.json b/invenio_rdm_records/records/mappings/os-v1/rdmrecords/records/record-v7.0.0.json index 3e3cd6600..ef3b4c32a 100644 --- a/invenio_rdm_records/records/mappings/os-v1/rdmrecords/records/record-v7.0.0.json +++ b/invenio_rdm_records/records/mappings/os-v1/rdmrecords/records/record-v7.0.0.json @@ -1256,6 +1256,16 @@ "props": { "type": "object", "dynamic": "true" + }, + "identifiers": { + "properties": { + "identifier": { + "type": "keyword" + }, + "scheme": { + "type": "keyword" + } + } } } }, diff --git a/invenio_rdm_records/records/mappings/os-v2/rdmrecords/drafts/draft-v6.0.0.json b/invenio_rdm_records/records/mappings/os-v2/rdmrecords/drafts/draft-v6.0.0.json index 0d93f5410..a9fedc421 100644 --- a/invenio_rdm_records/records/mappings/os-v2/rdmrecords/drafts/draft-v6.0.0.json +++ b/invenio_rdm_records/records/mappings/os-v2/rdmrecords/drafts/draft-v6.0.0.json @@ -1238,6 +1238,16 @@ "props": { "type": "object", "dynamic": "true" + }, + "identifiers": { + "properties": { + "identifier": { + "type": "keyword" + }, + "scheme": { + "type": "keyword" + } + } } } }, diff --git a/invenio_rdm_records/records/mappings/os-v2/rdmrecords/records/record-v7.0.0.json b/invenio_rdm_records/records/mappings/os-v2/rdmrecords/records/record-v7.0.0.json index de3b0a3de..6af64c11c 100644 --- a/invenio_rdm_records/records/mappings/os-v2/rdmrecords/records/record-v7.0.0.json +++ b/invenio_rdm_records/records/mappings/os-v2/rdmrecords/records/record-v7.0.0.json @@ -45,13 +45,8 @@ "accent_analyzer": { "tokenizer": "standard", "type": "custom", - "char_filter": [ - "strip_special_chars" - ], - "filter": [ - "lowercase", - "asciifolding" - ] + "char_filter": ["strip_special_chars"], + "filter": ["lowercase", "asciifolding"] } } } @@ -1248,6 +1243,16 @@ "props": { "type": "object", "dynamic": "true" + }, + "identifiers": { + "properties": { + "identifier": { + "type": "keyword" + }, + "scheme": { + "type": "keyword" + } + } } } }, diff --git a/invenio_rdm_records/records/mappings/v7/rdmrecords/drafts/draft-v6.0.0.json b/invenio_rdm_records/records/mappings/v7/rdmrecords/drafts/draft-v6.0.0.json index 1a901cbfe..cb454eeeb 100644 --- a/invenio_rdm_records/records/mappings/v7/rdmrecords/drafts/draft-v6.0.0.json +++ b/invenio_rdm_records/records/mappings/v7/rdmrecords/drafts/draft-v6.0.0.json @@ -1238,6 +1238,16 @@ "props": { "type": "object", "dynamic": "true" + }, + "identifiers": { + "properties": { + "identifier": { + "type": "keyword" + }, + "scheme": { + "type": "keyword" + } + } } } }, diff --git a/invenio_rdm_records/records/mappings/v7/rdmrecords/records/record-v6.0.0.json b/invenio_rdm_records/records/mappings/v7/rdmrecords/records/record-v6.0.0.json index 941fca4e4..9330c674c 100644 --- a/invenio_rdm_records/records/mappings/v7/rdmrecords/records/record-v6.0.0.json +++ b/invenio_rdm_records/records/mappings/v7/rdmrecords/records/record-v6.0.0.json @@ -1195,6 +1195,16 @@ "props": { "type": "object", "dynamic": "true" + }, + "identifiers": { + "properties": { + "identifier": { + "type": "keyword" + }, + "scheme": { + "type": "keyword" + } + } } } }, From cf58b5d3f64ddd0af49f5016844daef02aa1bce6 Mon Sep 17 00:00:00 2001 From: Fatimah Zulfiqar Date: Thu, 5 Dec 2024 09:42:44 +0100 Subject: [PATCH 2/2] serializer: updated subjects and affiliations in dcat --- .../resources/serializers/dcat/__init__.py | 103 ++++++++++++++++++ .../resources/serializers/dcat/schema.py | 43 +++++++- .../serializers/test_dcat_serializer.py | 2 +- 3 files changed, 146 insertions(+), 2 deletions(-) diff --git a/invenio_rdm_records/resources/serializers/dcat/__init__.py b/invenio_rdm_records/resources/serializers/dcat/__init__.py index 25cc8d02f..d56d92711 100644 --- a/invenio_rdm_records/resources/serializers/dcat/__init__.py +++ b/invenio_rdm_records/resources/serializers/dcat/__init__.py @@ -12,6 +12,7 @@ from datacite import schema43 from flask_resources import BaseListSchema, MarshmallowSerializer from flask_resources.serializers import SimpleSerializer +from idutils import detect_identifier_schemes, to_url from lxml import etree as ET from pkg_resources import resource_stream from werkzeug.utils import cached_property @@ -93,6 +94,100 @@ def access_url(file): if isinstance(tag_value, dict): el.attrib.update(tag_value) + def add_missing_creatibutor_links(self, rdf_tree): + """Add missing `rdf:about` attributes to within and and within .""" + namespaces = rdf_tree.nsmap + + # Helper function to add rdf:about based on identifier + def add_rdf_about(element, identifier_elem): + identifier = identifier_elem.text.strip() + schemes = detect_identifier_schemes(identifier) + rdf_about_url = next( + ( + to_url(identifier, scheme=scheme) + for scheme in schemes + if to_url(identifier, scheme) + ), + None, + ) + if rdf_about_url: + element.set( + "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", rdf_about_url + ) + + # Process and + contributors_and_creators = rdf_tree.xpath( + "//dct:creator/rdf:Description | //dct:contributor/rdf:Description", + namespaces=namespaces, + ) + + for description in contributors_and_creators: + # Add rdf:about for creator/contributor if missing + if not description.get( + "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about" + ): + identifier_elem = description.find("dct:identifier", namespaces) + if identifier_elem is not None: + add_rdf_about(description, identifier_elem) + + # Process within at any level + organizations = rdf_tree.xpath( + "//org:memberOf//foaf:Organization[not(@rdf:about)]", + namespaces=namespaces, + ) + + for org in organizations: + org_identifier_elem = org.find("dct:identifier", namespaces) + if org_identifier_elem is not None: + add_rdf_about(org, org_identifier_elem) + + return rdf_tree + + def add_subjects_uri(self, rdf_tree, subjects): + """Add valueURI of subjects to the corresponding dct:subject elements in the RDF tree.""" + namespaces = rdf_tree.nsmap + for subject in subjects: + value_uri = subject.get("valueURI") + subject_label = subject.get("subject") + subject_scheme = subject.get("subjectScheme") + subject_props = subject.get("subjectProps", {}) + + if value_uri and subject_label and subject_scheme: + # Find the corresponding dct:subject element by prefLabel and subjectScheme + subject_element = rdf_tree.xpath( + f""" + //dct:subject[ + skos:Concept[ + skos:prefLabel[text()='{subject_label}'] + and skos:inScheme/skos:ConceptScheme/dct:title[text()='{subject_scheme}'] + ] + ] + """, + namespaces=namespaces, + )[0] + + if subject_element: + # Add the valueURI to the dct:subject element as rdf:about + subject_element.set( + "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", value_uri + ) + + # Check if + # subject has a definition in its props + definition = subject_props.get("definition") + if definition: + concept_elem = subject_element.find( + ".//skos:Concept", namespaces=namespaces + ) + if concept_elem is not None: + skos_definition = ET.Element( + "{http://www.w3.org/2004/02/skos/core#}definition" + ) + skos_definition.text = definition + concept_elem.append(skos_definition) + + return rdf_tree + def transform_with_xslt(self, dc_record, **kwargs): """Transform record with XSLT.""" dc_etree = schema43.dump_etree(dc_record) @@ -100,6 +195,14 @@ def transform_with_xslt(self, dc_record, **kwargs): dc_etree.tag = "{{{0}}}resource".format(dc_namespace) dcat_etree = self.xslt_transform_func(dc_etree).getroot() + # Add valueURI to subjects + subjects = dc_record.get("subjects", []) + if subjects: + dcat_etree = self.add_subjects_uri(dcat_etree, subjects) + + # Add the identifier links for creators & contributors if missing + dcat_etree = self.add_missing_creatibutor_links(dcat_etree) + # Inject files in results (since the XSLT can't do that by default) files_data = dc_record.get("_files", []) if files_data: diff --git a/invenio_rdm_records/resources/serializers/dcat/schema.py b/invenio_rdm_records/resources/serializers/dcat/schema.py index 0d5060128..1ac4d8c03 100644 --- a/invenio_rdm_records/resources/serializers/dcat/schema.py +++ b/invenio_rdm_records/resources/serializers/dcat/schema.py @@ -9,7 +9,7 @@ import idutils from flask import current_app -from marshmallow import fields, missing +from marshmallow import ValidationError, fields, missing, validate from marshmallow_utils.html import sanitize_unicode from invenio_rdm_records.resources.serializers.datacite import DataCite43Schema @@ -49,3 +49,44 @@ def get_files(self, obj): ) return files_list or missing + + def get_subjects(self, obj): + """Get subjects.""" + subjects = obj["metadata"].get("subjects", []) + if not subjects: + return missing + + validator = validate.URL() + serialized_subjects = [] + + for subject in subjects: + entry = {"subject": subject.get("subject")} + + id_ = subject.get("id") + if id_: + entry["subjectScheme"] = subject.get("scheme") + try: + validator(id_) + entry["valueURI"] = id_ + except ValidationError: + pass + + # Get identifiers and assign valueURI if scheme is 'url' and id_ was not a valid url + if "valueURI" not in entry: + entry["valueURI"] = next( + ( + identifier.get("identifier") + for identifier in subject.get("identifiers", []) + if identifier.get("scheme") == "url" + ), + None, + ) + + # Add props if it exists + props = subject.get("props", {}) + if props: + entry["subjectProps"] = props + + serialized_subjects.append(entry) + + return serialized_subjects if serialized_subjects else missing diff --git a/tests/resources/serializers/test_dcat_serializer.py b/tests/resources/serializers/test_dcat_serializer.py index cc6137b06..14455fadb 100644 --- a/tests/resources/serializers/test_dcat_serializer.py +++ b/tests/resources/serializers/test_dcat_serializer.py @@ -71,7 +71,7 @@ def test_dcat_serializer(running_app, full_record_to_dict): " \n" " 2018\n' - " \n" + ' \n' " \n" " Abdominal Injuries\n" " \n"