From 165c3c60c6f0456d757a9264e30231aaaf756f6d Mon Sep 17 00:00:00 2001 From: Fatimah Zulfiqar Date: Wed, 4 Dec 2024 17:01:29 +0100 Subject: [PATCH] serializer: updated subjects and affiliations in dcat --- .../resources/serializers/dcat/__init__.py | 106 ++++++++++++++++++ .../resources/serializers/dcat/schema.py | 43 ++++++- .../serializers/test_dcat_serializer.py | 2 +- 3 files changed, 149 insertions(+), 2 deletions(-) diff --git a/invenio_rdm_records/resources/serializers/dcat/__init__.py b/invenio_rdm_records/resources/serializers/dcat/__init__.py index 25cc8d02f..05eca952d 100644 --- a/invenio_rdm_records/resources/serializers/dcat/__init__.py +++ b/invenio_rdm_records/resources/serializers/dcat/__init__.py @@ -12,6 +12,7 @@ from datacite import schema43 from flask_resources import BaseListSchema, MarshmallowSerializer from flask_resources.serializers import SimpleSerializer +from idutils import detect_identifier_schemes, to_url from lxml import etree as ET from pkg_resources import resource_stream from werkzeug.utils import cached_property @@ -93,6 +94,103 @@ def access_url(file): if isinstance(tag_value, dict): el.attrib.update(tag_value) + def add_missing_creator_and_contributor_links(self, rdf_tree): + """ + Add missing `rdf:about` attributes to within and , + and within . + """ + namespaces = rdf_tree.nsmap + + # Helper function to add rdf:about based on identifier + def add_rdf_about(element, identifier_elem): + identifier = identifier_elem.text.strip() + schemes = detect_identifier_schemes(identifier) + rdf_about_url = next( + ( + to_url(identifier, scheme=scheme) + for scheme in schemes + if to_url(identifier, scheme) + ), + None, + ) + if rdf_about_url: + element.set( + "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", rdf_about_url + ) + + # Process and + contributors_and_creators = rdf_tree.xpath( + "//dct:creator/rdf:Description | //dct:contributor/rdf:Description", + namespaces=namespaces, + ) + + for description in contributors_and_creators: + # Add rdf:about for creator/contributor if missing + if not description.get( + "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about" + ): + identifier_elem = description.find("dct:identifier", namespaces) + if identifier_elem is not None: + add_rdf_about(description, identifier_elem) + + # Process within at any level + organizations = rdf_tree.xpath( + "//org:memberOf//foaf:Organization[not(@rdf:about)]", + namespaces=namespaces, + ) + + for org in organizations: + org_identifier_elem = org.find("dct:identifier", namespaces) + if org_identifier_elem is not None: + add_rdf_about(org, org_identifier_elem) + + return rdf_tree + + def add_subjects_uri(self, rdf_tree, subjects): + """Add valueURI of subjects to the corresponding dct:subject elements in the RDF tree.""" + namespaces = rdf_tree.nsmap + for subject in subjects: + value_uri = subject.get("valueURI") + subject_label = subject.get("subject") + subject_scheme = subject.get("subjectScheme") + subject_props = subject.get("subjectProps", {}) + + if value_uri and subject_label and subject_scheme: + # Find the corresponding dct:subject element by prefLabel and subjectScheme + subject_element = rdf_tree.xpath( + f""" + //dct:subject[ + skos:Concept[ + skos:prefLabel[text()='{subject_label}'] + and skos:inScheme/skos:ConceptScheme/dct:title[text()='{subject_scheme}'] + ] + ] + """, + namespaces=namespaces, + )[0] + + if subject_element: + # Add the valueURI to the dct:subject element as rdf:about + subject_element.set( + "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", value_uri + ) + + # Check if + # subject has a definition in its props + definition = subject_props.get("definition") + if definition: + concept_elem = subject_element.find( + ".//skos:Concept", namespaces=namespaces + ) + if concept_elem is not None: + skos_definition = ET.Element( + "{http://www.w3.org/2004/02/skos/core#}definition" + ) + skos_definition.text = definition + concept_elem.append(skos_definition) + + return rdf_tree + def transform_with_xslt(self, dc_record, **kwargs): """Transform record with XSLT.""" dc_etree = schema43.dump_etree(dc_record) @@ -100,6 +198,14 @@ def transform_with_xslt(self, dc_record, **kwargs): dc_etree.tag = "{{{0}}}resource".format(dc_namespace) dcat_etree = self.xslt_transform_func(dc_etree).getroot() + # Add valueURI to subjects + subjects = dc_record.get("subjects", []) + if subjects: + dcat_etree = self.add_subjects_uri(dcat_etree, subjects) + + # Add the identifier links for creators if missing + dcat_etree = self.add_missing_creator_and_contributor_links(dcat_etree) + # Inject files in results (since the XSLT can't do that by default) files_data = dc_record.get("_files", []) if files_data: diff --git a/invenio_rdm_records/resources/serializers/dcat/schema.py b/invenio_rdm_records/resources/serializers/dcat/schema.py index 0d5060128..1ac4d8c03 100644 --- a/invenio_rdm_records/resources/serializers/dcat/schema.py +++ b/invenio_rdm_records/resources/serializers/dcat/schema.py @@ -9,7 +9,7 @@ import idutils from flask import current_app -from marshmallow import fields, missing +from marshmallow import ValidationError, fields, missing, validate from marshmallow_utils.html import sanitize_unicode from invenio_rdm_records.resources.serializers.datacite import DataCite43Schema @@ -49,3 +49,44 @@ def get_files(self, obj): ) return files_list or missing + + def get_subjects(self, obj): + """Get subjects.""" + subjects = obj["metadata"].get("subjects", []) + if not subjects: + return missing + + validator = validate.URL() + serialized_subjects = [] + + for subject in subjects: + entry = {"subject": subject.get("subject")} + + id_ = subject.get("id") + if id_: + entry["subjectScheme"] = subject.get("scheme") + try: + validator(id_) + entry["valueURI"] = id_ + except ValidationError: + pass + + # Get identifiers and assign valueURI if scheme is 'url' and id_ was not a valid url + if "valueURI" not in entry: + entry["valueURI"] = next( + ( + identifier.get("identifier") + for identifier in subject.get("identifiers", []) + if identifier.get("scheme") == "url" + ), + None, + ) + + # Add props if it exists + props = subject.get("props", {}) + if props: + entry["subjectProps"] = props + + serialized_subjects.append(entry) + + return serialized_subjects if serialized_subjects else missing diff --git a/tests/resources/serializers/test_dcat_serializer.py b/tests/resources/serializers/test_dcat_serializer.py index cc6137b06..14455fadb 100644 --- a/tests/resources/serializers/test_dcat_serializer.py +++ b/tests/resources/serializers/test_dcat_serializer.py @@ -71,7 +71,7 @@ def test_dcat_serializer(running_app, full_record_to_dict): " \n" " 2018\n' - " \n" + ' \n' " \n" " Abdominal Injuries\n" " \n"