From 85127d5785481a0d6a6685ff65f2569dc670282e Mon Sep 17 00:00:00 2001 From: Fatimah Zulfiqar Date: Wed, 4 Dec 2024 15:16:07 +0100 Subject: [PATCH] serializer: updated subjects and affiliation for dcat --- .../resources/serializers/dcat/__init__.py | 108 ++++++++++++++++++ .../resources/serializers/dcat/schema.py | 45 +++++++- 2 files changed, 150 insertions(+), 3 deletions(-) diff --git a/invenio_rdm_records/resources/serializers/dcat/__init__.py b/invenio_rdm_records/resources/serializers/dcat/__init__.py index 25cc8d02f..2f4da45a6 100644 --- a/invenio_rdm_records/resources/serializers/dcat/__init__.py +++ b/invenio_rdm_records/resources/serializers/dcat/__init__.py @@ -12,6 +12,7 @@ from datacite import schema43 from flask_resources import BaseListSchema, MarshmallowSerializer from flask_resources.serializers import SimpleSerializer +from idutils import detect_identifier_schemes, to_url from lxml import etree as ET from pkg_resources import resource_stream from werkzeug.utils import cached_property @@ -93,6 +94,105 @@ def access_url(file): if isinstance(tag_value, dict): el.attrib.update(tag_value) + def add_missing_creator_link(self, rdf_tree): + """Add `rdf:about` attributes to and within if missing.""" + namespaces = rdf_tree.nsmap + creators = rdf_tree.xpath( + "//dct:creator/rdf:Description[not(@rdf:about)]", namespaces=namespaces + ) + + for description in creators: + # Handle creator's rdf:about + identifier_elem = description.find("dct:identifier", namespaces) + if identifier_elem is not None: + identifier = identifier_elem.text.strip() + schemes = detect_identifier_schemes(identifier) + rdf_about_url = next( + ( + to_url(identifier, scheme=scheme) + for scheme in schemes + if to_url(identifier, scheme) + ), + None, + ) + if rdf_about_url: + description.set( + "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", + rdf_about_url, + ) + + # Handle organizations (foaf:Organization) and add missing rdf:about + organizations = description.xpath( + "//dct:creator//org:memberOf//foaf:Organization[not(@rdf:about)]", + namespaces=namespaces, + ) + for org in organizations: + org_identifier_elem = org.find("dct:identifier", namespaces) + breakpoint() + if org_identifier_elem is not None: + org_identifier = org_identifier_elem.text.strip() + org_schemes = detect_identifier_schemes(org_identifier) + org_rdf_about_url = next( + ( + to_url(org_identifier, scheme=scheme) + for scheme in org_schemes + if to_url(org_identifier, scheme) + ), + None, + ) + if org_rdf_about_url: + org.set( + "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", + org_rdf_about_url, + ) + + return rdf_tree + + def add_subjects_uri(self, rdf_tree, subjects): + """Add valueURI of subjects to the corresponding dct:subject elements in the RDF tree.""" + namespaces = rdf_tree.nsmap + for subject in subjects: + value_uri = subject.get("valueURI") + subject_label = subject.get("subject") + subject_scheme = subject.get("subjectScheme") + subject_props = subject.get("subjectProps", {}) + + if value_uri and subject_label and subject_scheme: + # Find the corresponding dct:subject element by prefLabel and subjectScheme + subject_element = rdf_tree.xpath( + f""" + //dct:subject[ + skos:Concept[ + skos:prefLabel[text()='{subject_label}'] + and skos:inScheme/skos:ConceptScheme/dct:title[text()='{subject_scheme}'] + ] + ] + """, + namespaces=namespaces, + )[0] + + if subject_element: + # Add the valueURI to the dct:subject element as rdf:about + subject_element.set( + "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", value_uri + ) + + # Check if + # subject has a definition in its props + definition = subject_props.get("definition") + if definition: + concept_elem = subject_element.find( + ".//skos:Concept", namespaces=namespaces + ) + if concept_elem is not None: + skos_definition = ET.Element( + "{http://www.w3.org/2004/02/skos/core#}definition" + ) + skos_definition.text = definition + concept_elem.append(skos_definition) + + return rdf_tree + def transform_with_xslt(self, dc_record, **kwargs): """Transform record with XSLT.""" dc_etree = schema43.dump_etree(dc_record) @@ -100,6 +200,14 @@ def transform_with_xslt(self, dc_record, **kwargs): dc_etree.tag = "{{{0}}}resource".format(dc_namespace) dcat_etree = self.xslt_transform_func(dc_etree).getroot() + # Add valueURI to subjects + subjects = dc_record.get("subjects", []) + if subjects: + dcat_etree = self.add_subjects_uri(dcat_etree, subjects) + + # Add the identifier links for creators if missing + dcat_etree = self.add_missing_creator_link(dcat_etree) + # Inject files in results (since the XSLT can't do that by default) files_data = dc_record.get("_files", []) if files_data: diff --git a/invenio_rdm_records/resources/serializers/dcat/schema.py b/invenio_rdm_records/resources/serializers/dcat/schema.py index 0d5060128..51f7e64d3 100644 --- a/invenio_rdm_records/resources/serializers/dcat/schema.py +++ b/invenio_rdm_records/resources/serializers/dcat/schema.py @@ -9,15 +9,13 @@ import idutils from flask import current_app -from marshmallow import fields, missing +from marshmallow import ValidationError, fields, missing, validate from marshmallow_utils.html import sanitize_unicode from invenio_rdm_records.resources.serializers.datacite import DataCite43Schema class DcatSchema(DataCite43Schema): - """Dcat Marshmallow Schema.""" - _files = fields.Method("get_files") def get_files(self, obj): @@ -49,3 +47,44 @@ def get_files(self, obj): ) return files_list or missing + + def get_subjects(self, obj): + """Get subjects.""" + subjects = obj["metadata"].get("subjects", []) + if not subjects: + return missing + + validator = validate.URL() + serialized_subjects = [] + + for subject in subjects: + entry = {"subject": subject.get("subject")} + + id_ = subject.get("id") + if id_: + entry["subjectScheme"] = subject.get("scheme") + try: + validator(id_) + entry["valueURI"] = id_ + except ValidationError: + pass + + # Get identifiers and assign valueURI if scheme is 'url' and id_ was not a valid url + if "valueURI" not in entry: + entry["valueURI"] = next( + ( + identifier.get("identifier") + for identifier in subject.get("identifiers", []) + if identifier.get("scheme") == "url" + ), + None, + ) + + # Add props if it exists + props = subject.get("props", {}) + if props: + entry["subjectProps"] = props + + serialized_subjects.append(entry) + + return serialized_subjects if serialized_subjects else missing