diff --git a/site/setup.cfg b/site/setup.cfg index 58bfa192..e0cec983 100644 --- a/site/setup.cfg +++ b/site/setup.cfg @@ -48,12 +48,14 @@ invenio_base.apps = zenodo_rdm_moderation = zenodo_rdm.moderation.ext:ZenodoModeration invenio_openaire = zenodo_rdm.openaire.ext:OpenAIRE zenodo_rdm_stats = zenodo_rdm.stats.ext:ZenodoStats + zenodo_rdm_curation = zenodo_rdm.curation.ext:ZenodoCuration invenio_base.api_apps = zenodo_rdm_legacy = zenodo_rdm.legacy.ext:ZenodoLegacy profiler = zenodo_rdm.profiler:Profiler zenodo_rdm_metrics = zenodo_rdm.metrics.ext:ZenodoMetrics zenodo_rdm_moderation = zenodo_rdm.moderation.ext:ZenodoModeration invenio_openaire = zenodo_rdm.openaire.ext:OpenAIRE + zenodo_rdm_curation = zenodo_rdm.curation.ext:ZenodoCuration invenio_base.api_blueprints = zenodo_rdm_legacy = zenodo_rdm.legacy.views:blueprint zenodo_rdm_legacy_records = zenodo_rdm.legacy.views:create_legacy_records_bp @@ -69,6 +71,7 @@ invenio_celery.tasks = zenodo_rdm_openaire = zenodo_rdm.openaire.tasks zenodo_rdm_moderation = zenodo_rdm.moderation.tasks zenodo_stats = zenodo_rdm.stats.tasks + zenodo_rdm_curation = zenodo_rdm.curation.tasks invenio_oauth2server.scopes = deposit_write_scope = zenodo_rdm.legacy.scopes:deposit_write_scope deposit_actions_scope = zenodo_rdm.legacy.scopes:deposit_actions_scope diff --git a/site/zenodo_rdm/curation/__init__.py b/site/zenodo_rdm/curation/__init__.py new file mode 100644 index 00000000..5a0c501f --- /dev/null +++ b/site/zenodo_rdm/curation/__init__.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2023 CERN. +# +# Zenodo-RDM is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. +"""Curation module.""" diff --git a/site/zenodo_rdm/curation/config.py b/site/zenodo_rdm/curation/config.py new file mode 100644 index 00000000..a9efae57 --- /dev/null +++ b/site/zenodo_rdm/curation/config.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 CERN. +# +# ZenodoRDM is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. + +"""Moderation config.""" + +from .rules import ( + award_acronym_in_description, + award_acronym_in_title, + test_phrases_in_record, +) + +CURATION_EU_RULES = { + "award_acronym_in_title": award_acronym_in_title, + "award_acronym_in_description": award_acronym_in_description, + "test_phrases_in_record": test_phrases_in_record, +} +"""Rules to run for EU Curation.""" + +CURATION_SCORES = { + "award_acronym_in_title": 5, + "award_acronym_in_description": 10, + "test_phrases_in_record": False, +} +"""Rule scores for EU Curation.""" + + +CURATION_THRESHOLDS = {"EU_RECORDS_CURATION": 10} +"""Threshold values for curators/rules.""" + + +CURATION_ENABLE_EU_CURATOR = False +"""Controls whether to dry run EU Curation.""" diff --git a/site/zenodo_rdm/curation/curators.py b/site/zenodo_rdm/curation/curators.py new file mode 100644 index 00000000..acddef99 --- /dev/null +++ b/site/zenodo_rdm/curation/curators.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 CERN. +# +# ZenodoRDM is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. + +"""Curators for ZenodoRDM Curation.""" + +from flask import current_app +from invenio_access.permissions import system_identity +from invenio_rdm_records.proxies import current_record_communities_service +from invenio_records_resources.services.uow import UnitOfWork + +from zenodo_rdm.curation.proxies import current_curation + + +class BaseCurator: + """Base Curator class.""" + + def __init__(self, dry=False): + """Constructor.""" + self.dry = dry + + def _evaluator(self, results): + """Evaluates final result for based on results dict.""" + raise NotImplementedError() + + @property + def rules(self): + """Get rules to run.""" + raise NotImplementedError() + + def run(self, record, raise_rule_exc=False): + """Run rules for the curator and evaluate result.""" + rule_results = {} + for name, rule in self.rules.items(): + try: + rule_results[name] = rule(record) + except Exception as e: + if raise_rule_exc: + raise e + rule_results[name] = None + + evaluation = self._evaluator(rule_results) + result = {"evaluation": evaluation, "rules": rule_results} + self._post_run(record, result) + return result + + def _post_run(self, record, result): + """Actions to take after calculating rules.""" + pass + + +class EURecordCurator(BaseCurator): + """Curator to check records for EC community.""" + + def _evaluator(self, results): + """Evaluate result for EC curation.""" + score = 0 + for rule, result in results.items(): + rule_score = current_curation.scores.get(rule) + if result is None: + continue + elif isinstance(rule_score, int): + score += rule_score if result else 0 + elif isinstance(rule_score, bool): + if result: + return rule_score + else: + continue + else: + raise ValueError("Unsupported score type configured.") + return score >= current_curation.thresholds.get("EU_RECORDS_CURATION") + + @property + def rules(self): + """Get rules to run from config.""" + return current_app.config.get("CURATION_EU_RULES", {}) + + def _post_run(self, record, result): + """Actions to take after run.""" + if self.dry: + current_app.logger.error( + "Evaluation for EU record curator", + extra={"record_id": record.pid.pid_value, "result": result}, + ) + return + if result["evaluation"]: + with UnitOfWork() as uow: + current_record_communities_service.bulk_add( + system_identity, + current_app.config.get("EU_COMMUNITY_UUID"), + [record.pid.pid_value], + uow=uow, + ) + uow.commit() diff --git a/site/zenodo_rdm/curation/ext.py b/site/zenodo_rdm/curation/ext.py new file mode 100644 index 00000000..2f7422f4 --- /dev/null +++ b/site/zenodo_rdm/curation/ext.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 CERN. +# +# ZenodoRDM is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. + +"""ZenodoRDM Curation module.""" + +from flask import current_app +from werkzeug.utils import cached_property + +from . import config + + +class ZenodoCuration: + """Zenodo content curation extension.""" + + def __init__(self, app=None): + """Extension initialization.""" + if app: + self.init_app(app) + + @staticmethod + def init_config(app): + """Initialize configuration.""" + for k in dir(config): + if k.startswith("CURATION_"): + app.config.setdefault(k, getattr(config, k)) + + def init_app(self, app): + """Flask application initialization.""" + self.init_config(app) + app.extensions["zenodo-curation"] = self + + @cached_property + def scores(self): + """Return curation scores used for rules.""" + return { + **config.CURATION_SCORES, + **current_app.config.get("CURATION_SCORES", {}), + } + + @cached_property + def thresholds(self): + """Return curation thresholds used for rules/curators.""" + return { + **config.CURATION_THRESHOLDS, + **current_app.config.get("CURATION_THRESHOLDS", {}), + } diff --git a/site/zenodo_rdm/curation/proxies.py b/site/zenodo_rdm/curation/proxies.py new file mode 100644 index 00000000..74caccd2 --- /dev/null +++ b/site/zenodo_rdm/curation/proxies.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2023 CERN. +# +# ZenodoRDM is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. + +"""Proxy objects for easier access to application objects.""" + +from flask import current_app +from werkzeug.local import LocalProxy + +current_curation = LocalProxy(lambda: current_app.extensions["zenodo-curation"]) diff --git a/site/zenodo_rdm/curation/rules.py b/site/zenodo_rdm/curation/rules.py new file mode 100644 index 00000000..1e1f5c72 --- /dev/null +++ b/site/zenodo_rdm/curation/rules.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 CERN. +# +# ZenodoRDM is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. + +"""Rules for curation.""" + +from flask import current_app +from invenio_records_resources.proxies import current_service_registry + + +def award_acronym_in_description(record): + """Check if EU award name in record description.""" + award_service = current_service_registry.get("awards") + description = record.metadata["description"] + funding = record.metadata["funding"] + + for f in funding: + if f["funder"]["id"] == "00k4n6c32": + if award_id := f.get("award", {}).get("id"): + award = award_service.record_cls.pid.resolve(award_id) + if award.get("acronym") and ( + award.get("acronym").lower() in description.lower() + ): + return True + return False + + +def award_acronym_in_title(record): + """Check if EU award name in record title.""" + award_service = current_service_registry.get("awards") + title = record.metadata["title"] + funding = record.metadata["funding"] + + for f in funding: + if f["funder"]["id"] == "00k4n6c32": + if award_id := f.get("award", {}).get("id"): + award = award_service.record_cls.pid.resolve(award_id) + if award.get("acronym") and ( + award.get("acronym").lower() in title.lower() + ): + return True + return False + + +def test_phrases_in_record(record): + """Check if test words in record.""" + test_phrases = current_app.config.get("CURATION_TEST_PHRASES") + record_data = ( + record.metadata["title"] + " " + record.metadata.get("description", "") + ) + + for word in test_phrases: + if word.lower() in record_data.lower(): + return True + return False diff --git a/site/zenodo_rdm/curation/tasks.py b/site/zenodo_rdm/curation/tasks.py new file mode 100644 index 00000000..fd23482d --- /dev/null +++ b/site/zenodo_rdm/curation/tasks.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 CERN. +# +# ZenodoRDM is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. +"""Tasks for curation.""" + +from datetime import datetime, timedelta, timezone + +from celery import shared_task +from flask import current_app +from invenio_access.permissions import system_identity +from invenio_rdm_records.proxies import current_rdm_records_service as records_service +from invenio_search.engine import dsl + +from zenodo_rdm.curation.curators import EURecordCurator + + +@shared_task +def run_eu_record_curation(since): + """Run EC Curator.""" + ctx = {"processed": 0, "approved": 0, "failed": 0, "since": since} + dry_run = not current_app.config.get("CURATION_ENABLE_EU_CURATOR") + curator = EURecordCurator(dry=dry_run) + + query = dsl.Q( + "bool", + must=[ + dsl.Q("term", **{"metadata.funding.funder.id": "00k4n6c32"}), + dsl.Q("term", **{"is_deleted": False}), + dsl.Q( + "range", + created={ + "lte": ( + datetime.now(timezone.utc) - timedelta(days=30) + ).isoformat(), + }, + ), + dsl.Q( + "range", + updated={ + "gte": datetime.fromisoformat(since).isoformat(), + }, + ), + ], + must_not=[ + dsl.Q( + "term", + **{ + "parent.communities.ids": current_app.config.get( + "EU_COMMUNITY_UUID" + ) + }, + ) + ], + ) + search = records_service.create_search( + system_identity, + records_service.record_cls, + records_service.config.search, + extra_filter=query, + ) + + for item in search.scan(): + record = records_service.record_cls.pid.resolve(item["id"]) + try: + result = curator.run(record=record) + ctx["processed"] += 1 + if result["evaluation"]: + ctx["approved"] += 1 + except Exception: + # NOTE Since curator's raise_rules_exc is by default false, rules would not fail. + # This catches failures due to other reasons + ctx["failed"] += 1 + + current_app.logger.error( + "EU curation processed", + extra=ctx, + )