Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

curation: init module; add EURecordCurator #1072

Merged
merged 3 commits into from
Nov 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions site/setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,14 @@ invenio_base.apps =
zenodo_rdm_moderation = zenodo_rdm.moderation.ext:ZenodoModeration
invenio_openaire = zenodo_rdm.openaire.ext:OpenAIRE
zenodo_rdm_stats = zenodo_rdm.stats.ext:ZenodoStats
zenodo_rdm_curation = zenodo_rdm.curation.ext:ZenodoCuration
invenio_base.api_apps =
zenodo_rdm_legacy = zenodo_rdm.legacy.ext:ZenodoLegacy
profiler = zenodo_rdm.profiler:Profiler
zenodo_rdm_metrics = zenodo_rdm.metrics.ext:ZenodoMetrics
zenodo_rdm_moderation = zenodo_rdm.moderation.ext:ZenodoModeration
invenio_openaire = zenodo_rdm.openaire.ext:OpenAIRE
zenodo_rdm_curation = zenodo_rdm.curation.ext:ZenodoCuration
invenio_base.api_blueprints =
zenodo_rdm_legacy = zenodo_rdm.legacy.views:blueprint
zenodo_rdm_legacy_records = zenodo_rdm.legacy.views:create_legacy_records_bp
Expand All @@ -69,6 +71,7 @@ invenio_celery.tasks =
zenodo_rdm_openaire = zenodo_rdm.openaire.tasks
zenodo_rdm_moderation = zenodo_rdm.moderation.tasks
zenodo_stats = zenodo_rdm.stats.tasks
zenodo_rdm_curation = zenodo_rdm.curation.tasks
invenio_oauth2server.scopes =
deposit_write_scope = zenodo_rdm.legacy.scopes:deposit_write_scope
deposit_actions_scope = zenodo_rdm.legacy.scopes:deposit_actions_scope
Expand Down
7 changes: 7 additions & 0 deletions site/zenodo_rdm/curation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2023 CERN.
#
# Zenodo-RDM is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.
"""Curation module."""
36 changes: 36 additions & 0 deletions site/zenodo_rdm/curation/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# ZenodoRDM is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""Moderation config."""

from .rules import (
award_acronym_in_description,
award_acronym_in_title,
test_phrases_in_record,
)

CURATION_EU_RULES = {
"award_acronym_in_title": award_acronym_in_title,
"award_acronym_in_description": award_acronym_in_description,
"test_phrases_in_record": test_phrases_in_record,
}
"""Rules to run for EU Curation."""

CURATION_SCORES = {
"award_acronym_in_title": 5,
"award_acronym_in_description": 10,
"test_phrases_in_record": False,
}
"""Rule scores for EU Curation."""


CURATION_THRESHOLDS = {"EU_RECORDS_CURATION": 10}
"""Threshold values for curators/rules."""


CURATION_ENABLE_EU_CURATOR = False
"""Controls whether to dry run EU Curation."""
97 changes: 97 additions & 0 deletions site/zenodo_rdm/curation/curators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# ZenodoRDM is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""Curators for ZenodoRDM Curation."""

from flask import current_app
from invenio_access.permissions import system_identity
from invenio_rdm_records.proxies import current_record_communities_service
from invenio_records_resources.services.uow import UnitOfWork

from zenodo_rdm.curation.proxies import current_curation


class BaseCurator:
"""Base Curator class."""

def __init__(self, dry=False):
"""Constructor."""
self.dry = dry

def _evaluator(self, results):
"""Evaluates final result for based on results dict."""
raise NotImplementedError()

@property
def rules(self):
"""Get rules to run."""
raise NotImplementedError()

def run(self, record, raise_rule_exc=False):
"""Run rules for the curator and evaluate result."""
rule_results = {}
for name, rule in self.rules.items():
try:
rule_results[name] = rule(record)
except Exception as e:
if raise_rule_exc:
raise e
rule_results[name] = None

evaluation = self._evaluator(rule_results)
result = {"evaluation": evaluation, "rules": rule_results}
self._post_run(record, result)
return result

def _post_run(self, record, result):
"""Actions to take after calculating rules."""
pass


class EURecordCurator(BaseCurator):
"""Curator to check records for EC community."""

def _evaluator(self, results):
"""Evaluate result for EC curation."""
score = 0
for rule, result in results.items():
rule_score = current_curation.scores.get(rule)
if result is None:
continue
elif isinstance(rule_score, int):
score += rule_score if result else 0
elif isinstance(rule_score, bool):
if result:
return rule_score
else:
continue
else:
raise ValueError("Unsupported score type configured.")
yashlamba marked this conversation as resolved.
Show resolved Hide resolved
return score >= current_curation.thresholds.get("EU_RECORDS_CURATION")

@property
def rules(self):
"""Get rules to run from config."""
return current_app.config.get("CURATION_EU_RULES", {})

def _post_run(self, record, result):
"""Actions to take after run."""
if self.dry:
current_app.logger.error(
"Evaluation for EU record curator",
extra={"record_id": record.pid.pid_value, "result": result},
)
return
if result["evaluation"]:
with UnitOfWork() as uow:
current_record_communities_service.bulk_add(
system_identity,
current_app.config.get("EU_COMMUNITY_UUID"),
[record.pid.pid_value],
uow=uow,
)
uow.commit()
50 changes: 50 additions & 0 deletions site/zenodo_rdm/curation/ext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# ZenodoRDM is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""ZenodoRDM Curation module."""

from flask import current_app
from werkzeug.utils import cached_property

from . import config


class ZenodoCuration:
"""Zenodo content curation extension."""

def __init__(self, app=None):
"""Extension initialization."""
if app:
self.init_app(app)

@staticmethod
def init_config(app):
"""Initialize configuration."""
for k in dir(config):
if k.startswith("CURATION_"):
app.config.setdefault(k, getattr(config, k))

def init_app(self, app):
"""Flask application initialization."""
self.init_config(app)
app.extensions["zenodo-curation"] = self

@cached_property
def scores(self):
"""Return curation scores used for rules."""
return {
**config.CURATION_SCORES,
**current_app.config.get("CURATION_SCORES", {}),
}

@cached_property
def thresholds(self):
"""Return curation thresholds used for rules/curators."""
return {
**config.CURATION_THRESHOLDS,
**current_app.config.get("CURATION_THRESHOLDS", {}),
}
13 changes: 13 additions & 0 deletions site/zenodo_rdm/curation/proxies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2023 CERN.
#
# ZenodoRDM is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""Proxy objects for easier access to application objects."""

from flask import current_app
from werkzeug.local import LocalProxy

current_curation = LocalProxy(lambda: current_app.extensions["zenodo-curation"])
58 changes: 58 additions & 0 deletions site/zenodo_rdm/curation/rules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# ZenodoRDM is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""Rules for curation."""

from flask import current_app
from invenio_records_resources.proxies import current_service_registry


def award_acronym_in_description(record):
"""Check if EU award name in record description."""
award_service = current_service_registry.get("awards")
description = record.metadata["description"]
funding = record.metadata["funding"]

for f in funding:
if f["funder"]["id"] == "00k4n6c32":
if award_id := f.get("award", {}).get("id"):
award = award_service.record_cls.pid.resolve(award_id)
if award.get("acronym") and (
award.get("acronym").lower() in description.lower()
):
return True
return False


def award_acronym_in_title(record):
"""Check if EU award name in record title."""
award_service = current_service_registry.get("awards")
title = record.metadata["title"]
funding = record.metadata["funding"]

for f in funding:
if f["funder"]["id"] == "00k4n6c32":
if award_id := f.get("award", {}).get("id"):
award = award_service.record_cls.pid.resolve(award_id)
if award.get("acronym") and (
award.get("acronym").lower() in title.lower()
):
return True
return False


def test_phrases_in_record(record):
"""Check if test words in record."""
test_phrases = current_app.config.get("CURATION_TEST_PHRASES")
record_data = (
record.metadata["title"] + " " + record.metadata.get("description", "")
)

for word in test_phrases:
if word.lower() in record_data.lower():
return True
return False
80 changes: 80 additions & 0 deletions site/zenodo_rdm/curation/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# ZenodoRDM is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.
"""Tasks for curation."""

from datetime import datetime, timedelta, timezone

from celery import shared_task
from flask import current_app
from invenio_access.permissions import system_identity
from invenio_rdm_records.proxies import current_rdm_records_service as records_service
from invenio_search.engine import dsl

from zenodo_rdm.curation.curators import EURecordCurator


@shared_task
def run_eu_record_curation(since):
"""Run EC Curator."""
ctx = {"processed": 0, "approved": 0, "failed": 0, "since": since}
dry_run = not current_app.config.get("CURATION_ENABLE_EU_CURATOR")
curator = EURecordCurator(dry=dry_run)

query = dsl.Q(
"bool",
must=[
dsl.Q("term", **{"metadata.funding.funder.id": "00k4n6c32"}),
dsl.Q("term", **{"is_deleted": False}),
dsl.Q(
"range",
created={
"lte": (
datetime.now(timezone.utc) - timedelta(days=30)
).isoformat(),
},
),
dsl.Q(
"range",
updated={
"gte": datetime.fromisoformat(since).isoformat(),
},
),
],
must_not=[
dsl.Q(
"term",
**{
"parent.communities.ids": current_app.config.get(
"EU_COMMUNITY_UUID"
)
},
)
],
)
search = records_service.create_search(
slint marked this conversation as resolved.
Show resolved Hide resolved
system_identity,
records_service.record_cls,
records_service.config.search,
extra_filter=query,
)

for item in search.scan():
record = records_service.record_cls.pid.resolve(item["id"])
try:
result = curator.run(record=record)
ctx["processed"] += 1
if result["evaluation"]:
ctx["approved"] += 1
except Exception:
# NOTE Since curator's raise_rules_exc is by default false, rules would not fail.
# This catches failures due to other reasons
ctx["failed"] += 1

current_app.logger.error(
"EU curation processed",
extra=ctx,
)