Skip to content

Commit

Permalink
curation: update config & logging; minor fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
yashlamba committed Nov 28, 2024
1 parent e3e4366 commit 7e4e97a
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 27 deletions.
2 changes: 1 addition & 1 deletion site/setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ invenio_celery.tasks =
zenodo_rdm_openaire = zenodo_rdm.openaire.tasks
zenodo_rdm_moderation = zenodo_rdm.moderation.tasks
zenodo_stats = zenodo_rdm.stats.tasks
zenodo_rdm_curations = zenodo_rdm.curation.tasks
zenodo_rdm_curation = zenodo_rdm.curation.tasks
invenio_oauth2server.scopes =
deposit_write_scope = zenodo_rdm.legacy.scopes:deposit_write_scope
deposit_actions_scope = zenodo_rdm.legacy.scopes:deposit_actions_scope
Expand Down
5 changes: 5 additions & 0 deletions site/zenodo_rdm/curation/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,10 @@
}
"""Rule scores for EU Curation."""


CURATION_THRESHOLDS = {"EU_RECORDS_CURATION": 10}
"""Threshold values for curators/rules."""


CURATION_ENABLE_EU_CURATOR = False
"""Controls whether to dry run EU Curation."""
22 changes: 12 additions & 10 deletions site/zenodo_rdm/curation/curators.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,9 @@
class BaseCurator:
"""Base Curator class."""

def __init__(self, dry=False, raise_exc=False):
def __init__(self, dry=False):
"""Constructor."""
self.dry = dry
self.raise_exc = raise_exc

def _evaluator(self, results):
"""Evaluates final result for based on results dict."""
Expand All @@ -32,14 +31,14 @@ def rules(self):
"""Get rules to run."""
raise NotImplementedError()

def run(self, record):
def run(self, record, raise_rule_exc=False):
"""Run rules for the curator and evaluate result."""
rule_results = {}
for name, rule in self.rules.items():
try:
rule_results[name] = rule(record)
except Exception as e:
if self.raise_exc:
if raise_rule_exc:
raise e
rule_results[name] = None

Expand All @@ -61,7 +60,9 @@ def _evaluator(self, results):
score = 0
for rule, result in results.items():
rule_score = current_curation.scores.get(rule)
if isinstance(rule_score, int):
if result is None:
continue
elif isinstance(rule_score, int):
score += rule_score if result else 0
elif isinstance(rule_score, bool):
if result:
Expand All @@ -70,7 +71,7 @@ def _evaluator(self, results):
continue
else:
raise ValueError("Unsupported score type configured.")
return score >= current_app.config.get("CURATION_EU_CURATION_THRESHOLD")
return score >= current_curation.thresholds.get("EU_RECORDS_CURATION")

@property
def rules(self):
Expand All @@ -80,15 +81,16 @@ def rules(self):
def _post_run(self, record, result):
"""Actions to take after run."""
if self.dry:
current_app.logger.info(
f"Processed record ID: {record.pid.pid_value}", result
) # TODO use error? Or should we log from the task
current_app.logger.error(
"Evaluation for EU record curator",
extra={"record_id": record.pid.pid_value, "result": result},
)
return
if result["evaluation"]:
with UnitOfWork() as uow:
current_record_communities_service.bulk_add(
system_identity,
current_app.config.get("EU_COMMUNITY_ID"),
current_app.config.get("EU_COMMUNITY_UUID"),
[record.pid.pid_value],
uow=uow,
)
Expand Down
10 changes: 8 additions & 2 deletions site/zenodo_rdm/curation/ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@

"""ZenodoRDM Curation module."""

from types import SimpleNamespace

from flask import current_app
from werkzeug.utils import cached_property

Expand Down Expand Up @@ -42,3 +40,11 @@ def scores(self):
**config.CURATION_SCORES,
**current_app.config.get("CURATION_SCORES", {}),
}

@cached_property
def thresholds(self):
"""Return curation thresholds used for rules/curators."""
return {
**config.CURATION_THRESHOLDS,
**current_app.config.get("CURATION_THRESHOLDS", {}),
}
20 changes: 13 additions & 7 deletions site/zenodo_rdm/curation/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@ def award_acronym_in_description(record):

for f in funding:
if f["funder"]["id"] == "00k4n6c32":
if "award" in f:
award = award_service.record_cls.pid.resolve(f["award"]["id"])
if award["acronym"].lower() in description.lower():
if award_id := f.get("award", {}).get("id"):
award = award_service.record_cls.pid.resolve(award_id)
if award.get("acronym") and (
award.get("acronym").lower() in description.lower()
):
return True
return False

Expand All @@ -34,17 +36,21 @@ def award_acronym_in_title(record):

for f in funding:
if f["funder"]["id"] == "00k4n6c32":
if "award" in f:
award = award_service.record_cls.pid.resolve(f["award"]["id"])
if award["acronym"].lower() in title.lower():
if award_id := f.get("award", {}).get("id"):
award = award_service.record_cls.pid.resolve(award_id)
if award.get("acronym") and (
award.get("acronym").lower() in title.lower()
):
return True
return False


def test_phrases_in_record(record):
"""Check if test words in record."""
test_phrases = current_app.config.get("CURATION_TEST_PHRASES")
record_data = record.metadata["title"] + " " + record.metadata["description"]
record_data = (
record.metadata["title"] + " " + record.metadata.get("description", "")
)

for word in test_phrases:
if word.lower() in record_data.lower():
Expand Down
21 changes: 14 additions & 7 deletions site/zenodo_rdm/curation/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# it under the terms of the MIT License; see LICENSE file for more details.
"""Tasks for curation."""

from datetime import datetime, timedelta
from datetime import datetime, timedelta, timezone

from celery import shared_task
from flask import current_app
Expand All @@ -32,7 +32,9 @@ def run_eu_record_curation(since):
dsl.Q(
"range",
created={
"lte": (datetime.now() - timedelta(days=30)).isoformat(),
"lte": (
datetime.now(timezone.utc) - timedelta(days=30)
).isoformat(),
},
),
dsl.Q(
Expand All @@ -45,7 +47,11 @@ def run_eu_record_curation(since):
must_not=[
dsl.Q(
"term",
**{"parent.communities.ids": current_app.config.get("EU_COMMUNITY_ID")},
**{
"parent.communities.ids": current_app.config.get(
"EU_COMMUNITY_UUID"
)
},
)
],
)
Expand All @@ -63,11 +69,12 @@ def run_eu_record_curation(since):
ctx["processed"] += 1
if result["evaluation"]:
ctx["approved"] += 1
except Exception as e:
# NOTE Since curator's raise_exc is by default false, rules would not fail.
# This catches failure due to other reasons
except Exception:
# NOTE Since curator's raise_rules_exc is by default false, rules would not fail.
# This catches failures due to other reasons
ctx["failed"] += 1

current_app.logger.error(
f"EU curation processed",
"EU curation processed",
extra=ctx,
)

0 comments on commit 7e4e97a

Please sign in to comment.