From 03cad791e962f5434db6d89106c9d9f05134a9b5 Mon Sep 17 00:00:00 2001 From: yashlamba Date: Thu, 28 Nov 2024 11:08:46 +0100 Subject: [PATCH] curation: update config & logging; minor fixes --- site/setup.cfg | 2 +- site/zenodo_rdm/curation/config.py | 5 +++++ site/zenodo_rdm/curation/curators.py | 22 ++++++++++++---------- site/zenodo_rdm/curation/ext.py | 10 ++++++++-- site/zenodo_rdm/curation/rules.py | 20 +++++++++++++------- site/zenodo_rdm/curation/tasks.py | 21 ++++++++++++++------- 6 files changed, 53 insertions(+), 27 deletions(-) diff --git a/site/setup.cfg b/site/setup.cfg index 7e30e2a2..e0cec983 100644 --- a/site/setup.cfg +++ b/site/setup.cfg @@ -71,7 +71,7 @@ invenio_celery.tasks = zenodo_rdm_openaire = zenodo_rdm.openaire.tasks zenodo_rdm_moderation = zenodo_rdm.moderation.tasks zenodo_stats = zenodo_rdm.stats.tasks - zenodo_rdm_curations = zenodo_rdm.curation.tasks + zenodo_rdm_curation = zenodo_rdm.curation.tasks invenio_oauth2server.scopes = deposit_write_scope = zenodo_rdm.legacy.scopes:deposit_write_scope deposit_actions_scope = zenodo_rdm.legacy.scopes:deposit_actions_scope diff --git a/site/zenodo_rdm/curation/config.py b/site/zenodo_rdm/curation/config.py index 9e0f47e2..a9efae57 100644 --- a/site/zenodo_rdm/curation/config.py +++ b/site/zenodo_rdm/curation/config.py @@ -27,5 +27,10 @@ } """Rule scores for EU Curation.""" + +CURATION_THRESHOLDS = {"EU_RECORDS_CURATION": 10} +"""Threshold values for curators/rules.""" + + CURATION_ENABLE_EU_CURATOR = False """Controls whether to dry run EU Curation.""" diff --git a/site/zenodo_rdm/curation/curators.py b/site/zenodo_rdm/curation/curators.py index 81698e9d..acddef99 100644 --- a/site/zenodo_rdm/curation/curators.py +++ b/site/zenodo_rdm/curation/curators.py @@ -18,10 +18,9 @@ class BaseCurator: """Base Curator class.""" - def __init__(self, dry=False, raise_exc=False): + def __init__(self, dry=False): """Constructor.""" self.dry = dry - self.raise_exc = raise_exc def _evaluator(self, results): """Evaluates final result for based on results dict.""" @@ -32,14 +31,14 @@ def rules(self): """Get rules to run.""" raise NotImplementedError() - def run(self, record): + def run(self, record, raise_rule_exc=False): """Run rules for the curator and evaluate result.""" rule_results = {} for name, rule in self.rules.items(): try: rule_results[name] = rule(record) except Exception as e: - if self.raise_exc: + if raise_rule_exc: raise e rule_results[name] = None @@ -61,7 +60,9 @@ def _evaluator(self, results): score = 0 for rule, result in results.items(): rule_score = current_curation.scores.get(rule) - if isinstance(rule_score, int): + if result is None: + continue + elif isinstance(rule_score, int): score += rule_score if result else 0 elif isinstance(rule_score, bool): if result: @@ -70,7 +71,7 @@ def _evaluator(self, results): continue else: raise ValueError("Unsupported score type configured.") - return score >= current_app.config.get("CURATION_EU_CURATION_THRESHOLD") + return score >= current_curation.thresholds.get("EU_RECORDS_CURATION") @property def rules(self): @@ -80,15 +81,16 @@ def rules(self): def _post_run(self, record, result): """Actions to take after run.""" if self.dry: - current_app.logger.info( - f"Processed record ID: {record.pid.pid_value}", result - ) # TODO use error? Or should we log from the task + current_app.logger.error( + "Evaluation for EU record curator", + extra={"record_id": record.pid.pid_value, "result": result}, + ) return if result["evaluation"]: with UnitOfWork() as uow: current_record_communities_service.bulk_add( system_identity, - current_app.config.get("EU_COMMUNITY_ID"), + current_app.config.get("EU_COMMUNITY_UUID"), [record.pid.pid_value], uow=uow, ) diff --git a/site/zenodo_rdm/curation/ext.py b/site/zenodo_rdm/curation/ext.py index 3a9dfc68..2f7422f4 100644 --- a/site/zenodo_rdm/curation/ext.py +++ b/site/zenodo_rdm/curation/ext.py @@ -7,8 +7,6 @@ """ZenodoRDM Curation module.""" -from types import SimpleNamespace - from flask import current_app from werkzeug.utils import cached_property @@ -42,3 +40,11 @@ def scores(self): **config.CURATION_SCORES, **current_app.config.get("CURATION_SCORES", {}), } + + @cached_property + def thresholds(self): + """Return curation thresholds used for rules/curators.""" + return { + **config.CURATION_THRESHOLDS, + **current_app.config.get("CURATION_THRESHOLDS", {}), + } diff --git a/site/zenodo_rdm/curation/rules.py b/site/zenodo_rdm/curation/rules.py index eba7b628..1e1f5c72 100644 --- a/site/zenodo_rdm/curation/rules.py +++ b/site/zenodo_rdm/curation/rules.py @@ -19,9 +19,11 @@ def award_acronym_in_description(record): for f in funding: if f["funder"]["id"] == "00k4n6c32": - if "award" in f: - award = award_service.record_cls.pid.resolve(f["award"]["id"]) - if award["acronym"].lower() in description.lower(): + if award_id := f.get("award", {}).get("id"): + award = award_service.record_cls.pid.resolve(award_id) + if award.get("acronym") and ( + award.get("acronym").lower() in description.lower() + ): return True return False @@ -34,9 +36,11 @@ def award_acronym_in_title(record): for f in funding: if f["funder"]["id"] == "00k4n6c32": - if "award" in f: - award = award_service.record_cls.pid.resolve(f["award"]["id"]) - if award["acronym"].lower() in title.lower(): + if award_id := f.get("award", {}).get("id"): + award = award_service.record_cls.pid.resolve(award_id) + if award.get("acronym") and ( + award.get("acronym").lower() in title.lower() + ): return True return False @@ -44,7 +48,9 @@ def award_acronym_in_title(record): def test_phrases_in_record(record): """Check if test words in record.""" test_phrases = current_app.config.get("CURATION_TEST_PHRASES") - record_data = record.metadata["title"] + " " + record.metadata["description"] + record_data = ( + record.metadata["title"] + " " + record.metadata.get("description", "") + ) for word in test_phrases: if word.lower() in record_data.lower(): diff --git a/site/zenodo_rdm/curation/tasks.py b/site/zenodo_rdm/curation/tasks.py index ee7a3970..fd23482d 100644 --- a/site/zenodo_rdm/curation/tasks.py +++ b/site/zenodo_rdm/curation/tasks.py @@ -6,7 +6,7 @@ # it under the terms of the MIT License; see LICENSE file for more details. """Tasks for curation.""" -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from celery import shared_task from flask import current_app @@ -32,7 +32,9 @@ def run_eu_record_curation(since): dsl.Q( "range", created={ - "lte": (datetime.now() - timedelta(days=30)).isoformat(), + "lte": ( + datetime.now(timezone.utc) - timedelta(days=30) + ).isoformat(), }, ), dsl.Q( @@ -45,7 +47,11 @@ def run_eu_record_curation(since): must_not=[ dsl.Q( "term", - **{"parent.communities.ids": current_app.config.get("EU_COMMUNITY_ID")}, + **{ + "parent.communities.ids": current_app.config.get( + "EU_COMMUNITY_UUID" + ) + }, ) ], ) @@ -63,11 +69,12 @@ def run_eu_record_curation(since): ctx["processed"] += 1 if result["evaluation"]: ctx["approved"] += 1 - except Exception as e: - # NOTE Since curator's raise_exc is by default false, rules would not fail. - # This catches failure due to other reasons + except Exception: + # NOTE Since curator's raise_rules_exc is by default false, rules would not fail. + # This catches failures due to other reasons ctx["failed"] += 1 + current_app.logger.error( - f"EU curation processed", + "EU curation processed", extra=ctx, )