diff --git a/site/zenodo_rdm/curation/config.py b/site/zenodo_rdm/curation/config.py index a9efae57..1b265409 100644 --- a/site/zenodo_rdm/curation/config.py +++ b/site/zenodo_rdm/curation/config.py @@ -10,13 +10,21 @@ from .rules import ( award_acronym_in_description, award_acronym_in_title, + contains_high_conf_keywords, + contains_low_conf_keywords, + published_before_award_start, test_phrases_in_record, + user_verified, ) CURATION_EU_RULES = { "award_acronym_in_title": award_acronym_in_title, "award_acronym_in_description": award_acronym_in_description, "test_phrases_in_record": test_phrases_in_record, + "published_before_award_start": published_before_award_start, + "user_verified": user_verified, + "contains_low_conf_keywords": contains_low_conf_keywords, + "contains_high_conf_keywords": contains_high_conf_keywords, } """Rules to run for EU Curation.""" @@ -24,13 +32,23 @@ "award_acronym_in_title": 5, "award_acronym_in_description": 10, "test_phrases_in_record": False, + "published_before_award_start": False, + "user_verified": 5, + "contains_low_conf_keywords": 5, + "contains_high_conf_keywords": 10, } """Rule scores for EU Curation.""" -CURATION_THRESHOLDS = {"EU_RECORDS_CURATION": 10} +CURATION_THRESHOLDS = {"EU_RECORDS_CURATION": 15} """Threshold values for curators/rules.""" CURATION_ENABLE_EU_CURATOR = False """Controls whether to dry run EU Curation.""" + +CURATION_LOW_CONF_KEYWORDS_EU = [] +"""Low confidence keywords for EU records.""" + +CURATION_HIGH_CONF_KEYWORDS_EU = [] +"""High confidence keywords for EU records.""" diff --git a/site/zenodo_rdm/curation/rules.py b/site/zenodo_rdm/curation/rules.py index 1e1f5c72..bb7b5db1 100644 --- a/site/zenodo_rdm/curation/rules.py +++ b/site/zenodo_rdm/curation/rules.py @@ -4,9 +4,9 @@ # # ZenodoRDM is free software; you can redistribute it and/or modify # it under the terms of the MIT License; see LICENSE file for more details. - """Rules for curation.""" +import arrow from flask import current_app from invenio_records_resources.proxies import current_service_registry @@ -14,11 +14,13 @@ def award_acronym_in_description(record): """Check if EU award name in record description.""" award_service = current_service_registry.get("awards") - description = record.metadata["description"] - funding = record.metadata["funding"] + description = record.metadata.get("description") + if not description: + return False + funding = record.metadata.get("funding", []) for f in funding: - if f["funder"]["id"] == "00k4n6c32": + if f["funder"].get("id") == "00k4n6c32": if award_id := f.get("award", {}).get("id"): award = award_service.record_cls.pid.resolve(award_id) if award.get("acronym") and ( @@ -56,3 +58,56 @@ def test_phrases_in_record(record): if word.lower() in record_data.lower(): return True return False + + +def published_before_award_start(record): + """Check if published before award start date.""" + award_service = current_service_registry.get("awards") + + for f in record.metadata["funding"]: + if f["funder"]["id"] == "00k4n6c32": + if award_id := f.get("award", {}).get("id"): + award = award_service.record_cls.pid.resolve(award_id) + if award.get("start_date") and ( + record.created < arrow.get(award.get("start_date")).datetime + ): + return True + return False + + +def user_verified(record): + """Check if user is verified.""" + is_verified = ( + getattr(record.parent, "is_verified", None) + if hasattr(record, "parent") + else getattr(record, "is_verified", False) + ) + return is_verified + + +def contains_low_conf_keywords(record): + """Check if record contains low confidence keywords.""" + low_conf_keywords_eu = current_app.config.get("CURATION_LOW_CONF_KEYWORDS_EU") + record_data = ( + record.metadata["title"] + " " + record.metadata.get("description", "") + ) + + for word in low_conf_keywords_eu: + # TODO could possibly return a number for higher conf + if word.lower() in record_data.lower(): + return True + return False + + +def contains_high_conf_keywords(record): + """Check if record contains high confidence keywords.""" + high_conf_keywords_eu = current_app.config.get("CURATION_HIGH_CONF_KEYWORDS_EU") + record_data = ( + record.metadata["title"] + " " + record.metadata.get("description", "") + ) + + for word in high_conf_keywords_eu: + # TODO could possibly return a number for higher conf + if word.lower() in record_data.lower(): + return True + return False