From 95d7fcd4551f979502bfc0c74e4d96a391018d6c Mon Sep 17 00:00:00 2001 From: yashlamba Date: Thu, 12 Dec 2024 17:30:46 +0100 Subject: [PATCH] curation: add rules to check additional desc --- site/zenodo_rdm/curation/config.py | 21 ++++++++++---- site/zenodo_rdm/curation/rules.py | 44 ++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 6 deletions(-) diff --git a/site/zenodo_rdm/curation/config.py b/site/zenodo_rdm/curation/config.py index 1c27d0ee..60056a15 100644 --- a/site/zenodo_rdm/curation/config.py +++ b/site/zenodo_rdm/curation/config.py @@ -8,6 +8,9 @@ """Moderation config.""" from .rules import ( + additional_desc_contains_high_conf_keywords, + additional_desc_contains_low_conf_keywords, + award_acronym_in_additional_description, award_acronym_in_description, award_acronym_in_title, contains_high_conf_keywords, @@ -25,22 +28,28 @@ "user_verified": user_verified, "contains_low_conf_keywords": contains_low_conf_keywords, "contains_high_conf_keywords": contains_high_conf_keywords, + "additional_desc_contains_low_conf_keywords": additional_desc_contains_low_conf_keywords, + "additional_desc_contains_high_conf_keywords": additional_desc_contains_high_conf_keywords, + "award_acronym_in_additional_description": award_acronym_in_additional_description, } """Rules to run for EU Curation.""" CURATION_SCORES = { - "award_acronym_in_title": 5, - "award_acronym_in_description": 10, + "award_acronym_in_title": 0, + "award_acronym_in_description": 0, "test_phrases_in_record": False, "published_before_award_start": False, - "user_verified": 5, - "contains_low_conf_keywords": 5, - "contains_high_conf_keywords": 10, + "user_verified": 0, + "contains_low_conf_keywords": 0, + "contains_high_conf_keywords": 0, + "additional_desc_contains_low_conf_keywords": 0, + "additional_desc_contains_high_conf_keywords": 0, + "award_acronym_in_additional_description": 0, } """Rule scores for EU Curation.""" -CURATION_THRESHOLDS = {"EU_RECORDS_CURATION": 15} +CURATION_THRESHOLDS = {"EU_RECORDS_CURATION": 100} """Threshold values for curators/rules.""" diff --git a/site/zenodo_rdm/curation/rules.py b/site/zenodo_rdm/curation/rules.py index 0524154b..228a1937 100644 --- a/site/zenodo_rdm/curation/rules.py +++ b/site/zenodo_rdm/curation/rules.py @@ -113,3 +113,47 @@ def contains_high_conf_keywords(record): if word.lower() in record_data.lower(): return True return False + + +def additional_desc_contains_high_conf_keywords(record): + """Check if additional description contains high confidence keywords.""" + high_conf_keywords_eu = current_app.config.get("CURATION_HIGH_CONF_KEYWORDS_EU") + additional_descriptions = record.metadata.get("additional_descriptions", []) + record_data = " ".join([x.get("description") for x in additional_descriptions]) + + for word in high_conf_keywords_eu: + # TODO could possibly return a number for higher conf + if word.lower() in record_data.lower(): + return True + return False + + +def additional_desc_contains_low_conf_keywords(record): + """Check if additional description contains low confidence keywords.""" + low_conf_keywords_eu = current_app.config.get("CURATION_LOW_CONF_KEYWORDS_EU") + additional_descriptions = record.metadata.get("additional_descriptions", []) + record_data = " ".join([x.get("description") for x in additional_descriptions]) + + for word in low_conf_keywords_eu: + # TODO could possibly return a number for higher conf + if word.lower() in record_data.lower(): + return True + return False + + +def award_acronym_in_additional_description(record): + """Check if EU award name in record additional description.""" + award_service = current_service_registry.get("awards") + additional_descriptions = record.metadata.get("additional_descriptions", []) + record_data = " ".join([x.get("description") for x in additional_descriptions]) + + funding = record.metadata.get("funding", []) + for f in funding: + if f["funder"].get("id") == "00k4n6c32": + if award_id := f.get("award", {}).get("id"): + award = award_service.record_cls.pid.resolve(award_id) + if award.get("acronym") and ( + award.get("acronym").lower() in record_data.lower() + ): + return True + return False