diff --git a/site/zenodo_rdm/curation/config.py b/site/zenodo_rdm/curation/config.py index 1c27d0ee..bb846513 100644 --- a/site/zenodo_rdm/curation/config.py +++ b/site/zenodo_rdm/curation/config.py @@ -5,9 +5,12 @@ # ZenodoRDM is free software; you can redistribute it and/or modify # it under the terms of the MIT License; see LICENSE file for more details. -"""Moderation config.""" +"""Curation config.""" from .rules import ( + additional_desc_contains_high_conf_keywords, + additional_desc_contains_low_conf_keywords, + award_acronym_in_additional_description, award_acronym_in_description, award_acronym_in_title, contains_high_conf_keywords, @@ -25,22 +28,28 @@ "user_verified": user_verified, "contains_low_conf_keywords": contains_low_conf_keywords, "contains_high_conf_keywords": contains_high_conf_keywords, + "additional_desc_contains_low_conf_keywords": additional_desc_contains_low_conf_keywords, + "additional_desc_contains_high_conf_keywords": additional_desc_contains_high_conf_keywords, + "award_acronym_in_additional_description": award_acronym_in_additional_description, } """Rules to run for EU Curation.""" CURATION_SCORES = { - "award_acronym_in_title": 5, - "award_acronym_in_description": 10, + "award_acronym_in_title": 0, + "award_acronym_in_description": 0, "test_phrases_in_record": False, "published_before_award_start": False, - "user_verified": 5, - "contains_low_conf_keywords": 5, - "contains_high_conf_keywords": 10, + "user_verified": 0, + "contains_low_conf_keywords": 0, + "contains_high_conf_keywords": 0, + "additional_desc_contains_low_conf_keywords": 0, + "additional_desc_contains_high_conf_keywords": 0, + "award_acronym_in_additional_description": 0, } """Rule scores for EU Curation.""" -CURATION_THRESHOLDS = {"EU_RECORDS_CURATION": 15} +CURATION_THRESHOLDS = {"EU_RECORDS_CURATION": 100} """Threshold values for curators/rules.""" diff --git a/site/zenodo_rdm/curation/rules.py b/site/zenodo_rdm/curation/rules.py index 0524154b..4e503daf 100644 --- a/site/zenodo_rdm/curation/rules.py +++ b/site/zenodo_rdm/curation/rules.py @@ -113,3 +113,47 @@ def contains_high_conf_keywords(record): if word.lower() in record_data.lower(): return True return False + + +def additional_desc_contains_high_conf_keywords(record): + """Check if additional description contains high confidence keywords.""" + high_conf_keywords_eu = current_app.config.get("CURATION_HIGH_CONF_KEYWORDS_EU") + additional_descriptions = record.metadata.get("additional_descriptions", []) + record_data = " ".join([x.get("description", "") for x in additional_descriptions]) + + for word in high_conf_keywords_eu: + # TODO could possibly return a number for higher conf + if word.lower() in record_data.lower(): + return True + return False + + +def additional_desc_contains_low_conf_keywords(record): + """Check if additional description contains low confidence keywords.""" + low_conf_keywords_eu = current_app.config.get("CURATION_LOW_CONF_KEYWORDS_EU") + additional_descriptions = record.metadata.get("additional_descriptions", []) + record_data = " ".join([x.get("description", "") for x in additional_descriptions]) + + for word in low_conf_keywords_eu: + # TODO could possibly return a number for higher conf + if word.lower() in record_data.lower(): + return True + return False + + +def award_acronym_in_additional_description(record): + """Check if EU award name in record additional description.""" + award_service = current_service_registry.get("awards") + additional_descriptions = record.metadata.get("additional_descriptions", []) + record_data = " ".join([x.get("description", "") for x in additional_descriptions]) + + funding = record.metadata.get("funding", []) + for f in funding: + if f["funder"].get("id") == "00k4n6c32": + if award_id := f.get("award", {}).get("id"): + award = award_service.record_cls.pid.resolve(award_id) + if award.get("acronym") and ( + award.get("acronym").lower() in record_data.lower() + ): + return True + return False diff --git a/site/zenodo_rdm/curation/tasks.py b/site/zenodo_rdm/curation/tasks.py index d65833db..9b461949 100644 --- a/site/zenodo_rdm/curation/tasks.py +++ b/site/zenodo_rdm/curation/tasks.py @@ -29,6 +29,8 @@ def run_eu_record_curation(since): } dry_run = not current_app.config.get("CURATION_ENABLE_EU_CURATOR") curator = EURecordCurator(dry=dry_run) + created_before = (datetime.now(timezone.utc) - timedelta(days=30)).isoformat() + updated_after = (datetime.fromisoformat(since) - timedelta(hours=12)).isoformat() query = dsl.Q( "bool", @@ -38,16 +40,13 @@ def run_eu_record_curation(since): dsl.Q( "range", created={ - "lte": ( - datetime.now(timezone.utc) - timedelta(days=30) - ).isoformat(), + "lte": created_before, }, ), dsl.Q( "range", updated={ - "gte": datetime.fromisoformat(since).isoformat() - - timedelta(hours=12), + "gte": updated_after, }, ), ],