diff --git a/doajtest/fixtures/article.py b/doajtest/fixtures/article.py index fbee1726f..b1c0fa053 100644 --- a/doajtest/fixtures/article.py +++ b/doajtest/fixtures/article.py @@ -160,6 +160,22 @@ def find_dict_in_list(lst, key, value): def make_article_apido_struct(): return deepcopy(ARTICLE_STRUCT) + @staticmethod + def make_article_with_data(title=None, publisher_name=None, abstract=None, country=None, author=None): + source = deepcopy(ARTICLE_SOURCE) + if title: + source["bibjson"]["title"] = title + if publisher_name: + source["bibjson"]["journal"]["publisher"] = publisher_name + if abstract: + source["bibjson"]["abstract"] = abstract + if country: + source["bibjson"]["journal"]["country"] = country + if author: + source["bibjson"]["author"][0]["name"] = author + + return source + ARTICLE_SOURCE = { "id": "abcdefghijk_article", diff --git a/doajtest/fixtures/v2/applications.py b/doajtest/fixtures/v2/applications.py index bda2eca93..8c862f930 100644 --- a/doajtest/fixtures/v2/applications.py +++ b/doajtest/fixtures/v2/applications.py @@ -20,6 +20,17 @@ def make_update_request_source(): @staticmethod def make_application_source(): return deepcopy(APPLICATION_SOURCE) + + @staticmethod + def make_application_with_data(title=None, publisher_name=None, country=None): + application = deepcopy(APPLICATION_SOURCE) + if title: + application["bibjson"]["title"] = title + if publisher_name: + application["bibjson"]["publisher"]["name"] = publisher_name + if country: + application["bibjson"]["publisher"]["country"] = country + return application @staticmethod def make_many_application_sources(count=2, in_doaj=False): diff --git a/doajtest/fixtures/v2/journals.py b/doajtest/fixtures/v2/journals.py index 3b3d1123a..9986789ab 100644 --- a/doajtest/fixtures/v2/journals.py +++ b/doajtest/fixtures/v2/journals.py @@ -44,6 +44,21 @@ def make_journal_form(): def make_journal_form_info(): return deepcopy(JOURNAL_FORM_EXPANDED) + @staticmethod + def make_journal_with_data(**data): + journal = deepcopy(JOURNAL_SOURCE) + in_doaj = data['in_doaj'] if'in_doaj' in data else True + journal['admin']['in_doaj'] = in_doaj + if 'title' in data: + journal["bibjson"]["title"] = data['title'] + if 'publisher_name' in data: + journal["bibjson"]["publisher"]["name"] = data['publisher_name'] + if 'country' in data: + journal["bibjson"]["publisher"]["country"] = data['country'] + if 'alternative_title' in data: + journal["bibjson"]["alternative_title"] = data['alternative_title'] + return journal + @staticmethod def make_bulk_edit_data(): return deepcopy(JOURNAL_BULK_EDIT) diff --git a/doajtest/testbook/public_site/public_search.yml b/doajtest/testbook/public_site/public_search.yml index 166b02c25..9f5ae3f79 100644 --- a/doajtest/testbook/public_site/public_search.yml +++ b/doajtest/testbook/public_site/public_search.yml @@ -196,3 +196,16 @@ tests: - step: Click on 'Export RIS' of any article results: - A RIS file is downloaded +- title: 'Test Public Search Ascii Folding: Articles/Journals' + context: + role: anonymous + steps: + - step: Make sure there is a Journal or Article which has special ascii characters (example - I can’t really think in English ) in one of the following fields + - Title + - Publisher name + - Country name + - step: Go to the DOAJ search page at /search/articles for article search or /search/journals for journal search + - step: search with ascii characters instead of special characters (example - I can't really think in English) + results: + - Same search results will be displayed when searched with special characters (I can’t really think in English) + diff --git a/doajtest/unit/test_query.py b/doajtest/unit/test_query.py index 0779449fa..984af1902 100644 --- a/doajtest/unit/test_query.py +++ b/doajtest/unit/test_query.py @@ -4,7 +4,7 @@ from portality import models from doajtest.fixtures import AccountFixtureFactory, ArticleFixtureFactory, EditorGroupFixtureFactory, \ - ApplicationFixtureFactory + ApplicationFixtureFactory, JournalFixtureFactory from doajtest.helpers import DoajTestCase, deep_sort from portality.bll.services.query import QueryService, Query @@ -18,6 +18,13 @@ "query_filters": ["only_in_doaj"], "result_filters": ["public_result_filter"], "dao": "portality.models.Article" + }, + "journal": { + "auth": False, + "role": None, + "query_filters": ["only_in_doaj"], + "result_filters": ["public_result_filter"], + "dao": "portality.models.Journal" } }, "publisher_query": { @@ -147,6 +154,12 @@ "search_all_meta" : "portality.lib.query_filters.search_all_meta", } +MATCH_ALL_RAW_QUERY = {"query": {"match_all": {}}} + + +def raw_query(query): + return {'query': {'query_string': {'query': query, 'default_operator': 'AND'}}, 'size': 0, 'track_total_hits': True} + def without_keys(d, keys): return {x: d[x] for x in d if x not in keys} @@ -192,8 +205,8 @@ def get_journal_with_notes(self): def test_01_auth(self): with self.app_test.test_client() as t_client: - response = t_client.get('/query/journal') # not in the settings above - assert response.status_code == 403, response.status_code + response = t_client.get('/query/journal') + assert response.status_code == 200, response.status_code # theoretically should be a 404, but the code checks QUERY_ROUTE config first, so auth checks go first response = t_client.get('/query/nonexistent') @@ -243,11 +256,15 @@ def test_02_query_gen(self): q = Query() q.add_include("last_updated") - assert q.as_dict() == {'track_total_hits' : True, "query": {"match_all": {}},"_source": {"includes": ["last_updated"]}}, q.as_dict() + assert q.as_dict() == {'track_total_hits': True, "query": {"match_all": {}}, + "_source": {"includes": ["last_updated"]}}, q.as_dict() q = Query() q.add_include(["last_updated", "id"]) - assert sorted(q.as_dict()) == sorted({'track_total_hits' : True, "query": {"match_all": {}},"_source": {"includes": ["last_updated", "id"]}}) or sorted(q.as_dict()) == sorted({"query": {"match_all": {}},"_source": {"include": ["last_updated", "id"]}}), sorted(q.as_dict()) + assert sorted(q.as_dict()) == sorted({'track_total_hits': True, "query": {"match_all": {}}, + "_source": {"includes": ["last_updated", "id"]}}) or sorted( + q.as_dict()) == sorted( + {"query": {"match_all": {}}, "_source": {"include": ["last_updated", "id"]}}), sorted(q.as_dict()) def test_03_query_svc_get_config(self): qsvc = QueryService() @@ -595,6 +612,158 @@ def test_journal_article_query_notes(self): 'size': 0, 'track_total_hits': True}, account=None, additional_parameters={"ref":"fqw"}) assert res['hits']['total']["value"] == 0, res['hits']['total']["value"] + def test_article_query_ascci_folding(self): + self.article12 = models.Article( + **ArticleFixtureFactory.make_article_with_data({"bibjson": {"title": "I can’t really think in English"}})) + self.article12.save(blocking=True) + qsvc = QueryService() + + res = qsvc.search('query', 'article', MATCH_ALL_RAW_QUERY, account=None, + additional_parameters={}) + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + + res = qsvc.search('query', 'article', raw_query("I can't really think in English"), + account=None, additional_parameters={"ref": "fqw"}) + + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + + res = qsvc.search('query', 'article', raw_query("I can’t really think in English"), + account=None, additional_parameters={"ref": "fqw"}) + + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + + def test_journal_query_ascii_folding(self): + self.journal = models.Journal(**JournalFixtureFactory.make_journal_with_data(title="I can’t really think in English")) + self.journal.save(blocking=True) + qsvc = QueryService() + + res = qsvc.search('query', 'journal', MATCH_ALL_RAW_QUERY, account=None, + additional_parameters={}) + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + + res = qsvc.search('query', 'journal', raw_query("I can't really think in English"), + account=None, additional_parameters={"ref": "fqw"}) + + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + + res = qsvc.search('query', 'journal', raw_query("I can’t really think in English"), + account=None, additional_parameters={"ref": "fqw"}) + + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + + def test_article_query_ascci_folding_data(self): + self.article12 = models.Article( + **ArticleFixtureFactory.make_article_with_data(title="Kadınlarının sağlık", + publisher_name="Ankara Üniversitesi", + abstract="Araştırma grubunu", country="Türkiye", + author="Sultan GÜÇLÜ")) + self.article12.save(blocking=True) + qsvc = QueryService() + + res = qsvc.search('query', 'article', MATCH_ALL_RAW_QUERY, account=None, + additional_parameters={}) + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + + # check for title + res = qsvc.search('query', 'article', raw_query("Kadinlarinin saglik"), account=None, + additional_parameters={"ref": "fqw"}) + + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + + # echeck for publisher + res = qsvc.search('query', 'article', raw_query("Ankara Universitesi"), account=None, + additional_parameters={"ref": "fqw"}) + + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + + # check for abstract + res = qsvc.search('query', 'article', raw_query("Arastırma grubunu"), account=None, + additional_parameters={"ref": "fqw"}) + + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + + # check for country + res = qsvc.search('query', 'article', raw_query("Turkiye"), account=None, + additional_parameters={"ref": "fqw"}) + + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + + # check for author + res = qsvc.search('query', 'article', raw_query("Sultan GUCLU"), account=None, + additional_parameters={"ref": "fqw"}) + + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + + def test_journal_query_ascii_folding_data(self): + self.journal = models.Journal(**JournalFixtureFactory + .make_journal_with_data(title="Kadınlarının sağlık", + publisher_name="Ankara Üniversitesi", + country="Türkiye", + alternative_title="Dirasat: Shariía and Law Sciences")) + self.journal.save(blocking=True) + qsvc = QueryService() + + # check if journal exist + res = qsvc.search('query', 'journal', MATCH_ALL_RAW_QUERY, account=None, + additional_parameters={}) + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + + # check for title search + res = qsvc.search('query', 'journal', raw_query("Kadinlarinin saglik"), account=None, + additional_parameters={"ref": "fqw"}) + + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + + # check for publisher name + res = qsvc.search('query', 'journal', raw_query("Ankara Universitesi"), account=None, + additional_parameters={"ref": "fqw"}) + + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + + # check for country + res = qsvc.search('query', 'journal', raw_query("Turkiye"), account=None, + additional_parameters={"ref": "fqw"}) + + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + + # check alternative title + res = qsvc.search('query', 'journal', raw_query("Shariia"), + account=None, additional_parameters={}) + + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + + def test_application_query_ascii_folding_data(self): + acc = models.Account(**AccountFixtureFactory.make_managing_editor_source()) + application = models.Application(**ApplicationFixtureFactory + .make_application_with_data(title="Kadınlarının sağlık", + publisher_name="Ankara Üniversitesi", + country="Türkiye", )) + application.save(blocking=True) + qsvc = QueryService() + + # check if journal exist + res = qsvc.search('editor_query', 'suggestion', MATCH_ALL_RAW_QUERY, account=acc, + additional_parameters={}) + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + + # check for title search + res = qsvc.search('editor_query', 'suggestion', raw_query("Kadinlarinin saglik"), account=acc, + additional_parameters={"ref": "fqw"}) + + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + + # check for publisher name + res = qsvc.search('editor_query', 'suggestion', raw_query("Ankara Universitesi"), account=acc, + additional_parameters={"ref": "fqw"}) + + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + + # check for country + res = qsvc.search('editor_query', 'suggestion', raw_query("Turkiye"), account=acc, + additional_parameters={"ref": "fqw"}) + + assert res['hits']['total']["value"] == 1, res['hits']['total']["value"] + def test_search__invalid_from(self): acc = models.Account(**AccountFixtureFactory.make_managing_editor_source()) acc.save(blocking=True) @@ -604,4 +773,4 @@ def test_search__invalid_from(self): 'sort': [{'_score': {'order': 'desc'}}], 'track_total_hits': 'true'} with pytest.raises(RequestError): - QueryService().search('admin_query', 'journal', query, account=acc, additional_parameters={}) + QueryService().search('admin_query', 'journal', query, account=acc, additional_parameters={}) \ No newline at end of file diff --git a/portality/lib/es_data_mapping.py b/portality/lib/es_data_mapping.py index 9e57d929e..553dd9ddc 100644 --- a/portality/lib/es_data_mapping.py +++ b/portality/lib/es_data_mapping.py @@ -22,6 +22,7 @@ def get_mappings(app): for cname in mapping_daos: klazz = plugin.load_class_raw(cname) mappings[klazz.__type__] = {'mappings': klazz().mappings()} + mappings[klazz.__type__]['settings'] = app.config["DEFAULT_INDEX_SETTINGS"] return mappings diff --git a/portality/migrate/3490_ascii_folding/README.md b/portality/migrate/3490_ascii_folding/README.md new file mode 100644 index 000000000..48a1ac6da --- /dev/null +++ b/portality/migrate/3490_ascii_folding/README.md @@ -0,0 +1,7 @@ +# 09 11 2023; Issue 3575 - Make notes searchable for admin + +## Execution + +Run the migration with + + python portality/scripts/es_reindex.py portality/migrate/3490_ascii_folding/migrate.json \ No newline at end of file diff --git a/portality/migrate/3490_ascii_folding/__init__.py b/portality/migrate/3490_ascii_folding/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/portality/migrate/3490_ascii_folding/migrate.json b/portality/migrate/3490_ascii_folding/migrate.json new file mode 100644 index 000000000..97f17cd40 --- /dev/null +++ b/portality/migrate/3490_ascii_folding/migrate.json @@ -0,0 +1,21 @@ +{ + "new_version": "-20240307_ascii_folding", + "old_version": "", + "types": [ + { + "type" : "article", + "migrate": true, + "set_alias": false + }, + { + "type": "journal", + "migrate": true, + "set_alias": false + }, + { + "type": "application", + "migrate": true, + "set_alias": false + } + ] +} \ No newline at end of file diff --git a/portality/models/article.py b/portality/models/article.py index 3cb2210e5..963506abe 100644 --- a/portality/models/article.py +++ b/portality/models/article.py @@ -6,11 +6,16 @@ from datetime import datetime from portality import datasets, constants +from portality.core import app from portality.dao import DomainObject +from portality.lib import es_data_mapping +from portality.lib.coerce import COERCE_MAP from portality.lib.dates import FMT_DATETIME_STD +from portality.lib.seamless import SeamlessMixin from portality.models import Journal from portality.models.v1.bibjson import GenericBibJSON # NOTE that article specifically uses the v1 BibJSON from portality.models.v1 import shared_structs +from portality.models.v2.shared_structs import ARTICLE_STRUCT from portality.lib import normalise, dates @@ -21,9 +26,75 @@ class NoValidOwnerException(Exception): pass -class Article(DomainObject): +ARTICLE_BIBJSON_EXTENSION = { + "objects" : ["bibjson"], + "structs" : { + "bibjson" : { + "fields" : { + "year" : {"coerce" : "unicode"}, + "month" : {"coerce" : "unicode"}, + "start_page" : {"coerce" : "unicode"}, + "end_page" : {"coerce" : "unicode"}, + "abstract" : {"coerce" : "unicode"} + }, + "lists" : { + "author" : {"contains" : "object"} + }, + "objects" : [ + "journal" + ], + + "structs" : { + "author" : { + "fields" : { + "name" : {"coerce" : "unicode"}, + "affiliation" : {"coerce" : "unicode"}, + "email" : {"coerce": "unicode"}, + "orcid_id" : {"coerce" : "unicode"} + } + }, + + "journal" : { + "fields" : { + "volume" : {"coerce" : "unicode"}, + "number" : {"coerce" : "unicode"}, + "publisher" : {"coerce" : "unicode"}, + "title" : {"coerce" : "unicode"}, + "country" : {"coerce" : "unicode"} + }, + "lists" : { + "language" : {"contains" : "field", "coerce" : "unicode"}, + "issns" : {"contains" : "field", "coerce" : "unicode"} + } + } + } + + } + } +} + +MAPPING_OPTS = { + "dynamic": None, + "coerces": app.config["DATAOBJ_TO_MAPPING_DEFAULTS"], + "exceptions": app.config["ARTICLE_EXCEPTION_MAPPING"], + "additional_mappings": {} +} + + +class Article(SeamlessMixin, DomainObject): __type__ = "article" + __SEAMLESS_STRUCT__ = [ + ARTICLE_STRUCT, + shared_structs.SHARED_BIBJSON, + ARTICLE_BIBJSON_EXTENSION + ] + + __SEAMLESS_COERCE__ = COERCE_MAP + + def mappings(self): + return es_data_mapping.create_mapping(self.__seamless_struct__.raw, MAPPING_OPTS) + @classmethod def duplicates(cls, publisher_record_id=None, doi=None, fulltexts=None, title=None, volume=None, number=None, start=None, should_match=None, size=10): # some input sanitisation @@ -840,52 +911,6 @@ def lcc_codes_full_list(self): return ["LCC:" + x for x in full_list if x is not None] -ARTICLE_BIBJSON_EXTENSION = { - "objects" : ["bibjson"], - "structs" : { - "bibjson" : { - "fields" : { - "year" : {"coerce" : "unicode"}, - "month" : {"coerce" : "unicode"}, - "start_page" : {"coerce" : "unicode"}, - "end_page" : {"coerce" : "unicode"}, - "abstract" : {"coerce" : "unicode"} - }, - "lists" : { - "author" : {"contains" : "object"} - }, - "objects" : [ - "journal" - ], - - "structs" : { - "author" : { - "fields" : { - "name" : {"coerce" : "unicode"}, - "affiliation" : {"coerce" : "unicode"}, - "email" : {"coerce": "unicode"}, - "orcid_id" : {"coerce" : "unicode"} - } - }, - - "journal" : { - "fields" : { - "volume" : {"coerce" : "unicode"}, - "number" : {"coerce" : "unicode"}, - "publisher" : {"coerce" : "unicode"}, - "title" : {"coerce" : "unicode"}, - "country" : {"coerce" : "unicode"} - }, - "lists" : { - "language" : {"contains" : "field", "coerce" : "unicode"}, - "issns" : {"contains" : "field", "coerce" : "unicode"} - } - } - } - - } - } -} ################################################## diff --git a/portality/models/v2/application.py b/portality/models/v2/application.py index c8f42d5c4..437d049c1 100644 --- a/portality/models/v2/application.py +++ b/portality/models/v2/application.py @@ -266,7 +266,7 @@ class AllPublisherApplications(DomainObject): MAPPING_OPTS = { "dynamic": None, "coerces": Journal.add_mapping_extensions(app.config["DATAOBJ_TO_MAPPING_DEFAULTS"]), - "exceptions": app.config["ADMIN_NOTES_SEARCH_MAPPING"], + "exceptions": {**app.config["ADMIN_NOTES_SEARCH_MAPPING"], **app.config["JOURNAL_EXCEPTION_MAPPING"]}, "additional_mappings": app.config["ADMIN_NOTES_INDEX_ONLY_FIELDS"] } diff --git a/portality/models/v2/journal.py b/portality/models/v2/journal.py index f97f8084f..a8664ddec 100644 --- a/portality/models/v2/journal.py +++ b/portality/models/v2/journal.py @@ -931,7 +931,7 @@ def _calculate_has_apc(self): MAPPING_OPTS = { "dynamic": None, "coerces": Journal.add_mapping_extensions(app.config["DATAOBJ_TO_MAPPING_DEFAULTS"]), - "exceptions": app.config["ADMIN_NOTES_SEARCH_MAPPING"], + "exceptions": {**app.config["ADMIN_NOTES_SEARCH_MAPPING"], **app.config["JOURNAL_EXCEPTION_MAPPING"]}, "additional_mappings": app.config["ADMIN_NOTES_INDEX_ONLY_FIELDS"] } diff --git a/portality/models/v2/shared_structs.py b/portality/models/v2/shared_structs.py index 6c2c031af..9aa222bf7 100644 --- a/portality/models/v2/shared_structs.py +++ b/portality/models/v2/shared_structs.py @@ -237,4 +237,47 @@ } } } +} + +ARTICLE_STRUCT = { + "fields" : { + "created_date": {"coerce": "utcdatetime"}, + "es_type": {"coerce": "unicode"}, + "id": {"coerce": "unicode"}, + "last_updated": {"coerce": "utcdatetime"}, + }, + "objects": [ + "admin", "index" + ], + "structs": { + "admin": { + "fields": { + "in_doaj": {"coerce": "bool"}, + "publisher_record_id": {"coerce": "unicode"}, + "seal": {"coerce": "bool"}, + "upload_id": {"coerce": "unicode"} + } + }, + "index": { + "fields": { + "asciiunpunctitle" : {"coerce" : "unicode"}, + "classification" : {"coerce" : "unicode"}, + "classification_paths": {"coerce" : "unicode"}, + "country" : {"coerce" : "unicode"}, + "date" : {"coerce" : "utcdatetime"}, + "date_toc_fv_month": {"coerce" : "utcdatetime"}, + "doi": {"coerce" : "unicode"}, + "fulltext": {"coerce" : "unicode"}, + "has_seal" : {"coerce" : "unicode"}, + "issn": {"coerce" : "unicode"}, + "language": {"coerce" : "unicode"}, + "publisher": {"coerce" : "unicode"}, + "schema_code": {"coerce" : "unicode"}, + "schema_codes_tree": {"coerce" : "unicode"}, + "schema_subject": {"coerce" : "unicode"}, + "subject": {"coerce" : "unicode"}, + "unpunctitle": {"coerce" : "unicode"} + } + } + } } \ No newline at end of file diff --git a/portality/scripts/es_reindex.py b/portality/scripts/es_reindex.py index 4893b1afc..73885c8ed 100644 --- a/portality/scripts/es_reindex.py +++ b/portality/scripts/es_reindex.py @@ -65,60 +65,62 @@ def do_import(config): # 2. re index with old index # 3. set alias for new index (if requested) for s in config.get("types", []): - import_type = s["type"] - if import_type in mappings: - - # index names - default_index_name = app.config['ELASTIC_SEARCH_DB_PREFIX'] + import_type - new_index = default_index_name + version - old_index = default_index_name + previous_version - - if not es_connection.indices.exists(new_index): - try: - # create new index - r = es_connection.indices.create(index=new_index, body=mappings[import_type]) - print("Creating ES Type + Mapping in index {0} for {1}; status: {2}".format(new_index, import_type, r)) - - # reindex from the old index - print("Reindexing from {0} to {1}".format(old_index, new_index)) - retry_count = 0 - max_retries = 5 - success = False - while not success and retry_count < max_retries: - try: - result, errors = helpers.reindex(client=es_connection, source_index=old_index, - target_index=new_index) - if errors: - print(f"Some documents failed to reindex: {import_type}", errors) - else: - success = True - print(f"Reindex completed successfully: {import_type}", result) - # add alias - if s.get("set_alias", False): - es_connection.indices.put_alias(index=new_index, name=default_index_name) - print("alias set for {0} as {1}".format(new_index, default_index_name)) + if s.get("migrate", False) is True: + import_type = s["type"] + if import_type in mappings: + + # index names + default_index_name = app.config['ELASTIC_SEARCH_DB_PREFIX'] + import_type + new_index = default_index_name + version + old_index = default_index_name + previous_version + + if not es_connection.indices.exists(new_index): + try: + # create new index + r = es_connection.indices.create(index=new_index, body=mappings[import_type]) + print("Creating ES Type + Mapping in index {0} for {1}; status: {2}".format(new_index, + import_type, r)) + + # reindex from the old index + print("Reindexing from {0} to {1}".format(old_index, new_index)) + retry_count = 0 + max_retries = 5 + success = False + while not success and retry_count < max_retries: + try: + result, errors = helpers.reindex(client=es_connection, source_index=old_index, + target_index=new_index) + if errors: + print(f"Some documents failed to reindex: {import_type}", errors) else: - print("alias not set for {0}".format(new_index)) - except ConnectionError: - retry_count += 1 - print(f"Timeout occurred, retrying {retry_count}/{max_retries}") - time.sleep(10) # Wait for 10 seconds before retrying - - if not success: - print("Failed to complete the reindexing after several retries.") - - except ConnectionError as e: - print(f"Failed to connect to Elasticsearch server. {e.info}") - except NotFoundError as e: - print(f"The specified index or alias does not exist. {e.info}") - except RequestError as e: - print(f"Bad request: {e.info}") - except AuthorizationException as e: - print(f"You do not have permission to perform this operation. {e.info}") - except Exception as e: - print(f"An unexpected error occurred: {e}") - else: - print("ES Type + Mapping already exists in index {0} for {1}".format(new_index, import_type)) + success = True + print(f"Reindex completed successfully: {import_type}", result) + # add alias + if s.get("set_alias", False): + es_connection.indices.put_alias(index=new_index, name=default_index_name) + print("alias set for {0} as {1}".format(new_index, default_index_name)) + else: + print("alias not set for {0}".format(new_index)) + except ConnectionError: + retry_count += 1 + print(f"Timeout occurred, retrying {retry_count}/{max_retries}") + time.sleep(10) # Wait for 10 seconds before retrying + + if not success: + print("Failed to complete the reindexing after several retries.") + + except ConnectionError as e: + print(f"Failed to connect to Elasticsearch server. {e.info}") + except NotFoundError as e: + print(f"The specified index or alias does not exist. {e.info}") + except RequestError as e: + print(f"Bad request: {e.info}") + except AuthorizationException as e: + print(f"You do not have permission to perform this operation. {e.info}") + except Exception as e: + print(f"An unexpected error occurred: {e}") + else: + print("ES Type + Mapping already exists in index {0} for {1}".format(new_index, import_type)) if __name__ == '__main__': diff --git a/portality/settings.py b/portality/settings.py index 69619ff1e..0ea5c8009 100644 --- a/portality/settings.py +++ b/portality/settings.py @@ -477,6 +477,7 @@ # an array of DAO classes from which to retrieve the type-specific ES mappings # to be loaded into the index during initialisation. ELASTIC_SEARCH_MAPPINGS = [ + "portality.models.Article", "portality.models.Journal", # ~~->Journal:Model~~ "portality.models.Application", # ~~->Application:Model~~ "portality.models.DraftApplication", # ~~-> DraftApplication:Model~~ @@ -650,7 +651,15 @@ DEFAULT_INDEX_SETTINGS = \ { 'number_of_shards': 4, - 'number_of_replicas': 1 + 'number_of_replicas': 1, + 'analysis': { + 'analyzer': { + 'ascii_folded': { + 'tokenizer': 'standard', + 'filter': ['lowercase', 'asciifolding'] + } + } + } } @@ -686,7 +695,6 @@ } } -MAPPINGS['article'] = MAPPINGS["account"] #~~->Article:Model~~ MAPPINGS['upload'] = MAPPINGS["account"] #~~->Upload:Model~~ MAPPINGS['bulk_articles'] = MAPPINGS["account"] #~~->BulkArticles:Model~~ MAPPINGS['cache'] = MAPPINGS["account"] #~~->Cache:Model~~ @@ -976,6 +984,24 @@ } } +ASCII_FOLDED = {"analyzer": "ascii_folded", "search_analyzer": "ascii_folded"} + +JOURNAL_EXCEPTION_MAPPING = { + "bibjson.title" : {**DATAOBJ_TO_MAPPING_DEFAULTS["unicode"], **ASCII_FOLDED}, + "bibjson.alternative_title" : {**DATAOBJ_TO_MAPPING_DEFAULTS["unicode"], **ASCII_FOLDED}, + "bibjson.publisher.name" : {**DATAOBJ_TO_MAPPING_DEFAULTS["unicode"], **ASCII_FOLDED}, + "index.country" : {**DATAOBJ_TO_MAPPING_DEFAULTS["unicode"], **ASCII_FOLDED}, + "index.title": {**DATAOBJ_TO_MAPPING_DEFAULTS["unicode"], **ASCII_FOLDED} +} + +ARTICLE_EXCEPTION_MAPPING = { + "bibjson.abstract" : {**DATAOBJ_TO_MAPPING_DEFAULTS["unicode"], **ASCII_FOLDED}, + "bibjson.author.name" : {**DATAOBJ_TO_MAPPING_DEFAULTS["unicode"], **ASCII_FOLDED}, + "bibjson.journal.publisher": {**DATAOBJ_TO_MAPPING_DEFAULTS["unicode"], **ASCII_FOLDED}, + "index.country": {**DATAOBJ_TO_MAPPING_DEFAULTS["unicode"], **ASCII_FOLDED}, + "bibjson.title": {**DATAOBJ_TO_MAPPING_DEFAULTS["unicode"], **ASCII_FOLDED} +} + #################################################### # Autocomplete