From cb2c13c1e37f8019d0a8b455c4acabc6edd22ea9 Mon Sep 17 00:00:00 2001 From: Martin Peck Date: Thu, 26 Oct 2023 11:57:55 -0700 Subject: [PATCH 01/20] Run migrations as their own target in Makefile, rather than by default. --- .gitlab-ci.yml | 4 ++-- Makefile | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6d9f1ec1..ae5fd8e9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -25,7 +25,7 @@ build_qa: - docker push "$QA_ECR_API_BASE_URL:latest" only: - develop - - feature/CV2-3482_deploy-new-model + - bugfix/no-migrations-test deploy_qa: image: python:3-alpine @@ -63,7 +63,7 @@ deploy_qa: - echo "new Image was deployed $QA_ECR_API_BASE_URL:$CI_COMMIT_SHA" only: - develop - - feature/CV2-3482_deploy-new-model + - bugfix/no-migrations-test build_live: image: docker:latest diff --git a/Makefile b/Makefile index 9a687196..d46ab0c1 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,12 @@ .PHONY: run test wait -run: wait +migration: wait python manage.py init_perl_functions python manage.py init python manage.py db stamp head python manage.py db upgrade + +run: wait python manage.py run # The model and worker entry points run repeatedly to From fd0f01833ada7a2d5c7e6940f163f7593a0560e3 Mon Sep 17 00:00:00 2001 From: Martin Peck Date: Sun, 29 Oct 2023 14:54:12 -0700 Subject: [PATCH 02/20] Test migration task in QA --- .gitlab-ci.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ae5fd8e9..1f3da7a8 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -44,6 +44,12 @@ deploy_qa: - pip install ecs-deploy==1.14.0 - pip install awscli==1.29.59 - aws ssm get-parameters-by-path --region $AWS_DEFAULT_REGION --path /qa/alegre/ --recursive --with-decryption --output text --query "Parameters[].[Name]" | sed -E 's#/qa/alegre/##' > env.qa.names + - for NAME in `cat env.qa.names`; do echo -n "-s qa-alegre-migration $NAME /qa/alegre/$NAME " >> qa-alegre-migration.env.args; done + - ecs update qa-alegre-migration --image qa-alegre-migration $QA_ECR_API_BASE_URL:$CI_COMMIT_SHA --exclusive-env -e qa-alegre-migration APP alegre -e qa-alegre-migration DEPLOY_ENV qa -e qa-alegre-migration AWS_REGION $AWS_DEFAULT_REGION -e qa-alegre-migration ALEGRE_PORT 8000 --exclusive-secrets `cat qa-alegre-migration.env.args` + - taskArn=$(aws ecs run-task --cluster ecs-qa --task-definition live-check-api-migration --query 'tasks[].taskArn' --output text) + - echo "Migration task started - $taskArn" + - aws ecs wait tasks-stopped --cluster ecs-qa --tasks $taskArn + - echo "Migration task finished." - for NAME in `cat env.qa.names`; do echo -n "-s qa-alegre-c $NAME /qa/alegre/$NAME " >> qa-alegre-c.env.args; done - ecs deploy ecs-qa qa-alegre --diff --image qa-alegre-c $QA_ECR_API_BASE_URL:$CI_COMMIT_SHA --timeout 1200 --exclusive-env -e qa-alegre-c APP alegre -e qa-alegre-c DEPLOY_ENV qa -e qa-alegre-c ALEGRE_PORT 8000 --exclusive-secrets `cat qa-alegre-c.env.args` - for NAME in `cat env.qa.names`; do echo -n "-s qa-alegre-indiansbert $NAME /qa/alegre/$NAME " >> qa-alegre-indiansbert.env.args; done From 37020106f24227afb6f7cebe34b79889501f1b2e Mon Sep 17 00:00:00 2001 From: Martin Peck Date: Sun, 29 Oct 2023 15:30:58 -0700 Subject: [PATCH 03/20] Fix typo. --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1f3da7a8..d9cae4e5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -46,7 +46,7 @@ deploy_qa: - aws ssm get-parameters-by-path --region $AWS_DEFAULT_REGION --path /qa/alegre/ --recursive --with-decryption --output text --query "Parameters[].[Name]" | sed -E 's#/qa/alegre/##' > env.qa.names - for NAME in `cat env.qa.names`; do echo -n "-s qa-alegre-migration $NAME /qa/alegre/$NAME " >> qa-alegre-migration.env.args; done - ecs update qa-alegre-migration --image qa-alegre-migration $QA_ECR_API_BASE_URL:$CI_COMMIT_SHA --exclusive-env -e qa-alegre-migration APP alegre -e qa-alegre-migration DEPLOY_ENV qa -e qa-alegre-migration AWS_REGION $AWS_DEFAULT_REGION -e qa-alegre-migration ALEGRE_PORT 8000 --exclusive-secrets `cat qa-alegre-migration.env.args` - - taskArn=$(aws ecs run-task --cluster ecs-qa --task-definition live-check-api-migration --query 'tasks[].taskArn' --output text) + - taskArn=$(aws ecs run-task --cluster ecs-qa --task-definition qa-alegre-migration --query 'tasks[].taskArn' --output text) - echo "Migration task started - $taskArn" - aws ecs wait tasks-stopped --cluster ecs-qa --tasks $taskArn - echo "Migration task finished." From 749667cc7b50d371159d9d84666204e0ba661078 Mon Sep 17 00:00:00 2001 From: Martin Peck Date: Mon, 30 Oct 2023 08:38:23 -0700 Subject: [PATCH 04/20] Add migration step to Live deployments and remove branch builds in preparation for merge. --- .gitlab-ci.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d9cae4e5..51184ba2 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -25,7 +25,6 @@ build_qa: - docker push "$QA_ECR_API_BASE_URL:latest" only: - develop - - bugfix/no-migrations-test deploy_qa: image: python:3-alpine @@ -69,7 +68,6 @@ deploy_qa: - echo "new Image was deployed $QA_ECR_API_BASE_URL:$CI_COMMIT_SHA" only: - develop - - bugfix/no-migrations-test build_live: image: docker:latest @@ -110,6 +108,12 @@ deploy_live: - pip install ecs-deploy==1.14.0 - pip install awscli==1.29.59 - aws ssm get-parameters-by-path --region $AWS_DEFAULT_REGION --path /live/alegre/ --recursive --with-decryption --output text --query "Parameters[].[Name]" | sed -E 's#/live/alegre/##' > env.live.names + - for NAME in `cat env.live.names`; do echo -n "-s live-alegre-migration $NAME /live/alegre/$NAME " >> live-alegre-migration.env.args; done + - ecs update live-alegre-migration --image live-alegre-migration $QA_ECR_API_BASE_URL:$CI_COMMIT_SHA --exclusive-env -e live-alegre-migration APP alegre -e live-alegre-migration DEPLOY_ENV live -e live-alegre-migration AWS_REGION $AWS_DEFAULT_REGION -e live-alegre-migration ALEGRE_PORT 8000 --exclusive-secrets `cat live-alegre-migration.env.args` + - taskArn=$(aws ecs run-task --cluster ecs-live --task-definition live-alegre-migration --query 'tasks[].taskArn' --output text) + - echo "Migration task started - $taskArn" + - aws ecs wait tasks-stopped --cluster ecs-live --tasks $taskArn + - echo "Migration task finished." - for NAME in `cat env.live.names`; do echo -n "-s live-alegre-c $NAME /live/alegre/$NAME " >> live-alegre-c.env.args; done - ecs deploy ecs-live live-alegre --image live-alegre-c $LIVE_ECR_API_BASE_URL:$CI_COMMIT_SHA --timeout 1200 --exclusive-env -e live-alegre-c APP alegre -e live-alegre-c DEPLOY_ENV live -e live-alegre-c ALEGRE_PORT 8000 --exclusive-secrets `cat live-alegre-c.env.args` - for NAME in `cat env.live.names`; do echo -n "-s live-alegre-indiansbert $NAME /live/alegre/$NAME " >> live-alegre-indiansbert.env.args; done From 0859d6437fbe4fd74360bd7e4837081b3386d721 Mon Sep 17 00:00:00 2001 From: Martin Peck Date: Sat, 4 Nov 2023 08:41:13 -0700 Subject: [PATCH 05/20] Build and deploy for this branch. --- .gitlab-ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 51184ba2..ee6f5990 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -25,6 +25,7 @@ build_qa: - docker push "$QA_ECR_API_BASE_URL:latest" only: - develop + - bugfix/no-migrations-test deploy_qa: image: python:3-alpine @@ -68,6 +69,7 @@ deploy_qa: - echo "new Image was deployed $QA_ECR_API_BASE_URL:$CI_COMMIT_SHA" only: - develop + - bugfix/no-migrations-test build_live: image: docker:latest From ca2fe08f5148994a93524853cb8bc5ca34e67059 Mon Sep 17 00:00:00 2001 From: Martin Peck Date: Mon, 20 Nov 2023 21:52:54 -0700 Subject: [PATCH 06/20] ELASTICSEARCH is now OPENSEARCH --- Makefile | 1 + app/main/config.py | 6 +- .../controller/bulk_similarity_controller.py | 4 +- .../bulk_update_similarity_controller.py | 4 +- app/main/controller/healthcheck_controller.py | 16 ++-- app/main/lib/elasticsearch.py | 20 ++-- app/main/lib/language_analyzers.py | 6 +- app/main/lib/reindex_analyzers.py | 16 ++-- app/main/lib/text_similarity.py | 10 +- app/test/test_bulk_similarity.py | 14 +-- app/test/test_bulk_update_similarity.py | 14 +-- app/test/test_healthcheck.py | 4 +- app/test/test_similarity.py | 92 +++++++++---------- app/test/test_similarity_lang_analyzers.py | 34 +++---- app/test/test_text_similarity.py | 10 +- manage.py | 38 ++++++-- opensearch/alegre_similarity.json | 30 +++--- opensearch/alegre_similarity_base.json | 10 ++ production/bin/migrate_entrypoint.sh | 12 +++ production/bin/run_migrations.sh | 18 ++++ 20 files changed, 211 insertions(+), 148 deletions(-) create mode 100644 opensearch/alegre_similarity_base.json create mode 100755 production/bin/migrate_entrypoint.sh create mode 100755 production/bin/run_migrations.sh diff --git a/Makefile b/Makefile index d46ab0c1..586c0f65 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,7 @@ migration: wait python manage.py init python manage.py db stamp head python manage.py db upgrade + echo "Migrations complete." run: wait python manage.py run diff --git a/app/main/config.py b/app/main/config.py index fe4dfba9..4961350a 100644 --- a/app/main/config.py +++ b/app/main/config.py @@ -6,8 +6,8 @@ class Config: SECRET_KEY = os.getenv('SECRET_KEY', 'my_precious_secret_key') DEBUG = False - ELASTICSEARCH_URL = os.getenv('ELASTICSEARCH_URL', 'http://elasticsearch:9200') - ELASTICSEARCH_SIMILARITY = 'alegre_similarity' + OPENSEARCH_URL = os.getenv('OPENSEARCH_URL', 'http://elasticsearch:9200') + OPENSEARCH_SIMILARITY = 'alegre_similarity' REDIS_HOST = os.getenv('REDIS_HOST', 'redis') REDIS_PORT = os.getenv('REDIS_PORT', 6379) REDIS_DATABASE = os.getenv('REDIS_DATABASE', 0) @@ -49,7 +49,7 @@ class TestingConfig(Config): DEBUG = True TESTING = True PRESERVE_CONTEXT_ON_EXCEPTION = False - ELASTICSEARCH_SIMILARITY = 'alegre_similarity_test' + OPENSEARCH_SIMILARITY = 'alegre_similarity_test' REDIS_DATABASE = os.getenv('REDIS_DATABASE', 1) SQLALCHEMY_DATABASE_URI = 'postgresql+psycopg2://%(user)s:%(password)s@%(host)s/%(dbname)s?client_encoding=utf8' % { 'user': os.getenv('DATABASE_USER', 'postgres'), diff --git a/app/main/controller/bulk_similarity_controller.py b/app/main/controller/bulk_similarity_controller.py index 151793e4..15b4a3ae 100644 --- a/app/main/controller/bulk_similarity_controller.py +++ b/app/main/controller/bulk_similarity_controller.py @@ -26,7 +26,7 @@ class BulkSimilarityResource(Resource): def get_bulk_write_object(self, doc_id, body, op_type="index"): return { "_op_type": op_type, - '_index': app.config['ELASTICSEARCH_SIMILARITY'], + '_index': app.config['OPENSEARCH_SIMILARITY'], '_id': doc_id, '_source': body } @@ -46,7 +46,7 @@ def get_bodies_for_request(self): return doc_ids, bodies def submit_bulk_request(self, doc_ids, bodies, op_type="index"): - es = OpenSearch(app.config['ELASTICSEARCH_URL']) + es = OpenSearch(app.config['OPENSEARCH_URL']) writables = [] for doc_body_set in each_slice(list(zip(doc_ids, bodies)), 8000): to_write = [] diff --git a/app/main/controller/bulk_update_similarity_controller.py b/app/main/controller/bulk_update_similarity_controller.py index 025f9690..a4b82a5a 100644 --- a/app/main/controller/bulk_update_similarity_controller.py +++ b/app/main/controller/bulk_update_similarity_controller.py @@ -71,9 +71,9 @@ def get_cases(params, existing_docs, updateable=True): class BulkUpdateSimilarityResource(Resource): # Assumes less than 10k documents at a time. def get_writeable_data_for_request(self): - es = OpenSearch(app.config['ELASTICSEARCH_URL'], timeout=30) + es = OpenSearch(app.config['OPENSEARCH_URL'], timeout=30) params = request.json - existing_docs = get_documents_by_ids(app.config['ELASTICSEARCH_SIMILARITY'], [e.get("doc_id") for e in params.get("documents", [])], es) + existing_docs = get_documents_by_ids(app.config['OPENSEARCH_SIMILARITY'], [e.get("doc_id") for e in params.get("documents", [])], es) updated_cases = get_cases(params, existing_docs) new_cases = get_cases(params, existing_docs, False) return updated_cases, new_cases diff --git a/app/main/controller/healthcheck_controller.py b/app/main/controller/healthcheck_controller.py index 0659828f..01e2f54e 100644 --- a/app/main/controller/healthcheck_controller.py +++ b/app/main/controller/healthcheck_controller.py @@ -15,8 +15,8 @@ class HealthcheckResource(Resource): @api.doc('Make a healthcheck query') def get(self): result = { - 'ELASTICSEARCH': False, - 'ELASTICSEARCH_SIMILARITY': False, + 'OPENSEARCH': False, + 'OPENSEARCH_SIMILARITY': False, 'REDIS': False, 'DATABASE': False, 'LANGID': False @@ -24,15 +24,15 @@ def get(self): # Elasticsearch try: - es = OpenSearch(app.config['ELASTICSEARCH_URL'], timeout=10, max_retries=3, retry_on_timeout=True) + es = OpenSearch(app.config['OPENSEARCH_URL'], timeout=10, max_retries=3, retry_on_timeout=True) except Exception as e: - result['ELASTICSEARCH'] = str(e) + result['OPENSEARCH'] = str(e) else: - result['ELASTICSEARCH'] = True - result['ELASTICSEARCH_SIMILARITY'] = True if es.indices.exists( - index=[app.config['ELASTICSEARCH_SIMILARITY']] - ) else 'Index not found `%s`' % app.config['ELASTICSEARCH_SIMILARITY'] + result['OPENSEARCH'] = True + result['OPENSEARCH_SIMILARITY'] = True if es.indices.exists( + index=[app.config['OPENSEARCH_SIMILARITY']] + ) else 'Index not found `%s`' % app.config['OPENSEARCH_SIMILARITY'] # Redis try: diff --git a/app/main/lib/elasticsearch.py b/app/main/lib/elasticsearch.py index 18846fe2..c9830f65 100644 --- a/app/main/lib/elasticsearch.py +++ b/app/main/lib/elasticsearch.py @@ -13,7 +13,7 @@ def get_all_documents_matching_context(context): matches, clause_count = generate_matches(context) - es = OpenSearch(app.config['ELASTICSEARCH_URL'], timeout=30) + es = OpenSearch(app.config['OPENSEARCH_URL'], timeout=30) conditions = [{ 'nested': { 'score_mode': 'none', @@ -36,7 +36,7 @@ def get_all_documents_matching_context(context): docs = scan(es, size=10000, query=body, - index=app.config['ELASTICSEARCH_SIMILARITY'], + index=app.config['OPENSEARCH_SIMILARITY'], ) for hit in docs: yield hit @@ -75,7 +75,7 @@ def merge_contexts(body, found_doc): return body def update_or_create_document(body, doc_id, index): - es = OpenSearch(app.config['ELASTICSEARCH_URL'], timeout=30) + es = OpenSearch(app.config['OPENSEARCH_URL'], timeout=30) result = None if doc_id: try: @@ -107,7 +107,7 @@ def update_or_create_document(body, doc_id, index): return result def store_document(body, doc_id, language=None): - indices = [app.config['ELASTICSEARCH_SIMILARITY']] + indices = [app.config['OPENSEARCH_SIMILARITY']] # 'auto' indicates we should try to guess the appropriate language if language == 'auto': text = body['content'] @@ -118,7 +118,7 @@ def store_document(body, doc_id, language=None): if (language is not None) and (language in SUPPORTED_LANGUAGES): # also cache in the language-specific index - indices.append(app.config['ELASTICSEARCH_SIMILARITY']+"_"+language) + indices.append(app.config['OPENSEARCH_SIMILARITY']+"_"+language) results = [] for index in indices: @@ -138,25 +138,25 @@ def store_document(body, doc_id, language=None): def delete_context_from_found_doc(context, found_doc, doc_id): found_doc["contexts"] = [row for row in found_doc.get("contexts", []) if context != row] - es = OpenSearch(app.config['ELASTICSEARCH_URL']) + es = OpenSearch(app.config['OPENSEARCH_URL']) result = es.update( id=doc_id, body={"doc": found_doc}, - index=app.config['ELASTICSEARCH_SIMILARITY'] + index=app.config['OPENSEARCH_SIMILARITY'] ) return result def delete_document(doc_id, context, quiet): - es = OpenSearch(app.config['ELASTICSEARCH_URL']) + es = OpenSearch(app.config['OPENSEARCH_URL']) try: - found_doc = es.get(index=app.config['ELASTICSEARCH_SIMILARITY'], id=doc_id) + found_doc = es.get(index=app.config['OPENSEARCH_SIMILARITY'], id=doc_id) except opensearchpy.exceptions.NotFoundError: found_doc = None try: if found_doc and context in found_doc.get("contexts", []) and len(found_doc.get("contexts", [])) > 1: return delete_context_from_found_doc(context, found_doc, doc_id) else: - return es.delete(index=app.config['ELASTICSEARCH_SIMILARITY'], id=doc_id) + return es.delete(index=app.config['OPENSEARCH_SIMILARITY'], id=doc_id) except: if quiet: return { diff --git a/app/main/lib/language_analyzers.py b/app/main/lib/language_analyzers.py index 8859ffed..f7da1b37 100644 --- a/app/main/lib/language_analyzers.py +++ b/app/main/lib/language_analyzers.py @@ -304,17 +304,17 @@ } def init_indices(): - es = OpenSearch(app.config['ELASTICSEARCH_URL']) + es = OpenSearch(app.config['OPENSEARCH_URL']) indices = es.cat.indices(h='index', s='index').split() for lang in SUPPORTED_LANGUAGES: - index_name = app.config['ELASTICSEARCH_SIMILARITY']+"_"+lang + index_name = app.config['OPENSEARCH_SIMILARITY']+"_"+lang if index_name not in indices: es.indices.create(index=index_name) else: es.indices.delete(index=index_name) es.indices.create(index=index_name) es.indices.close(index=index_name) - mapping = json.load(open('./elasticsearch/alegre_similarity_base.json')) + mapping = json.load(open('./opensearch/alegre_similarity_base.json')) mapping["properties"]["content"]["analyzer"] = "rebuilt_"+lang es.indices.put_settings( body=SETTINGS_BY_LANGUAGE[lang], diff --git a/app/main/lib/reindex_analyzers.py b/app/main/lib/reindex_analyzers.py index dc13859c..019dca41 100644 --- a/app/main/lib/reindex_analyzers.py +++ b/app/main/lib/reindex_analyzers.py @@ -10,11 +10,11 @@ from app.main.lib.language_analyzers import SUPPORTED_LANGUAGES import cld3 def get_all_documents(): - es = OpenSearch(app.config['ELASTICSEARCH_URL'], timeout=30) + es = OpenSearch(app.config['OPENSEARCH_URL'], timeout=30) try: docs = scan(es, size=10000, - index=app.config['ELASTICSEARCH_SIMILARITY'], + index=app.config['OPENSEARCH_SIMILARITY'], ) for hit in docs: yield hit @@ -23,7 +23,7 @@ def get_all_documents(): return [] def get_docs_to_transform(team_id, language=None): - es = OpenSearch(app.config['ELASTICSEARCH_URL'], timeout=30) + es = OpenSearch(app.config['OPENSEARCH_URL'], timeout=30) docs_to_transform = {} for doc in get_all_documents_matching_context({"team_id": team_id}): if not language: @@ -44,12 +44,12 @@ def get_cached_docs_to_transform(team_id, language=None): return get_docs_to_transform(team_id, language) def store_updated_docs(docs_to_transform): - es = OpenSearch(app.config['ELASTICSEARCH_URL'], timeout=30) + es = OpenSearch(app.config['OPENSEARCH_URL'], timeout=30) for doc_id, language in docs_to_transform.items(): try: - already_done = es.get(index=app.config['ELASTICSEARCH_SIMILARITY']+"_"+language, id=doc_id) + already_done = es.get(index=app.config['OPENSEARCH_SIMILARITY']+"_"+language, id=doc_id) except opensearchpy.exceptions.NotFoundError: - found_doc = es.get(index=app.config['ELASTICSEARCH_SIMILARITY'], id=doc_id) + found_doc = es.get(index=app.config['OPENSEARCH_SIMILARITY'], id=doc_id) if found_doc: source = found_doc["_source"] keys_to_pop = [e for e in source.keys() if 'vector' in e or 'model_' in e] @@ -59,7 +59,7 @@ def store_updated_docs(docs_to_transform): finished = False while not finished and fail_count < 5: try: - update_or_create_document(source, doc_id, app.config['ELASTICSEARCH_SIMILARITY']+"_"+language) + update_or_create_document(source, doc_id, app.config['OPENSEARCH_SIMILARITY']+"_"+language) finished = True except opensearchpy.exceptions.ConnectionError: fail_count += 1 @@ -68,4 +68,4 @@ def run(team_id, language=None): if language is not None and language not in SUPPORTED_LANGUAGES: raise Exception(f"Unsupported language: {language} is not a supported language.") docs_to_transform = get_cached_docs_to_transform(team_id, language) - store_updated_docs(docs_to_transform) \ No newline at end of file + store_updated_docs(docs_to_transform) diff --git a/app/main/lib/text_similarity.py b/app/main/lib/text_similarity.py index d2660b56..11ee16d6 100644 --- a/app/main/lib/text_similarity.py +++ b/app/main/lib/text_similarity.py @@ -7,7 +7,7 @@ #from app.main.lib.langid import Cld3LangidProvider as LangidProvider from app.main.lib.langid import GoogleLangidProvider as LangidProvider from app.main.lib.openai import retrieve_openai_embeddings, PREFIX_OPENAI -ELASTICSEARCH_DEFAULT_LIMIT = 10000 +OPENSEARCH_DEFAULT_LIMIT = 10000 def delete_text(doc_id, context, quiet): return delete_document(doc_id, context, quiet) @@ -158,11 +158,11 @@ def search_text_by_model(search_params): model_key, threshold = get_model_and_threshold(search_params) app.logger.info( f"[Alegre Similarity] search_text_by_model:model_key {model_key}, threshold:{threshold}") - es = OpenSearch(app.config['ELASTICSEARCH_URL'], timeout=30) + es = OpenSearch(app.config['OPENSEARCH_URL'], timeout=30) conditions = [] matches = [] clause_count = 0 - search_indices = [app.config['ELASTICSEARCH_SIMILARITY']] + search_indices = [app.config['OPENSEARCH_SIMILARITY']] if 'context' in search_params: matches, clause_count = generate_matches(search_params['context']) if clause_count >= app.config['MAX_CLAUSE_COUNT']: @@ -178,7 +178,7 @@ def search_text_by_model(search_params): app.logger.warning('Detected language in query text {} is not explicitly supported for indexing, defaulting to "none"'.format(language)) language = None if language in SUPPORTED_LANGUAGES: - search_indices.append(app.config['ELASTICSEARCH_SIMILARITY']+"_"+language) + search_indices.append(app.config['OPENSEARCH_SIMILARITY']+"_"+language) elif language: error_text = f"[Alegre Similarity] [Similarity type: text] Language parameter value of {language} for text similarity search asserted, but not in SUPPORTED_LANGUAGES" app.logger.info(error_text) @@ -207,7 +207,7 @@ def search_text_by_model(search_params): body = get_body_from_conditions(conditions) app.logger.info(f"Sending OpenSearch query: {body}") result = es.search( - size=limit or ELASTICSEARCH_DEFAULT_LIMIT, #NOTE a default limit is given in similarity.py + size=limit or OPENSEARCH_DEFAULT_LIMIT, #NOTE a default limit is given in similarity.py body=body, index=search_indices ) diff --git a/app/test/test_bulk_similarity.py b/app/test/test_bulk_similarity.py index 3c97c56c..f127d1b0 100644 --- a/app/test/test_bulk_similarity.py +++ b/app/test/test_bulk_similarity.py @@ -15,22 +15,22 @@ class TestBulkSimilarityBlueprint(BaseTestCase): def setUp(self): super().setUp() - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.delete(index=app.config['ELASTICSEARCH_SIMILARITY'], ignore=[400, 404]) - es.indices.create(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.delete(index=app.config['OPENSEARCH_SIMILARITY'], ignore=[400, 404]) + es.indices.create(index=app.config['OPENSEARCH_SIMILARITY']) es.indices.put_mapping( body=json.load(open('./elasticsearch/alegre_similarity.json')), - index=app.config['ELASTICSEARCH_SIMILARITY'] + index=app.config['OPENSEARCH_SIMILARITY'] ) def test_similarity_mapping(self): - es = OpenSearch(app.config['ELASTICSEARCH_URL']) + es = OpenSearch(app.config['OPENSEARCH_URL']) mapping = es.indices.get_mapping( - index=app.config['ELASTICSEARCH_SIMILARITY'] + index=app.config['OPENSEARCH_SIMILARITY'] ) self.assertDictEqual( json.load(open('./elasticsearch/alegre_similarity.json')), - mapping[app.config['ELASTICSEARCH_SIMILARITY']]['mappings'] + mapping[app.config['OPENSEARCH_SIMILARITY']]['mappings'] ) def test_elasticsearch_insert_text_with_doc_id(self): diff --git a/app/test/test_bulk_update_similarity.py b/app/test/test_bulk_update_similarity.py index 49d46994..a597e5bd 100644 --- a/app/test/test_bulk_update_similarity.py +++ b/app/test/test_bulk_update_similarity.py @@ -20,12 +20,12 @@ class TestBulkUpdateSimilarityBlueprint(BaseTestCase): def setUp(self): super().setUp() - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.delete(index=app.config['ELASTICSEARCH_SIMILARITY'], ignore=[400, 404]) - es.indices.create(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.delete(index=app.config['OPENSEARCH_SIMILARITY'], ignore=[400, 404]) + es.indices.create(index=app.config['OPENSEARCH_SIMILARITY']) es.indices.put_mapping( body=json.load(open('./elasticsearch/alegre_similarity.json')), - index=app.config['ELASTICSEARCH_SIMILARITY'] + index=app.config['OPENSEARCH_SIMILARITY'] ) r = redis.Redis(host=app.config['REDIS_HOST'], port=app.config['REDIS_PORT'], db=app.config['REDIS_DATABASE']) r.delete(SharedModelStub.model_key) @@ -33,13 +33,13 @@ def setUp(self): r.srem('SharedModel', SharedModelStub.model_key) def test_similarity_mapping(self): - es = OpenSearch(app.config['ELASTICSEARCH_URL']) + es = OpenSearch(app.config['OPENSEARCH_URL']) mapping = es.indices.get_mapping( - index=app.config['ELASTICSEARCH_SIMILARITY'] + index=app.config['OPENSEARCH_SIMILARITY'] ) self.assertDictEqual( json.load(open('./elasticsearch/alegre_similarity.json')), - mapping[app.config['ELASTICSEARCH_SIMILARITY']]['mappings'] + mapping[app.config['OPENSEARCH_SIMILARITY']]['mappings'] ) def test_elasticsearch_insert_text_with_doc_id(self): diff --git a/app/test/test_healthcheck.py b/app/test/test_healthcheck.py index d9d790cb..bf3f9a3e 100644 --- a/app/test/test_healthcheck.py +++ b/app/test/test_healthcheck.py @@ -45,7 +45,7 @@ def test_healthcheck_api_with_wrong_server(self): def test_healthcheck_api_elasticsearch_exception(self): with app.app_context(): - app.config['ELASTICSEARCH_URL']= '' + app.config['OPENSEARCH_URL']= '' response = self.client.get('/healthcheck/') self.assertEqual('application/json', response.content_type) self.assertEqual(500, response.status_code) @@ -59,7 +59,7 @@ def test_healthcheck_api_redis_error_connection(self): def test_healthcheck_api_with_bad_config(self): with app.app_context(): - app.config['ELASTICSEARCH_URL']= 'bad' + app.config['OPENSEARCH_URL']= 'bad' app.config['REDIS_HOST']= 'bad' app.config['SQLALCHEMY_DATABASE_URI']= 'bad' response = self.client.get('/healthcheck/') diff --git a/app/test/test_similarity.py b/app/test/test_similarity.py index dc2f70c4..c1fda543 100644 --- a/app/test/test_similarity.py +++ b/app/test/test_similarity.py @@ -16,22 +16,22 @@ class TestSimilarityBlueprint(BaseTestCase): def setUp(self): super().setUp() - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.delete(index=app.config['ELASTICSEARCH_SIMILARITY'], ignore=[400, 404]) - es.indices.create(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.delete(index=app.config['OPENSEARCH_SIMILARITY'], ignore=[400, 404]) + es.indices.create(index=app.config['OPENSEARCH_SIMILARITY']) es.indices.put_mapping( body=json.load(open('./elasticsearch/alegre_similarity.json')), - index=app.config['ELASTICSEARCH_SIMILARITY'] + index=app.config['OPENSEARCH_SIMILARITY'] ) def test_similarity_mapping(self): - es = OpenSearch(app.config['ELASTICSEARCH_URL']) + es = OpenSearch(app.config['OPENSEARCH_URL']) mapping = es.indices.get_mapping( - index=app.config['ELASTICSEARCH_SIMILARITY'] + index=app.config['OPENSEARCH_SIMILARITY'] ) self.assertDictEqual( json.load(open('./elasticsearch/alegre_similarity.json')), - mapping[app.config['ELASTICSEARCH_SIMILARITY']]['mappings'] + mapping[app.config['OPENSEARCH_SIMILARITY']]['mappings'] ) def test_elasticsearch_similarity_english(self): @@ -43,8 +43,8 @@ def test_elasticsearch_similarity_english(self): result = json.loads(response.data.decode()) self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) response = self.client.get( '/text/similarity/', @@ -157,8 +157,8 @@ def test_elasticsearch_similarity_english_models_specified(self): result = json.loads(response.data.decode()) self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) response = self.client.get( '/text/similarity/', @@ -285,14 +285,14 @@ def test_elasticsearch_update_text_listed_context(self): with self.client: term = { 'text': 'how to slice a banana', 'model': 'elasticsearch', 'context': { 'dbid': [54, 55] } } post_response = self.client.post('/text/similarity/', data=json.dumps(term), content_type='application/json') - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) - results = es.search(body={"query": {"match_all": {}}},index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) + results = es.search(body={"query": {"match_all": {}}},index=app.config['OPENSEARCH_SIMILARITY']) doc = [e for e in results["hits"]["hits"] if e["_source"]['content'] == term['text']][0] term2 = { 'text': 'how to slice a pizza', 'model': 'elasticsearch', 'context': { 'dbid': [54, 55] }, 'doc_id': doc["_id"]} post_response2 = self.client.post('/text/similarity/', data=json.dumps(term2), content_type='application/json') - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) - results = es.search(body={"query": {"match_all": {}}},index=app.config['ELASTICSEARCH_SIMILARITY']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) + results = es.search(body={"query": {"match_all": {}}},index=app.config['OPENSEARCH_SIMILARITY']) doc = [e for e in results["hits"]["hits"] if doc["_id"] == e["_id"]][0] self.assertEqual(term2['text'], doc['_source']['content']) @@ -300,8 +300,8 @@ def test_elasticsearch_performs_correct_fuzzy_search(self): with self.client: term = { 'text': 'what even is a banana', 'model': 'elasticsearch', 'context': { 'dbid': 54 } } post_response = self.client.post('/text/similarity/', data=json.dumps(term), content_type='application/json') - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) lookup = { 'text': 'what even is a bananna', 'model': 'elasticsearch', 'context': { 'dbid': 54 } } post_response = self.client.get('/text/similarity/', data=json.dumps(lookup), content_type='application/json') lookup["fuzzy"] = True @@ -315,14 +315,14 @@ def test_elasticsearch_update_text(self): with self.client: term = { 'text': 'how to slice a banana', 'model': 'elasticsearch', 'context': { 'dbid': 54 } } post_response = self.client.post('/text/similarity/', data=json.dumps(term), content_type='application/json') - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) - results = es.search(body={"query": {"match_all": {}}},index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) + results = es.search(body={"query": {"match_all": {}}},index=app.config['OPENSEARCH_SIMILARITY']) doc = [e for e in results["hits"]["hits"] if e["_source"]['content'] == term['text']][0] term2 = { 'text': 'how to slice a pizza', 'model': 'elasticsearch', 'context': { 'dbid': 54 }, 'doc_id': doc["_id"]} post_response2 = self.client.post('/text/similarity/', data=json.dumps(term2), content_type='application/json') - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) - results = es.search(body={"query": {"match_all": {}}},index=app.config['ELASTICSEARCH_SIMILARITY']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) + results = es.search(body={"query": {"match_all": {}}},index=app.config['OPENSEARCH_SIMILARITY']) doc = [e for e in results["hits"]["hits"] if doc["_id"] == e["_id"]][0] self.assertEqual(term2['text'], doc['_source']['content']) @@ -330,14 +330,14 @@ def test_elasticsearch_update_text_with_doc_id(self): with self.client: term = { 'text': 'how to slice a banana', 'model': 'elasticsearch', 'context': { 'dbid': 54 }, 'doc_id': "123456" } post_response = self.client.post('/text/similarity/', data=json.dumps(term), content_type='application/json') - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) - results = es.search(body={"query": {"match_all": {}}},index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) + results = es.search(body={"query": {"match_all": {}}},index=app.config['OPENSEARCH_SIMILARITY']) doc = [e for e in results["hits"]["hits"] if e["_source"]['content'] == term['text']][0] term2 = { 'text': 'how to slice a pizza', 'model': 'elasticsearch', 'context': { 'dbid': 54 }, 'doc_id': "123456"} post_response2 = self.client.post('/text/similarity/', data=json.dumps(term2), content_type='application/json') - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) - results = es.search(body={"query": {"match_all": {}}},index=app.config['ELASTICSEARCH_SIMILARITY']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) + results = es.search(body={"query": {"match_all": {}}},index=app.config['OPENSEARCH_SIMILARITY']) doc = [e for e in results["hits"]["hits"] if doc["_id"] == e["_id"]][0] self.assertEqual(term2['text'], doc['_source']['content']) @@ -364,12 +364,12 @@ def test_elasticsearch_delete_text(self): with self.client: term = { 'text': 'how to slice a banana', 'model': 'elasticsearch', 'context': { 'dbid': 54 } } post_response = self.client.post('/text/similarity/', data=json.dumps(term), content_type='application/json') - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) result = json.loads(post_response.data.decode()) self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - results = es.search(body={"query": {"match_all": {}}},index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + results = es.search(body={"query": {"match_all": {}}},index=app.config['OPENSEARCH_SIMILARITY']) doc = [e for e in results["hits"]["hits"] if e["_source"]['content'] == term['text']][0] delete_response = self.client.delete( '/text/similarity/', @@ -383,12 +383,12 @@ def test_elasticsearch_delete_text(self): post_response = self.client.post('/text/similarity/', data=json.dumps(term), content_type='application/json') term = { 'doc_id': '123', 'text': 'how to slice a banana', 'model': 'elasticsearch', 'context': { 'dbid': 55 } } post_response = self.client.post('/text/similarity/', data=json.dumps(term), content_type='application/json') - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) result = json.loads(post_response.data.decode()) self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - results = es.search(body={"query": {"match_all": {}}},index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + results = es.search(body={"query": {"match_all": {}}},index=app.config['OPENSEARCH_SIMILARITY']) doc = [e for e in results["hits"]["hits"] if e["_source"]['content'] == term['text']][0] delete_response = self.client.delete( '/text/similarity/', @@ -408,8 +408,8 @@ def test_elasticsearch_similarity_hindi(self): result = json.loads(response.data.decode()) self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) response = self.client.get( '/text/similarity/', data=json.dumps({ @@ -440,8 +440,8 @@ def test_model_similarity(self): result = json.loads(response.data.decode()) self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) response = self.client.get( '/text/similarity/', data=json.dumps({ @@ -514,8 +514,8 @@ def test_wrong_model_key(self): result = json.loads(response.data.decode()) self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) response = self.client.get( '/text/similarity/', @@ -559,8 +559,8 @@ def test_model_similarity_with_vector(self): result = json.loads(response.data.decode()) self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) model = SharedModel.get_client(TestSimilarityBlueprint.use_model_key) vector = model.get_shared_model_response('how to delete an invoice') @@ -588,8 +588,8 @@ def test_min_es_search(self): result = json.loads(response.data.decode()) self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) response = self.client.get( '/text/similarity/', diff --git a/app/test/test_similarity_lang_analyzers.py b/app/test/test_similarity_lang_analyzers.py index b40a7b3b..f1d78354 100644 --- a/app/test/test_similarity_lang_analyzers.py +++ b/app/test/test_similarity_lang_analyzers.py @@ -17,12 +17,12 @@ class TestSimilarityBlueprint(BaseTestCase): def setUp(self): super().setUp() - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.delete(index=app.config['ELASTICSEARCH_SIMILARITY'], ignore=[400, 404]) - es.indices.create(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.delete(index=app.config['OPENSEARCH_SIMILARITY'], ignore=[400, 404]) + es.indices.create(index=app.config['OPENSEARCH_SIMILARITY']) es.indices.put_mapping( body=json.load(open('./elasticsearch/alegre_similarity.json')), - index=app.config['ELASTICSEARCH_SIMILARITY'] + index=app.config['OPENSEARCH_SIMILARITY'] ) # also make sure all the language specific indices have been dropped and recreated # (this is slow and runs before each test) @@ -36,8 +36,8 @@ def test_all_analyzers(self): response = self.client.post('/text/similarity/', data=json.dumps(example), content_type='application/json') result = json.loads(response.data.decode()) self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']+"_"+example['language']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']+"_"+example['language']) response = self.client.get( '/text/similarity/', data=json.dumps({ @@ -48,7 +48,7 @@ def test_all_analyzers(self): content_type='application/json' ) result = json.loads(response.data.decode()) - self.assertTrue(app.config['ELASTICSEARCH_SIMILARITY']+"_"+example['language'] in [e['_index'] for e in result['result']]) + self.assertTrue(app.config['OPENSEARCH_SIMILARITY']+"_"+example['language'] in [e['_index'] for e in result['result']]) def test_auto_language_id(self): # language examples as input to language classifier @@ -67,11 +67,11 @@ def test_auto_language_id(self): response = self.client.post('/text/similarity/', data=json.dumps(example), content_type='application/json') result = json.loads(response.data.decode()) # we are feeding in 'auto' expected correct id back self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) + es = OpenSearch(app.config['OPENSEARCH_URL']) if expected_lang is None: - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) else: - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']+"_"+expected_lang) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']+"_"+expected_lang) response = self.client.get( '/text/similarity/', data=json.dumps({ @@ -83,9 +83,9 @@ def test_auto_language_id(self): ) result = json.loads(response.data.decode()) # indirectly checking classification by confirming which index was included in result - index_alias = app.config['ELASTICSEARCH_SIMILARITY'] + index_alias = app.config['OPENSEARCH_SIMILARITY'] if expected_lang is not None: - index_alias = app.config['ELASTICSEARCH_SIMILARITY']+"_"+expected_lang + index_alias = app.config['OPENSEARCH_SIMILARITY']+"_"+expected_lang self.assertTrue(index_alias in [e['_index'] for e in result['result']]) def test_auto_language_query(self): @@ -105,11 +105,11 @@ def test_auto_language_query(self): response = self.client.post('/text/similarity/', data=json.dumps(example), content_type='application/json') result = json.loads(response.data.decode()) # we are feeding in 'auto' expected correct id back self.assertEqual(True, result['success']) - es = OpenSearch(app.config['ELASTICSEARCH_URL']) + es = OpenSearch(app.config['OPENSEARCH_URL']) if expected_lang is None: - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']) else: - es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY']+"_"+expected_lang) + es.indices.refresh(index=app.config['OPENSEARCH_SIMILARITY']+"_"+expected_lang) response = self.client.get( '/text/similarity/', data=json.dumps({ @@ -121,9 +121,9 @@ def test_auto_language_query(self): ) result = json.loads(response.data.decode()) # indirectly checking classification by confirming which index was included in result - index_alias = app.config['ELASTICSEARCH_SIMILARITY'] + index_alias = app.config['OPENSEARCH_SIMILARITY'] if expected_lang is not None: - index_alias = app.config['ELASTICSEARCH_SIMILARITY']+"_"+expected_lang + index_alias = app.config['OPENSEARCH_SIMILARITY']+"_"+expected_lang self.assertTrue(index_alias in [e['_index'] for e in result['result']]) diff --git a/app/test/test_text_similarity.py b/app/test/test_text_similarity.py index e6661117..c3ae3002 100644 --- a/app/test/test_text_similarity.py +++ b/app/test/test_text_similarity.py @@ -11,12 +11,12 @@ class TestTextSimilarity(BaseTestCase): def setUp(self): super().setUp() - es = OpenSearch(app.config['ELASTICSEARCH_URL']) - es.indices.delete(index=app.config['ELASTICSEARCH_SIMILARITY'], ignore=[400, 404]) - es.indices.create(index=app.config['ELASTICSEARCH_SIMILARITY']) + es = OpenSearch(app.config['OPENSEARCH_URL']) + es.indices.delete(index=app.config['OPENSEARCH_SIMILARITY'], ignore=[400, 404]) + es.indices.create(index=app.config['OPENSEARCH_SIMILARITY']) es.indices.put_mapping( body=json.load(open('./elasticsearch/alegre_similarity.json')), - index=app.config['ELASTICSEARCH_SIMILARITY'] + index=app.config['OPENSEARCH_SIMILARITY'] ) def test_get_vector_model_base_conditions(self): @@ -57,4 +57,4 @@ def test_get_document_body(self): if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/manage.py b/manage.py index bffdb1d9..0135c9ae 100644 --- a/manage.py +++ b/manage.py @@ -9,6 +9,8 @@ from sqlalchemy.schema import DDL from sqlalchemy_utils import database_exists, create_database import json_logging +import logging +import sys import redis from rq import Connection, Worker @@ -24,6 +26,9 @@ # (by upgrading to tensorflow 2.2 or higher) import tensorflow as tf +alegre_index_name = os.getenv('ALEGRE_INDEX', 'alegre_similarity') +alegre_init_index_name = os.getenv('ALEGRE_INIT_INDEX', 'alegre_similarity') + config_name = os.getenv('BOILERPLATE_ENV', 'dev') app = create_app(config_name) app.register_blueprint(blueprint) @@ -55,6 +60,11 @@ def test_simple_perl_function(): @manager.command def init_simple_perl_function(): with app.app_context(): + json_logging.init_non_web(enable_json=True) + logger = logging.getLogger("init") + logger.setLevel(logging.DEBUG) + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.info("Starting init_simple_perl_function ...") sqlalchemy.event.listen( db.metadata, 'before_create', @@ -115,6 +125,11 @@ def init_simple_perl_function(): @manager.command def init_perl_functions(): with app.app_context(): + json_logging.init_non_web(enable_json=True) + logger = logging.getLogger("init") + logger.setLevel(logging.DEBUG) + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.info("Starting init_perl_functions ...") sqlalchemy.event.listen( db.metadata, 'before_create', @@ -262,25 +277,32 @@ def run_video_matcher(): @manager.command def init(): """Initializes the service.""" + json_logging.init_non_web(enable_json=True) + logger = logging.getLogger("init") + logger.setLevel(logging.DEBUG) + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.info("Starting init ...") # Create ES indexes. - es = OpenSearch(app.config['ELASTICSEARCH_URL']) + logger.info("Creating indices with init index name: " + alegre_init_index_name) + es = OpenSearch(app.config['OPENSEARCH_URL']) try: if config_name == 'test': - es.indices.delete(index=app.config['ELASTICSEARCH_SIMILARITY'], ignore=[400, 404]) - es.indices.create(index=app.config['ELASTICSEARCH_SIMILARITY']) + es.indices.delete(index=alegre_init_index_name, ignore=[400, 404]) + es.indices.create(alegre_init_index_name) except TransportError as e: # ignore already existing index if e.error == 'resource_already_exists_exception': pass else: raise - es.indices.put_mapping( - body=json.load(open('./elasticsearch/alegre_similarity.json')), - # include_type_name=True, - index=app.config['ELASTICSEARCH_SIMILARITY'] - ) + # For now, omit mapping updates. + #es.indices.put_mapping( + # body=json.load(open('./opensearch/alegre_similarity.json')), + # index=alegre_init_index_name + #) init_indices() # Create database. + logger.info("Creating database ...") with app.app_context(): if not database_exists(db.engine.url): create_database(db.engine.url) diff --git a/opensearch/alegre_similarity.json b/opensearch/alegre_similarity.json index 1fb35c73..49296ab8 100644 --- a/opensearch/alegre_similarity.json +++ b/opensearch/alegre_similarity.json @@ -1,6 +1,18 @@ { "mappings": { "properties": { + "content": { + "type": "text" + }, + "context": { + "type": "nested" + }, + "vector": { + "type": "double" + }, + "model": { + "type": "keyword" + }, "vector_768": { "type": "knn_vector", "dimension": 768 @@ -10,9 +22,9 @@ "dimension": 768 }, "vector_openai-text-embedding-ada-002": { - "type": "dense_vector", - "dims": 1536 - }, + "type": "knn_vector", + "dimension": 1536 + }, "vector_paraphrase-filipino-mpnet-base-v2": { "type": "knn_vector", "dimension": 768 @@ -20,18 +32,6 @@ "vector_indian-sbert": { "type": "knn_vector", "dimension": 768 - }, - "content": { - "type": "text" - }, - "context": { - "type": "nested" - }, - "vector": { - "type": "double" - }, - "model": { - "type": "keyword" } } } diff --git a/opensearch/alegre_similarity_base.json b/opensearch/alegre_similarity_base.json new file mode 100644 index 00000000..df0ec35a --- /dev/null +++ b/opensearch/alegre_similarity_base.json @@ -0,0 +1,10 @@ +{ + "properties": { + "content": { + "type": "text" + }, + "context": { + "type": "nested" + } + } +} diff --git a/production/bin/migrate_entrypoint.sh b/production/bin/migrate_entrypoint.sh new file mode 100755 index 00000000..9205186b --- /dev/null +++ b/production/bin/migrate_entrypoint.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +echo "Begin container entrypoint..." + +# Redirect filehandles +ln -sf /proc/$$/fd/1 /var/log/entrypoint-stdout.log +ln -sf /proc/$$/fd/2 /var/log/entrypoint-stderr.log + +echo "Executing into target..." + +# exec into target process +>/log/stdout.log 2>/log/stderr.log exec /opt/bin/run_migrations.sh diff --git a/production/bin/run_migrations.sh b/production/bin/run_migrations.sh new file mode 100755 index 00000000..ca971eeb --- /dev/null +++ b/production/bin/run_migrations.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +echo "Starting migrations..." +cd /app + +echo "Calling init_perl_functions ..." +python manage.py init_perl_functions + +echo "Initializing db stamp head ..." +python manage.py db stamp head + +echo "Initializing db upgrade ..." +python manage.py db upgrade + +echo "Initializing search ..." +python manage.py init + +echo "Migrations complete." From 0960ceddde3211466a4d6b84f841b12f5f93e611 Mon Sep 17 00:00:00 2001 From: Martin Peck Date: Sat, 25 Nov 2023 12:31:16 -0700 Subject: [PATCH 07/20] OpenSearch refactor. --- .env_file.example | 2 +- .travis.yml | 2 +- Makefile | 2 +- app/main/controller/similarity_controller.py | 10 ++--- app/main/lib/graph_writer.py | 4 +- .../lib/{elasticsearch.py => opensearch.py} | 2 +- app/main/lib/reindex_analyzers.py | 2 +- app/main/lib/similarity.py | 2 +- app/main/lib/text_similarity.py | 14 +++---- elasticsearch/Dockerfile | 3 -- elasticsearch/alegre_similarity.json | 40 ------------------- elasticsearch/alegre_similarity_base.json | 10 ----- elasticsearch/alegre_similarity_settings.json | 13 ------ 13 files changed, 20 insertions(+), 86 deletions(-) rename app/main/lib/{elasticsearch.py => opensearch.py} (99%) delete mode 100644 elasticsearch/Dockerfile delete mode 100644 elasticsearch/alegre_similarity.json delete mode 100644 elasticsearch/alegre_similarity_base.json delete mode 100644 elasticsearch/alegre_similarity_settings.json diff --git a/.env_file.example b/.env_file.example index bc229571..d82b18ba 100644 --- a/.env_file.example +++ b/.env_file.example @@ -1,5 +1,5 @@ IMAGE_MODEL=phash -ELASTICSEARCH_URL=http://elasticsearch:9200 +OPENSEARCH_URL=http://elasticsearch:9200 REDIS_HOST=redis REDIS_PORT=6379 REDIS_DATABASE=0 diff --git a/.travis.yml b/.travis.yml index 649bd7bf..a938ec4c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,7 @@ before_script: - docker-compose build --pull - docker-compose -f docker-compose.yml -f docker-test.yml up -d - docker-compose logs -t -f & -- echo "Waiting for Elasticsearch indexes..." && until curl --silent --fail -I "http://localhost:9200/alegre_similarity_test"; do sleep 1; done +- echo "Waiting for OpenSearch indexes..." && until curl --silent --fail -I "http://localhost:9200/alegre_similarity_test"; do sleep 1; done - until curl --silent --fail -I "http://localhost:3100"; do sleep 1; done - echo "Waiting for model servers..." && while [[ ! '2' =~ $(redis-cli -n 1 SCARD 'SharedModel') ]]; do sleep 1; done #comment until fix timeout curl: (28) Operation timed out diff --git a/Makefile b/Makefile index 586c0f65..6d76b7b5 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ test: wait coverage run --source=app/main/ manage.py test wait: - until curl --silent -XGET --fail $(ELASTICSEARCH_URL); do printf '.'; sleep 1; done + until curl --silent -XGET --fail $(OPENSEARCH_URL); do printf '.'; sleep 1; done contract_testing: wait curl -vvv -X POST "http://alegre:3100/image/similarity/" -H "Content-Type: application/json" -d '{"url":"https://i.pinimg.com/564x/0f/73/57/0f7357637b2b203e9f32e73c24d126d7.jpg","threshold":0.9,"context":{}}' diff --git a/app/main/controller/similarity_controller.py b/app/main/controller/similarity_controller.py index 63cff601..45490bc2 100644 --- a/app/main/controller/similarity_controller.py +++ b/app/main/controller/similarity_controller.py @@ -9,12 +9,12 @@ similarity_request = api.model('similarity_request', { 'text': fields.String(required=False, description='text to be stored or queried for similarity'), 'doc_id': fields.String(required=False, description='text ID to constrain uniqueness'), - 'model': fields.String(required=False, description='similarity model to use: "elasticsearch" (pure Elasticsearch, default) or the key name of an active model'), - 'models': fields.List(required=False, description='similarity models to use: ["elasticsearch"] (pure Elasticsearch, default) or the key name of an active model', cls_or_instance=fields.String), + 'model': fields.String(required=False, description='similarity model to use: "opensearch" (pure Elasticsearch, default) or the key name of an active model'), + 'models': fields.List(required=False, description='similarity models to use: ["opensearch"] (pure Elasticsearch, default) or the key name of an active model', cls_or_instance=fields.String), 'language': fields.String(required=False, description='language code for the analyzer to use during the similarity query (defaults to standard analyzer)'), 'threshold': fields.Float(required=False, description='minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)'), 'context': JsonObject(required=False, description='context'), - 'fuzzy': fields.Boolean(required=False, description='whether or not to use fuzzy search on GET queries (only used when model is set to \'elasticsearch\')'), + 'fuzzy': fields.Boolean(required=False, description='whether or not to use fuzzy search on GET queries (only used when model is set to \'opensearch\')'), }) @api.route('/') class SimilarityResource(Resource): @@ -39,7 +39,7 @@ def post(self): return similarity.add_item(item, "text") @api.response(200, 'text similarity successfully queried.') - @api.doc('Make a text similarity query. Note that we currently require GET requests with a JSON body rather than embedded params in the URL. You can achieve this via curl -X GET -H "Content-type: application/json" -H "Accept: application/json" -d \'{"text":"Some Text", "threshold": 0.5, "model": "elasticsearch"}\' "http://[ALEGRE_HOST]/text/similarity"') - @api.doc(params={'text': 'text to be stored or queried for similarity', 'threshold': 'minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)', 'model': 'similarity model to use: "elasticsearch" (pure Elasticsearch, default) or the key name of an active model'}) + @api.doc('Make a text similarity query. Note that we currently require GET requests with a JSON body rather than embedded params in the URL. You can achieve this via curl -X GET -H "Content-type: application/json" -H "Accept: application/json" -d \'{"text":"Some Text", "threshold": 0.5, "model": "opensearch"}\' "http://[ALEGRE_HOST]/text/similarity"') + @api.doc(params={'text': 'text to be stored or queried for similarity', 'threshold': 'minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)', 'model': 'similarity model to use: "opensearch" (pure Elasticsearch, default) or the key name of an active model'}) def get(self): return similarity.get_similar_items(similarity.get_body_for_text_document(request.args or request.json, mode='query'), "text") diff --git a/app/main/lib/graph_writer.py b/app/main/lib/graph_writer.py index 2787b52a..cf7c608c 100644 --- a/app/main/lib/graph_writer.py +++ b/app/main/lib/graph_writer.py @@ -1,4 +1,4 @@ -from app.main.lib.elasticsearch import get_all_documents_matching_context +from app.main.lib.opensearch import get_all_documents_matching_context from app.main.lib import text_similarity from app.main.lib import image_similarity from flask import current_app as app @@ -58,7 +58,7 @@ def package_item_for_query(item, graph, data_type): elif data_type == "text": vector_keys = [k for k in item["_source"].keys() if "vector" in k] vector_key = "" - model = graph.context.get("model") or "elasticsearch" + model = graph.context.get("model") or "opensearch" if vector_keys: vector_key = vector_keys[0] return { diff --git a/app/main/lib/elasticsearch.py b/app/main/lib/opensearch.py similarity index 99% rename from app/main/lib/elasticsearch.py rename to app/main/lib/opensearch.py index c9830f65..7afc3245 100644 --- a/app/main/lib/elasticsearch.py +++ b/app/main/lib/opensearch.py @@ -1,4 +1,4 @@ -# Elasticsearch helpers +# OpenSearch helpers import opensearchpy from opensearchpy import OpenSearch diff --git a/app/main/lib/reindex_analyzers.py b/app/main/lib/reindex_analyzers.py index 019dca41..4e507b30 100644 --- a/app/main/lib/reindex_analyzers.py +++ b/app/main/lib/reindex_analyzers.py @@ -1,7 +1,7 @@ import json import opensearchpy from opensearchpy import OpenSearch -from app.main.lib.elasticsearch import get_all_documents_matching_context, update_or_create_document +from app.main.lib.opensearch import get_all_documents_matching_context, update_or_create_document from app.main.lib.error_log import ErrorLog from opensearchpy.helpers import scan diff --git a/app/main/lib/similarity.py b/app/main/lib/similarity.py index 8fd9f608..b6c06432 100644 --- a/app/main/lib/similarity.py +++ b/app/main/lib/similarity.py @@ -53,7 +53,7 @@ def get_body_for_text_document(params, mode): if 'models' in params: models = models|set(params['models']) if not models: - models = ['elasticsearch'] + models = ['opensearch'] params['models'] = list(models) # Rename "text" to "content" if present diff --git a/app/main/lib/text_similarity.py b/app/main/lib/text_similarity.py index 11ee16d6..78e36f64 100644 --- a/app/main/lib/text_similarity.py +++ b/app/main/lib/text_similarity.py @@ -1,6 +1,6 @@ from flask import current_app as app from opensearchpy import OpenSearch -from app.main.lib.elasticsearch import generate_matches, truncate_query, store_document, delete_document +from app.main.lib.opensearch import generate_matches, truncate_query, store_document, delete_document from app.main.lib.error_log import ErrorLog from app.main.lib.shared_models.shared_model import SharedModel from app.main.lib.language_analyzers import SUPPORTED_LANGUAGES @@ -16,7 +16,7 @@ def get_document_body(body): context = body.get("context", {}) if context: body["contexts"] = [context] - if model_key != 'elasticsearch': + if model_key != 'opensearch': if model_key[:len(PREFIX_OPENAI)] == PREFIX_OPENAI: vector = retrieve_openai_embeddings(body['content'], model_key) if vector == None: @@ -47,7 +47,7 @@ def search_text(search_params): return results def get_model_and_threshold(search_params): - model_key = 'elasticsearch' + model_key = 'opensearch' threshold = 0.9 if 'model' in search_params: model_key = search_params['model'] @@ -73,7 +73,7 @@ def get_body_from_conditions(conditions): body = conditions return body -def get_elasticsearch_base_conditions(search_params, clause_count, threshold): +def get_opensearch_base_conditions(search_params, clause_count, threshold): conditions = [ { 'match': { @@ -142,7 +142,7 @@ def strip_vectors(results): def restrict_results(results, search_params, model_key): out_results = [] - if search_params.get("min_es_score") and model_key == "elasticsearch": + if search_params.get("min_es_score") and model_key == "opensearch": for result in results: if "_score" in result and search_params.get("min_es_score", 0) < result["_score"]: out_results.append(result) @@ -167,8 +167,8 @@ def search_text_by_model(search_params): matches, clause_count = generate_matches(search_params['context']) if clause_count >= app.config['MAX_CLAUSE_COUNT']: return {'error': "Too many clauses specified! Text search will fail if another clause is added. Current clause count: "+str(clause_count)} - if model_key.lower() == 'elasticsearch': - conditions = get_elasticsearch_base_conditions(search_params, clause_count, threshold) + if model_key.lower() == 'opensearch': + conditions = get_opensearch_base_conditions(search_params, clause_count, threshold) language = search_params.get("language") # 'auto' indicates we should try to guess the appropriate language if language == 'auto': diff --git a/elasticsearch/Dockerfile b/elasticsearch/Dockerfile deleted file mode 100644 index 7497f36c..00000000 --- a/elasticsearch/Dockerfile +++ /dev/null @@ -1,3 +0,0 @@ -FROM docker.elastic.co/elasticsearch/elasticsearch:7.9.2 -RUN echo y | bin/elasticsearch-plugin install analysis-icu -RUN echo y | bin/elasticsearch-plugin install repository-s3 diff --git a/elasticsearch/alegre_similarity.json b/elasticsearch/alegre_similarity.json deleted file mode 100644 index 96b80f79..00000000 --- a/elasticsearch/alegre_similarity.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "properties": { - "vector_768": { - "type": "dense_vector", - "dims": 768 - }, - "vector_xlm-r-bert-base-nli-stsb-mean-tokens": { - "type": "dense_vector", - "dims": 768 - }, - "vector_paraphrase-filipino-mpnet-base-v2": { - "type": "dense_vector", - "dims": 768 - }, - "vector_indian-sbert": { - "type": "dense_vector", - "dims": 768 - }, - "vector_paraphrase-multilingual-mpnet-base-v2": { - "type": "dense_vector", - "dims": 768 - }, - "vector_openai-text-embedding-ada-002": { - "type": "dense_vector", - "dims": 1536 - }, - "content": { - "type": "text" - }, - "context": { - "type": "nested" - }, - "vector": { - "type": "double" - }, - "model": { - "type": "keyword" - } - } -} diff --git a/elasticsearch/alegre_similarity_base.json b/elasticsearch/alegre_similarity_base.json deleted file mode 100644 index df0ec35a..00000000 --- a/elasticsearch/alegre_similarity_base.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "properties": { - "content": { - "type": "text" - }, - "context": { - "type": "nested" - } - } -} diff --git a/elasticsearch/alegre_similarity_settings.json b/elasticsearch/alegre_similarity_settings.json deleted file mode 100644 index 4608723b..00000000 --- a/elasticsearch/alegre_similarity_settings.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "similarity": { - "scripted_tfidf": { - "type": "scripted", - "script": { - "source": "double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;" - } - }, - "lm_jelinek_mercer" : { - "type" : "LMJelinekMercer" - } - } -} From 8e3df67253016cb1fb00fa183cba8b4a099ce19d Mon Sep 17 00:00:00 2001 From: Martin Peck Date: Mon, 27 Nov 2023 09:04:30 -0700 Subject: [PATCH 08/20] OpenSearch refactor. --- .env_file.example | 2 +- app/main/config.py | 2 +- app/main/controller/about_controller.py | 4 ++-- app/main/controller/bulk_update_similarity_controller.py | 2 +- app/main/controller/similarity_async_controller.py | 8 ++++---- app/main/controller/similarity_sync_controller.py | 8 ++++---- app/main/lib/language_analyzers.py | 1 - 7 files changed, 13 insertions(+), 14 deletions(-) diff --git a/.env_file.example b/.env_file.example index d82b18ba..93199d5e 100644 --- a/.env_file.example +++ b/.env_file.example @@ -1,5 +1,5 @@ IMAGE_MODEL=phash -OPENSEARCH_URL=http://elasticsearch:9200 +OPENSEARCH_URL=http://opensearch:9200 REDIS_HOST=redis REDIS_PORT=6379 REDIS_DATABASE=0 diff --git a/app/main/config.py b/app/main/config.py index 4961350a..9c3d9a49 100644 --- a/app/main/config.py +++ b/app/main/config.py @@ -6,7 +6,7 @@ class Config: SECRET_KEY = os.getenv('SECRET_KEY', 'my_precious_secret_key') DEBUG = False - OPENSEARCH_URL = os.getenv('OPENSEARCH_URL', 'http://elasticsearch:9200') + OPENSEARCH_URL = os.getenv('OPENSEARCH_URL', 'http://opensearch:9200') OPENSEARCH_SIMILARITY = 'alegre_similarity' REDIS_HOST = os.getenv('REDIS_HOST', 'redis') REDIS_PORT = os.getenv('REDIS_PORT', 6379) diff --git a/app/main/controller/about_controller.py b/app/main/controller/about_controller.py index 09f5666b..e12d4e71 100644 --- a/app/main/controller/about_controller.py +++ b/app/main/controller/about_controller.py @@ -18,8 +18,8 @@ def get(self): return { 'text/langid': AboutResource.list_providers('app.main.lib.langid', 'LangidProvider'), 'text/translation': ['google'], - 'text/similarity': ['elasticsearch'] + SharedModel.get_servers(), - 'text/bulk_similarity': ['elasticsearch'], + 'text/similarity': ['opensearch'] + SharedModel.get_servers(), + 'text/bulk_similarity': ['opensearch'], 'text/bulk_upload_similarity': SharedModel.get_servers(), 'image/classification': AboutResource.list_providers('app.main.lib.image_classification', 'ImageClassificationProvider'), 'image/similarity': ['phash'], diff --git a/app/main/controller/bulk_update_similarity_controller.py b/app/main/controller/bulk_update_similarity_controller.py index a4b82a5a..0b59a3e5 100644 --- a/app/main/controller/bulk_update_similarity_controller.py +++ b/app/main/controller/bulk_update_similarity_controller.py @@ -7,7 +7,7 @@ from app.main.controller.bulk_similarity_controller import BulkSimilarityResource from app.main.lib import similarity from app.main.lib.text_similarity import get_document_body -from app.main.lib.elasticsearch import merge_contexts +from app.main.lib.opensearch import merge_contexts def get_documents_by_ids(index, ids, es): query = { "query": { diff --git a/app/main/controller/similarity_async_controller.py b/app/main/controller/similarity_async_controller.py index f88d8f61..bc6daa16 100644 --- a/app/main/controller/similarity_async_controller.py +++ b/app/main/controller/similarity_async_controller.py @@ -10,18 +10,18 @@ 'url': fields.String(required=False, description='url for item to be stored or queried for similarity'), 'callback_url': fields.String(required=False, description='callback_url for final search results'), 'doc_id': fields.String(required=False, description='text ID to constrain uniqueness'), - 'models': fields.List(required=False, description='similarity models to use: ["elasticsearch"] (pure Elasticsearch, default) or the key name of an active model', cls_or_instance=fields.String), + 'models': fields.List(required=False, description='similarity models to use: ["opensearch"] (pure OpenSearch, default) or the key name of an active model', cls_or_instance=fields.String), 'language': fields.String(required=False, description='language code for the analyzer to use during the similarity query (defaults to standard analyzer)'), 'threshold': fields.Float(required=False, description='minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)'), 'context': JsonObject(required=True, description='context'), - 'fuzzy': fields.Boolean(required=False, description='whether or not to use fuzzy search on GET queries (only used when model is set to \'elasticsearch\')'), + 'fuzzy': fields.Boolean(required=False, description='whether or not to use fuzzy search on GET queries (only used when model is set to \'opensearch\')'), 'requires_callback': fields.Boolean(required=False, description='whether or not to trigger a callback event to the provided URL'), }) @api.route('/') class AsyncSimilarityResource(Resource): @api.response(200, 'text similarity successfully queried.') - @api.doc('Make a text similarity query. Note that we currently require GET requests with a JSON body rather than embedded params in the URL. You can achieve this via curl -X GET -H "Content-type: application/json" -H "Accept: application/json" -d \'{"text":"Some Text", "threshold": 0.5, "model": "elasticsearch"}\' "http://[ALEGRE_HOST]/text/similarity"') - @api.doc(params={'text': 'text to be stored or queried for similarity', 'threshold': 'minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)', 'model': 'similarity model to use: "elasticsearch" (pure Elasticsearch, default) or the key name of an active model'}) + @api.doc('Make a text similarity query. Note that we currently require GET requests with a JSON body rather than embedded params in the URL. You can achieve this via curl -X GET -H "Content-type: application/json" -H "Accept: application/json" -d \'{"text":"Some Text", "threshold": 0.5, "model": "opensearch"}\' "http://[ALEGRE_HOST]/text/similarity"') + @api.doc(params={'text': 'text to be stored or queried for similarity', 'threshold': 'minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)', 'model': 'similarity model to use: "opensearch" (pure Elasticsearch, default) or the key name of an active model'}) def get(self, similarity_type): if similarity_type == "text": package = similarity.get_body_for_text_document(request.json, 'query') diff --git a/app/main/controller/similarity_sync_controller.py b/app/main/controller/similarity_sync_controller.py index 4f0dd4f6..83c76520 100644 --- a/app/main/controller/similarity_sync_controller.py +++ b/app/main/controller/similarity_sync_controller.py @@ -9,17 +9,17 @@ 'text': fields.String(required=False, description='text to be stored or queried for similarity'), 'url': fields.String(required=False, description='url for item to be stored or queried for similarity'), 'doc_id': fields.String(required=False, description='text ID to constrain uniqueness'), - 'models': fields.List(required=False, description='similarity models to use: ["elasticsearch"] (pure Elasticsearch, default) or the key name of an active model', cls_or_instance=fields.String), + 'models': fields.List(required=False, description='similarity models to use: ["opensearch"] (pure OpenSearch, default) or the key name of an active model', cls_or_instance=fields.String), 'language': fields.String(required=False, description='language code for the analyzer to use during the similarity query (defaults to standard analyzer)'), 'threshold': fields.Float(required=False, description='minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)'), 'context': JsonObject(required=True, description='context'), - 'fuzzy': fields.Boolean(required=False, description='whether or not to use fuzzy search on GET queries (only used when model is set to \'elasticsearch\')'), + 'fuzzy': fields.Boolean(required=False, description='whether or not to use fuzzy search on GET queries (only used when model is set to \'opensearch\')'), }) @api.route('/') class SyncSimilarityResource(Resource): @api.response(200, 'text similarity successfully queried.') - @api.doc('Make a text similarity query. Note that we currently require GET requests with a JSON body rather than embedded params in the URL. You can achieve this via curl -X GET -H "Content-type: application/json" -H "Accept: application/json" -d \'{"text":"Some Text", "threshold": 0.5, "model": "elasticsearch"}\' "http://[ALEGRE_HOST]/text/similarity"') - @api.doc(params={'text': 'text to be stored or queried for similarity', 'threshold': 'minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)', 'model': 'similarity model to use: "elasticsearch" (pure Elasticsearch, default) or the key name of an active model'}) + @api.doc('Make a text similarity query. Note that we currently require GET requests with a JSON body rather than embedded params in the URL. You can achieve this via curl -X GET -H "Content-type: application/json" -H "Accept: application/json" -d \'{"text":"Some Text", "threshold": 0.5, "model": "opensearch"}\' "http://[ALEGRE_HOST]/text/similarity"') + @api.doc(params={'text': 'text to be stored or queried for similarity', 'threshold': 'minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)', 'model': 'similarity model to use: "opensearch" (pure Elasticsearch, default) or the key name of an active model'}) def get(self, similarity_type): if similarity_type == "text": package = similarity.get_body_for_text_document(request.json, 'query') diff --git a/app/main/lib/language_analyzers.py b/app/main/lib/language_analyzers.py index f7da1b37..b4726e9e 100644 --- a/app/main/lib/language_analyzers.py +++ b/app/main/lib/language_analyzers.py @@ -2,7 +2,6 @@ from opensearchpy import OpenSearch from flask import request, current_app as app SUPPORTED_LANGUAGES = ["en", "pt", "es", "hi", "bn", "pt-br", "ar", "fr", "de", "cjk", "id"] -#via https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#bengali-analyzer SETTINGS_BY_LANGUAGE = { "en": { "analysis": { From 0de0d5dc707fc3b6eefee106e4586798668ea8ed Mon Sep 17 00:00:00 2001 From: Martin Peck Date: Thu, 30 Nov 2023 09:21:28 -0700 Subject: [PATCH 09/20] Support both elasticsearch and opensearch model names for compatibility. --- app/main/controller/similarity_async_controller.py | 2 +- app/main/controller/similarity_controller.py | 4 ++-- app/main/controller/similarity_sync_controller.py | 4 ++-- app/main/lib/text_similarity.py | 6 +++++- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/app/main/controller/similarity_async_controller.py b/app/main/controller/similarity_async_controller.py index f70c512a..1758d841 100644 --- a/app/main/controller/similarity_async_controller.py +++ b/app/main/controller/similarity_async_controller.py @@ -11,7 +11,7 @@ 'url': fields.String(required=False, description='url for item to be stored or queried for similarity'), 'callback_url': fields.String(required=False, description='callback_url for final search results'), 'doc_id': fields.String(required=False, description='text ID to constrain uniqueness'), - 'models': fields.List(required=False, description='similarity models to use: ["opensearch"] (pure OpenSearch, default) or the key name of an active model', cls_or_instance=fields.String), + 'models': fields.List(required=False, description='similarity models to use: ["opensearch"] (pure OpenSearch, default) or the key name of an active model. Legacy elasticsearch model supported for migration purposes.', cls_or_instance=fields.String), 'language': fields.String(required=False, description='language code for the analyzer to use during the similarity query (defaults to standard analyzer)'), 'threshold': fields.Float(required=False, description='minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)'), 'context': JsonObject(required=True, description='context'), diff --git a/app/main/controller/similarity_controller.py b/app/main/controller/similarity_controller.py index aefa1a2b..00a150ab 100644 --- a/app/main/controller/similarity_controller.py +++ b/app/main/controller/similarity_controller.py @@ -10,8 +10,8 @@ similarity_request = api.model('similarity_request', { 'text': fields.String(required=False, description='text to be stored or queried for similarity'), 'doc_id': fields.String(required=False, description='text ID to constrain uniqueness'), - 'model': fields.String(required=False, description='similarity model to use: "opensearch" (pure Elasticsearch, default) or the key name of an active model'), - 'models': fields.List(required=False, description='similarity models to use: ["opensearch"] (pure Elasticsearch, default) or the key name of an active model', cls_or_instance=fields.String), + 'model': fields.String(required=False, description='similarity model to use: "opensearch" (pure Elasticsearch, default) or the key name of an active model. Legacy elasticsearch model supported for migration purposes.'), + 'models': fields.List(required=False, description='similarity models to use: ["opensearch"] (pure Elasticsearch, default) or the key name of an active model. Legacy elasticsearch model supported for migration purposes.', cls_or_instance=fields.String), 'language': fields.String(required=False, description='language code for the analyzer to use during the similarity query (defaults to standard analyzer)'), 'threshold': fields.Float(required=False, description='minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)'), 'context': JsonObject(required=False, description='context'), diff --git a/app/main/controller/similarity_sync_controller.py b/app/main/controller/similarity_sync_controller.py index 5648edd6..b5ffa2b5 100644 --- a/app/main/controller/similarity_sync_controller.py +++ b/app/main/controller/similarity_sync_controller.py @@ -10,7 +10,7 @@ 'text': fields.String(required=False, description='text to be stored or queried for similarity'), 'url': fields.String(required=False, description='url for item to be stored or queried for similarity'), 'doc_id': fields.String(required=False, description='text ID to constrain uniqueness'), - 'models': fields.List(required=False, description='similarity models to use: ["opensearch"] (pure OpenSearch, default) or the key name of an active model', cls_or_instance=fields.String), + 'models': fields.List(required=False, description='similarity models to use: ["opensearch"] (pure OpenSearch, default) or the key name of an active model. Legacy elasticsearch model supported for migration purposes.', cls_or_instance=fields.String), 'language': fields.String(required=False, description='language code for the analyzer to use during the similarity query (defaults to standard analyzer)'), 'threshold': fields.Float(required=False, description='minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)'), 'context': JsonObject(required=True, description='context'), @@ -20,7 +20,7 @@ class SyncSimilarityResource(Resource): @api.response(200, 'text similarity successfully queried.') @api.doc('Make a text similarity query. Note that we currently require GET requests with a JSON body rather than embedded params in the URL. You can achieve this via curl -X GET -H "Content-type: application/json" -H "Accept: application/json" -d \'{"text":"Some Text", "threshold": 0.5, "model": "opensearch"}\' "http://[ALEGRE_HOST]/text/similarity"') - @api.doc(params={'text': 'text to be stored or queried for similarity', 'threshold': 'minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)', 'model': 'similarity model to use: "opensearch" (pure Elasticsearch, default) or the key name of an active model'}) + @api.doc(params={'text': 'text to be stored or queried for similarity', 'threshold': 'minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)', 'model': 'similarity model to use: "opensearch" (pure Elasticsearch, default) or the key name of an active model. Legacy elasticsearch model supported for migration purposes.'}) def post(self, similarity_type): args = request.json app.logger.debug(f"Args are {args}") diff --git a/app/main/lib/text_similarity.py b/app/main/lib/text_similarity.py index b9552426..bb1dd297 100644 --- a/app/main/lib/text_similarity.py +++ b/app/main/lib/text_similarity.py @@ -16,7 +16,7 @@ def get_document_body(body): context = body.get("context", {}) if context: body["contexts"] = [context] - if model_key != 'opensearch': + if model_key != 'opensearch' and model_key != 'elasticsearch': if model_key[:len(PREFIX_OPENAI)] == PREFIX_OPENAI: vector = retrieve_openai_embeddings(body['content'], model_key) if vector == None: @@ -51,6 +51,8 @@ def get_model_and_threshold(search_params): threshold = 0.9 if 'model' in search_params: model_key = search_params['model'] + if model_key == 'elasticsearch': + model_key = 'opensearch' if 'threshold' in search_params: threshold = search_params['threshold'] if 'per_model_threshold' in search_params and search_params['per_model_threshold'].get(model_key): @@ -142,6 +144,8 @@ def strip_vectors(results): def restrict_results(results, search_params, model_key): out_results = [] + if model_key == 'elasticsearch': + model_key = 'opensearch' try: min_es_score = float(search_params.get("min_es_score")) except (ValueError, TypeError) as e: From 127c5722c0651daa1e939a0b957501b170c7ecf1 Mon Sep 17 00:00:00 2001 From: Martin Peck Date: Mon, 4 Dec 2023 07:43:40 -0700 Subject: [PATCH 10/20] Remove builds for branch in preparation for merge. --- .gitlab-ci.yml | 2 -- docker-compose.yml | 29 +++++++++++++++-------------- opensearch.yml | 39 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 16 deletions(-) create mode 100644 opensearch.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ee6f5990..51184ba2 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -25,7 +25,6 @@ build_qa: - docker push "$QA_ECR_API_BASE_URL:latest" only: - develop - - bugfix/no-migrations-test deploy_qa: image: python:3-alpine @@ -69,7 +68,6 @@ deploy_qa: - echo "new Image was deployed $QA_ECR_API_BASE_URL:$CI_COMMIT_SHA" only: - develop - - bugfix/no-migrations-test build_live: image: docker:latest diff --git a/docker-compose.yml b/docker-compose.yml index 221d8801..c27eef85 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,27 +1,28 @@ version: '2' volumes: - elasticsearch: + opensearch: redis: postgres: services: - elasticsearch: - build: ./elasticsearch + opensearch: + build: ./opensearch ports: - "9200:9200" environment: discovery.type: single-node transport.host: 127.0.0.1 - xpack.security.enabled: "false" + plugins.security.disabled: "true" volumes: - - "elasticsearch:/usr/share/elasticsearch/data" - kibana: - image: docker.elastic.co/kibana/kibana:7.9.2 - ports: - - "5601:5601" - depends_on: - - elasticsearch - environment: - ELASTICSEARCH_URL: http://elasticsearch:9200 + - "./opensearch.yml:/usr/share/opensearch/config/opensearch.yml" + - "opensearch:/usr/share/opensearch/data" +# kibana: +# image: docker.elastic.co/kibana/kibana:7.9.2 +# ports: +# - "5601:5601" +# depends_on: +# - opensearch +# environment: +# OPENSEARCH_URL: http://opensearch:9200 redis: image: redis:5 ports: @@ -147,4 +148,4 @@ services: # - xlm_r_bert_base_nli_stsb_mean_tokens # - indian_sbert env_file: - - .env_file \ No newline at end of file + - .env_file diff --git a/opensearch.yml b/opensearch.yml new file mode 100644 index 00000000..17f5fbc2 --- /dev/null +++ b/opensearch.yml @@ -0,0 +1,39 @@ +network.host: 0.0.0.0 +plugins.security.disabled: true + +path: + logs: /usr/share/opensearch/logs + data: /usr/share/opensearch/data + repo: /usr/share/opensearch/snapshots +cluster: + name: ${HOSTNAME}-cluster +node: + name: ${HOSTNAME} +http: + cors: + enabled: true + allow-origin: '*' + +######## Start OpenSearch Security Demo Configuration ######## +# WARNING: revise all the lines below before you go into production +plugins.security.ssl.transport.pemcert_filepath: esnode.pem +plugins.security.ssl.transport.pemkey_filepath: esnode-key.pem +plugins.security.ssl.transport.pemtrustedcas_filepath: root-ca.pem +plugins.security.ssl.transport.enforce_hostname_verification: false +plugins.security.ssl.http.enabled: true +plugins.security.ssl.http.pemcert_filepath: esnode.pem +plugins.security.ssl.http.pemkey_filepath: esnode-key.pem +plugins.security.ssl.http.pemtrustedcas_filepath: root-ca.pem +plugins.security.allow_unsafe_democertificates: true +plugins.security.allow_default_init_securityindex: true +plugins.security.authcz.admin_dn: + - CN=kirk,OU=client,O=client,L=test, C=de + +plugins.security.audit.type: internal_opensearch +plugins.security.enable_snapshot_restore_privilege: true +plugins.security.check_snapshot_restore_write_privileges: true +plugins.security.restapi.roles_enabled: ["all_access", "security_rest_api_access"] +plugins.security.system_indices.enabled: true +plugins.security.system_indices.indices: [".plugins-ml-config", ".plugins-ml-connector", ".plugins-ml-model-group", ".plugins-ml-model", ".plugins-ml-task", ".plugins-ml-conversation-meta", ".plugins-ml-conversation-interactions", ".opendistro-alerting-config", ".opendistro-alerting-alert*", ".opendistro-anomaly-results*", ".opendistro-anomaly-detector*", ".opendistro-anomaly-checkpoints", ".opendistro-anomaly-detection-state", ".opendistro-reports-*", ".opensearch-notifications-*", ".opensearch-notebooks", ".opensearch-observability", ".ql-datasources", ".opendistro-asynchronous-search-response*", ".replication-metadata-store", ".opensearch-knn-models", ".geospatial-ip2geo-data*"] +node.max_local_storage_nodes: 3 +######## End OpenSearch Security Demo Configuration ######## From 5c43df7c17b36d6f45bb2eb011330ffdeb7138ab Mon Sep 17 00:00:00 2001 From: Martin Peck Date: Mon, 4 Dec 2023 12:03:16 -0700 Subject: [PATCH 11/20] OpenSearch is the future --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1d7c330a..a3efa8df 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ A media analysis service. Part of the [Check platform](https://meedan.com/check) The Alegre API Swagger UI unfortunately [does not support sending body payloads to GET methods](https://github.com/swagger-api/swagger-ui/issues/2136). To test those API methods, you can still fill in your arguments, and click "Execute" - Swagger will fail, but show you a `curl` command that you can use in your console. - Open http://localhost:5601 for the Kibana UI -- Open http://localhost:9200 for the Elasticsearch API +- Open http://localhost:9200 for the OpenSearch API - `docker-compose exec alegre flask shell` to get inside a Python shell in docker container with the loaded app ## Testing @@ -30,7 +30,7 @@ To test individual modules: ## Troubleshooting -- If you're having trouble starting Elasticsearch on macOS, with the error `container_name exited with code 137`, you will need to adjust your Docker settings, as per https://www.petefreitag.com/item/848.cfm +- If you're having trouble starting OpenSearch on macOS, with the error `container_name exited with code 137`, you will need to adjust your Docker settings, as per https://www.petefreitag.com/item/848.cfm - Note that the alegre docker service definitions in the `alegre` repo may not align with the alegre service definitions in the `check` repository, so different variations of the service may be spun up depending on the directory where `docker-compose up` is executed. From a033a640ed7535ba64f7c95f344977f02ec2a57d Mon Sep 17 00:00:00 2001 From: Martin Peck Date: Tue, 5 Dec 2023 12:09:13 -0700 Subject: [PATCH 12/20] Typo. --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 51184ba2..3744ec0d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -109,7 +109,7 @@ deploy_live: - pip install awscli==1.29.59 - aws ssm get-parameters-by-path --region $AWS_DEFAULT_REGION --path /live/alegre/ --recursive --with-decryption --output text --query "Parameters[].[Name]" | sed -E 's#/live/alegre/##' > env.live.names - for NAME in `cat env.live.names`; do echo -n "-s live-alegre-migration $NAME /live/alegre/$NAME " >> live-alegre-migration.env.args; done - - ecs update live-alegre-migration --image live-alegre-migration $QA_ECR_API_BASE_URL:$CI_COMMIT_SHA --exclusive-env -e live-alegre-migration APP alegre -e live-alegre-migration DEPLOY_ENV live -e live-alegre-migration AWS_REGION $AWS_DEFAULT_REGION -e live-alegre-migration ALEGRE_PORT 8000 --exclusive-secrets `cat live-alegre-migration.env.args` + - ecs update live-alegre-migration --image live-alegre-migration $LIVE_ECR_API_BASE_URL:$CI_COMMIT_SHA --exclusive-env -e live-alegre-migration APP alegre -e live-alegre-migration DEPLOY_ENV live -e live-alegre-migration AWS_REGION $AWS_DEFAULT_REGION -e live-alegre-migration ALEGRE_PORT 8000 --exclusive-secrets `cat live-alegre-migration.env.args` - taskArn=$(aws ecs run-task --cluster ecs-live --task-definition live-alegre-migration --query 'tasks[].taskArn' --output text) - echo "Migration task started - $taskArn" - aws ecs wait tasks-stopped --cluster ecs-live --tasks $taskArn From 91b70255602d92afb2ea59b05f36be1c37d60bae Mon Sep 17 00:00:00 2001 From: computermacgyver Date: Thu, 11 Jan 2024 10:48:35 +0000 Subject: [PATCH 13/20] Remove dependency on kibana. We don't use it --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 821922be..4b6c7402 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -142,8 +142,8 @@ services: - ".:/app" depends_on: - postgres - - kibana - redis + # - kibana # - video # - xlm_r_bert_base_nli_stsb_mean_tokens # - indian_sbert From dd85c9826f0b4efd679081dc28c9da9b87385f97 Mon Sep 17 00:00:00 2001 From: Martin Peck Date: Thu, 11 Jan 2024 12:32:37 -0700 Subject: [PATCH 14/20] More kibana deprecation. --- README.md | 1 - docker-compose.yml | 8 -------- 2 files changed, 9 deletions(-) diff --git a/README.md b/README.md index a3efa8df..bf327423 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,6 @@ A media analysis service. Part of the [Check platform](https://meedan.com/check) The Alegre API Swagger UI unfortunately [does not support sending body payloads to GET methods](https://github.com/swagger-api/swagger-ui/issues/2136). To test those API methods, you can still fill in your arguments, and click "Execute" - Swagger will fail, but show you a `curl` command that you can use in your console. -- Open http://localhost:5601 for the Kibana UI - Open http://localhost:9200 for the OpenSearch API - `docker-compose exec alegre flask shell` to get inside a Python shell in docker container with the loaded app diff --git a/docker-compose.yml b/docker-compose.yml index 4b6c7402..39c5cbee 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -15,14 +15,6 @@ services: volumes: - "./opensearch.yml:/usr/share/opensearch/config/opensearch.yml" - "opensearch:/usr/share/opensearch/data" -# kibana: -# image: docker.elastic.co/kibana/kibana:7.9.2 -# ports: -# - "5601:5601" -# depends_on: -# - opensearch -# environment: -# OPENSEARCH_URL: http://opensearch:9200 redis: image: redis:6.2 ports: From f4998823b7d902b5081db2cae31f5ef9ec2041f0 Mon Sep 17 00:00:00 2001 From: Martin Peck Date: Tue, 16 Jan 2024 09:41:39 -0700 Subject: [PATCH 15/20] Revert API breaking changes. --- app/main/controller/about_controller.py | 4 ++-- app/main/controller/similarity_async_controller.py | 8 ++++---- app/main/controller/similarity_controller.py | 10 +++++----- app/main/controller/similarity_sync_controller.py | 10 +++++----- app/main/lib/graph_writer.py | 2 +- app/main/lib/similarity.py | 2 +- app/main/lib/text_similarity.py | 12 ++++-------- 7 files changed, 22 insertions(+), 26 deletions(-) diff --git a/app/main/controller/about_controller.py b/app/main/controller/about_controller.py index e12d4e71..09f5666b 100644 --- a/app/main/controller/about_controller.py +++ b/app/main/controller/about_controller.py @@ -18,8 +18,8 @@ def get(self): return { 'text/langid': AboutResource.list_providers('app.main.lib.langid', 'LangidProvider'), 'text/translation': ['google'], - 'text/similarity': ['opensearch'] + SharedModel.get_servers(), - 'text/bulk_similarity': ['opensearch'], + 'text/similarity': ['elasticsearch'] + SharedModel.get_servers(), + 'text/bulk_similarity': ['elasticsearch'], 'text/bulk_upload_similarity': SharedModel.get_servers(), 'image/classification': AboutResource.list_providers('app.main.lib.image_classification', 'ImageClassificationProvider'), 'image/similarity': ['phash'], diff --git a/app/main/controller/similarity_async_controller.py b/app/main/controller/similarity_async_controller.py index 68783552..b7bfda55 100644 --- a/app/main/controller/similarity_async_controller.py +++ b/app/main/controller/similarity_async_controller.py @@ -11,18 +11,18 @@ 'url': fields.String(required=False, description='url for item to be stored or queried for similarity'), 'callback_url': fields.String(required=False, description='callback_url for final search results'), 'doc_id': fields.String(required=False, description='text ID to constrain uniqueness'), - 'models': fields.List(required=False, description='similarity models to use: ["opensearch"] (pure OpenSearch, default) or the key name of an active model. Legacy elasticsearch model supported for migration purposes.', cls_or_instance=fields.String), + 'models': fields.List(required=False, description='similarity models to use: ["elasticsearch"] (pure Elasticsearch, default) or the key name of an active model', cls_or_instance=fields.String), 'language': fields.String(required=False, description='language code for the analyzer to use during the similarity query (defaults to standard analyzer)'), 'threshold': fields.Float(required=False, description='minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)'), 'context': JsonObject(required=True, description='context'), - 'fuzzy': fields.Boolean(required=False, description='whether or not to use fuzzy search on GET queries (only used when model is set to \'opensearch\')'), + 'fuzzy': fields.Boolean(required=False, description='whether or not to use fuzzy search on GET queries (only used when model is set to \'elasticsearch\')'), 'requires_callback': fields.Boolean(required=False, description='whether or not to trigger a callback event to the provided URL'), }) @api.route('/') class AsyncSimilarityResource(Resource): @api.response(200, 'text similarity successfully queried.') - @api.doc('Make a text similarity query. Note that we currently require GET requests with a JSON body rather than embedded params in the URL. You can achieve this via curl -X GET -H "Content-type: application/json" -H "Accept: application/json" -d \'{"text":"Some Text", "threshold": 0.5, "model": "opensearch"}\' "http://[ALEGRE_HOST]/text/similarity"') - @api.doc(params={'text': 'text to be stored or queried for similarity', 'threshold': 'minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)', 'model': 'similarity model to use: "opensearch" (pure Elasticsearch, default) or the key name of an active model'}) + @api.doc('Make a text similarity query. Note that we currently require GET requests with a JSON body rather than embedded params in the URL. You can achieve this via curl -X GET -H "Content-type: application/json" -H "Accept: application/json" -d \'{"text":"Some Text", "threshold": 0.5, "model": "elasticsearch"}\' "http://[ALEGRE_HOST]/text/similarity"') + @api.doc(params={'text': 'text to be stored or queried for similarity', 'threshold': 'minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)', 'model': 'similarity model to use: "elasticsearch" (pure Elasticsearch, default) or the key name of an active model'}) def post(self, similarity_type): args = request.json app.logger.debug(f"Args are {args}") diff --git a/app/main/controller/similarity_controller.py b/app/main/controller/similarity_controller.py index 7c18794b..c4872911 100644 --- a/app/main/controller/similarity_controller.py +++ b/app/main/controller/similarity_controller.py @@ -10,12 +10,12 @@ similarity_request = api.model('similarity_request', { 'text': fields.String(required=False, description='text to be stored or queried for similarity'), 'doc_id': fields.String(required=False, description='text ID to constrain uniqueness'), - 'model': fields.String(required=False, description='similarity model to use: "opensearch" (pure Elasticsearch, default) or the key name of an active model. Legacy elasticsearch model supported for migration purposes.'), - 'models': fields.List(required=False, description='similarity models to use: ["opensearch"] (pure Elasticsearch, default) or the key name of an active model. Legacy elasticsearch model supported for migration purposes.', cls_or_instance=fields.String), + 'model': fields.String(required=False, description='similarity model to use: "elasticsearch" (pure Elasticsearch, default) or the key name of an active model'), + 'models': fields.List(required=False, description='similarity models to use: ["elasticsearch"] (pure Elasticsearch, default) or the key name of an active model', cls_or_instance=fields.String), 'language': fields.String(required=False, description='language code for the analyzer to use during the similarity query (defaults to standard analyzer)'), 'threshold': fields.Float(required=False, description='minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)'), 'context': JsonObject(required=False, description='context'), - 'fuzzy': fields.Boolean(required=False, description='whether or not to use fuzzy search on GET queries (only used when model is set to \'opensearch\')'), + 'fuzzy': fields.Boolean(required=False, description='whether or not to use fuzzy search on GET queries (only used when model is set to \'elasticsearch\')'), }) @api.route('/') class SimilarityResource(Resource): @@ -42,8 +42,8 @@ def post(self): @api.route('/search/') class SimilaritySearchResource(Resource): @api.response(200, 'text similarity successfully queried.') - @api.doc('Make a text similarity query. Note that we currently require GET requests with a JSON body rather than embedded params in the URL. You can achieve this via curl -X GET -H "Content-type: application/json" -H "Accept: application/json" -d \'{"text":"Some Text", "threshold": 0.5, "model": "opensearch"}\' "http://[ALEGRE_HOST]/text/similarity"') - @api.doc(params={'text': 'text to be stored or queried for similarity', 'threshold': 'minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)', 'model': 'similarity model to use: "opensearch" (pure Elasticsearch, default) or the key name of an active model'}) + @api.doc('Make a text similarity query. Note that we currently require GET requests with a JSON body rather than embedded params in the URL. You can achieve this via curl -X GET -H "Content-type: application/json" -H "Accept: application/json" -d \'{"text":"Some Text", "threshold": 0.5, "model": "elasticsearch"}\' "http://[ALEGRE_HOST]/text/similarity"') + @api.doc(params={'text': 'text to be stored or queried for similarity', 'threshold': 'minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)', 'model': 'similarity model to use: "elasticsearch" (pure Elasticsearch, default) or the key name of an active model'}) def post(self): args = request.json app.logger.debug(f"Args are {args}") diff --git a/app/main/controller/similarity_sync_controller.py b/app/main/controller/similarity_sync_controller.py index 6dc5810f..30515473 100644 --- a/app/main/controller/similarity_sync_controller.py +++ b/app/main/controller/similarity_sync_controller.py @@ -10,17 +10,17 @@ 'text': fields.String(required=False, description='text to be stored or queried for similarity'), 'url': fields.String(required=False, description='url for item to be stored or queried for similarity'), 'doc_id': fields.String(required=False, description='text ID to constrain uniqueness'), - 'models': fields.List(required=False, description='similarity models to use: ["opensearch"] (pure OpenSearch, default) or the key name of an active model. Legacy elasticsearch model supported for migration purposes.', cls_or_instance=fields.String), + 'models': fields.List(required=False, description='similarity models to use: ["elasticsearch"] (pure Elasticsearch, default) or the key name of an active model', cls_or_instance=fields.String), 'language': fields.String(required=False, description='language code for the analyzer to use during the similarity query (defaults to standard analyzer)'), 'threshold': fields.Float(required=False, description='minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)'), - 'context': JsonObject(required=True, description='context'), - 'fuzzy': fields.Boolean(required=False, description='whether or not to use fuzzy search on GET queries (only used when model is set to \'opensearch\')'), + 'context': JsonObject(required=False, description='context'), + 'fuzzy': fields.Boolean(required=False, description='whether or not to use fuzzy search on GET queries (only used when model is set to \'elasticsearch\')'), }) @api.route('/') class SyncSimilarityResource(Resource): @api.response(200, 'text similarity successfully queried.') - @api.doc('Make a text similarity query. Note that we currently require GET requests with a JSON body rather than embedded params in the URL. You can achieve this via curl -X GET -H "Content-type: application/json" -H "Accept: application/json" -d \'{"text":"Some Text", "threshold": 0.5, "model": "opensearch"}\' "http://[ALEGRE_HOST]/text/similarity"') - @api.doc(params={'text': 'text to be stored or queried for similarity', 'threshold': 'minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)', 'model': 'similarity model to use: "opensearch" (pure Elasticsearch, default) or the key name of an active model. Legacy elasticsearch model supported for migration purposes.'}) + @api.doc('Make a text similarity query. Note that we currently require GET requests with a JSON body rather than embedded params in the URL. You can achieve this via curl -X GET -H "Content-type: application/json" -H "Accept: application/json" -d \'{"text":"Some Text", "threshold": 0.5, "model": "elasticsearch"}\' "http://[ALEGRE_HOST]/text/similarity"') + @api.doc(params={'text': 'text to be stored or queried for similarity', 'threshold': 'minimum score to consider, between 0.0 and 1.0 (defaults to 0.9)', 'model': 'similarity model to use: "elasticsearch" (pure Elasticsearch, default) or the key name of an active model'}) def post(self, similarity_type): args = request.json app.logger.debug(f"Args are {args}") diff --git a/app/main/lib/graph_writer.py b/app/main/lib/graph_writer.py index cf7c608c..1fbdf365 100644 --- a/app/main/lib/graph_writer.py +++ b/app/main/lib/graph_writer.py @@ -58,7 +58,7 @@ def package_item_for_query(item, graph, data_type): elif data_type == "text": vector_keys = [k for k in item["_source"].keys() if "vector" in k] vector_key = "" - model = graph.context.get("model") or "opensearch" + model = graph.context.get("model") or "elasticsearch" if vector_keys: vector_key = vector_keys[0] return { diff --git a/app/main/lib/similarity.py b/app/main/lib/similarity.py index fb1dab66..f464018a 100644 --- a/app/main/lib/similarity.py +++ b/app/main/lib/similarity.py @@ -45,7 +45,7 @@ def get_body_for_text_document(params, mode): if 'models' in params: models = models|set(params['models']) if not models: - models = ['opensearch'] + models = ['elasticsearch'] params['models'] = list(models) # Rename "text" to "content" if present diff --git a/app/main/lib/text_similarity.py b/app/main/lib/text_similarity.py index bb1dd297..00543974 100644 --- a/app/main/lib/text_similarity.py +++ b/app/main/lib/text_similarity.py @@ -16,7 +16,7 @@ def get_document_body(body): context = body.get("context", {}) if context: body["contexts"] = [context] - if model_key != 'opensearch' and model_key != 'elasticsearch': + if model_key != 'elasticsearch': if model_key[:len(PREFIX_OPENAI)] == PREFIX_OPENAI: vector = retrieve_openai_embeddings(body['content'], model_key) if vector == None: @@ -47,12 +47,10 @@ def search_text(search_params): return results def get_model_and_threshold(search_params): - model_key = 'opensearch' + model_key = 'elasticsearch' threshold = 0.9 if 'model' in search_params: model_key = search_params['model'] - if model_key == 'elasticsearch': - model_key = 'opensearch' if 'threshold' in search_params: threshold = search_params['threshold'] if 'per_model_threshold' in search_params and search_params['per_model_threshold'].get(model_key): @@ -144,14 +142,12 @@ def strip_vectors(results): def restrict_results(results, search_params, model_key): out_results = [] - if model_key == 'elasticsearch': - model_key = 'opensearch' try: min_es_score = float(search_params.get("min_es_score")) except (ValueError, TypeError) as e: app.logger.info(f"search_params failed on min_es_score for {search_params}, raised error as {e}") min_es_score = None - if min_es_score is not None and model_key == "opensearch": + if min_es_score is not None and model_key == "elasticsearch": for result in results: if "_score" in result and min_es_score < result["_score"]: out_results.append(result) @@ -176,7 +172,7 @@ def search_text_by_model(search_params): matches, clause_count = generate_matches(search_params['context']) if clause_count >= app.config['MAX_CLAUSE_COUNT']: return {'error': "Too many clauses specified! Text search will fail if another clause is added. Current clause count: "+str(clause_count)} - if model_key.lower() == 'opensearch': + if model_key.lower() == 'elasticsearch': conditions = get_opensearch_base_conditions(search_params, clause_count, threshold) language = search_params.get("language") if language == 'None': From d137bce50c9a280c69c9066ae0351a4e35a9e101 Mon Sep 17 00:00:00 2001 From: Martin Peck Date: Thu, 18 Jan 2024 09:14:57 -0700 Subject: [PATCH 16/20] Deprecate unused model services in ECS deploy. --- .gitlab-ci.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7955e482..405e80cd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -58,8 +58,6 @@ deploy_qa: - ecs deploy ecs-qa qa-alegre-multilingual --diff --image qa-alegre-multilingual $QA_ECR_API_BASE_URL:$CI_COMMIT_SHA --timeout 1200 --exclusive-env -e qa-alegre-multilingual MODEL_NAME paraphrasemultilingualmpnetbasev2 -e qa-alegre-multilingual SENTENCE_TRANSFORMERS_HOME /mnt/models/multilingual-cache -e qa-alegre-multilingual APP alegre -e qa-alegre-multilingual DEPLOY_ENV qa -e qa-alegre-multilingual ALEGRE_PORT 8000 --exclusive-secrets `cat qa-alegre-multilingual.env.args` - for NAME in `cat env.qa.names`; do echo -n "-s qa-alegre-video $NAME /qa/alegre/$NAME " >> qa-alegre-video.env.args; done - ecs deploy ecs-qa qa-alegre-video --diff --image qa-alegre-video $QA_ECR_API_BASE_URL:$CI_COMMIT_SHA --timeout 1200 --exclusive-env -e qa-alegre-video MODEL_NAME video -e qa-alegre-video PERSISTENT_DISK_PATH /mnt/models/video -e qa-alegre-video APP alegre -e qa-alegre-video DEPLOY_ENV qa -e qa-alegre-video ALEGRE_PORT 8000 --exclusive-secrets `cat qa-alegre-video.env.args` - - for NAME in `cat env.qa.names`; do echo -n "-s qa-alegre-audio $NAME /qa/alegre/$NAME " >> qa-alegre-audio.env.args; done - - ecs deploy ecs-qa qa-alegre-audio --diff --image qa-alegre-audio $QA_ECR_API_BASE_URL:$CI_COMMIT_SHA --timeout 1200 --exclusive-env -e qa-alegre-audio MODEL_NAME audio -e qa-alegre-audio APP alegre -e qa-alegre-audio DEPLOY_ENV qa -e qa-alegre-audio ALEGRE_PORT 8000 --exclusive-secrets `cat qa-alegre-audio.env.args` - for NAME in `cat env.qa.names`; do echo -n "-s qa-alegre-worker-c $NAME /qa/alegre/$NAME " >> qa-alegre-worker.env.args; done - ecs deploy ecs-qa qa-alegre-worker --diff --image qa-alegre-worker-c $QA_ECR_API_BASE_URL:$CI_COMMIT_SHA --timeout 1200 --exclusive-env -e qa-alegre-worker-c APP alegre -e qa-alegre-worker-c DEPLOY_ENV qa -e qa-alegre-worker-c ALEGRE_PORT 8000 --exclusive-secrets `cat qa-alegre-worker.env.args` - echo "new Image was deployed $QA_ECR_API_BASE_URL:$CI_COMMIT_SHA" @@ -120,8 +118,6 @@ deploy_live: - ecs deploy ecs-live live-alegre-multilingual --diff --image live-alegre-multilingual $LIVE_ECR_API_BASE_URL:$CI_COMMIT_SHA --timeout 1200 --exclusive-env -e live-alegre-multilingual MODEL_NAME paraphrasemultilingualmpnetbasev2 -e live-alegre-multilingual SENTENCE_TRANSFORMERS_HOME /mnt/models/multilingual-cache -e live-alegre-multilingual APP alegre -e live-alegre-multilingual DEPLOY_ENV live -e live-alegre-multilingual ALEGRE_PORT 8000 --exclusive-secrets `cat live-alegre-multilingual.env.args` - for NAME in `cat env.live.names`; do echo -n "-s live-alegre-video $NAME /live/alegre/$NAME " >> live-alegre-video.env.args; done - ecs deploy ecs-live live-alegre-video --diff --image live-alegre-video $LIVE_ECR_API_BASE_URL:$CI_COMMIT_SHA --timeout 1200 --exclusive-env -e live-alegre-video MODEL_NAME video -e live-alegre-video PERSISTENT_DISK_PATH /mnt/models/video -e live-alegre-video APP alegre -e live-alegre-video DEPLOY_ENV live -e live-alegre-video ALEGRE_PORT 8000 --exclusive-secrets `cat live-alegre-video.env.args` - - for NAME in `cat env.live.names`; do echo -n "-s live-alegre-audio $NAME /live/alegre/$NAME " >> live-alegre-audio.env.args; done - - ecs deploy ecs-live live-alegre-audio --diff --image live-alegre-audio $LIVE_ECR_API_BASE_URL:$CI_COMMIT_SHA --timeout 1200 --exclusive-env -e live-alegre-audio MODEL_NAME audio -e live-alegre-audio APP alegre -e live-alegre-audio DEPLOY_ENV live -e live-alegre-audio ALEGRE_PORT 8000 --exclusive-secrets `cat live-alegre-audio.env.args` - for NAME in `cat env.live.names`; do echo -n "-s live-alegre-worker-c $NAME /live/alegre/$NAME " >> live-alegre-worker.env.args; done - ecs deploy ecs-live live-alegre-worker --diff --image live-alegre-worker-c $LIVE_ECR_API_BASE_URL:$CI_COMMIT_SHA --timeout 1200 --exclusive-env -e live-alegre-worker-c APP alegre -e live-alegre-worker-c DEPLOY_ENV live -e live-alegre-worker-c ALEGRE_PORT 8000 --exclusive-secrets `cat live-alegre-worker.env.args` - echo "new Image was deployed $LIVE_ECR_API_BASE_URL:$CI_COMMIT_SHA" From cc97ff49f2540dbfcbe684bcf36f446591f9d158 Mon Sep 17 00:00:00 2001 From: Martin Peck Date: Thu, 18 Jan 2024 09:22:02 -0700 Subject: [PATCH 17/20] Build and deploy to QA from this branch. --- .gitlab-ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 405e80cd..2455feff 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -22,6 +22,7 @@ build_qa: - docker push "$ECR_API_BASE_URL/qa/alegre/api:latest" only: - develop + - bugfix/no-migrations-test deploy_qa: image: python:3-alpine @@ -63,6 +64,7 @@ deploy_qa: - echo "new Image was deployed $QA_ECR_API_BASE_URL:$CI_COMMIT_SHA" only: - develop + - bugfix/no-migrations-test build_live: image: registry.gitlab.com/gitlab-org/cloud-deploy/aws-base:latest From ff6c560c2c4c8494955f5a35cddb099e37f1d151 Mon Sep 17 00:00:00 2001 From: Martin Peck Date: Mon, 22 Jan 2024 09:18:21 -0700 Subject: [PATCH 18/20] Remove builds for branch in preparation for merge. --- .gitlab-ci.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2455feff..405e80cd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -22,7 +22,6 @@ build_qa: - docker push "$ECR_API_BASE_URL/qa/alegre/api:latest" only: - develop - - bugfix/no-migrations-test deploy_qa: image: python:3-alpine @@ -64,7 +63,6 @@ deploy_qa: - echo "new Image was deployed $QA_ECR_API_BASE_URL:$CI_COMMIT_SHA" only: - develop - - bugfix/no-migrations-test build_live: image: registry.gitlab.com/gitlab-org/cloud-deploy/aws-base:latest From fc845ae9f423f33a52e4e043002ccd89ac2f712b Mon Sep 17 00:00:00 2001 From: Martin Peck Date: Tue, 23 Jan 2024 16:43:12 -0700 Subject: [PATCH 19/20] Build and deploy to QA from this branch. --- .gitlab-ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 405e80cd..2455feff 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -22,6 +22,7 @@ build_qa: - docker push "$ECR_API_BASE_URL/qa/alegre/api:latest" only: - develop + - bugfix/no-migrations-test deploy_qa: image: python:3-alpine @@ -63,6 +64,7 @@ deploy_qa: - echo "new Image was deployed $QA_ECR_API_BASE_URL:$CI_COMMIT_SHA" only: - develop + - bugfix/no-migrations-test build_live: image: registry.gitlab.com/gitlab-org/cloud-deploy/aws-base:latest From 5c01ee4aeefdb2c69bd83908fb40a3074d4baadf Mon Sep 17 00:00:00 2001 From: Martin Peck Date: Wed, 24 Jan 2024 09:06:59 -0700 Subject: [PATCH 20/20] Remove builds for branch in preparation for merge. --- .gitlab-ci.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2455feff..405e80cd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -22,7 +22,6 @@ build_qa: - docker push "$ECR_API_BASE_URL/qa/alegre/api:latest" only: - develop - - bugfix/no-migrations-test deploy_qa: image: python:3-alpine @@ -64,7 +63,6 @@ deploy_qa: - echo "new Image was deployed $QA_ECR_API_BASE_URL:$CI_COMMIT_SHA" only: - develop - - bugfix/no-migrations-test build_live: image: registry.gitlab.com/gitlab-org/cloud-deploy/aws-base:latest