From c7b533ecb5299d4511430d33de9a50abfebc9dad Mon Sep 17 00:00:00 2001 From: Amit Galitzky Date: Fri, 7 Jun 2024 11:27:48 -0700 Subject: [PATCH] adding reindex Signed-off-by: Amit Galitzky --- build.gradle | 2 +- .../flowframework/common/CommonValue.java | 2 + .../flowframework/common/DefaultUseCases.java | 8 ++ .../flowframework/workflow/ReindexStep.java | 16 ++- ...brid-search-with-local-model-defaults.json | 4 +- ...ntic-search-with-local-model-defaults.json | 4 +- ...semantic-search-with-reindex-defaults.json | 31 ++++ ...semantic-search-with-reindex-template.json | 135 ++++++++++++++++++ 8 files changed, 194 insertions(+), 8 deletions(-) create mode 100644 src/main/resources/defaults/semantic-search-with-reindex-defaults.json create mode 100644 src/main/resources/substitutionTemplates/semantic-search-with-reindex-template.json diff --git a/build.gradle b/build.gradle index f19f52318..0d874cf7e 100644 --- a/build.gradle +++ b/build.gradle @@ -494,7 +494,7 @@ List> plugins = [ return new RegularFile() { @Override File getAsFile() { - return configurations.zipArchive.asFileTree.getSingleFile() + return configurations.zipArchive.asFileTree.getFiles() } } } diff --git a/src/main/java/org/opensearch/flowframework/common/CommonValue.java b/src/main/java/org/opensearch/flowframework/common/CommonValue.java index 87c2f2180..10a23357a 100644 --- a/src/main/java/org/opensearch/flowframework/common/CommonValue.java +++ b/src/main/java/org/opensearch/flowframework/common/CommonValue.java @@ -225,4 +225,6 @@ private CommonValue() {} public static final String CREATE_CONNECTOR_CREDENTIAL_SESSION_TOKEN = "create_connector.credential.session_token"; /** The field name for ingest pipeline model ID substitution */ public static final String CREATE_INGEST_PIPELINE_MODEL_ID = "create_ingest_pipeline.model_id"; + /** The field name for reindex source index substitution */ + public static final String REINDEX_SOURCE_INDEX = "reindex.source_index"; } diff --git a/src/main/java/org/opensearch/flowframework/common/DefaultUseCases.java b/src/main/java/org/opensearch/flowframework/common/DefaultUseCases.java index c2c3abdb7..7b8d06f1a 100644 --- a/src/main/java/org/opensearch/flowframework/common/DefaultUseCases.java +++ b/src/main/java/org/opensearch/flowframework/common/DefaultUseCases.java @@ -22,6 +22,7 @@ import static org.opensearch.flowframework.common.CommonValue.CREATE_CONNECTOR_CREDENTIAL_SECRET_KEY; import static org.opensearch.flowframework.common.CommonValue.CREATE_CONNECTOR_CREDENTIAL_SESSION_TOKEN; import static org.opensearch.flowframework.common.CommonValue.CREATE_INGEST_PIPELINE_MODEL_ID; +import static org.opensearch.flowframework.common.CommonValue.REINDEX_SOURCE_INDEX; /** * Enum encapsulating the different default use cases and templates we have stored @@ -147,6 +148,13 @@ public enum DefaultUseCases { "defaults/hybrid-search-with-local-model-defaults.json", "substitutionTemplates/hybrid-search-with-local-model-template.json", Collections.emptyList() + ), + /** defaults file and substitution ready template for semantic search with reindex command*/ + SEMANTIC_SEARCH_WITH_REINDEX( + "semantic_search_with_reindex", + "defaults/semantic-search-with-reindex-defaults.json", + "substitutionTemplates/semantic-search-with-reindex-template.json", + List.of(CREATE_CONNECTOR_CREDENTIAL_KEY, REINDEX_SOURCE_INDEX) ); private final String useCaseName; diff --git a/src/main/java/org/opensearch/flowframework/workflow/ReindexStep.java b/src/main/java/org/opensearch/flowframework/workflow/ReindexStep.java index bc335db97..b46ddecab 100644 --- a/src/main/java/org/opensearch/flowframework/workflow/ReindexStep.java +++ b/src/main/java/org/opensearch/flowframework/workflow/ReindexStep.java @@ -95,10 +95,20 @@ public PlainActionFuture execute( Float requestsPerSecond = inputs.containsKey(REQUESTS_PER_SECOND) ? Float.parseFloat(inputs.get(REQUESTS_PER_SECOND).toString()) : null; + requestsPerSecond = requestsPerSecond < 0 ? Float.POSITIVE_INFINITY : requestsPerSecond; Boolean requireAlias = inputs.containsKey(REQUIRE_ALIAS) ? Booleans.parseBoolean(inputs.get(REQUIRE_ALIAS).toString()) : null; - Integer slices = (Integer) inputs.get(SLICES); - Integer maxDocs = (Integer) inputs.get(MAX_DOCS); - + Integer slices; + Integer maxDocs; + if (inputs.get(SLICES) != null) { + slices = Integer.parseInt(String.valueOf(inputs.get(SLICES))); + } else { + slices = (Integer) inputs.get(SLICES); + } + if (inputs.get(MAX_DOCS) != null) { + maxDocs = Integer.parseInt(String.valueOf(inputs.get(MAX_DOCS))); + } else { + maxDocs = (Integer) inputs.get(MAX_DOCS); + } ReindexRequest reindexRequest = new ReindexRequest().setSourceIndices(Strings.splitStringByCommaToArray(sourceIndices)) .setDestIndex(destinationIndex); diff --git a/src/main/resources/defaults/hybrid-search-with-local-model-defaults.json b/src/main/resources/defaults/hybrid-search-with-local-model-defaults.json index 26b389a29..d07cc918d 100644 --- a/src/main/resources/defaults/hybrid-search-with-local-model-defaults.json +++ b/src/main/resources/defaults/hybrid-search-with-local-model-defaults.json @@ -1,11 +1,11 @@ { "template.name": "hybrid-search", "template.description": "Setting up hybrid search, ingest pipeline and index", - "register_local_pretrained_model.name": "huggingface/sentence-transformers/msmarco-distilbert-base-tas-b", + "register_local_pretrained_model.name": "huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2", "register_local_pretrained_model.description": "This is a sentence transformer model", "register_local_pretrained_model.model_format": "TORCH_SCRIPT", "register_local_pretrained_model.deploy": "true", - "register_local_pretrained_model.version": "1.0.2", + "register_local_pretrained_model.version": "1.0.1", "create_ingest_pipeline.pipeline_id": "nlp-ingest-pipeline", "create_ingest_pipeline.description": "A text embedding pipeline", "create_ingest_pipeline.model_id": "123", diff --git a/src/main/resources/defaults/semantic-search-with-local-model-defaults.json b/src/main/resources/defaults/semantic-search-with-local-model-defaults.json index 5330d04a5..89fad8465 100644 --- a/src/main/resources/defaults/semantic-search-with-local-model-defaults.json +++ b/src/main/resources/defaults/semantic-search-with-local-model-defaults.json @@ -1,11 +1,11 @@ { "template.name": "semantic search with local pretrained model", "template.description": "Setting up semantic search, with a local pretrained embedding model", - "register_local_pretrained_model.name": "huggingface/sentence-transformers/msmarco-distilbert-base-tas-b", + "register_local_pretrained_model.name": "huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2", "register_local_pretrained_model.description": "This is a sentence transformer model", "register_local_pretrained_model.model_format": "TORCH_SCRIPT", "register_local_pretrained_model.deploy": "true", - "register_local_pretrained_model.version": "1.0.2", + "register_local_pretrained_model.version": "1.0.1", "create_ingest_pipeline.pipeline_id": "nlp-ingest-pipeline", "create_ingest_pipeline.description": "A text embedding pipeline", "text_embedding.field_map.input": "passage_text", diff --git a/src/main/resources/defaults/semantic-search-with-reindex-defaults.json b/src/main/resources/defaults/semantic-search-with-reindex-defaults.json new file mode 100644 index 000000000..b59780ee9 --- /dev/null +++ b/src/main/resources/defaults/semantic-search-with-reindex-defaults.json @@ -0,0 +1,31 @@ +{ + "template.name": "semantic search with cohere embedding", + "template.description": "Setting up semantic search, with a Cohere embedding model", + "create_connector.name": "cohere-embedding-connector", + "create_connector.description": "The connector to Cohere's public embed API", + "create_connector.protocol": "http", + "create_connector.model": "embed-english-v3.0", + "create_connector.input_type": "search_document", + "create_connector.truncate": "end", + "create_connector.credential.key": "123", + "create_connector.actions.url": "https://api.cohere.ai/v1/embed", + "create_connector.actions.request_body": "{ \"texts\": ${parameters.texts}, \"truncate\": \"${parameters.truncate}\", \"model\": \"${parameters.model}\", \"input_type\": \"${parameters.input_type}\" }", + "create_connector.actions.pre_process_function": "connector.pre_process.cohere.embedding", + "create_connector.actions.post_process_function": "connector.post_process.cohere.embedding", + "register_remote_model.name": "Cohere english embed model", + "register_remote_model.description": "cohere-embedding-model", + "create_ingest_pipeline.pipeline_id": "nlp-ingest-pipeline", + "create_ingest_pipeline.description": "A text embedding pipeline", + "text_embedding.field_map.input": "passage_text", + "text_embedding.field_map.output": "passage_embedding", + "create_index.name": "my-nlp-index", + "create_index.settings.number_of_shards": "2", + "create_index.mappings.method.engine": "lucene", + "create_index.mappings.method.space_type": "l2", + "create_index.mappings.method.name": "hnsw", + "text_embedding.field_map.output.dimension": "1024", + "create_search_pipeline.pipeline_id": "default_model_pipeline", + "reindex.source_index": "", + "reindex.requests_per_second": "-1", + "reindex.slices": "1" +} diff --git a/src/main/resources/substitutionTemplates/semantic-search-with-reindex-template.json b/src/main/resources/substitutionTemplates/semantic-search-with-reindex-template.json new file mode 100644 index 000000000..6460eabdc --- /dev/null +++ b/src/main/resources/substitutionTemplates/semantic-search-with-reindex-template.json @@ -0,0 +1,135 @@ +{ + "name": "${{template.name}}", + "description": "${{template.description}}", + "use_case": "SEMANTIC_SEARCH", + "version": { + "template": "1.0.0", + "compatibility": [ + "2.12.0", + "3.0.0" + ] + }, + "workflows": { + "provision": { + "nodes": [ + { + "id": "create_connector", + "type": "create_connector", + "user_inputs": { + "name": "${{create_connector.name}}", + "description": "${{create_connector.description}}", + "version": "1", + "protocol": "${{create_connector.protocol}}", + "parameters": { + "endpoint": "${{create_connector.endpoint}}", + "model": "${{create_connector.model}}", + "input_type": "search_document", + "truncate": "END" + }, + "credential": { + "key": "${{create_connector.credential.key}}" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "${{create_connector.actions.url}}", + "headers": { + "Authorization": "Bearer ${credential.key}", + "Request-Source": "unspecified:opensearch" + }, + "request_body": "${{create_connector.actions.request_body}}", + "pre_process_function": "${{create_connector.actions.pre_process_function}}", + "post_process_function": "${{create_connector.actions.post_process_function}}" + } + ] + } + }, + { + "id": "register_model", + "type": "register_remote_model", + "previous_node_inputs": { + "create_connector": "connector_id" + }, + "user_inputs": { + "name": "${{register_remote_model.name}}", + "function_name": "remote", + "description": "${{register_remote_model.description}}", + "deploy": true + } + }, + { + "id": "create_ingest_pipeline", + "type": "create_ingest_pipeline", + "previous_node_inputs": { + "register_model": "model_id" + }, + "user_inputs": { + "pipeline_id": "${{create_ingest_pipeline.pipeline_id}}", + "configurations": { + "description": "${{create_ingest_pipeline.description}}", + "processors": [ + { + "text_embedding": { + "model_id": "${{register_model.model_id}}", + "field_map": { + "${{text_embedding.field_map.input}}": "${{text_embedding.field_map.output}}" + } + } + } + ] + } + } + }, + { + "id": "create_index", + "type": "create_index", + "previous_node_inputs": { + "create_ingest_pipeline": "pipeline_id" + }, + "user_inputs": { + "index_name": "${{create_index.name}}", + "configurations": { + "settings": { + "index.knn": true, + "default_pipeline": "${{create_ingest_pipeline.pipeline_id}}", + "number_of_shards": "${{create_index.settings.number_of_shards}}" + }, + "mappings": { + "properties": { + "${{text_embedding.field_map.output}}": { + "type": "knn_vector", + "dimension": "${{text_embedding.field_map.output.dimension}}", + "method": { + "engine": "${{create_index.mappings.method.engine}}", + "space_type": "${{create_index.mappings.method.space_type}}", + "name": "${{create_index.mappings.method.name}}", + "parameters": {} + } + }, + "${{text_embedding.field_map.input}}": { + "type": "text" + } + } + } + } + } + }, + { + "id": "reindex", + "type": "reindex", + "previous_node_inputs": { + "create_index": "index_name" + }, + "user_inputs": { + "source_index": "${{reindex.source_index}}", + "destination_index": "${{create_index.name}}", + "refresh": false, + "requests_per_second": "${{reindex.requests_per_second}}", + "slices": "${{reindex.slices}}" + } + } + ] + } + } +}