partners: add Elasticsearch package (langchain-ai#17467)

### Description This PR moves the Elasticsearch classes to a partners package. Note that we will not move (and later remove) `ElasticKnnSearch`. It were previously deprecated. `ElasticVectorSearch` is going to stay in the community package since it is used quite a lot still. Also note that I left the `ElasticsearchTranslator` for self query untouched because it resides in main `langchain` package. ### Dependencies There will be another PR that updates the notebooks (potentially pulling them into the partners package) and templates and removes the classes from the community package, see langchain-ai#17468 #### Open question How to make the transition smooth for users? Do we move the import aliases and require people to install `langchain-elasticsearch`? Or do we remove the import aliases from the `langchain` package all together? What has worked well for other partner packages? --------- Co-authored-by: Erick Friis <[email protected]>
kineticadb · Feb 26, 2024 · 5ab69f9 · 5ab69f9
1 parent a4896da
commit 5ab69f9
Show file tree

Hide file tree

Showing 33 changed files with 4,916 additions and 16 deletions.
diff --git a/.github/workflows/_integration_test.yml b/.github/workflows/_integration_test.yml
@@ -70,6 +70,9 @@ jobs:
           ASTRA_DB_API_ENDPOINT: ${{ secrets.ASTRA_DB_API_ENDPOINT }}
           ASTRA_DB_APPLICATION_TOKEN: ${{ secrets.ASTRA_DB_APPLICATION_TOKEN }}
           ASTRA_DB_KEYSPACE: ${{ secrets.ASTRA_DB_KEYSPACE }}
+          ES_URL: ${{ secrets.ES_URL }}
+          ES_CLOUD_ID: ${{ secrets.ES_CLOUD_ID }}
+          ES_API_KEY: ${{ secrets.ES_API_KEY }}
         run: |
           make integration_tests
 

diff --git a/.github/workflows/_release.yml b/.github/workflows/_release.yml
@@ -191,6 +191,9 @@ jobs:
           ASTRA_DB_API_ENDPOINT: ${{ secrets.ASTRA_DB_API_ENDPOINT }}
           ASTRA_DB_APPLICATION_TOKEN: ${{ secrets.ASTRA_DB_APPLICATION_TOKEN }}
           ASTRA_DB_KEYSPACE: ${{ secrets.ASTRA_DB_KEYSPACE }}
+          ES_URL: ${{ secrets.ES_URL }}
+          ES_CLOUD_ID: ${{ secrets.ES_CLOUD_ID }}
+          ES_API_KEY: ${{ secrets.ES_API_KEY }}
         run: make integration_tests
         working-directory: ${{ inputs.working-directory }}
 

diff --git a/cookbook/self_query_hotel_search.ipynb b/cookbook/self_query_hotel_search.ipynb
@@ -1083,7 +1083,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from langchain_community.vectorstores import ElasticsearchStore\n",
+    "from langchain_elasticsearch import ElasticsearchStore\n",
     "from langchain_openai import OpenAIEmbeddings\n",
     "\n",
     "embeddings = OpenAIEmbeddings()"

diff --git a/docs/docs/integrations/providers/elasticsearch.mdx b/docs/docs/integrations/providers/elasticsearch.mdx
@@ -23,15 +23,15 @@ Elastic Cloud is a managed Elasticsearch service. Signup for a [free trial](http
 ### Install Client
 
 ```bash
-pip install elasticsearch
+pip install langchain-elasticsearch
 ```
 
 ## Vector Store
 
 The vector store is a simple wrapper around Elasticsearch. It provides a simple interface to store and retrieve vectors.
 
 ```python
-from langchain_community.vectorstores import ElasticsearchStore
+from langchain_elasticsearch import ElasticsearchStore
 
 from langchain_community.document_loaders import TextLoader
 from langchain.text_splitter import CharacterTextSplitter

diff --git a/docs/docs/integrations/retrievers/self_query/elasticsearch_self_query.ipynb b/docs/docs/integrations/retrievers/self_query/elasticsearch_self_query.ipynb
@@ -60,8 +60,8 @@
     "import getpass\n",
     "import os\n",
     "\n",
-    "from langchain_community.vectorstores import ElasticsearchStore\n",
     "from langchain_core.documents import Document\n",
+    "from langchain_elasticsearch import ElasticsearchStore\n",
     "from langchain_openai import OpenAIEmbeddings\n",
     "\n",
     "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")\n",

diff --git a/docs/docs/integrations/text_embedding/elasticsearch.ipynb b/docs/docs/integrations/text_embedding/elasticsearch.ipynb
@@ -24,7 +24,7 @@
    },
    "outputs": [],
    "source": [
-    "!pip -q install elasticsearch langchain"
+    "!pip -q install langchain-elasticsearch"
    ]
   },
   {
@@ -36,7 +36,7 @@
    },
    "outputs": [],
    "source": [
-    "from langchain_community.embeddings.elasticsearch import ElasticsearchEmbeddings"
+    "from langchain_elasticsearch import ElasticsearchEmbeddings"
    ]
   },
   {

diff --git a/docs/docs/integrations/vectorstores/elasticsearch.ipynb b/docs/docs/integrations/vectorstores/elasticsearch.ipynb
@@ -21,7 +21,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install --upgrade --quiet  elasticsearch langchain-openai tiktoken langchain"
+    "%pip install --upgrade --quiet  langchain-elasticsearch langchain-openai tiktoken langchain"
    ]
   },
   {
@@ -64,7 +64,7 @@
     "\n",
     "Example:\n",
     "```python\n",
-    "        from langchain_community.vectorstores.elasticsearch import ElasticsearchStore\n",
+    "        from langchain_elasticsearch import ElasticsearchStore\n",
     "        from langchain_openai import OpenAIEmbeddings\n",
     "\n",
     "        embedding = OpenAIEmbeddings()\n",
@@ -79,7 +79,7 @@
     "\n",
     "Example:\n",
     "```python\n",
-    "        from langchain_community.vectorstores import ElasticsearchStore\n",
+    "        from langchain_elasticsearch import ElasticsearchStore\n",
     "        from langchain_openai import OpenAIEmbeddings\n",
     "\n",
     "        embedding = OpenAIEmbeddings()\n",
@@ -97,7 +97,7 @@
     "Example:\n",
     "```python\n",
     "        import elasticsearch\n",
-    "        from langchain_community.vectorstores import ElasticsearchStore\n",
+    "        from langchain_elasticsearch import ElasticsearchStore\n",
     "\n",
     "        es_client= elasticsearch.Elasticsearch(\n",
     "            hosts=[\"http://localhost:9200\"],\n",
@@ -137,7 +137,7 @@
     "\n",
     "Example:\n",
     "```python\n",
-    "        from langchain_community.vectorstores.elasticsearch import ElasticsearchStore\n",
+    "        from langchain_elasticsearch import ElasticsearchStore\n",
     "        from langchain_openai import OpenAIEmbeddings\n",
     "\n",
     "        embedding = OpenAIEmbeddings()\n",
@@ -202,7 +202,7 @@
    },
    "outputs": [],
    "source": [
-    "from langchain_community.vectorstores import ElasticsearchStore\n",
+    "from langchain_elasticsearch import ElasticsearchStore\n",
     "from langchain_openai import OpenAIEmbeddings"
    ]
   },
@@ -817,7 +817,7 @@
    "source": [
     "from typing import Dict\n",
     "\n",
-    "from langchain.docstore.document import Document\n",
+    "from langchain_core.documents import Document\n",
     "\n",
     "\n",
     "def custom_document_builder(hit: Dict) -> Document:\n",
@@ -902,7 +902,7 @@
     "\n",
     "```python\n",
     "\n",
-    "from langchain_community.vectorstores.elasticsearch import ElasticsearchStore\n",
+    "from langchain_elasticsearch import ElasticsearchStore\n",
     "\n",
     "db = ElasticsearchStore(\n",
     "  es_url=\"http://localhost:9200\",\n",
@@ -936,7 +936,7 @@
     "\n",
     "```python\n",
     "\n",
-    "from langchain_community.vectorstores.elasticsearch import ElasticsearchStore\n",
+    "from langchain_elasticsearch import ElasticsearchStore\n",
     "\n",
     "db = ElasticsearchStore(\n",
     "  es_url=\"http://localhost:9200\",\n",

diff --git a/docs/docs/modules/data_connection/indexing.ipynb b/docs/docs/modules/data_connection/indexing.ipynb
@@ -91,8 +91,8 @@
    "outputs": [],
    "source": [
     "from langchain.indexes import SQLRecordManager, index\n",
-    "from langchain_community.vectorstores import ElasticsearchStore\n",
     "from langchain_core.documents import Document\n",
+    "from langchain_elasticsearch import ElasticsearchStore\n",
     "from langchain_openai import OpenAIEmbeddings"
    ]
   },

diff --git a/libs/partners/elasticsearch/.gitignore b/libs/partners/elasticsearch/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/libs/partners/elasticsearch/LICENSE b/libs/partners/elasticsearch/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 LangChain, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/libs/partners/elasticsearch/Makefile b/libs/partners/elasticsearch/Makefile
@@ -0,0 +1,60 @@
+.PHONY: all format lint test tests integration_tests docker_tests help extended_tests
+
+# Default target executed when no arguments are given to make.
+all: help
+
+install:
+	poetry install
+
+# Define a variable for the test file path.
+TEST_FILE ?= tests/unit_tests/
+integration_test integration_tests: TEST_FILE=tests/integration_tests/
+
+test tests integration_test integration_tests:
+	poetry run pytest $(TEST_FILE)
+
+
+######################
+# LINTING AND FORMATTING
+######################
+
+# Define a variable for Python and notebook files.
+PYTHON_FILES=.
+MYPY_CACHE=.mypy_cache
+lint format: PYTHON_FILES=.
+lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=libs/partners/elasticsearch --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$')
+lint_package: PYTHON_FILES=langchain_elasticsearch
+lint_tests: PYTHON_FILES=tests
+lint_tests: MYPY_CACHE=.mypy_cache_test
+
+lint lint_diff lint_package lint_tests:
+	poetry run ruff .
+	poetry run ruff format $(PYTHON_FILES) --diff
+	poetry run ruff --select I $(PYTHON_FILES)
+	mkdir $(MYPY_CACHE); poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE)
+
+format format_diff:
+	poetry run ruff format $(PYTHON_FILES)
+	poetry run ruff --select I --fix $(PYTHON_FILES)
+
+spell_check:
+	poetry run codespell --toml pyproject.toml
+
+spell_fix:
+	poetry run codespell --toml pyproject.toml -w
+
+check_imports: $(shell find langchain_elasticsearch -name '*.py')
+	poetry run python ./scripts/check_imports.py $^
+
+######################
+# HELP
+######################
+
+help:
+	@echo '----'
+	@echo 'check_imports				- check imports'
+	@echo 'format                       - run code formatters'
+	@echo 'lint                         - run linters'
+	@echo 'test                         - run unit tests'
+	@echo 'tests                        - run unit tests'
+	@echo 'test TEST_FILE=<test_file>   - run all tests in file'
diff --git a/libs/partners/elasticsearch/README.md b/libs/partners/elasticsearch/README.md
@@ -0,0 +1,29 @@
+# langchain-elasticsearch
+
+This package contains the LangChain integration with Elasticsearch.
+
+## Installation
+
+```bash
+pip install -U langchain-elasticsearch
+```
+
+TODO document how to get id and key
+
+## Usage
+
+The `ElasticsearchStore` class exposes the connection to the Pinecone vector store.
+
+```python
+from langchain_elasticsearch import ElasticsearchStore
+
+embeddings = ... # use a LangChain Embeddings class
+
+vectorstore = ElasticsearchStore(
+    es_cloud_id="your-cloud-id",
+    es_api_key="your-api-key",
+    index_name="your-index-name",
+    embeddings=embeddings,
+)
+```
+
diff --git a/libs/partners/elasticsearch/langchain_elasticsearch/__init__.py b/libs/partners/elasticsearch/langchain_elasticsearch/__init__.py
@@ -0,0 +1,17 @@
+from langchain_elasticsearch.chat_history import ElasticsearchChatMessageHistory
+from langchain_elasticsearch.embeddings import ElasticsearchEmbeddings
+from langchain_elasticsearch.vectorstores import (
+    ApproxRetrievalStrategy,
+    ElasticsearchStore,
+    ExactRetrievalStrategy,
+    SparseRetrievalStrategy,
+)
+
+__all__ = [
+    "ApproxRetrievalStrategy",
+    "ElasticsearchChatMessageHistory",
+    "ElasticsearchEmbeddings",
+    "ElasticsearchStore",
+    "ExactRetrievalStrategy",
+    "SparseRetrievalStrategy",
+]
diff --git a/libs/partners/elasticsearch/langchain_elasticsearch/_utilities.py b/libs/partners/elasticsearch/langchain_elasticsearch/_utilities.py
@@ -0,0 +1,82 @@
+from enum import Enum
+from typing import List, Union
+
+import numpy as np
+
+Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]
+
+
+class DistanceStrategy(str, Enum):
+    """Enumerator of the Distance strategies for calculating distances
+    between vectors."""
+
+    EUCLIDEAN_DISTANCE = "EUCLIDEAN_DISTANCE"
+    MAX_INNER_PRODUCT = "MAX_INNER_PRODUCT"
+    DOT_PRODUCT = "DOT_PRODUCT"
+    JACCARD = "JACCARD"
+    COSINE = "COSINE"
+
+
+def maximal_marginal_relevance(
+    query_embedding: np.ndarray,
+    embedding_list: list,
+    lambda_mult: float = 0.5,
+    k: int = 4,
+) -> List[int]:
+    """Calculate maximal marginal relevance."""
+    if min(k, len(embedding_list)) <= 0:
+        return []
+    if query_embedding.ndim == 1:
+        query_embedding = np.expand_dims(query_embedding, axis=0)
+    similarity_to_query = cosine_similarity(query_embedding, embedding_list)[0]
+    most_similar = int(np.argmax(similarity_to_query))
+    idxs = [most_similar]
+    selected = np.array([embedding_list[most_similar]])
+    while len(idxs) < min(k, len(embedding_list)):
+        best_score = -np.inf
+        idx_to_add = -1
+        similarity_to_selected = cosine_similarity(embedding_list, selected)
+        for i, query_score in enumerate(similarity_to_query):
+            if i in idxs:
+                continue
+            redundant_score = max(similarity_to_selected[i])
+            equation_score = (
+                lambda_mult * query_score - (1 - lambda_mult) * redundant_score
+            )
+            if equation_score > best_score:
+                best_score = equation_score
+                idx_to_add = i
+        idxs.append(idx_to_add)
+        selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
+    return idxs
+
+
+def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
+    """Row-wise cosine similarity between two equal-width matrices."""
+    if len(X) == 0 or len(Y) == 0:
+        return np.array([])
+
+    X = np.array(X)
+    Y = np.array(Y)
+    if X.shape[1] != Y.shape[1]:
+        raise ValueError(
+            f"Number of columns in X and Y must be the same. X has shape {X.shape} "
+            f"and Y has shape {Y.shape}."
+        )
+    try:
+        import simsimd as simd  # type: ignore
+
+        X = np.array(X, dtype=np.float32)
+        Y = np.array(Y, dtype=np.float32)
+        Z = 1 - simd.cdist(X, Y, metric="cosine")
+        if isinstance(Z, float):
+            return np.array([Z])
+        return Z
+    except ImportError:
+        X_norm = np.linalg.norm(X, axis=1)
+        Y_norm = np.linalg.norm(Y, axis=1)
+        # Ignore divide by zero errors run time warnings as those are handled below.
+        with np.errstate(divide="ignore", invalid="ignore"):
+            similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm)
+        similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0
+        return similarity