community[patch]: DuckDB VS - expose similarity, improve performance …

…of from_texts (langchain-ai#20971) 3 fixes of DuckDB vector store: - unify defaults in constructor and from_texts (users no longer have to specify `vector_key`). - include search similarity into output metadata (fixes langchain-ai#20969) - significantly improve performance of `from_documents` Dependencies: added Pandas to speed up `from_documents`. I was thinking about CSV and JSON options, but I expect trouble loading JSON values this way and also CSV and JSON options require storing data to disk. Anyway, the poetry file for langchain-community already contains a dependency on Pandas. --------- Co-authored-by: Bagatur <[email protected]> Co-authored-by: Harrison Chase <[email protected]> Co-authored-by: ccurme <[email protected]>
kineticadb · May 24, 2024 · cccc8fb · cccc8fb
1 parent 42207f5
commit cccc8fb
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 22 deletions.
diff --git a/docs/docs/integrations/vectorstores/duckdb.ipynb b/docs/docs/integrations/vectorstores/duckdb.ipynb
@@ -14,7 +14,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "! pip install duckdb langchain-community"
+    "! pip install duckdb langchain langchain-community langchain-openai"
    ]
   },
   {
@@ -86,7 +86,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -100,9 +100,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.2"
+   "version": "3.9.1"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/libs/community/langchain_community/vectorstores/duckdb.py b/libs/community/langchain_community/vectorstores/duckdb.py
@@ -2,13 +2,23 @@
 from __future__ import annotations
 
 import json
+import logging
 import uuid
+import warnings
 from typing import Any, Iterable, List, Optional, Type
 
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
 from langchain_core.vectorstores import VST, VectorStore
 
+logger = logging.getLogger(__name__)
+
+DEFAULT_VECTOR_KEY = "embedding"
+DEFAULT_ID_KEY = "id"
+DEFAULT_TEXT_KEY = "text"
+DEFAULT_TABLE_NAME = "embeddings"
+SIMILARITY_ALIAS = "similarity_score"
+
 
 class DuckDB(VectorStore):
     """`DuckDB` vector store.
@@ -76,10 +86,10 @@ def __init__(
         *,
         connection: Optional[Any] = None,
         embedding: Embeddings,
-        vector_key: str = "embedding",
-        id_key: str = "id",
-        text_key: str = "text",
-        table_name: str = "vectorstore",
+        vector_key: str = DEFAULT_VECTOR_KEY,
+        id_key: str = DEFAULT_ID_KEY,
+        text_key: str = DEFAULT_TEXT_KEY,
+        table_name: str = DEFAULT_TABLE_NAME,
     ):
         """Initialize with DuckDB connection and setup for vector storage."""
         try:
@@ -100,8 +110,6 @@ def __init__(
             raise ValueError("An embedding function or model must be provided.")
 
         if connection is None:
-            import warnings
-
             warnings.warn(
                 "No DuckDB connection provided. A new connection will be created."
                 "This connection is running in memory and no data will be persisted."
@@ -138,13 +146,25 @@ def add_texts(
         Returns:
             List of ids of the added texts.
         """
+        have_pandas = False
+        try:
+            import pandas as pd
+
+            have_pandas = True
+        except ImportError:
+            logger.info(
+                "Unable to import pandas. "
+                "Install it with `pip install -U pandas` "
+                "to improve performance of add_texts()."
+            )
 
         # Extract ids from kwargs or generate new ones if not provided
         ids = kwargs.pop("ids", [str(uuid.uuid4()) for _ in texts])
 
         # Embed texts and create documents
         ids = ids or [str(uuid.uuid4()) for _ in texts]
         embeddings = self._embedding.embed_documents(list(texts))
+        data = []
         for idx, text in enumerate(texts):
             embedding = embeddings[idx]
             # Serialize metadata if present, else default to None
@@ -153,9 +173,26 @@ def add_texts(
                 if metadatas and idx < len(metadatas)
                 else None
             )
+            if have_pandas:
+                data.append(
+                    {
+                        self._id_key: ids[idx],
+                        self._text_key: text,
+                        self._vector_key: embedding,
+                        "metadata": metadata,
+                    }
+                )
+            else:
+                self._connection.execute(
+                    f"INSERT INTO {self._table_name} VALUES (?,?,?,?)",
+                    [ids[idx], text, embedding, metadata],
+                )
+
+        if have_pandas:
+            # noinspection PyUnusedLocal
+            df = pd.DataFrame.from_dict(data)  # noqa: F841
             self._connection.execute(
-                f"INSERT INTO {self._table_name} VALUES (?,?,?,?)",
-                [ids[idx], text, embedding, metadata],
+                f"INSERT INTO {self._table_name} SELECT * FROM df",
             )
         return ids
 
@@ -181,20 +218,21 @@ def similarity_search(
             self._table.select(
                 *[
                     self.duckdb.StarExpression(exclude=[]),
-                    list_cosine_similarity.alias("similarity"),
+                    list_cosine_similarity.alias(SIMILARITY_ALIAS),
                 ]
             )
-            .order("similarity desc")
+            .order(f"{SIMILARITY_ALIAS} desc")
             .limit(k)
-            .select(
-                self.duckdb.StarExpression(exclude=["similarity", self._vector_key])
-            )
             .fetchdf()
         )
         return [
             Document(
                 page_content=docs[self._text_key][idx],
-                metadata=json.loads(docs["metadata"][idx])
+                metadata={
+                    **json.loads(docs["metadata"][idx]),
+                    # using underscore prefix to avoid conflicts with user metadata keys
+                    f"_{SIMILARITY_ALIAS}": docs[SIMILARITY_ALIAS][idx],
+                }
                 if docs["metadata"][idx]
                 else {},
             )
@@ -231,10 +269,10 @@ def from_texts(
 
         # Extract kwargs for DuckDB instance creation
         connection = kwargs.get("connection", None)
-        vector_key = kwargs.get("vector_key", "vector")
-        id_key = kwargs.get("id_key", "id")
-        text_key = kwargs.get("text_key", "text")
-        table_name = kwargs.get("table_name", "embeddings")
+        vector_key = kwargs.get("vector_key", DEFAULT_VECTOR_KEY)
+        id_key = kwargs.get("id_key", DEFAULT_ID_KEY)
+        text_key = kwargs.get("text_key", DEFAULT_TEXT_KEY)
+        table_name = kwargs.get("table_name", DEFAULT_TABLE_NAME)
 
         # Create an instance of DuckDB
         instance = DuckDB(