From cccc8fbe2fe59bde0846875f67aa046aeb1105a3 Mon Sep 17 00:00:00 2001
From: Jan Soubusta <jan.soubusta@gooddata.com>
Date: Sat, 25 May 2024 00:17:52 +0200
Subject: [PATCH] community[patch]: DuckDB VS - expose similarity, improve
 performance of from_texts (#20971)

3 fixes of DuckDB vector store:
- unify defaults in constructor and from_texts (users no longer have to
specify `vector_key`).
- include search similarity into output metadata (fixes #20969)
- significantly improve performance of `from_documents`

Dependencies: added Pandas to speed up `from_documents`.
I was thinking about CSV and JSON options, but I expect trouble loading
JSON values this way and also CSV and JSON options require storing data
to disk.
Anyway, the poetry file for langchain-community already contains a
dependency on Pandas.

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
Co-authored-by: ccurme <chester.curme@gmail.com>
---
 .../integrations/vectorstores/duckdb.ipynb    |  8 +-
 .../vectorstores/duckdb.py                    | 74 ++++++++++++++-----
 2 files changed, 60 insertions(+), 22 deletions(-)

diff --git a/docs/docs/integrations/vectorstores/duckdb.ipynb b/docs/docs/integrations/vectorstores/duckdb.ipynb
index e87f1c4142ffd..be23a3f2b8df4 100644
--- a/docs/docs/integrations/vectorstores/duckdb.ipynb
+++ b/docs/docs/integrations/vectorstores/duckdb.ipynb
@@ -14,7 +14,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "! pip install duckdb langchain-community"
+    "! pip install duckdb langchain langchain-community langchain-openai"
    ]
   },
   {
@@ -86,7 +86,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -100,9 +100,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.2"
+   "version": "3.9.1"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/libs/community/langchain_community/vectorstores/duckdb.py b/libs/community/langchain_community/vectorstores/duckdb.py
index dd3b1611e87ec..e949d6ac1ab33 100644
--- a/libs/community/langchain_community/vectorstores/duckdb.py
+++ b/libs/community/langchain_community/vectorstores/duckdb.py
@@ -2,13 +2,23 @@
 from __future__ import annotations
 
 import json
+import logging
 import uuid
+import warnings
 from typing import Any, Iterable, List, Optional, Type
 
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
 from langchain_core.vectorstores import VST, VectorStore
 
+logger = logging.getLogger(__name__)
+
+DEFAULT_VECTOR_KEY = "embedding"
+DEFAULT_ID_KEY = "id"
+DEFAULT_TEXT_KEY = "text"
+DEFAULT_TABLE_NAME = "embeddings"
+SIMILARITY_ALIAS = "similarity_score"
+
 
 class DuckDB(VectorStore):
     """`DuckDB` vector store.
@@ -76,10 +86,10 @@ def __init__(
         *,
         connection: Optional[Any] = None,
         embedding: Embeddings,
-        vector_key: str = "embedding",
-        id_key: str = "id",
-        text_key: str = "text",
-        table_name: str = "vectorstore",
+        vector_key: str = DEFAULT_VECTOR_KEY,
+        id_key: str = DEFAULT_ID_KEY,
+        text_key: str = DEFAULT_TEXT_KEY,
+        table_name: str = DEFAULT_TABLE_NAME,
     ):
         """Initialize with DuckDB connection and setup for vector storage."""
         try:
@@ -100,8 +110,6 @@ def __init__(
             raise ValueError("An embedding function or model must be provided.")
 
         if connection is None:
-            import warnings
-
             warnings.warn(
                 "No DuckDB connection provided. A new connection will be created."
                 "This connection is running in memory and no data will be persisted."
@@ -138,6 +146,17 @@ def add_texts(
         Returns:
             List of ids of the added texts.
         """
+        have_pandas = False
+        try:
+            import pandas as pd
+
+            have_pandas = True
+        except ImportError:
+            logger.info(
+                "Unable to import pandas. "
+                "Install it with `pip install -U pandas` "
+                "to improve performance of add_texts()."
+            )
 
         # Extract ids from kwargs or generate new ones if not provided
         ids = kwargs.pop("ids", [str(uuid.uuid4()) for _ in texts])
@@ -145,6 +164,7 @@ def add_texts(
         # Embed texts and create documents
         ids = ids or [str(uuid.uuid4()) for _ in texts]
         embeddings = self._embedding.embed_documents(list(texts))
+        data = []
         for idx, text in enumerate(texts):
             embedding = embeddings[idx]
             # Serialize metadata if present, else default to None
@@ -153,9 +173,26 @@ def add_texts(
                 if metadatas and idx < len(metadatas)
                 else None
             )
+            if have_pandas:
+                data.append(
+                    {
+                        self._id_key: ids[idx],
+                        self._text_key: text,
+                        self._vector_key: embedding,
+                        "metadata": metadata,
+                    }
+                )
+            else:
+                self._connection.execute(
+                    f"INSERT INTO {self._table_name} VALUES (?,?,?,?)",
+                    [ids[idx], text, embedding, metadata],
+                )
+
+        if have_pandas:
+            # noinspection PyUnusedLocal
+            df = pd.DataFrame.from_dict(data)  # noqa: F841
             self._connection.execute(
-                f"INSERT INTO {self._table_name} VALUES (?,?,?,?)",
-                [ids[idx], text, embedding, metadata],
+                f"INSERT INTO {self._table_name} SELECT * FROM df",
             )
         return ids
 
@@ -181,20 +218,21 @@ def similarity_search(
             self._table.select(
                 *[
                     self.duckdb.StarExpression(exclude=[]),
-                    list_cosine_similarity.alias("similarity"),
+                    list_cosine_similarity.alias(SIMILARITY_ALIAS),
                 ]
             )
-            .order("similarity desc")
+            .order(f"{SIMILARITY_ALIAS} desc")
             .limit(k)
-            .select(
-                self.duckdb.StarExpression(exclude=["similarity", self._vector_key])
-            )
             .fetchdf()
         )
         return [
             Document(
                 page_content=docs[self._text_key][idx],
-                metadata=json.loads(docs["metadata"][idx])
+                metadata={
+                    **json.loads(docs["metadata"][idx]),
+                    # using underscore prefix to avoid conflicts with user metadata keys
+                    f"_{SIMILARITY_ALIAS}": docs[SIMILARITY_ALIAS][idx],
+                }
                 if docs["metadata"][idx]
                 else {},
             )
@@ -231,10 +269,10 @@ def from_texts(
 
         # Extract kwargs for DuckDB instance creation
         connection = kwargs.get("connection", None)
-        vector_key = kwargs.get("vector_key", "vector")
-        id_key = kwargs.get("id_key", "id")
-        text_key = kwargs.get("text_key", "text")
-        table_name = kwargs.get("table_name", "embeddings")
+        vector_key = kwargs.get("vector_key", DEFAULT_VECTOR_KEY)
+        id_key = kwargs.get("id_key", DEFAULT_ID_KEY)
+        text_key = kwargs.get("text_key", DEFAULT_TEXT_KEY)
+        table_name = kwargs.get("table_name", DEFAULT_TABLE_NAME)
 
         # Create an instance of DuckDB
         instance = DuckDB(