Merge branch 'langchain-ai:master' into master

kineticadb · May 15, 2024 · cf2bed9 · cf2bed9
2 parents 9a76986 + f2f970f
commit cf2bed9
Show file tree

Hide file tree

Showing 11 changed files with 355 additions and 102 deletions.
diff --git a/docs/docs/how_to/function_calling.ipynb b/docs/docs/how_to/function_calling.ipynb
@@ -696,7 +696,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.1"
+   "version": "3.9.1"
   }
  },
  "nbformat": 4,

diff --git a/docs/docs/how_to/structured_output.ipynb b/docs/docs/how_to/structured_output.ipynb
diff --git a/docs/docs/integrations/chat/openai.ipynb b/docs/docs/integrations/chat/openai.ipynb
@@ -147,7 +147,7 @@
     "\n",
     "### ChatOpenAI.bind_tools()\n",
     "\n",
-    "With `ChatAnthropic.bind_tools`, we can easily pass in Pydantic classes, dict schemas, LangChain tools, or even functions as tools to the model. Under the hood these are converted to an Anthropic tool schemas, which looks like:\n",
+    "With `ChatOpenAI.bind_tools`, we can easily pass in Pydantic classes, dict schemas, LangChain tools, or even functions as tools to the model. Under the hood these are converted to an OpenAI tool schemas, which looks like:\n",
     "```\n",
     "{\n",
     "    \"name\": \"...\",\n",

diff --git a/docs/docusaurus.config.js b/docs/docusaurus.config.js
@@ -124,7 +124,7 @@ const config = {
     /** @type {import('@docusaurus/preset-classic').ThemeConfig} */
     ({
       announcementBar: {
-        content: 'You are viewing the <strong>preview</strong> LangChain v0.2 docs. Note that 0.2 Search features are currently unstable and in progress. View the <a href="/v0.1/docs/get_started/introduction/">stable 0.1 docs here</a>.',
+        content: 'You are viewing the <strong>preview</strong> LangChain v0.2 docs. View the <a href="/v0.1/docs/get_started/introduction/">stable 0.1 docs here</a>.',
         isCloseable: true,
       },
       docs: {
@@ -310,9 +310,9 @@ const config = {
         // this is linked to [email protected] currently
         apiKey: "6c01842d6a88772ed2236b9c85806441",
 
-        indexName: "python-langchain",
+        indexName: "python-langchain-0.2",
 
-        contextualSearch: true,
+        contextualSearch: false,
       },
     }),
 

diff --git a/docs/scripts/notebook_convert.py b/docs/scripts/notebook_convert.py
@@ -84,12 +84,8 @@ def check_conditions(self, cell):
         pattern = re.compile(r"(?s)(?:\s*\Z)|(?:.*#\s*\|\s*output:\s*false.*)")
         rtn = not pattern.match(cell.source)
         if not rtn:
-            print("--remove--")
-            print(cell.source)
             return False
         else:
-            print("--keep--")
-            print(cell.source)
             return True
 
     def preprocess(self, nb, resources):

diff --git a/libs/core/langchain_core/language_models/base.py b/libs/core/langchain_core/language_models/base.py
@@ -204,7 +204,9 @@ async def agenerate_prompt(
     def with_structured_output(
         self, schema: Union[Dict, Type[BaseModel]], **kwargs: Any
     ) -> Runnable[LanguageModelInput, Union[Dict, BaseModel]]:
-        """Implement this if there is a way of steering the model to generate responses that match a given schema."""  # noqa: E501
+        """Not implemented on this class."""
+        # Implement this on child class if there is a way of steering the model to
+        # generate responses that match a given schema.
         raise NotImplementedError()
 
     @deprecated("0.1.7", alternative="invoke", removal="0.3.0")

diff --git a/libs/core/pyproject.toml b/libs/core/pyproject.toml
@@ -80,6 +80,11 @@ select = [
 disallow_untyped_defs = "True"
 exclude = ["notebooks", "examples", "example_data", "langchain_core/pydantic"]
 
+[[tool.mypy.overrides]]
+# conditional dependencies introduced by langsmith-sdk
+module = ["numpy", "pytest"]
+ignore_missing_imports = true
+
 [tool.coverage.run]
 omit = ["tests/*"]
 

diff --git a/libs/partners/mongodb/langchain_mongodb/vectorstores.py b/libs/partners/mongodb/langchain_mongodb/vectorstores.py
@@ -16,6 +16,7 @@
 )
 
 import numpy as np
+from bson import ObjectId, json_util
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
 from langchain_core.runnables.config import run_in_executor
@@ -31,7 +32,7 @@
 
 logger = logging.getLogger(__name__)
 
-DEFAULT_INSERT_BATCH_SIZE = 100
+DEFAULT_INSERT_BATCH_SIZE = 100_000
 
 
 class MongoDBAtlasVectorSearch(VectorStore):
@@ -150,18 +151,24 @@ def add_texts(
         """
         batch_size = kwargs.get("batch_size", DEFAULT_INSERT_BATCH_SIZE)
         _metadatas: Union[List, Generator] = metadatas or ({} for _ in texts)
-        texts_batch = []
-        metadatas_batch = []
+        texts_batch = texts
+        metadatas_batch = _metadatas
         result_ids = []
-        for i, (text, metadata) in enumerate(zip(texts, _metadatas)):
-            texts_batch.append(text)
-            metadatas_batch.append(metadata)
-            if (i + 1) % batch_size == 0:
-                result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
-                texts_batch = []
-                metadatas_batch = []
+        if batch_size:
+            texts_batch = []
+            metadatas_batch = []
+            size = 0
+            for i, (text, metadata) in enumerate(zip(texts, _metadatas)):
+                size += len(text) + len(metadata)
+                texts_batch.append(text)
+                metadatas_batch.append(metadata)
+                if (i + 1) % batch_size == 0 or size >= 47_000_000:
+                    result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
+                    texts_batch = []
+                    metadatas_batch = []
+                    size = 0
         if texts_batch:
-            result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
+            result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))  # type: ignore
         return result_ids
 
     def _insert_texts(self, texts: List[str], metadatas: List[Dict[str, Any]]) -> List:
@@ -210,9 +217,23 @@ def _similarity_search_with_score(
             pipeline.extend(post_filter_pipeline)
         cursor = self._collection.aggregate(pipeline)  # type: ignore[arg-type]
         docs = []
+
+        def _make_serializable(obj: Dict[str, Any]) -> None:
+            for k, v in obj.items():
+                if isinstance(v, dict):
+                    _make_serializable(v)
+                elif isinstance(v, list) and v and isinstance(v[0], ObjectId):
+                    obj[k] = [json_util.default(item) for item in v]
+                elif isinstance(v, ObjectId):
+                    obj[k] = json_util.default(v)
+
         for res in cursor:
             text = res.pop(self._text_key)
             score = res.pop("score")
+            # Make every ObjectId found JSON-Serializable
+            # following format used in bson.json_util.loads
+            # e.g. loads('{"_id": {"$oid": "664..."}}') == {'_id': ObjectId('664..')} # noqa: E501
+            _make_serializable(res)
             docs.append((Document(page_content=text, metadata=res), score))
         return docs
 

diff --git a/libs/partners/mongodb/pyproject.toml b/libs/partners/mongodb/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langchain-mongodb"
-version = "0.1.3"
+version = "0.1.4"
 description = "An integration package connecting MongoDB and LangChain"
 authors = []
 readme = "README.md"
@@ -28,7 +28,7 @@ pytest-watcher = "^0.3.4"
 pytest-asyncio = "^0.21.1"
 langchain = { path = "../../langchain", develop = true }
 langchain-core = { path = "../../core", develop = true }
-langchain-text-splitters = {path = "../../text-splitters", develop = true}
+langchain-text-splitters = { path = "../../text-splitters", develop = true }
 
 [tool.poetry.group.codespell]
 optional = true

diff --git a/libs/partners/mongodb/tests/unit_tests/test_vectorstores.py b/libs/partners/mongodb/tests/unit_tests/test_vectorstores.py
@@ -1,6 +1,8 @@
+from json import dumps, loads
 from typing import Any, Optional
 
 import pytest
+from bson import ObjectId, json_util
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
 from pymongo.collection import Collection
@@ -75,6 +77,11 @@ def _validate_search(
         output = vectorstore.similarity_search("", k=1)
         assert output[0].page_content == page_content
         assert output[0].metadata.get("c") == metadata
+        # Validate the ObjectId provided is json serializable
+        assert loads(dumps(output[0].page_content)) == output[0].page_content
+        assert loads(dumps(output[0].metadata)) == output[0].metadata
+        json_metadata = dumps(output[0].metadata)  # normal json.dumps
+        assert isinstance(json_util.loads(json_metadata)["_id"], ObjectId)
 
     def test_from_documents(
         self, embedding_openai: Embeddings, collection: MockCollection

diff --git a/libs/partners/mongodb/tests/utils.py b/libs/partners/mongodb/tests/utils.py
@@ -1,9 +1,9 @@
 from __future__ import annotations
 
-import uuid
 from copy import deepcopy
 from typing import Any, Dict, List, Mapping, Optional, cast
 
+from bson import ObjectId
 from langchain_core.callbacks.manager import (
     AsyncCallbackManagerForLLMRun,
     CallbackManagerForLLMRun,
@@ -162,7 +162,7 @@ def delete_many(self, *args, **kwargs) -> DeleteResult:  # type: ignore
 
     def insert_many(self, to_insert: List[Any], *args, **kwargs) -> InsertManyResult:  # type: ignore
         mongodb_inserts = [
-            {"_id": str(uuid.uuid4()), "score": 1, **insert} for insert in to_insert
+            {"_id": ObjectId(), "score": 1, **insert} for insert in to_insert
         ]
         self._data.extend(mongodb_inserts)
         return self._insert_result or InsertManyResult(