diff --git a/docs/docs.trychroma.com/app/api/search/route.ts b/docs/docs.trychroma.com/app/api/search/route.ts new file mode 100644 index 00000000000..2952dda82a1 --- /dev/null +++ b/docs/docs.trychroma.com/app/api/search/route.ts @@ -0,0 +1,81 @@ +import { NextResponse } from "next/server"; +import { ChromaClient } from "chromadb"; +// @ts-ignore +import { Collection } from "chromadb/src/Collection"; + +const chromaClient = new ChromaClient({ + path: "https://api.trychroma.com:8000", + auth: { + provider: "token", + credentials: process.env.CHROMA_CLOUD_API_KEY, + tokenHeaderType: "X_CHROMA_TOKEN", + }, + tenant: process.env.CHROMA_CLOUD_TENANT, + database: "docs", +}); + +const collection: Collection = await chromaClient.getOrCreateCollection({ + name: "docs-content", +}); + +export async function GET(request: Request) { + try { + const { searchParams } = new URL(request.url); + const query = searchParams.get("q"); + + if (!query) { + return NextResponse.json( + { error: "Query parameter is required" }, + { status: 400 }, + ); + } + + let results: { + distance: number; + title: string; + pageTitle: string; + pageUrl: string; + }[] = []; + + const queryResults = await collection.query({ + queryTexts: [query], + include: ["metadatas"], + where: + results.length > 0 + ? { pageTitle: { $nin: results.map((r) => r.pageTitle) } } + : undefined, + }); + + results.push( + ...queryResults.metadatas[0].map( + ( + m: { + pageTitle: string; + title: string; + page: string; + section: string; + subsection?: string; + }, + index: number, + ) => { + return { + title: m.title, + pageTitle: m.pageTitle, + pageUrl: m.subsection + ? `/${m.section}/${m.subsection}/${m.page}${m.pageTitle !== m.title ? `#${m.title.replaceAll(" ", "-").replaceAll("_", "-").toLowerCase()}` : ""}` + : `/${m.section}/${m.page}${m.pageTitle !== m.title ? `#${m.title.replaceAll(" ", "-").replaceAll("_", "-").toLowerCase()}` : ""}`, + }; + }, + ), + ); + + results = Array.from( + new Map(results.map((item) => [item.title, item])).values(), + ); + + return NextResponse.json(results); + } catch (error) { + console.log(error); + return NextResponse.json({ error: "Search failed" }, { status: 500 }); + } +} diff --git a/docs/docs.trychroma.com/components/header/header.tsx b/docs/docs.trychroma.com/components/header/header.tsx index 087fbc4fbd4..0914df90b87 100644 --- a/docs/docs.trychroma.com/components/header/header.tsx +++ b/docs/docs.trychroma.com/components/header/header.tsx @@ -5,7 +5,7 @@ import GithubLink from "@/components/header/github-link"; import XLink from "@/components/header/x-link"; import DiscordLink from "@/components/header/discord-link"; import Link from "next/link"; -import SearchBox from "@/components/header/search-box"; +import SearchDocs from "@/components/header/search-docs"; const Header: React.FC = () => { return ( @@ -14,9 +14,9 @@ const Header: React.FC = () => { -
+ diff --git a/docs/docs.trychroma.com/components/header/search-docs.tsx b/docs/docs.trychroma.com/components/header/search-docs.tsx new file mode 100644 index 00000000000..605e58fc1d7 --- /dev/null +++ b/docs/docs.trychroma.com/components/header/search-docs.tsx @@ -0,0 +1,157 @@ +"use client"; + +import React, { useEffect, useState } from "react"; +import { Dialog, DialogContent, DialogTrigger } from "../ui/dialog"; +import UIButton from "@/components/ui/ui-button"; +import { Cross2Icon, MagnifyingGlassIcon } from "@radix-ui/react-icons"; +import * as DialogPrimitive from "@radix-ui/react-dialog"; +import _ from "lodash"; +import { Input } from "@/components/ui/input"; +import ChromaIcon from "../../public/chroma-icon.svg"; +import { AlertTriangleIcon, ArrowRight, Loader } from "lucide-react"; +import Link from "next/link"; + +const SearchDocs: React.FC = () => { + const [query, setQuery] = useState(""); + const [results, setResults] = useState< + { title: string; pageTitle: string; pageUrl: string }[] + >([]); + const [isLoading, setIsLoading] = useState(false); + const [error, setError] = useState(null); + + const debouncedSearch = _.debounce(async (searchQuery: string) => { + if (!searchQuery.trim()) { + setResults([]); + return; + } + + try { + setIsLoading(true); + setError(null); + + const response = await fetch( + `/api/search?q=${encodeURIComponent(searchQuery)}`, + ); + + if (!response.ok) { + throw new Error("Search failed"); + } + + const data = await response.json(); + setResults(data); + console.log(data); + } catch (err) { + setError("Failed to perform search"); + setResults([]); + } finally { + setIsLoading(false); + } + }, 300); + + useEffect(() => { + debouncedSearch(query); + + return () => { + debouncedSearch.cancel(); + }; + }, [query]); + + return ( + { + if (!open) { + setQuery(""); + } + }} + > + + + +

Search...

+
+
+ +
+
+ {[...Array(7)].map((_, index) => ( +
+ ))} +
+ Search Docs +
+
+ + + Close + +
+
+
+
+ setQuery(e.target.value)} + placeholder="Search..." + className="w-full p-2 border border-black rounded-none" + /> +
+
+ {isLoading && ( +
+ +
+ )} + {error && ( +
+ +

+ Failed to fetch results. Try again later +

+
+ )} + {!isLoading && !error && ( +
+ {results.map((result, index) => ( + setQuery("")} + > + +
+

+ {result.title || result.pageTitle} +

+ {result.title && result.title !== result.pageTitle && ( +

{result.pageTitle}

+ )} +
+ +
+ + ))} +
+ )} +
+
+ +
+ +

Powered by Chroma Cloud

+
+ +
+ +
+ ); +}; + +export default SearchDocs; diff --git a/docs/docs.trychroma.com/components/markdoc/markdoc-heading.tsx b/docs/docs.trychroma.com/components/markdoc/markdoc-heading.tsx index 09da191177b..aa9508ed343 100644 --- a/docs/docs.trychroma.com/components/markdoc/markdoc-heading.tsx +++ b/docs/docs.trychroma.com/components/markdoc/markdoc-heading.tsx @@ -4,6 +4,7 @@ const generateId = (content: React.ReactNode): string => { if (typeof content === "string") { return content .toLowerCase() + .replaceAll("_", "-") .replace(/[^a-z0-9\s-]/g, "") .replace(/\s+/g, "-") .trim(); diff --git a/docs/docs.trychroma.com/components/ui/dialog.tsx b/docs/docs.trychroma.com/components/ui/dialog.tsx new file mode 100644 index 00000000000..4d3be9123d3 --- /dev/null +++ b/docs/docs.trychroma.com/components/ui/dialog.tsx @@ -0,0 +1,117 @@ +"use client"; + +import * as React from "react"; +import * as DialogPrimitive from "@radix-ui/react-dialog"; +import { cn } from "@/lib/utils"; +import { Cross2Icon } from "@radix-ui/react-icons"; + +const Dialog = DialogPrimitive.Root; + +const DialogTrigger = DialogPrimitive.Trigger; + +const DialogPortal = DialogPrimitive.Portal; + +const DialogClose = DialogPrimitive.Close; + +const DialogOverlay = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => ( + +)); +DialogOverlay.displayName = DialogPrimitive.Overlay.displayName; + +const DialogContent = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, children, ...props }, ref) => ( + + + + {children} + + +)); +DialogContent.displayName = DialogPrimitive.Content.displayName; + +const DialogHeader = ({ + className, + ...props +}: React.HTMLAttributes) => ( +
+); +DialogHeader.displayName = "DialogHeader"; + +const DialogFooter = ({ + className, + ...props +}: React.HTMLAttributes) => ( +
+); +DialogFooter.displayName = "DialogFooter"; + +const DialogTitle = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => ( + +)); +DialogTitle.displayName = DialogPrimitive.Title.displayName; + +const DialogDescription = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => ( + +)); +DialogDescription.displayName = DialogPrimitive.Description.displayName; + +export { + Dialog, + DialogPortal, + DialogOverlay, + DialogTrigger, + DialogClose, + DialogContent, + DialogHeader, + DialogFooter, + DialogTitle, + DialogDescription, +}; diff --git a/docs/docs.trychroma.com/components/ui/input.tsx b/docs/docs.trychroma.com/components/ui/input.tsx new file mode 100644 index 00000000000..d24a287d367 --- /dev/null +++ b/docs/docs.trychroma.com/components/ui/input.tsx @@ -0,0 +1,22 @@ +import * as React from "react" + +import { cn } from "@/lib/utils" + +const Input = React.forwardRef>( + ({ className, type, ...props }, ref) => { + return ( + + ) + } +) +Input.displayName = "Input" + +export { Input } diff --git a/docs/docs.trychroma.com/markdoc/content/docs/guides/embeddings-guide.md b/docs/docs.trychroma.com/markdoc/content/docs/guides/embeddings-guide.md deleted file mode 100644 index 5db0bcf7ddb..00000000000 --- a/docs/docs.trychroma.com/markdoc/content/docs/guides/embeddings-guide.md +++ /dev/null @@ -1,123 +0,0 @@ ---- -{ - "id": "embeddings-guide", - "title": "Embeddings", - "section": "Guides", - "order": 1 -} ---- - -# Embeddings - -Embeddings are the A.I-native way to represent any kind of data, making them the perfect fit for working with all kinds of A.I-powered tools and algorithms. They can represent text, images, and soon audio and video. There are many options for creating embeddings, whether locally using an installed library, or by calling an API. - -Chroma provides lightweight wrappers around popular embedding providers, making it easy to use them in your apps. You can set an embedding function when you create a Chroma collection, which will be used automatically, or you can call them directly yourself. - -{% special_table %} -{% /special_table %} - -| | Python | JS | -|--------------|-----------|---------------| -| [OpenAI](/integrations/openai) | ✅ | ✅ | -| [Google Generative AI](/integrations/google-gemini) | ✅ | ✅ | -| [Cohere](/integrations/cohere) | ✅ | ✅ | -| [Hugging Face](/integrations/hugging-face) | ✅ | ➖ | -| [Instructor](/integrations/instructor) | ✅ | ➖ | -| [Hugging Face Embedding Server](/integrations/hugging-face-server) | ✅ | ✅ | -| [Jina AI](/integrations/jinaai) | ✅ | ✅ | - -We welcome pull requests to add new Embedding Functions to the community. - -*** - -## Default: all-MiniLM-L6-v2 - -By default, Chroma uses the [Sentence Transformers](https://www.sbert.net/) `all-MiniLM-L6-v2` model to create embeddings. This embedding model can create sentence and document embeddings that can be used for a wide variety of tasks. This embedding function runs locally on your machine, and may require you download the model files (this will happen automatically). - -```python -from chromadb.utils import embedding_functions -default_ef = embedding_functions.DefaultEmbeddingFunction() -``` - -{% note type="default" %} -Embedding functions can be linked to a collection and used whenever you call `add`, `update`, `upsert` or `query`. You can also use them directly which can be handy for debugging. -```py -val = default_ef(["foo"]) -``` --> [[0.05035809800028801, 0.0626462921500206, -0.061827320605516434...]] -{% /note %} - - -{% tabs group="code-lang" hideTabs=true %} -{% tab label="Python" %} - -## Sentence Transformers - -Chroma can also use any [Sentence Transformers](https://www.sbert.net/) model to create embeddings. - -```python -sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2") -``` - -You can pass in an optional `model_name` argument, which lets you choose which Sentence Transformers model to use. By default, Chroma uses `all-MiniLM-L6-v2`. You can see a list of all available models [here](https://www.sbert.net/docs/pretrained_models.html). - -{% /tab %} -{% tab label="Javascript" %} -{% /tab %} -{% /tabs %} - - -*** - - -## Custom Embedding Functions - -{% tabs group="code-lang" hideContent=true %} - -{% tab label="Python" %} -{% /tab %} - -{% tab label="Javascript" %} -{% /tab %} - -{% /tabs %} - -{% tabs group="code-lang" hideTabs=true %} -{% tab label="Python" %} - -You can create your own embedding function to use with Chroma, it just needs to implement the `EmbeddingFunction` protocol. - -```python -from chromadb import Documents, EmbeddingFunction, Embeddings - -class MyEmbeddingFunction(EmbeddingFunction): - def __call__(self, input: Documents) -> Embeddings: - # embed the documents somehow - return embeddings -``` - -We welcome contributions! If you create an embedding function that you think would be useful to others, please consider [submitting a pull request](https://github.com/chroma-core/chroma) to add it to Chroma's `embedding_functions` module. - - -{% /tab %} -{% tab label="Javascript" %} - -You can create your own embedding function to use with Chroma, it just needs to implement the `EmbeddingFunction` protocol. The `.generate` method in a class is strictly all you need. - -```javascript -class MyEmbeddingFunction { - private api_key: string; - - constructor(api_key: string) { - this.api_key = api_key; - } - - public async generate(texts: string[]): Promise { - // do things to turn texts into embeddings with an api_key perhaps - return embeddings; - } -} -``` - -{% /tab %} -{% /tabs %} diff --git a/docs/docs.trychroma.com/markdoc/content/docs/guides/multimodal-guide.md b/docs/docs.trychroma.com/markdoc/content/docs/guides/multimodal-guide.md deleted file mode 100644 index 70d7b30e5dc..00000000000 --- a/docs/docs.trychroma.com/markdoc/content/docs/guides/multimodal-guide.md +++ /dev/null @@ -1,158 +0,0 @@ ---- -{ - "id": "multimodal-guide", - "title": "Multimodal", - "section": "Guides", - "order": 2 -} ---- - -# Multimodal - -{% tabs group="code-lang" hideContent=true %} - -{% tab label="Python" %} -{% /tab %} - -{% tab label="Javascript" %} -{% /tab %} - -{% /tabs %} - ---- - -{% tabs group="code-lang" hideTabs=true %} -{% tab label="Python" %} - -Chroma supports multimodal collections, i.e. collections which can store, and can be queried by, multiple modalities of data. - -Try it out in Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/chroma-core/chroma/blob/main/examples/multimodal/multimodal_retrieval.ipynb) - -## Multi-modal Embedding Functions - -Chroma supports multi-modal embedding functions, which can be used to embed data from multiple modalities into a single embedding space. - -Chroma has the OpenCLIP embedding function built in, which supports both text and images. - -```python -from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction -embedding_function = OpenCLIPEmbeddingFunction() -``` - -## Data Loaders - -Chroma supports data loaders, for storing and querying with data stored outside Chroma itself, via URI. Chroma will not store this data, but will instead store the URI, and load the data from the URI when needed. - -Chroma has an data loader for loading images from a filesystem built in. - -```python -from chromadb.utils.data_loaders import ImageLoader -data_loader = ImageLoader() -``` - -## Multi-modal Collections - -You can create a multi-modal collection by passing in a multi-modal embedding function. In order to load data from a URI, you must also pass in a data loader. - -```python -import chromadb - -client = chromadb.Client() - -collection = client.create_collection( - name='multimodal_collection', - embedding_function=embedding_function, - data_loader=data_loader) - -``` - -### Adding data - -You can add data to a multi-modal collection by specifying the data modality. For now, images are supported: - -```python -collection.add( - ids=['id1', 'id2', 'id3'], - images=[...] # A list of numpy arrays representing images -) -``` - -Note that Chroma will not store the data for you, and you will have to maintain a mapping from IDs to data yourself. - -However, you can use Chroma in combination with data stored elsewhere, by adding it via URI. Note that this requires that you have specified a data loader when creating the collection. - -```python -collection.add( - ids=['id1', 'id2', 'id3'], - uris=[...] # A list of strings representing URIs to data -) -``` - -Since the embedding function is multi-modal, you can also add text to the same collection: - -```python -collection.add( - ids=['id4', 'id5', 'id6'], - documents=["This is a document", "This is another document", "This is a third document"] -) -``` - -### Querying - -You can query a multi-modal collection with any of the modalities that it supports. For example, you can query with images: - -```python -results = collection.query( - query_images=[...] # A list of numpy arrays representing images -) -``` - -Or with text: - -```python -results = collection.query( - query_texts=["This is a query document", "This is another query document"] -) -``` - -If a data loader is set for the collection, you can also query with URIs which reference data stored elsewhere of the supported modalities: - -```python -results = collection.query( - query_uris=[...] # A list of strings representing URIs to data -) -``` - -Additionally, if a data loader is set for the collection, and URIs are available, you can include the data in the results: - -```python -results = collection.query( - query_images=[...], # # list of numpy arrays representing images - includes=['data'] -) -``` - -This will automatically call the data loader for any available URIs, and include the data in the results. `uris` are also available as an `includes` field. - -### Updating - -You can update a multi-modal collection by specifying the data modality, in the same way as `add`. For now, images are supported: - -```python -collection.update( - ids=['id1', 'id2', 'id3'], - images=[...] # A list of numpy arrays representing images -) -``` - -Note that a given entry with a specific ID can only have one associated modality at a time. Updates will over-write the existing modality, so for example, an entry which originally has corresponding text and updated with an image, will no longer have that text after an update with images. - -{% /tab %} -{% tab label="Javascript" %} - -Support for multi-modal retrieval for Chroma's JavaScript client is coming soon! - -{% /tab %} - -{% /tabs %} - diff --git a/docs/docs.trychroma.com/markdoc/content/docs/guides/usage-guide.md b/docs/docs.trychroma.com/markdoc/content/docs/guides/usage-guide.md deleted file mode 100644 index ce37ff89487..00000000000 --- a/docs/docs.trychroma.com/markdoc/content/docs/guides/usage-guide.md +++ /dev/null @@ -1,851 +0,0 @@ ---- -{ - "id": "usage-guide", - "title": "Usage Guide", - "section": "Guides", - "order": 0 -} ---- - - -# Usage Guide - - -{% tabs group="code-lang" hideContent=true %} - -{% tab label="Python" %} -{% /tab %} - -{% tab label="Javascript" %} -{% /tab %} - -{% /tabs %} - ---- - -## Initiating a persistent Chroma client - -{% tabs group="code-lang" hideTabs=true %} -{% tab label="Python" %} - -```python -import chromadb -``` - -You can configure Chroma to save and load the database from your local machine. Data will be persisted automatically and loaded on start (if it exists). - -```python -client = chromadb.PersistentClient(path="/path/to/save/to") -``` - -The `path` is where Chroma will store its database files on disk, and load them on start. - -{% /tab %} -{% tab label="Javascript" %} - -```js -// CJS -const { ChromaClient } = require("chromadb"); - -// ESM -import { ChromaClient } from "chromadb"; -``` - -{% note type="note" title="Connecting to the backend" %} -To connect with the JS client, you must connect to a backend running Chroma. See [Running Chroma in client-server mode](#running-chroma-in-client-server-mode) for how to do this. -{% /note %} - -```js -const client = new ChromaClient(); -``` - -{% /tab %} - -{% /tabs %} - -{% tabs group="code-lang" hideTabs=true %} -{% tab label="Python" %} - -The client object has a few useful convenience methods. - -```python -client.heartbeat() # returns a nanosecond heartbeat. Useful for making sure the client remains connected. -client.reset() # Empties and completely resets the database. ⚠️ This is destructive and not reversible. -``` - -{% /tab %} -{% tab label="Javascript" %} - -The client object has a few useful convenience methods. - -```javascript -await client.reset() # Empties and completely resets the database. ⚠️ This is destructive and not reversible. -``` - -{% /tab %} - -{% /tabs %} - -## Running Chroma in client-server mode - -{% tabs group="code-lang" hideTabs=true %} -{% tab label="Python" %} - -Chroma can also be configured to run in client/server mode. In this mode, the Chroma client connects to a Chroma server running in a separate process. - -To start the Chroma server, run the following command: - -```bash -chroma run --path /db_path -``` - -Then use the Chroma HTTP client to connect to the server: - -```python -import chromadb -chroma_client = chromadb.HttpClient(host='localhost', port=8000) -``` - -That's it! Chroma's API will run in `client-server` mode with just this change. - ---- - -Chroma also provides an async HTTP client. The behaviors and method signatures are identical to the synchronous client, but all methods that would block are now async. To use it, call `AsyncHttpClient` instead: - -```python -import asyncio -import chromadb - -async def main(): - client = await chromadb.AsyncHttpClient() - collection = await client.create_collection(name="my_collection") - - await collection.add( - documents=["hello world"], - ids=["id1"] - ) - -asyncio.run(main()) -``` - - - -#### Using the Python HTTP-only client - -If you are running Chroma in client-server mode, you may not need the full Chroma library. Instead, you can use the lightweight client-only library. -In this case, you can install the `chromadb-client` package. This package is a lightweight HTTP client for the server with a minimal dependency footprint. - -```python -pip install chromadb-client -``` - -```python -import chromadb -# Example setup of the client to connect to your chroma server -client = chromadb.HttpClient(host='localhost', port=8000) - -# Or for async usage: -async def main(): - client = await chromadb.AsyncHttpClient(host='localhost', port=8000) -``` - -Note that the `chromadb-client` package is a subset of the full Chroma library and does not include all the dependencies. If you want to use the full Chroma library, you can install the `chromadb` package instead. -Most importantly, there is no default embedding function. If you add() documents without embeddings, you must have manually specified an embedding function and installed the dependencies for it. - -{% /tab %} -{% tab label="Javascript" %} - -To run Chroma in client server mode, first install the chroma library and CLI via pypi: - -```bash -pip install chromadb -``` - -Then start the Chroma server: - -```bash -chroma run --path /db_path -``` - -The JS client then talks to the chroma server backend. - -```js -// CJS -const { ChromaClient } = require("chromadb"); - -// ESM -import { ChromaClient } from "chromadb"; - -const client = new ChromaClient(); -``` - -You can also run the Chroma server in a docker container, or deployed to a cloud provider. See the [deployment docs](./deployment.md) for more information. - -{% /tab %} - -{% /tabs %} - -## Using collections - -Chroma lets you manage collections of embeddings, using the `collection` primitive. - -### Creating, inspecting, and deleting Collections - -Chroma uses collection names in the url, so there are a few restrictions on naming them: - -- The length of the name must be between 3 and 63 characters. -- The name must start and end with a lowercase letter or a digit, and it can contain dots, dashes, and underscores in between. -- The name must not contain two consecutive dots. -- The name must not be a valid IP address. - -Chroma collections are created with a name and an optional embedding function. If you supply an embedding function, you must supply it every time you get the collection. - -{% tabs group="code-lang" hideTabs=true %} -{% tab label="Python" %} - -```python -collection = client.create_collection(name="my_collection", embedding_function=emb_fn) -collection = client.get_collection(name="my_collection", embedding_function=emb_fn) -``` - -{% note type="caution" %} -If you later wish to `get_collection`, you MUST do so with the embedding function you supplied while creating the collection -{% /note %} - -The embedding function takes text as input, and performs tokenization and embedding. If no embedding function is supplied, Chroma will use [sentence transformer](https://www.sbert.net/index.html) as a default. - -{% /tab %} -{% tab label="Javascript" %} - -```js -// CJS -const { ChromaClient } = require("chromadb"); - -// ESM -import { ChromaClient } from "chromadb"; -``` - -The JS client talks to a chroma server backend. This can run on your local computer or be easily deployed to AWS. - -```js -let collection = await client.createCollection({ - name: "my_collection", - embeddingFunction: emb_fn, -}); -let collection2 = await client.getCollection({ - name: "my_collection", - embeddingFunction: emb_fn, -}); -``` - -{% note type="caution" %} -If you later wish to `getCollection`, you MUST do so with the embedding function you supplied while creating the collection -{% /note %} - -The embedding function takes text as input, and performs tokenization and embedding. - -{% /tab %} - -{% /tabs %} - -You can learn more about [🧬 embedding functions](./guides/embeddings), and how to create your own. - -{% tabs group="code-lang" hideTabs=true %} -{% tab label="Python" %} - -Existing collections can be retrieved by name with `.get_collection`, and deleted with `.delete_collection`. You can also use `.get_or_create_collection` to get a collection if it exists, or create it if it doesn't. - -```python -collection = client.get_collection(name="test") # Get a collection object from an existing collection, by name. Will raise an exception if it's not found. -collection = client.get_or_create_collection(name="test") # Get a collection object from an existing collection, by name. If it doesn't exist, create it. -client.delete_collection(name="my_collection") # Delete a collection and all associated embeddings, documents, and metadata. ⚠️ This is destructive and not reversible -``` - -{% /tab %} -{% tab label="Javascript" %} - -Existing collections can be retrieved by name with `.getCollection`, and deleted with `.deleteCollection`. - -```javascript -const collection = await client.getCollection({ name: "test" }); // Get a collection object from an existing collection, by name. Will raise an exception of it's not found. -collection = await client.getOrCreateCollection({ name: "test" }); // Get a collection object from an existing collection, by name. If it doesn't exist, create it. -await client.deleteCollection(collection); // Delete a collection and all associated embeddings, documents, and metadata. ⚠️ This is destructive and not reversible -``` - -{% /tab %} - -{% /tabs %} - -{% tabs group="code-lang" hideTabs=true %} -{% tab label="Python" %} - -Collections have a few useful convenience methods. - -```python -collection.peek() # returns a list of the first 10 items in the collection -collection.count() # returns the number of items in the collection -collection.modify(name="new_name") # Rename the collection -``` - -{% /tab %} -{% tab label="Javascript" %} - -There are a few useful convenience methods for working with Collections. - -```javascript -await collection.peek(); // returns a list of the first 10 items in the collection -await collection.count(); // returns the number of items in the collection -``` - -{% /tab %} - -{% /tabs %} - -### Changing the distance function - -{% tabs group="code-lang" hideTabs=true %} -{% tab label="Python" %} - -`create_collection` also takes an optional `metadata` argument which can be used to customize the distance method of the embedding space by setting the value of `hnsw:space`. - -```python - collection = client.create_collection( - name="collection_name", - metadata={"hnsw:space": "cosine"} # l2 is the default - ) -``` - -{% /tab %} -{% tab label="Javascript" %} - -`createCollection` also takes an optional `metadata` argument which can be used to customize the distance method of the embedding space by setting the value of `hnsw:space` - -```js -let collection = client.createCollection({ - name: "collection_name", - metadata: { "hnsw:space": "cosine" }, -}); -``` - -{% /tab %} - -{% /tabs %} - -Valid options for `hnsw:space` are "l2", "ip, "or "cosine". The **default** is "l2" which is the squared L2 norm. - -{% special_table %} -{% /special_table %} - -| Distance | parameter | Equation | -| ----------------- | :-------: | ------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| Squared L2 | `l2` | {% math latexText="d = \\sum\\left(A_i-B_i\\right)^2" %}{% /math %} | -| Inner product | `ip` | {% math latexText="d = 1.0 - \\sum\\left(A_i \\times B_i\\right) " %}{% /math %} | -| Cosine similarity | `cosine` | {% math latexText="d = 1.0 - \\frac{\\sum\\left(A_i \\times B_i\\right)}{\\sqrt{\\sum\\left(A_i^2\\right)} \\cdot \\sqrt{\\sum\\left(B_i^2\\right)}}" %}{% /math %} | - -### Adding data to a Collection - -{% tabs group="code-lang" hideTabs=true %} -{% tab label="Python" %} - -Add data to Chroma with `.add`. - -Raw documents: - -```python -collection.add( - documents=["lorem ipsum...", "doc2", "doc3", ...], - metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...], - ids=["id1", "id2", "id3", ...] -) -``` - -{% /tab %} -{% tab label="Javascript" %} - -Add data to Chroma with `.addRecords`. - -Raw documents: - -```javascript -await collection.add({ - ids: ["id1", "id2", "id3", ...], - metadatas: [{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...], - documents: ["lorem ipsum...", "doc2", "doc3", ...], -}) -// input order -// ids - required -// embeddings - optional -// metadata - optional -// documents - optional -``` - -{% /tab %} - -{% /tabs %} - -If Chroma is passed a list of `documents`, it will automatically tokenize and embed them with the collection's embedding function (the default will be used if none was supplied at collection creation). Chroma will also store the `documents` themselves. If the documents are too large to embed using the chosen embedding function, an exception will be raised. - -Each document must have a unique associated `id`. Trying to `.add` the same ID twice will result in only the initial value being stored. An optional list of `metadata` dictionaries can be supplied for each document, to store additional information and enable filtering. - -Alternatively, you can supply a list of document-associated `embeddings` directly, and Chroma will store the associated documents without embedding them itself. - -{% tabs group="code-lang" hideTabs=true %} -{% tab label="Python" %} - -```python -collection.add( - documents=["doc1", "doc2", "doc3", ...], - embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...], - metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...], - ids=["id1", "id2", "id3", ...] -) -``` - -{% /tab %} -{% tab label="Javascript" %} - -```javascript -await collection.add({ - ids: ["id1", "id2", "id3", ...], - embeddings: [[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...], - metadatas: [{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...], - documents: ["lorem ipsum...", "doc2", "doc3", ...], -}) - -``` - -{% /tab %} - -{% /tabs %} - -If the supplied `embeddings` are not the same dimension as the collection, an exception will be raised. - -You can also store documents elsewhere, and just supply a list of `embeddings` and `metadata` to Chroma. You can use the `ids` to associate the embeddings with your documents stored elsewhere. - -{% tabs group="code-lang" hideTabs=true %} -{% tab label="Python" %} - -```python -collection.add( - ids=["id1", "id2", "id3", ...], - embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...], - metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...] -) -``` - -{% /tab %} -{% tab label="Javascript" %} - -```javascript -await collection.add({ - ids: ["id1", "id2", "id3", ...], - embeddings: [[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...], - metadatas: [{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...], -}) -``` - -{% /tab %} - -{% /tabs %} - -### Querying a Collection - -You can query by a set of `query_embeddings`. - -{% tabs group="code-lang" hideTabs=true %} -{% tab label="Python" %} - -Chroma collections can be queried in a variety of ways, using the `.query` method. - -```python -collection.query( - query_embeddings=[[11.1, 12.1, 13.1],[1.1, 2.3, 3.2], ...], - n_results=10, - where={"metadata_field": "is_equal_to_this"}, - where_document={"$contains":"search_string"} -) -``` - -{% /tab %} -{% tab label="Javascript" %} - -Chroma collections can be queried in a variety of ways, using the `.queryRecords` method. - -```javascript -const result = await collection.query({ - queryEmbeddings: [[11.1, 12.1, 13.1],[1.1, 2.3, 3.2], ...], - nResults: 10, - where: {"metadata_field": "is_equal_to_this"}, -}) -// input order -// queryEmbeddings - optional, exactly one of queryEmbeddings and queryTexts must be provided -// queryTexts - optional -// n_results - required -// where - optional -``` - -{% /tab %} - -{% /tabs %} - -The query will return the `n_results` closest matches to each `query_embedding`, in order. -An optional `where` filter dictionary can be supplied to filter by the `metadata` associated with each document. -Additionally, an optional `where_document` filter dictionary can be supplied to filter by contents of the document. - -If the supplied `query_embeddings` are not the same dimension as the collection, an exception will be raised. - -You can also query by a set of `query_texts`. Chroma will first embed each `query_text` with the collection's embedding function, and then perform the query with the generated embedding. - -{% tabs group="code-lang" hideTabs=true %} -{% tab label="Python" %} - -```python -collection.query( - query_texts=["doc10", "thus spake zarathustra", ...], - n_results=10, - where={"metadata_field": "is_equal_to_this"}, - where_document={"$contains":"search_string"} -) -``` - -You can also retrieve items from a collection by `id` using `.get`. - -```python -collection.get( - ids=["id1", "id2", "id3", ...], - where={"style": "style1"} -) -``` - -{% /tab %} -{% tab label="Javascript" %} - -```javascript -await collection.query({ - nResults: 10, // n_results - where: {"metadata_field": "is_equal_to_this"}, // where - queryTexts: ["doc10", "thus spake zarathustra", ...], // query_text -}) -``` - -You can also retrieve records from a collection by `id` using `.getRecords`. - -```javascript -await collection.get( { - ids: ["id1", "id2", "id3", ...], //ids - where: {"style": "style1"} // where -}) -``` - -{% /tab %} - -{% /tabs %} - -`.get` also supports the `where` and `where_document` filters. If no `ids` are supplied, it will return all items in the collection that match the `where` and `where_document` filters. - -##### Choosing which data is returned - -When using get or query you can use the include parameter to specify which data you want returned - any of `embeddings`, `documents`, `metadatas`, and for query, `distances`. By default, Chroma will return the `documents`, `metadatas` and in the case of query, the `distances` of the results. `embeddings` are excluded by default for performance and the `ids` are always returned. You can specify which of these you want returned by passing an array of included field names to the includes parameter of the query or get method. Note that embeddings will be returned as a 2-d numpy array in `.get` and a python list of 2-d numpy arrays in `.query`. - -{% tabs group="code-lang" hideTabs=true %} -{% tab label="Python" %} - -```python -# Only get documents and ids -collection.get( - include=["documents"] -) - -collection.query( - query_embeddings=[[11.1, 12.1, 13.1],[1.1, 2.3, 3.2], ...], - include=["documents"] -) -``` - -{% /tab %} -{% tab label="Javascript" %} - -```javascript -# Only get documents and ids -collection.get( - {include=["documents"]} -) - -collection.get({ - queryEmbeddings=[[11.1, 12.1, 13.1],[1.1, 2.3, 3.2], ...], - include=["documents"] -}) -``` - -{% /tab %} - -{% /tabs %} - -### Using Where filters - -Chroma supports filtering queries by `metadata` and `document` contents. The `where` filter is used to filter by `metadata`, and the `where_document` filter is used to filter by `document` contents. - -##### Filtering by metadata - -In order to filter on metadata, you must supply a `where` filter dictionary to the query. The dictionary must have the following structure: - -```python -{ - "metadata_field": { - : - } -} -``` - -Filtering metadata supports the following operators: - -- `$eq` - equal to (string, int, float) -- `$ne` - not equal to (string, int, float) -- `$gt` - greater than (int, float) -- `$gte` - greater than or equal to (int, float) -- `$lt` - less than (int, float) -- `$lte` - less than or equal to (int, float) - -Using the $eq operator is equivalent to using the `where` filter. - -```python -{ - "metadata_field": "search_string" -} - -# is equivalent to - -{ - "metadata_field": { - "$eq": "search_string" - } -} -``` - -{% note type="note" %} -Where filters only search embeddings where the key exists. If you search `collection.get(where={"version": {"$ne": 1}})`. Metadata that does not have the key `version` will not be returned. -{% /note %} - -##### Filtering by document contents - -In order to filter on document contents, you must supply a `where_document` filter dictionary to the query. We support two filtering keys: `$contains` and `$not_contains`. The dictionary must have the following structure: - -```python -# Filtering for a search_string -{ - "$contains": "search_string" -} -``` - -```python -# Filtering for not contains -{ - "$not_contains": "search_string" -} -``` - -##### Using logical operators - -You can also use the logical operators `$and` and `$or` to combine multiple filters. - -An `$and` operator will return results that match all of the filters in the list. - -```python -{ - "$and": [ - { - "metadata_field": { - : - } - }, - { - "metadata_field": { - : - } - } - ] -} -``` - -An `$or` operator will return results that match any of the filters in the list. - -```python -{ - "$or": [ - { - "metadata_field": { - : - } - }, - { - "metadata_field": { - : - } - } - ] -} -``` - -##### Using inclusion operators (`$in` and `$nin`) - -The following inclusion operators are supported: - -- `$in` - a value is in predefined list (string, int, float, bool) -- `$nin` - a value is not in predefined list (string, int, float, bool) - -An `$in` operator will return results where the metadata attribute is part of a provided list: - -```json -{ - "metadata_field": { - "$in": ["value1", "value2", "value3"] - } -} -``` - -An `$nin` operator will return results where the metadata attribute is not part of a provided list: - -```json -{ - "metadata_field": { - "$nin": ["value1", "value2", "value3"] - } -} -``` - -{% tabs group="code-lang" hideTabs=true %} -{% tab label="Python" %} - -{% note type="note" title="Practical examples" %} -For additional examples and a demo how to use the inclusion operators, please see provided notebook [here](https://github.com/chroma-core/chroma/blob/main/examples/basic_functionality/in_not_in_filtering.ipynb) -{% /note %} - -{% /tab %} -{% tab label="Javascript" %} -{% /tab %} - -{% /tabs %} - -### Updating data in a collection - -{% tabs group="code-lang" hideTabs=true %} -{% tab label="Python" %} - -Any property of records in a collection can be updated using `.update`. - -```python -collection.update( - ids=["id1", "id2", "id3", ...], - embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...], - metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...], - documents=["doc1", "doc2", "doc3", ...], -) -``` - -{% /tab %} -{% tab label="Javascript" %} - -Any property of records in a collection can be updated using `.updateRecords`. - -```javascript -collection.update( - { - ids: ["id1", "id2", "id3", ...], - embeddings: [[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...], - metadatas: [{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...], - documents: ["doc1", "doc2", "doc3", ...], - }, -) -``` - -{% /tab %} - -{% /tabs %} - -If an `id` is not found in the collection, an error will be logged and the update will be ignored. If `documents` are supplied without corresponding `embeddings`, the embeddings will be recomputed with the collection's embedding function. - -If the supplied `embeddings` are not the same dimension as the collection, an exception will be raised. - -Chroma also supports an `upsert` operation, which updates existing items, or adds them if they don't yet exist. - -{% tabs group="code-lang" hideTabs=true %} -{% tab label="Python" %} - -```python -collection.upsert( - ids=["id1", "id2", "id3", ...], - embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...], - metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...], - documents=["doc1", "doc2", "doc3", ...], -) -``` - -{% /tab %} -{% tab label="Javascript" %} - -```javascript -await collection.upsert({ - ids: ["id1", "id2", "id3"], - embeddings: [ - [1.1, 2.3, 3.2], - [4.5, 6.9, 4.4], - [1.1, 2.3, 3.2], - ], - metadatas: [ - { chapter: "3", verse: "16" }, - { chapter: "3", verse: "5" }, - { chapter: "29", verse: "11" }, - ], - documents: ["doc1", "doc2", "doc3"], -}); -``` - -{% /tab %} - -{% /tabs %} - -If an `id` is not present in the collection, the corresponding items will be created as per `add`. Items with existing `id`s will be updated as per `update`. - -### Deleting data from a collection - -Chroma supports deleting items from a collection by `id` using `.delete`. The embeddings, documents, and metadata associated with each item will be deleted. -⚠️ Naturally, this is a destructive operation, and cannot be undone. - -{% tabs group="code-lang" hideTabs=true %} -{% tab label="Python" %} - -```python -collection.delete( - ids=["id1", "id2", "id3",...], - where={"chapter": "20"} -) -``` - -{% /tab %} -{% tab label="Javascript" %} - -```javascript -await collection.delete({ - ids: ["id1", "id2", "id3",...], //ids - where: {"chapter": "20"} //where -}) -``` - -{% /tab %} - -{% /tabs %} - -`.delete` also supports the `where` filter. If no `ids` are supplied, it will delete all items in the collection that match the `where` filter. diff --git a/docs/docs.trychroma.com/markdoc/content/reference/python/client.md b/docs/docs.trychroma.com/markdoc/content/reference/python/client.md index 1584a28d6df..7b47ab1a9ce 100644 --- a/docs/docs.trychroma.com/markdoc/content/reference/python/client.md +++ b/docs/docs.trychroma.com/markdoc/content/reference/python/client.md @@ -159,7 +159,7 @@ Used to check if the server is alive. - `int` - The current time in nanoseconds since epoch -## count\_collections +## count_collections ```python def count_collections() -> int @@ -179,7 +179,7 @@ client.count_collections() # 1 ``` -## delete\_collection +## delete_collection ```python def delete_collection(name: str) -> None @@ -215,7 +215,7 @@ Resets the database. This will delete all collections and entries. - `bool` - True if the database was reset successfully. -## get\_version +## get_version ```python def get_version() -> str @@ -227,7 +227,7 @@ Get the version of Chroma. - `str` - The version of Chroma -## get\_settings +## get_settings ```python def get_settings() -> Settings @@ -239,7 +239,7 @@ Get the settings used to initialize. - `Settings` - The settings used to initialize. -## get\_max\_batch\_size +## get_max_batch_size ```python def get_max_batch_size() -> int @@ -255,7 +255,7 @@ Return the maximum number of records that can be created or mutated in a single class ClientAPI(BaseAPI, ABC) ``` -## list\_collections +## list_collections ```python def list_collections(limit: Optional[int] = None, @@ -366,7 +366,7 @@ Get a collection with the given name. # collection(name="my_collection", metadata={}) ``` -## get\_or\_create\_collection +## get_or_create_collection ```python def get_or_create_collection( diff --git a/docs/docs.trychroma.com/next.config.mjs b/docs/docs.trychroma.com/next.config.mjs index 9aca8484266..295262aefae 100644 --- a/docs/docs.trychroma.com/next.config.mjs +++ b/docs/docs.trychroma.com/next.config.mjs @@ -1,6 +1,6 @@ /** @type {import('next').NextConfig} */ const nextConfig = { - webpack(config) { + webpack(config, { isServer }) { config.module.rules.push({ test: /\.svg$/, use: [ @@ -12,8 +12,15 @@ const nextConfig = { }, ], }); + + config.externals = [ + ...(config.externals || []), + "@xenova/transformers", + "chromadb", + ]; + return config; - } + }, }; export default nextConfig; diff --git a/docs/docs.trychroma.com/package.json b/docs/docs.trychroma.com/package.json index b19d75c6cf5..648c755afcc 100644 --- a/docs/docs.trychroma.com/package.json +++ b/docs/docs.trychroma.com/package.json @@ -6,15 +6,17 @@ "start": "next start", "index": "node scripts/indexContent.mjs", "gen-js": "sh scripts/jsDocs.sh", - "gen-python": "sh scripts/pythonDocs.sh" + "gen-python": "sh scripts/pythonDocs.sh", + "ingest-docs": "tsx search/ingest-docs.ts" }, "dependencies": { "@docsearch/react": "^3.8.0", "@heroicons/react": "^2.2.0", + "@langchain/textsplitters": "^0.1.0", "@markdoc/markdoc": "latest", "@markdoc/next.js": "^0.3.7", "@matejmazur/react-katex": "^3.1.3", - "@radix-ui/react-dialog": "^1.1.2", + "@radix-ui/react-dialog": "^1.1.4", "@radix-ui/react-dropdown-menu": "^2.0.6", "@radix-ui/react-icons": "^1.3.0", "@radix-ui/react-menubar": "^1.0.4", @@ -27,13 +29,18 @@ "@sindresorhus/slugify": "^2.2.1", "@svgr/webpack": "^8.1.0", "@tailwindcss/typography": "^0.5.15", + "@xenova/transformers": "^2.17.2", "algoliasearch": "^5.17.1", + "chromadb": "^1.9.4", + "chromadb-default-embed": "^2.13.2", "class-variance-authority": "^0.7.0", "clsx": "^2.1.0", + "dotenv": "^16.4.7", "flexsearch": "^0.7.43", "gray-matter": "^4.0.3", "image-size": "^1.1.1", "katex": "^0.16.9", + "lodash": "^4.17.21", "lucide": "^0.464.0", "lucide-react": "^0.464.0", "next": "^14.2.13", @@ -50,10 +57,12 @@ "tailwind-merge": "^2.2.1", "tailwindcss-animate": "^1.0.7", "unified": "^11.0.5", + "uuid": "^11.0.3", "vaul": "^1.1.1" }, "devDependencies": { "@types/katex": "^0.16.7", + "@types/lodash": "^4.17.13", "@types/node": "latest", "@types/prismjs": "^1.26.5", "@types/react": "latest", @@ -63,6 +72,8 @@ "postcss": "^8.4.35", "prettier": "^3.4.1", "tailwindcss": "^3.4.1", + "ts-node": "^10.9.2", + "tsx": "^4.19.2", "typedoc": "^0.24.7", "typedoc-plugin-markdown": "^3.15.3", "typescript": "latest" diff --git a/docs/docs.trychroma.com/public/chroma-icon.svg b/docs/docs.trychroma.com/public/chroma-icon.svg new file mode 100644 index 00000000000..fc07764e435 --- /dev/null +++ b/docs/docs.trychroma.com/public/chroma-icon.svg @@ -0,0 +1,59 @@ + + + + + Chroma + + + + + + diff --git a/docs/docs.trychroma.com/search/ingest-docs.ts b/docs/docs.trychroma.com/search/ingest-docs.ts new file mode 100644 index 00000000000..fb1e8059604 --- /dev/null +++ b/docs/docs.trychroma.com/search/ingest-docs.ts @@ -0,0 +1,212 @@ +import { promises as fs } from "fs"; +import path from "path"; +import { fileURLToPath } from "url"; +import { ChromaClient } from "chromadb"; +import "dotenv/config"; +// @ts-ignore +import { Collection } from "chromadb/src/Collection"; +import { + RecursiveCharacterTextSplitter, + TokenTextSplitter, +} from "@langchain/textsplitters"; +import { v4 as uuidv4 } from "uuid"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +const splitMarkdownByHeaders = ( + markdownContent: string, +): { + title: string; + content: string; +}[] => { + const normalizedContent = markdownContent.replace(/\r\n/g, "\n"); + const lines = normalizedContent.split("\n"); + + const sections: { + content: string; + title: string; + }[] = []; + let currentSection: string[] = []; + let currentTitle = ""; + let hasStarted = false; + let insideCodeFence = false; + let currentFenceMarker = ""; + + const addCurrentSection = () => { + if (currentSection.length > 0 && hasStarted) { + sections.push({ + content: currentSection.join("\n").trim(), + title: currentTitle, + }); + currentSection = []; + } + }; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + + const codeFenceMatch = line.match(/^([`~]{3,})/); + if (codeFenceMatch) { + if (!insideCodeFence) { + insideCodeFence = true; + currentFenceMarker = codeFenceMatch[1][0]; // Remember if it's ` or ~ + } else if (line.startsWith(currentFenceMarker)) { + insideCodeFence = false; + } + currentSection.push(line); + continue; + } + + if (insideCodeFence) { + currentSection.push(line); + continue; + } + + const headerMatch = line.match(/^(#{1,2})\s+(.+)$/); + + if (headerMatch) { + const [fullMatch, hashes, headerText] = headerMatch; + + if (hashes.length <= 2) { + addCurrentSection(); + currentTitle = headerText.trim(); + currentSection.push(line); + hasStarted = true; + } else { + currentSection.push(line); + } + } else { + currentSection.push(line); + } + } + + addCurrentSection(); + return sections; +}; + +export const recursiveChunker = async ( + inputData: string, + chunkSize: number, + chunkOverlap: number, +): Promise<{ chunk: string; title: string }[]> => { + const markdocTagPattern = /{%\s.*?\s%}/g; + const data = inputData.replace(markdocTagPattern, ""); + + const sections = splitMarkdownByHeaders(data); + const splitter = new RecursiveCharacterTextSplitter({ + chunkSize, + chunkOverlap, + separators: ["\n\n", "\n", ".", " ", ""], + }); + + const results: { chunk: string; title: string }[] = []; + for (const { title, content } of sections) { + const sectionChunks = await splitter.splitText(content); + for (const c of sectionChunks) { + results.push({ chunk: c, title }); + } + } + + return results; +}; + +const tokenChunker = async ( + data: { chunk: string; title: string }, + chunkSize: number, + chunkOverlap: number, +) => { + const splitter = new TokenTextSplitter({ chunkSize, chunkOverlap }); + return (await splitter.splitText(data.chunk)).map((chunk) => { + return { chunk, title: data.title }; + }); +}; + +const ingestDocs = async ( + collection: Collection, + filePath: string, + pageLink: string, +) => { + try { + const content = await fs.readFile(filePath, "utf8"); + + const match = content.match(/^# (.+)/m); + const pageTitle = match ? match[1].trim() : "Chroma Docs"; + + const splitTexts = await recursiveChunker(content, 1000, 100); + const tokenSplitTexts: { chunk: string; title: string }[] = []; + for (const chunk of splitTexts) { + tokenSplitTexts.push(...(await tokenChunker(chunk, 256, 0))); + } + + const path = pageLink.split("/").slice(1); + + await collection.add({ + ids: tokenSplitTexts.map(() => uuidv4()), + documents: tokenSplitTexts.map((chunk) => chunk.chunk), + metadatas: tokenSplitTexts.map((chunk) => { + return { + section: path[0], + subsection: path.length === 3 ? path[1] : undefined, + page: path.length === 3 ? path[2] : path[1], + title: chunk.title, + pageTitle, + }; + }), + }); + } catch (err) { + console.error("Error ingesting file:", `${pageLink}\n${err}`); + } +}; + +const collectMarkdownFiles = async ( + dir: string, + allFiles: string[], +): Promise => { + const entries = await fs.readdir(dir, { withFileTypes: true }); + + for (const entry of entries) { + const fullPath = path.join(dir, entry.name); + + if (entry.isDirectory()) { + await collectMarkdownFiles(fullPath, allFiles); + } else if (entry.isFile() && entry.name.endsWith(".md")) { + allFiles.push(fullPath); + } + } +}; + +const main = async (): Promise => { + const markdocContentDir = path.join(__dirname, "..", "markdoc", "content"); + const allMarkdowns: string[] = []; + await collectMarkdownFiles(markdocContentDir, allMarkdowns); + + const chromaClient = new ChromaClient({ + path: "https://api.trychroma.com:8000", + auth: { + provider: "token", + credentials: process.env.CHROMA_CLOUD_API_KEY, + tokenHeaderType: "X_CHROMA_TOKEN", + }, + tenant: process.env.CHROMA_CLOUD_TENANT, + database: "docs", + }); + + const collection: Collection = await chromaClient.getOrCreateCollection({ + name: "docs-content", + }); + + for (const doc of allMarkdowns) { + await ingestDocs( + collection, + doc, + doc.replace(markdocContentDir, "").replace(".md", ""), + ); + } +}; + +if (import.meta.url === `file://${__filename}`) { + main().catch((err) => console.error("Error:", err)); +} + +export default main;