From 15d6b72a028c878e74c894939f7e3b171bef0505 Mon Sep 17 00:00:00 2001 From: fzowl Date: Sun, 22 Dec 2024 15:53:26 +0100 Subject: [PATCH] Adding VoyageAI text embedding integration --- chromadb/test/ef/test_ef.py | 1 + .../voyageai_embedding_function.py | 27 ++++++ clients/js/package.json | 8 +- .../embeddings/VoyageAIEmbeddingFunction.ts | 78 +++++++++++++++++ clients/js/test/add.collections.test.ts | 25 ++++++ .../integrations/embedding-models/voyageai.md | 86 +++++++++++++++++++ 6 files changed, 223 insertions(+), 2 deletions(-) create mode 100644 chromadb/utils/embedding_functions/voyageai_embedding_function.py create mode 100644 clients/js/src/embeddings/VoyageAIEmbeddingFunction.ts create mode 100644 docs/docs.trychroma.com/markdoc/content/integrations/embedding-models/voyageai.md diff --git a/chromadb/test/ef/test_ef.py b/chromadb/test/ef/test_ef.py index c93502e3fc8..603adf5d994 100644 --- a/chromadb/test/ef/test_ef.py +++ b/chromadb/test/ef/test_ef.py @@ -15,6 +15,7 @@ def test_get_builtins_holds() -> None: expected_builtins = { "AmazonBedrockEmbeddingFunction", "CohereEmbeddingFunction", + "VoyageAIEmbeddingFunction", "GoogleGenerativeAiEmbeddingFunction", "GooglePalmEmbeddingFunction", "GoogleVertexEmbeddingFunction", diff --git a/chromadb/utils/embedding_functions/voyageai_embedding_function.py b/chromadb/utils/embedding_functions/voyageai_embedding_function.py new file mode 100644 index 00000000000..65b553f5932 --- /dev/null +++ b/chromadb/utils/embedding_functions/voyageai_embedding_function.py @@ -0,0 +1,27 @@ +import logging + +from chromadb.api.types import Documents, EmbeddingFunction, Embeddings + +logger = logging.getLogger(__name__) + + +class VoyageAIEmbeddingFunction(EmbeddingFunction[Documents]): + def __init__(self, api_key: str, model_name: str): + try: + import voyageai + except ImportError: + raise ValueError( + "The voyageai python package is not installed. Please install it with `pip install voyageai`" + ) + + self._client = voyageai.Client(api_key=api_key) + self._model_name = model_name + + def __call__(self, input: Documents) -> Embeddings: + # Call Cohere Embedding API for each document. + return [ + embeddings + for embeddings in self._client.embed( + texts=input, model=self._model_name + ) + ] diff --git a/clients/js/package.json b/clients/js/package.json index 72655607578..601c2f78580 100644 --- a/clients/js/package.json +++ b/clients/js/package.json @@ -9,9 +9,9 @@ "@jest/globals": "^29.7.0", "@jest/types": "^29.6.3", "@openapi-generator-plus/typescript-fetch-client-generator": "^1.5.0", + "@types/bcrypt": "^5.0.2", "@types/jest": "^29.5.0", "@types/node": "^20.8.10", - "@types/bcrypt": "^5.0.2", "bcrypt": "^5.1.1", "jest": "^29.5.0", "npm-run-all": "^4.1.5", @@ -69,7 +69,8 @@ "peerDependencies": { "@google/generative-ai": "^0.1.1", "cohere-ai": "^5.0.0 || ^6.0.0 || ^7.0.0", - "openai": "^3.0.0 || ^4.0.0" + "openai": "^4.77.0", + "voyageai": "^0.0.3-1" }, "peerDependenciesMeta": { "@google/generative-ai": { @@ -80,6 +81,9 @@ }, "openai": { "optional": true + }, + "voyageai": { + "optional": true } } } diff --git a/clients/js/src/embeddings/VoyageAIEmbeddingFunction.ts b/clients/js/src/embeddings/VoyageAIEmbeddingFunction.ts new file mode 100644 index 00000000000..1d1e51e44ad --- /dev/null +++ b/clients/js/src/embeddings/VoyageAIEmbeddingFunction.ts @@ -0,0 +1,78 @@ +import { IEmbeddingFunction } from "./IEmbeddingFunction"; + +class VoyageAIAPI { + private client: any; + private apiKey: string; + + constructor(configuration: { apiKey: string }) { + this.apiKey = configuration.apiKey; + } + + private async loadClient() { + if (this.client) return; + //@ts-ignore + const voyageai = await import("voyageai").then((voyageai) => { + return voyageai; + }); + // @ts-ignore + this.client = new voyageai.VoyageAIClient({ + apiKey: this.apiKey, + }); + } + + public async createEmbedding(params: { + model: string; + input: string[]; + }): Promise { + await this.loadClient(); + return await this.client + .embed({ input: params.input, model: params.model }) + .then((response: any) => { + return response.data.map((item: { embedding: number[]; }) => item.embedding); + }); + } +} + +export class VoyageAIEmbeddingFunction implements IEmbeddingFunction { + private voyageAiApi?: VoyageAIAPI; + private model: string; + private apiKey: string; + constructor({ + api_key, + model, + }: { + api_key: string; + model: string; + }) { + this.model = model; + this.apiKey = api_key; + } + + private async initClient() { + if (this.voyageAiApi) return; + try { + // @ts-ignore + this.voyageAiApi = await import("voyageai").then((voyageai) => { + // @ts-ignore + return new VoyageAIAPI({ apiKey: this.apiKey }); + }); + } catch (e) { + // @ts-ignore + if (e.code === "MODULE_NOT_FOUND") { + throw new Error( + "Please install the voyageai package to use the VoyageAIEmbeddingFunction, `npm install -S voyageai`", + ); + } + throw e; + } + } + + public async generate(texts: string[]): Promise { + await this.initClient(); + // @ts-ignore + return await this.voyageAiApi.createEmbedding({ + model: this.model, + input: texts, + }); + } +} diff --git a/clients/js/test/add.collections.test.ts b/clients/js/test/add.collections.test.ts index 837051b9daa..cfac1ba6119 100644 --- a/clients/js/test/add.collections.test.ts +++ b/clients/js/test/add.collections.test.ts @@ -12,6 +12,7 @@ import { IncludeEnum } from "../src/types"; import { OpenAIEmbeddingFunction } from "../src/embeddings/OpenAIEmbeddingFunction"; import { CohereEmbeddingFunction } from "../src/embeddings/CohereEmbeddingFunction"; import { OllamaEmbeddingFunction } from "../src/embeddings/OllamaEmbeddingFunction"; +import { VoyageAIEmbeddingFunction } from "../src/embeddings/VoyageAIEmbeddingFunction"; import { InvalidCollectionError } from "../src/Errors"; import { ChromaClient } from "../src/ChromaClient"; @@ -150,6 +151,30 @@ describe("add collections", () => { }); } + if (!process.env.VOYAGE_API_KEY) { + test.skip("it should add VoyageAI embeddings", async () => {}); + } else { + test("it should add VoyageAI embeddings", async () => { + const embedder = new VoyageAIEmbeddingFunction({ + api_key: process.env.VOYAGE_API_KEY || "", + model: "voyage-3-large" + }); + const collection = await client.createCollection({ + name: "test", + embeddingFunction: embedder, + }); + const embeddings = await embedder.generate(DOCUMENTS); + await collection.add({ ids: IDS, embeddings: embeddings }); + const count = await collection.count(); + expect(count).toBe(3); + var res = await collection.get({ + ids: IDS, + include: [IncludeEnum.Embeddings], + }); + expect(res.embeddings).toEqual(embeddings); // reverse because of the order of the ids + }); + } + test("add documents", async () => { const collection = await client.createCollection({ name: "test" }); await collection.add({ diff --git a/docs/docs.trychroma.com/markdoc/content/integrations/embedding-models/voyageai.md b/docs/docs.trychroma.com/markdoc/content/integrations/embedding-models/voyageai.md new file mode 100644 index 00000000000..48594b3abeb --- /dev/null +++ b/docs/docs.trychroma.com/markdoc/content/integrations/embedding-models/voyageai.md @@ -0,0 +1,86 @@ +--- +id: 'voyageai' +name: 'VoyageAI' +--- + +# VoyageAI + +Chroma also provides a convenient wrapper around VoyageAI's embedding API. This embedding function runs remotely on VoyageAI’s servers, and requires an API key. You can get an API key by signing up for an account at [VoyageAI](https://dash.voyageai.com/). + +{% Tabs %} +{% Tab label="python" %} + +This embedding function relies on the `voyageai` python package, which you can install with `pip install voyageai`. + +```python +import chromadb.utils.embedding_functions as embedding_functions +voyageai_ef = embedding_functions.VoyageAIEmbeddingFunction(api_key="YOUR_API_KEY", model_name="voyage-3-large") +voyageai_ef(texts=["document1","document2"]) +``` + +{% /Tab %} + +{% Tab label="typescript" %} + +```typescript +import { VoyageAIEmbeddingFunction } from 'chromadb'; + +const embedder = new VoyageAIEmbeddingFunction("apiKey", "model_name") + +// use directly +const embeddings = embedder.generate(["document1","document2"]) + +// pass documents to query for .add and .query +const collection = await client.createCollection({name: "name", embeddingFunction: embedder}) +const collectionGet = await client.getCollection({name: "name", embeddingFunction: embedder}) +``` + +{% /Tab %} + +{% /Tabs %} + +### Multilingual model example + +{% TabbedCodeBlock %} + +{% Tab label="python" %} + +```python +voyageai_ef = embedding_functions.VoyageAIEmbeddingFunction( + api_key="YOUR_API_KEY", + model_name="voyage-3-large") + +multilingual_texts = [ 'Hello from VoyageAI!', 'مرحباً من VoyageAI!!', + 'Hallo von VoyageAI!', 'Bonjour de VoyageAI!', + '¡Hola desde VoyageAI!', 'Olá do VoyageAI!', + 'Ciao da VoyageAI!', '您好,来自 VoyageAI!', + 'कोहिअर से VoyageAI!' ] + +voyageai_ef(texts=multilingual_texts) + +``` + +{% /Tab %} + +{% Tab label="typescript" %} + +```typescript +import { VoyageAIEmbeddingFunction } from 'chromadb'; + +const embedder = new VoyageAIEmbeddingFunction("apiKey", "voyage-3-large") + +multilingual_texts = [ 'Hello from VoyageAI!', 'مرحباً من VoyageAI!!', + 'Hallo von VoyageAI!', 'Bonjour de VoyageAI!', + '¡Hola desde VoyageAI!', 'Olá do VoyageAI!', + 'Ciao da VoyageAI!', '您好,来自 VoyageAI!', + 'कोहिअर से VoyageAI!' ] + +const embeddings = embedder.generate(multilingual_texts) + +``` + +{% /Tab %} + +{% /TabbedCodeBlock %} + +For further details on VoyageAI's models check the [documentation](https://docs.voyageai.com/docs/introduction) and the [blogs](https://blog.voyageai.com/). \ No newline at end of file