diff --git a/.yarnrc.yml b/.yarnrc.yml index 31a96a1bfb59..0352824903cf 100644 --- a/.yarnrc.yml +++ b/.yarnrc.yml @@ -1,5 +1,7 @@ nodeLinker: node-modules +npmRegistryServer: "https://npm.pkg.github.com/lemonade-hq" + plugins: - path: .yarn/plugins/@yarnpkg/plugin-typescript.cjs spec: "@yarnpkg/plugin-typescript" diff --git a/docs/docs/modules/indexes/vector_stores/integrations/opensearch.md b/docs/docs/modules/indexes/vector_stores/integrations/opensearch.md new file mode 100644 index 000000000000..32e86bec0472 --- /dev/null +++ b/docs/docs/modules/indexes/vector_stores/integrations/opensearch.md @@ -0,0 +1,103 @@ +# OpenSearch + +Langchain.js accepts [@opensearch-project/opensearch](https://opensearch.org/docs/latest/clients/javascript/index/) +as the client for OpenSearch vectorstore. Install the client with + +```bash npm2yarn +npm install -S dotenv langchain @opensearch-project/opensearch +``` + +## Index docs + +```typescript +import { Client } from "@opensearch-project/opensearch"; +import * as dotenv from "dotenv"; +import { Document } from "langchain/document"; +import { OpenAIEmbeddings } from "langchain/embeddings"; +import { OpenSearchVectorStore } from "langchain/vectorstores"; + +dotenv.config(); + +const client = new Client({ + nodes: [process.env.OPENSEARCH_URL ?? "http://127.0.0.1:9200"], +}); + +const docs = [ + new Document({ + metadata: { foo: "bar" }, + pageContent: "opensearch is also a vector db", + }), + new Document({ + metadata: { foo: "bar" }, + pageContent: "the quick brown fox jumped over the lazy dog", + }), + new Document({ + metadata: { baz: "qux" }, + pageContent: "lorem ipsum dolor sit amet", + }), + new Document({ + metadata: { baz: "qux" }, + pageContent: "OpenSearch is a scalable, flexible, and extensible open-source software suite for search, analytics, and observability applications", + }), +]; + +await OpenSearchVectorStore.fromDocuments(docs, new OpenAIEmbeddings(), { + client, + indexName: process.env.OPENSEARCH_INDEX, // Will default to `documents` +}); +``` + +## Query docs + +```typescript +import { Client } from "@opensearch-project/opensearch"; +import * as dotenv from "dotenv"; +import { VectorDBQAChain } from "langchain/chains"; +import { OpenAIEmbeddings } from "langchain/embeddings"; +import { OpenAI } from "langchain/llms"; +import { OpenSearchVectorStore } from "langchain/vectorstores"; + +dotenv.config(); + +const client = new Client({ + nodes: [process.env.OPENSEARCH_URL ?? "http://127.0.0.1:9200"], +}); + +const vectorStore = new OpenSearchVectorStore(new OpenAIEmbeddings(), { + client, +}); + +/* Search the vector DB independently with meta filters */ +const results = await vectorStore.similaritySearch("hello world", 1); +console.log(JSON.stringify(results, null, 2)); +/* [ + { + "pageContent": "Hello world", + "metadata": { + "id": 2 + } + } + ] */ + +/* Use as part of a chain (currently no metadata filters) */ +const model = new OpenAI(); +const chain = VectorDBQAChain.fromLLM(model, vectorStore, { + k: 1, + returnSourceDocuments: true, +}); +const response = await chain.call({ query: "What is opensearch?" }); +console.log(JSON.stringify(response, null, 2)); +/* + { + "text": " Opensearch is a collection of technologies that allow search engines to publish search results in a standard format, making it easier for users to search across multiple sites.", + "sourceDocuments": [ + { + "pageContent": "What's this?", + "metadata": { + "id": 3 + } + } + ] + } + */ +``` diff --git a/examples/.env.example b/examples/.env.example index d71b30773787..79a9b492110b 100644 --- a/examples/.env.example +++ b/examples/.env.example @@ -2,6 +2,7 @@ ANTHROPIC_API_KEY=ADD_YOURS_HERE COHERE_API_KEY=ADD_YOURS_HERE HUGGINGFACEHUB_API_KEY=ADD_YOURS_HERE OPENAI_API_KEY=ADD_YOURS_HERE +OPENSEARCH_URL=http://127.0.0.1:9200 PINECONE_API_KEY=ADD_YOURS_HERE PINECONE_ENVIRONMENT=ADD_YOURS_HERE PINECONE_INDEX=ADD_YOURS_HERE diff --git a/examples/package.json b/examples/package.json index 9db2dfe8ad60..2787fa080005 100644 --- a/examples/package.json +++ b/examples/package.json @@ -24,6 +24,7 @@ "dependencies": { "@dqbd/tiktoken": "^1.0.2", "@getmetal/metal-sdk": "^1.0.12", + "@opensearch-project/opensearch": "^2.2.0", "@pinecone-database/pinecone": "^0.0.10", "@prisma/client": "^4.11.0", "@supabase/supabase-js": "^2.10.0", diff --git a/examples/src/indexes/vector_stores/opensearch/docker-compose.yml b/examples/src/indexes/vector_stores/opensearch/docker-compose.yml new file mode 100644 index 000000000000..4278767bdf12 --- /dev/null +++ b/examples/src/indexes/vector_stores/opensearch/docker-compose.yml @@ -0,0 +1,42 @@ +# Reference: +# https://opensearch.org/docs/latest/install-and-configure/install-opensearch/docker/#sample-docker-composeyml +version: '3' +services: + opensearch: + image: opensearchproject/opensearch:2.6.0 + container_name: opensearch + environment: + - cluster.name=opensearch + - node.name=opensearch + - discovery.type=single-node + - bootstrap.memory_lock=true + - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" + - "DISABLE_INSTALL_DEMO_CONFIG=true" + - "DISABLE_SECURITY_PLUGIN=true" + ulimits: + memlock: + soft: -1 + hard: -1 + volumes: + - opensearch_data:/usr/share/opensearch/data + ports: + - 9200:9200 + - 9600:9600 + networks: + - opensearch + opensearch-dashboards: + image: opensearchproject/opensearch-dashboards:latest # Make sure the version of opensearch-dashboards matches the version of opensearch installed on other nodes + container_name: opensearch-dashboards + ports: + - 5601:5601 # Map host port 5601 to container port 5601 + expose: + - "5601" # Expose port 5601 for web access to OpenSearch Dashboards + environment: + OPENSEARCH_HOSTS: '["http://opensearch:9200"]' # Define the OpenSearch nodes that OpenSearch Dashboards will query + DISABLE_SECURITY_DASHBOARDS_PLUGIN: "true" # disables security dashboards plugin in OpenSearch Dashboards + networks: + - opensearch +networks: + opensearch: +volumes: + opensearch_data: \ No newline at end of file diff --git a/examples/src/indexes/vector_stores/opensearch/opensearch.ts b/examples/src/indexes/vector_stores/opensearch/opensearch.ts new file mode 100644 index 000000000000..377aa3db4e08 --- /dev/null +++ b/examples/src/indexes/vector_stores/opensearch/opensearch.ts @@ -0,0 +1,24 @@ +// Run with: yarn example src/indexes/vector_stores/opensearch/opensearch.ts + +import { Client } from "@opensearch-project/opensearch"; +import { OpenAIEmbeddings } from "langchain/embeddings"; +import { OpenSearchVectorStore } from "langchain/vectorstores"; + +export async function run() { + const client = new Client({ + nodes: [process.env.OPENSEARCH_URL ?? "http://127.0.0.1:9200"], + }); + + const vectorStore = await OpenSearchVectorStore.fromTexts( + ["Hello world", "Bye bye", "What's this?"], + [{ id: 2 }, { id: 1 }, { id: 3 }], + new OpenAIEmbeddings(), + { + client, + indexName: "documents", + } + ); + + const resultOne = await vectorStore.similaritySearch("Hello world", 1); + console.log(resultOne); +} diff --git a/langchain/.env.example b/langchain/.env.example index de2ed2d75d5a..5ce8ab1c483b 100644 --- a/langchain/.env.example +++ b/langchain/.env.example @@ -2,6 +2,7 @@ ANTHROPIC_API_KEY=ADD_YOURS_HERE COHERE_API_KEY=ADD_YOURS_HERE HUGGINGFACEHUB_API_KEY=ADD_YOURS_HERE OPENAI_API_KEY=ADD_YOURS_HERE +OPENSEARCH_URL=http://127.0.0.1:9200 PINECONE_API_KEY=ADD_YOURS_HERE PINECONE_ENVIRONMENT=ADD_YOURS_HERE PINECONE_INDEX=ADD_YOURS_HERE diff --git a/langchain/package.json b/langchain/package.json index c4e132bab837..6e84f6eae464 100644 --- a/langchain/package.json +++ b/langchain/package.json @@ -76,7 +76,7 @@ ], "repository": { "type": "git", - "url": "git@github.com:hwchase17/langchainjs.git" + "url": "git@github.com:lemonade-hq/langchainjs.git" }, "scripts": { "build": "yarn clean && yarn build:esm && yarn build:cjs && node scripts/create-entrypoints.js && node scripts/check-tree-shaking.js", @@ -106,6 +106,7 @@ "@getmetal/metal-sdk": "^1.0.12", "@huggingface/inference": "^1.5.1", "@jest/globals": "^29.5.0", + "@opensearch-project/opensearch": "^2.2.0", "@pinecone-database/pinecone": "^0.0.10", "@supabase/supabase-js": "^2.10.0", "@tsconfig/recommended": "^1.0.2", @@ -153,6 +154,7 @@ "@dqbd/tiktoken": "^1.0.2", "@getmetal/metal-sdk": "*", "@huggingface/inference": "^1.5.1", + "@opensearch-project/opensearch": "*", "@pinecone-database/pinecone": "^0.0.10", "@supabase/supabase-js": "^2.10.0", "cheerio": "^1.0.0-rc.12", @@ -246,7 +248,8 @@ "zod": "^3.21.4" }, "publishConfig": { - "access": "public" + "access": "public", + "registry": "https://npm.pkg.github.com/lemonade-hq" }, "keywords": [ "llm", @@ -370,4 +373,4 @@ }, "./package.json": "./package.json" } -} +} \ No newline at end of file diff --git a/langchain/src/vectorstores/index.ts b/langchain/src/vectorstores/index.ts index 5834faed9194..da678365032f 100644 --- a/langchain/src/vectorstores/index.ts +++ b/langchain/src/vectorstores/index.ts @@ -4,3 +4,4 @@ export { PineconeStore } from "./pinecone.js"; export { VectorStore, SaveableVectorStore } from "./base.js"; export { SupabaseVectorStore } from "./supabase.js"; export { PrismaVectorStore } from "./prisma.js"; +export { OpenSearchVectorStore } from "./opensearch.js"; diff --git a/langchain/src/vectorstores/opensearch.ts b/langchain/src/vectorstores/opensearch.ts new file mode 100644 index 000000000000..dbbe3b2b09bb --- /dev/null +++ b/langchain/src/vectorstores/opensearch.ts @@ -0,0 +1,226 @@ +/* eslint-disable no-instanceof/no-instanceof */ +import { Embeddings } from "embeddings/base.js"; +import { Client, RequestParams, errors } from "@opensearch-project/opensearch"; +import { v4 as uuid } from "uuid"; +import { Document } from "../document.js"; +import { VectorStore } from "./base.js"; + +type OpenSearchEngine = "nmslib" | "hnsw"; +type OpenSearchSpaceType = "l2" | "cosinesimil" | "ip"; + +interface VectorSearchOptions { + readonly engine?: OpenSearchEngine; + readonly spaceType?: OpenSearchSpaceType; + readonly m?: number; + readonly efConstruction?: number; + readonly efSearch?: number; +} + +export interface OpenSearchClientArgs { + readonly client: Client; + readonly indexName?: string; + + readonly vectorSearchOptions?: VectorSearchOptions; +} + +export class OpenSearchVectorStore extends VectorStore { + private readonly client: Client; + + private readonly indexName: string; + + private readonly engine: OpenSearchEngine; + + private readonly spaceType: OpenSearchSpaceType; + + private readonly efConstruction: number; + + private readonly efSearch: number; + + private readonly m: number; + + constructor(embeddings: Embeddings, args: OpenSearchClientArgs) { + super(embeddings, args); + + this.spaceType = args.vectorSearchOptions?.spaceType ?? "l2"; + this.engine = args.vectorSearchOptions?.engine ?? "nmslib"; + this.m = args.vectorSearchOptions?.m ?? 16; + this.efConstruction = args.vectorSearchOptions?.efConstruction ?? 512; + this.efSearch = args.vectorSearchOptions?.efSearch ?? 512; + + this.client = args.client; + this.indexName = args.indexName ?? "documents"; + } + + async addDocuments(documents: Document[]): Promise { + const texts = documents.map(({ pageContent }) => pageContent); + return this.addVectors( + await this.embeddings.embedDocuments(texts), + documents + ); + } + + async addVectors(vectors: number[][], documents: Document[]): Promise { + await this.ensureIndexExists( + vectors[0].length, + this.engine, + this.spaceType, + this.efSearch, + this.efConstruction, + this.m + ); + const operations = vectors.flatMap((embedding, idx) => [ + { + index: { + _index: this.indexName, + _id: uuid(), + }, + }, + { + embedding, + metadata: documents[idx].metadata, + text: documents[idx].pageContent, + }, + ]); + await this.client.bulk({ body: operations }); + await this.client.indices.refresh({ index: this.indexName }); + } + + async similaritySearchVectorWithScore( + query: number[], + k: number, + filter?: object | undefined + ): Promise<[Document, number][]> { + const search: RequestParams.Search = { + index: this.indexName, + body: { + query: { + bool: { + filter: { bool: { must: this.buildMetadataTerms(filter) } }, + must: [ + { + knn: { + embedding: { vector: query, k }, + }, + }, + ], + }, + }, + size: k, + }, + }; + + const { body } = await this.client.search(search); + + return body.hits.hits.map((hit: any) => [ + new Document({ + pageContent: hit._source.text, + metadata: hit._source.metadata, + }), + hit._score, + ]); + } + + static fromTexts( + texts: string[], + metadatas: object[] | object, + embeddings: Embeddings, + args: OpenSearchClientArgs + ): Promise { + const documents = texts.map((text, idx) => { + const metadata = Array.isArray(metadatas) ? metadatas[idx] : metadatas; + return new Document({ pageContent: text, metadata }); + }); + + return OpenSearchVectorStore.fromDocuments(documents, embeddings, args); + } + + static async fromDocuments( + docs: Document[], + embeddings: Embeddings, + dbConfig: OpenSearchClientArgs + ): Promise { + const store = new OpenSearchVectorStore(embeddings, dbConfig); + await store.addDocuments(docs).then(() => store); + return store; + } + + private async ensureIndexExists( + dimension: number, + engine = "nmslib", + spaceType = "l2", + efSearch = 512, + efConstruction = 512, + m = 16 + ): Promise { + const body = { + settings: { + index: { + number_of_shards: 5, + number_of_replicas: 1, + knn: true, + "knn.algo_param.ef_search": efSearch, + }, + }, + mappings: { + dynamic_templates: [ + { + // map all metadata properties to be keyword + "metadata.*": { + match_mapping_type: "*", + mapping: { type: "keyword" }, + }, + }, + ], + properties: { + text: { type: "text" }, + metadata: { type: "object" }, + embedding: { + type: "knn_vector", + dimension, + method: { + name: "hnsw", + engine, + space_type: spaceType, + parameters: { ef_construction: efConstruction, m }, + }, + }, + }, + }, + }; + + const indexExists = await this.doesIndexExist(); + if (indexExists) return; + + await this.client.indices.create({ index: this.indexName, body }); + } + + private buildMetadataTerms( + filter?: object + ): { term: Record }[] { + if (filter == null) return []; + const result = []; + for (const [key, value] of Object.entries(filter)) { + result.push({ term: { [`metadata.${key}`]: value } }); + } + return result; + } + + async doesIndexExist(): Promise { + try { + await this.client.cat.indices({ index: this.indexName }); + return true; + } catch (err: unknown) { + if (err instanceof errors.ResponseError && err.statusCode === 404) { + return false; + } + throw err; + } + } + + async deleteIfExists(): Promise { + const indexExists = await this.doesIndexExist(); + if (!indexExists) return; + + await this.client.indices.delete({ index: this.indexName }); + } +} diff --git a/langchain/src/vectorstores/tests/opensearch.int.test.ts b/langchain/src/vectorstores/tests/opensearch.int.test.ts new file mode 100644 index 000000000000..5fc311ff66db --- /dev/null +++ b/langchain/src/vectorstores/tests/opensearch.int.test.ts @@ -0,0 +1,42 @@ +/* eslint-disable no-process-env */ +import { test, expect } from "@jest/globals"; +import { Client } from "@opensearch-project/opensearch"; +import { OpenAIEmbeddings } from "../../embeddings/index.js"; +import { OpenSearchVectorStore } from "../opensearch.js"; +import { Document } from "../../document.js"; + +test("OpenSearchVectorStore integration", async () => { + const client = new Client({ + nodes: [process.env.OPENSEARCH_URL!], + }); + + const indexName = "test_index"; + + const embeddings = new OpenAIEmbeddings(undefined, { + baseOptions: { temperature: 0 }, + }); + const store = new OpenSearchVectorStore(embeddings, { client, indexName }); + await store.deleteIfExists(); + + expect(store).toBeDefined(); + + await store.addDocuments([ + { pageContent: "hello", metadata: { a: 2 } }, + { pageContent: "car", metadata: { a: 1 } }, + { pageContent: "adjective", metadata: { a: 1 } }, + { pageContent: "hi", metadata: { a: 1 } }, + ]); + + const results1 = await store.similaritySearch("hello!", 1); + + expect(results1).toHaveLength(1); + expect(results1).toEqual([ + new Document({ metadata: { a: 2 }, pageContent: "hello" }), + ]); + + const results2 = await store.similaritySearchWithScore("hello!", 1, { + a: 1, + }); + + expect(results2).toHaveLength(1); +}); diff --git a/package.json b/package.json index 5c58b84827f3..734454ce66f3 100644 --- a/package.json +++ b/package.json @@ -14,7 +14,7 @@ ], "repository": { "type": "git", - "url": "https://github.com/hwchase17/langchainjs.git" + "url": "https://github.com/lemonade-hq/langchainjs.git" }, "packageManager": "yarn@3.4.1", "scripts": { @@ -27,7 +27,7 @@ "test": "yarn run test:unit && yarn run test:int", "test:unit": "turbo run test", "test:int": "yarn workspace langchain run test:integration", - "publish": "bash scripts/release-branch.sh && turbo run build lint test && yarn run test:int && yarn workspace langchain run release", + "publish": "yarn workspace langchain run release", "example": "turbo run start --filter langchain-examples --", "prepare": "husky install", "precommit": "turbo run precommit", diff --git a/yarn.lock b/yarn.lock index 36105d74614e..000a9fe46569 100644 --- a/yarn.lock +++ b/yarn.lock @@ -3180,6 +3180,19 @@ __metadata: languageName: node linkType: hard +"@opensearch-project/opensearch@npm:^2.2.0": + version: 2.2.0 + resolution: "@opensearch-project/opensearch@npm:2.2.0" + dependencies: + aws4: ^1.11.0 + debug: ^4.3.1 + hpagent: ^1.2.0 + ms: ^2.1.3 + secure-json-parse: ^2.4.0 + checksum: cceb5bb2c194a7d4bfab3c1b4a3230ea1457ae8976f39bb9e0c5e0067dc450a418a4649536988f0d48a746d7d3ed2002c32d9fde48dfc3112158e964bafa6e76 + languageName: node + linkType: hard + "@pinecone-database/pinecone@npm:^0.0.10": version: 0.0.10 resolution: "@pinecone-database/pinecone@npm:0.0.10" @@ -5064,6 +5077,13 @@ __metadata: languageName: node linkType: hard +"aws4@npm:^1.11.0": + version: 1.12.0 + resolution: "aws4@npm:1.12.0" + checksum: 68f79708ac7c335992730bf638286a3ee0a645cf12575d557860100767c500c08b30e24726b9f03265d74116417f628af78509e1333575e9f8d52a80edfe8cbc + languageName: node + linkType: hard + "axe-core@npm:^4.6.2": version: 4.6.3 resolution: "axe-core@npm:4.6.3" @@ -6852,7 +6872,7 @@ __metadata: languageName: node linkType: hard -"debug@npm:4, debug@npm:4.3.4, debug@npm:^4.1.0, debug@npm:^4.1.1, debug@npm:^4.3.2, debug@npm:^4.3.3, debug@npm:^4.3.4": +"debug@npm:4, debug@npm:4.3.4, debug@npm:^4.1.0, debug@npm:^4.1.1, debug@npm:^4.3.1, debug@npm:^4.3.2, debug@npm:^4.3.3, debug@npm:^4.3.4": version: 4.3.4 resolution: "debug@npm:4.3.4" dependencies: @@ -9601,6 +9621,13 @@ __metadata: languageName: node linkType: hard +"hpagent@npm:^1.2.0": + version: 1.2.0 + resolution: "hpagent@npm:1.2.0" + checksum: b029da695edae438cee4da2a437386f9db4ac27b3ceb7306d02e1b586c9c194741ed2e943c8a222e0cfefaf27ee3f863aca7ba1721b0950a2a19bf25bc0d85e2 + languageName: node + linkType: hard + "html-entities@npm:^2.3.2": version: 2.3.3 resolution: "html-entities@npm:2.3.3" @@ -11443,6 +11470,7 @@ __metadata: dependencies: "@dqbd/tiktoken": ^1.0.2 "@getmetal/metal-sdk": ^1.0.12 + "@opensearch-project/opensearch": ^2.2.0 "@pinecone-database/pinecone": ^0.0.10 "@prisma/client": ^4.11.0 "@supabase/supabase-js": ^2.10.0 @@ -11482,6 +11510,7 @@ __metadata: "@getmetal/metal-sdk": ^1.0.12 "@huggingface/inference": ^1.5.1 "@jest/globals": ^29.5.0 + "@opensearch-project/opensearch": ^2.2.0 "@pinecone-database/pinecone": ^0.0.10 "@supabase/supabase-js": ^2.10.0 "@tsconfig/recommended": ^1.0.2 @@ -11540,6 +11569,7 @@ __metadata: "@dqbd/tiktoken": ^1.0.2 "@getmetal/metal-sdk": "*" "@huggingface/inference": ^1.5.1 + "@opensearch-project/opensearch": "*" "@pinecone-database/pinecone": ^0.0.10 "@supabase/supabase-js": ^2.10.0 cheerio: ^1.0.0-rc.12 @@ -12505,7 +12535,7 @@ __metadata: languageName: node linkType: hard -"ms@npm:2.1.3, ms@npm:^2.0.0, ms@npm:^2.1.1": +"ms@npm:2.1.3, ms@npm:^2.0.0, ms@npm:^2.1.1, ms@npm:^2.1.3": version: 2.1.3 resolution: "ms@npm:2.1.3" checksum: aa92de608021b242401676e35cfa5aa42dd70cbdc082b916da7fb925c542173e36bce97ea3e804923fe92c0ad991434e4a38327e15a1b5b5f945d66df615ae6d @@ -15459,6 +15489,13 @@ __metadata: languageName: node linkType: hard +"secure-json-parse@npm:^2.4.0": + version: 2.7.0 + resolution: "secure-json-parse@npm:2.7.0" + checksum: d9d7d5a01fc6db6115744ba23cf9e67ecfe8c524d771537c062ee05ad5c11b64c730bc58c7f33f60bd6877f96b86f0ceb9ea29644e4040cb757f6912d4dd6737 + languageName: node + linkType: hard + "selderee@npm:^0.11.0": version: 0.11.0 resolution: "selderee@npm:0.11.0"