From 52504bff4e3897cd1e099f86dbd8aa32c4301f49 Mon Sep 17 00:00:00 2001 From: mrmer1 Date: Wed, 2 Oct 2024 11:16:16 +0800 Subject: [PATCH] add semantic search docs --- .../text-embeddings/semantic-search-embed.mdx | 239 ++++++++++++++++++ .../text-embeddings/semantic-search-embed.mdx | 239 ++++++++++++++++++ fern/v1.yml | 2 + fern/v2.yml | 2 + 4 files changed, 482 insertions(+) create mode 100644 fern/pages/text-embeddings/semantic-search-embed.mdx create mode 100644 fern/pages/v2/text-embeddings/semantic-search-embed.mdx diff --git a/fern/pages/text-embeddings/semantic-search-embed.mdx b/fern/pages/text-embeddings/semantic-search-embed.mdx new file mode 100644 index 00000000..5b7610e4 --- /dev/null +++ b/fern/pages/text-embeddings/semantic-search-embed.mdx @@ -0,0 +1,239 @@ +--- +title: "Semantic Search with Embeddings" +slug: "docs/semantic-search-embed" + +hidden: false +description: >- + Examples on how to use the Embed endpoint to perform semantic search (API v1). +image: "../../assets/images/fa074c3-cohere_docs_preview_image_1200x630_copy.jpg" +keywords: "vector embeddings, embeddings, natural language processing" + +--- + +This section provides examples on how to use the Embed endpoint to perform semantic search. + +Semantic search solves the problem faced by the more traditional approach of lexical search, which is great at finding keyword matches, but struggles to capture the context or meaning of a piece of text. + + +```python PYTHON +import cohere +import numpy as np +co = cohere.Client(api_key="YOUR_API_KEY") # Get your free API key: https://dashboard.cohere.com/api-keys +``` + +The Embed endpoint takes in texts as input and returns embeddings as output. + +For semantic search, there are two types of documents we need to turn into embeddings. + +- The list of documents to search from. +- The query that will be used to search the documents. + +### Step 1: Embed the documents +We call the Embed endpoint using `co.embed()` and pass the required arguments: +- `texts`: The list of texts +- `model`: Here we choose `embed-english-v3.0`, which generates embeddings of size 1024 +- `input_type`: We choose `search_document` to ensure the model treats these as the documents for search +- `embedding_types`: We choose `float` to get a float array as the output + +### Step 2: Embed the query +Next, we add and embed a query. We choose `search_query` as the `input_type` to ensure the model treats this as the query (instead of documents) for search. + +### Step 3: Return the most similar documents +Next, we calculate and sort similarity scores between a query and document embeddings, then display the top N most similar documents. Here, we are using the numpy library for calculating similarity using a dot product approach. + + + +```python PYTHON +### STEP 1: Embed the documents + +# Define the documents +documents = [ + "Joining Slack Channels: You will receive an invite via email. Be sure to join relevant channels to stay informed and engaged.", + "Finding Coffee Spots: For your caffeine fix, head to the break room's coffee machine or cross the street to the café for artisan coffee.", + "Team-Building Activities: We foster team spirit with monthly outings and weekly game nights. Feel free to suggest new activity ideas anytime!", + "Working Hours Flexibility: We prioritize work-life balance. While our core hours are 9 AM to 5 PM, we offer flexibility to adjust as needed.", +] + +# Embed the documents +doc_emb = co.embed( + texts=documents, + model="embed-english-v3.0", + input_type="search_document", + embedding_types=["float"] +).embeddings.float + +### STEP 2: Embed the query + +# Add the user query +query = "How to connect with my teammates?" + +# Embed the query +query_emb = co.embed( + texts=[query], + model="embed-english-v3.0", + input_type="search_query", + embedding_types=["float"] +).embeddings.float + +### STEP 3: Return the most similar documents + +# Calculate similarity scores +scores = np.dot(query_emb, np.transpose(doc_emb))[0] + +# Sort and filter documents based on scores +top_n = 2 +top_doc_idxs = np.argsort(-scores)[:top_n] + +# Display search results +for idx, docs_idx in enumerate(top_doc_idxs): + print(f"Rank: {idx+1}") + print(f"Document: {documents[docs_idx]}\n") +``` +``` +Rank: 1 +Document: Team-Building Activities: We foster team spirit with monthly outings and weekly game nights. Feel free to suggest new activity ideas anytime! + +Rank: 2 +Document: Joining Slack Channels: You will receive an invite via email. Be sure to join relevant channels to stay informed and engaged. +``` + + +## Content quality measure with Embed v3 + +A standard text embeddings model is optimized for only topic similarity between a query and candidate documents. But in many real-world applications, you have redundant information with varying content quality. + +For instance, consider a user query of “COVID-19 Symptoms” and compare that to candidate document, “COVID-19 has many symptoms”. This document does not offer high-quality and rich information. However, with a typical embedding model, it will appear high on search results because it is highly similar to the query. + + +The Embed v3 model is trained to capture both content quality and topic similarity. Through this approach, a search system can extract richer information from documents and is robust against noise. + +As an example below, give a query ("COVID-19 Symptoms"), the document with the highest quality ("COVID-19 symptoms can include: a high temperature or shivering...") is ranked first. + +Another document ("COVID-19 has many symptoms") is arguably more similar to the query based on what information it contains, yet it is ranked lower as it doesn’t contain that much information. + +This demonstrates how Embed v3 helps to surface high-quality documents for a given query. + + +```python PYTHON +### STEP 1: Embed the documents + +documents = [ + "COVID-19 has many symptoms.", + "COVID-19 symptoms are bad.", + "COVID-19 symptoms are not nice", + "COVID-19 symptoms are bad. 5G capabilities include more expansive service coverage, a higher number of available connections, and lower power consumption.", + "COVID-19 is a disease caused by a virus. The most common symptoms are fever, chills, and sore throat, but there are a range of others.", + "COVID-19 symptoms can include: a high temperature or shivering (chills); a new, continuous cough; a loss or change to your sense of smell or taste; and many more", + "Dementia has the following symptom: Experiencing memory loss, poor judgment, and confusion.", + "COVID-19 has the following symptom: Experiencing memory loss, poor judgment, and confusion.", +] + +# Embed the documents +doc_emb = co.embed( + texts=documents, + model="embed-english-v3.0", + input_type="search_document", + embedding_types=["float"] +).embeddings.float + +### STEP 2: Embed the query + +# Add the user query +query = "COVID-19 Symptoms" + +# Embed the query +query_emb = co.embed( + texts=[query], + model="embed-english-v3.0", + input_type="search_query", + embedding_types=["float"] +).embeddings.float + +### STEP 3: Return the most similar documents + +# Calculate similarity scores +scores = np.dot(query_emb, np.transpose(doc_emb))[0] + +# Sort and filter documents based on scores +top_n = 5 +top_doc_idxs = np.argsort(-scores)[:top_n] + +# Display search results +for idx, docs_idx in enumerate(top_doc_idxs): + print(f"Rank: {idx+1}") + print(f"Document: {documents[docs_idx]}\n") +``` +``` +Rank: 1 +Document: COVID-19 symptoms can include: a high temperature or shivering (chills); a new, continuous cough; a loss or change to your sense of smell or taste; and many more + +Rank: 2 +Document: COVID-19 is a disease caused by a virus. The most common symptoms are fever, chills, and sore throat, but there are a range of others. + +Rank: 3 +Document: COVID-19 has the following symptom: Experiencing memory loss, poor judgment, and confusion. + +Rank: 4 +Document: COVID-19 has many symptoms. + +Rank: 5 +Document: COVID-19 symptoms are not nice +``` + + +## Multilingual semantic search + +The Embed endpoint also supports multilingual semantic search via the `embed-multilingual-...` models. This means you can perform semantic search on texts in different languages. + +Specifically, you can do both multilingual and cross-lingual searches using one single model. + + +```python PYTHON +### STEP 1: Embed the documents + +documents = [ + "Remboursement des frais de voyage : Gérez facilement vos frais de voyage en les soumettant via notre outil financier. Les approbations sont rapides et simples.", + "Travailler de l'étranger : Il est possible de travailler à distance depuis un autre pays. Il suffit de coordonner avec votre responsable et de vous assurer d'être disponible pendant les heures de travail.", + "Avantages pour la santé et le bien-être : Nous nous soucions de votre bien-être et proposons des adhésions à des salles de sport, des cours de yoga sur site et une assurance santé complète.", + "Fréquence des évaluations de performance : Nous organisons des bilans informels tous les trimestres et des évaluations formelles deux fois par an.", +] + +# Embed the documents +doc_emb = co.embed( + texts=documents, + model="embed-english-v3.0", + input_type="search_document", + embedding_types=["float"] +).embeddings.float + +### STEP 2: Embed the query + +# Add the user query +query = "What's your remote-working policy?" + +# Embed the query +query_emb = co.embed( + texts=[query], + model="embed-english-v3.0", + input_type="search_query", + embedding_types=["float"] +).embeddings.float + +### STEP 3: Return the most similar documents + +# Calculate similarity scores +scores = np.dot(query_emb, np.transpose(doc_emb))[0] + +# Sort and filter documents based on scores +top_n = 1 +top_doc_idxs = np.argsort(-scores)[:top_n] + +# Display search results +for idx, docs_idx in enumerate(top_doc_idxs): + print(f"Rank: {idx+1}") + print(f"Document: {documents[docs_idx]}\n") +``` +``` +Rank: 1 +Document: Travailler de l'étranger : Il est possible de travailler à distance depuis un autre pays. Il suffit de coordonner avec votre responsable et de vous assurer d'être disponible pendant les heures de travail. +``` \ No newline at end of file diff --git a/fern/pages/v2/text-embeddings/semantic-search-embed.mdx b/fern/pages/v2/text-embeddings/semantic-search-embed.mdx new file mode 100644 index 00000000..6b0be710 --- /dev/null +++ b/fern/pages/v2/text-embeddings/semantic-search-embed.mdx @@ -0,0 +1,239 @@ +--- +title: "Semantic Search with Embeddings" +slug: "v2/docs/semantic-search-embed" + +hidden: false +description: >- + Examples on how to use the Embed endpoint to perform semantic search (API v2). +image: "../../../assets/images/fa074c3-cohere_docs_preview_image_1200x630_copy.jpg" +keywords: "vector embeddings, embeddings, natural language processing" + +--- + +This section provides examples on how to use the Embed endpoint to perform semantic search. + +Semantic search solves the problem faced by the more traditional approach of lexical search, which is great at finding keyword matches, but struggles to capture the context or meaning of a piece of text. + + +```python PYTHON +import cohere +import numpy as np +co = cohere.Client(api_key="YOUR_API_KEY") # Get your free API key: https://dashboard.cohere.com/api-keys +``` + +The Embed endpoint takes in texts as input and returns embeddings as output. + +For semantic search, there are two types of documents we need to turn into embeddings. + +- The list of documents to search from. +- The query that will be used to search the documents. + +### Step 1: Embed the documents +We call the Embed endpoint using `co.embed()` and pass the required arguments: +- `texts`: The list of texts +- `model`: Here we choose `embed-english-v3.0`, which generates embeddings of size 1024 +- `input_type`: We choose `search_document` to ensure the model treats these as the documents for search +- `embedding_types`: We choose `float` to get a float array as the output + +### Step 2: Embed the query +Next, we add and embed a query. We choose `search_query` as the `input_type` to ensure the model treats this as the query (instead of documents) for search. + +### Step 3: Return the most similar documents +Next, we calculate and sort similarity scores between a query and document embeddings, then display the top N most similar documents. Here, we are using the numpy library for calculating similarity using a dot product approach. + + + +```python PYTHON +### STEP 1: Embed the documents + +# Define the documents +documents = [ + "Joining Slack Channels: You will receive an invite via email. Be sure to join relevant channels to stay informed and engaged.", + "Finding Coffee Spots: For your caffeine fix, head to the break room's coffee machine or cross the street to the café for artisan coffee.", + "Team-Building Activities: We foster team spirit with monthly outings and weekly game nights. Feel free to suggest new activity ideas anytime!", + "Working Hours Flexibility: We prioritize work-life balance. While our core hours are 9 AM to 5 PM, we offer flexibility to adjust as needed.", +] + +# Embed the documents +doc_emb = co.embed( + texts=documents, + model="embed-english-v3.0", + input_type="search_document", + embedding_types=["float"] +).embeddings.float + +### STEP 2: Embed the query + +# Add the user query +query = "How to connect with my teammates?" + +# Embed the query +query_emb = co.embed( + texts=[query], + model="embed-english-v3.0", + input_type="search_query", + embedding_types=["float"] +).embeddings.float + +### STEP 3: Return the most similar documents + +# Calculate similarity scores +scores = np.dot(query_emb, np.transpose(doc_emb))[0] + +# Sort and filter documents based on scores +top_n = 2 +top_doc_idxs = np.argsort(-scores)[:top_n] + +# Display search results +for idx, docs_idx in enumerate(top_doc_idxs): + print(f"Rank: {idx+1}") + print(f"Document: {documents[docs_idx]}\n") +``` +``` +Rank: 1 +Document: Team-Building Activities: We foster team spirit with monthly outings and weekly game nights. Feel free to suggest new activity ideas anytime! + +Rank: 2 +Document: Joining Slack Channels: You will receive an invite via email. Be sure to join relevant channels to stay informed and engaged. +``` + + +## Content quality measure with Embed v3 + +A standard text embeddings model is optimized for only topic similarity between a query and candidate documents. But in many real-world applications, you have redundant information with varying content quality. + +For instance, consider a user query of “COVID-19 Symptoms” and compare that to candidate document, “COVID-19 has many symptoms”. This document does not offer high-quality and rich information. However, with a typical embedding model, it will appear high on search results because it is highly similar to the query. + + +The Embed v3 model is trained to capture both content quality and topic similarity. Through this approach, a search system can extract richer information from documents and is robust against noise. + +As an example below, give a query ("COVID-19 Symptoms"), the document with the highest quality ("COVID-19 symptoms can include: a high temperature or shivering...") is ranked first. + +Another document ("COVID-19 has many symptoms") is arguably more similar to the query based on what information it contains, yet it is ranked lower as it doesn’t contain that much information. + +This demonstrates how Embed v3 helps to surface high-quality documents for a given query. + + +```python PYTHON +### STEP 1: Embed the documents + +documents = [ + "COVID-19 has many symptoms.", + "COVID-19 symptoms are bad.", + "COVID-19 symptoms are not nice", + "COVID-19 symptoms are bad. 5G capabilities include more expansive service coverage, a higher number of available connections, and lower power consumption.", + "COVID-19 is a disease caused by a virus. The most common symptoms are fever, chills, and sore throat, but there are a range of others.", + "COVID-19 symptoms can include: a high temperature or shivering (chills); a new, continuous cough; a loss or change to your sense of smell or taste; and many more", + "Dementia has the following symptom: Experiencing memory loss, poor judgment, and confusion.", + "COVID-19 has the following symptom: Experiencing memory loss, poor judgment, and confusion.", +] + +# Embed the documents +doc_emb = co.embed( + texts=documents, + model="embed-english-v3.0", + input_type="search_document", + embedding_types=["float"] +).embeddings.float + +### STEP 2: Embed the query + +# Add the user query +query = "COVID-19 Symptoms" + +# Embed the query +query_emb = co.embed( + texts=[query], + model="embed-english-v3.0", + input_type="search_query", + embedding_types=["float"] +).embeddings.float + +### STEP 3: Return the most similar documents + +# Calculate similarity scores +scores = np.dot(query_emb, np.transpose(doc_emb))[0] + +# Sort and filter documents based on scores +top_n = 5 +top_doc_idxs = np.argsort(-scores)[:top_n] + +# Display search results +for idx, docs_idx in enumerate(top_doc_idxs): + print(f"Rank: {idx+1}") + print(f"Document: {documents[docs_idx]}\n") +``` +``` +Rank: 1 +Document: COVID-19 symptoms can include: a high temperature or shivering (chills); a new, continuous cough; a loss or change to your sense of smell or taste; and many more + +Rank: 2 +Document: COVID-19 is a disease caused by a virus. The most common symptoms are fever, chills, and sore throat, but there are a range of others. + +Rank: 3 +Document: COVID-19 has the following symptom: Experiencing memory loss, poor judgment, and confusion. + +Rank: 4 +Document: COVID-19 has many symptoms. + +Rank: 5 +Document: COVID-19 symptoms are not nice +``` + + +## Multilingual semantic search + +The Embed endpoint also supports multilingual semantic search via the `embed-multilingual-...` models. This means you can perform semantic search on texts in different languages. + +Specifically, you can do both multilingual and cross-lingual searches using one single model. + + +```python PYTHON +### STEP 1: Embed the documents + +documents = [ + "Remboursement des frais de voyage : Gérez facilement vos frais de voyage en les soumettant via notre outil financier. Les approbations sont rapides et simples.", + "Travailler de l'étranger : Il est possible de travailler à distance depuis un autre pays. Il suffit de coordonner avec votre responsable et de vous assurer d'être disponible pendant les heures de travail.", + "Avantages pour la santé et le bien-être : Nous nous soucions de votre bien-être et proposons des adhésions à des salles de sport, des cours de yoga sur site et une assurance santé complète.", + "Fréquence des évaluations de performance : Nous organisons des bilans informels tous les trimestres et des évaluations formelles deux fois par an.", +] + +# Embed the documents +doc_emb = co.embed( + texts=documents, + model="embed-english-v3.0", + input_type="search_document", + embedding_types=["float"] +).embeddings.float + +### STEP 2: Embed the query + +# Add the user query +query = "What's your remote-working policy?" + +# Embed the query +query_emb = co.embed( + texts=[query], + model="embed-english-v3.0", + input_type="search_query", + embedding_types=["float"] +).embeddings.float + +### STEP 3: Return the most similar documents + +# Calculate similarity scores +scores = np.dot(query_emb, np.transpose(doc_emb))[0] + +# Sort and filter documents based on scores +top_n = 1 +top_doc_idxs = np.argsort(-scores)[:top_n] + +# Display search results +for idx, docs_idx in enumerate(top_doc_idxs): + print(f"Rank: {idx+1}") + print(f"Document: {documents[docs_idx]}\n") +``` +``` +Rank: 1 +Document: Travailler de l'étranger : Il est possible de travailler à distance depuis un autre pays. Il suffit de coordonner avec votre responsable et de vous assurer d'être disponible pendant les heures de travail. +``` \ No newline at end of file diff --git a/fern/v1.yml b/fern/v1.yml index fefce975..c79f587c 100644 --- a/fern/v1.yml +++ b/fern/v1.yml @@ -135,6 +135,8 @@ navigation: contents: - page: Introduction to Embeddings at Cohere path: pages/text-embeddings/embeddings.mdx + - page: Semantic Search with Embeddings + path: pages/text-embeddings/semantic-search-embed.mdx - page: Batch Embedding Jobs path: pages/text-embeddings/embed-jobs-api.mdx - section: Reranking diff --git a/fern/v2.yml b/fern/v2.yml index 0020adcf..94e91b20 100644 --- a/fern/v2.yml +++ b/fern/v2.yml @@ -116,6 +116,8 @@ navigation: contents: - page: Introduction to Embeddings at Cohere path: pages/v2/text-embeddings/embeddings.mdx + - page: Semantic Search with Embeddings + path: pages/v2/text-embeddings/semantic-search-embed.mdx - page: Batch Embedding Jobs path: pages/v2/text-embeddings/embed-jobs-api.mdx - section: Reranking