diff --git a/fern/pages/fine-tuning/fine-tuning-with-the-python-sdk.mdx b/fern/pages/fine-tuning/fine-tuning-with-the-python-sdk.mdx index 339d950e..0927be14 100644 --- a/fern/pages/fine-tuning/fine-tuning-with-the-python-sdk.mdx +++ b/fern/pages/fine-tuning/fine-tuning-with-the-python-sdk.mdx @@ -19,13 +19,13 @@ The snippet below creates a dataset for fine-tuning a model on records of custom ```python PYTHON # create a dataset -co = cohere.Client('Your API key') +co = cohere.Client("Your API key") my_dataset = co.datasets.create( - name="customer service logs", - type="chat-finetune-input", - data=open("./customer-chat.jsonl", "rb"), - eval_data=open("./customer-chat-eval.jsonl", "rb") + name="customer service logs", + type="chat-finetune-input", + data=open("./customer-chat.jsonl", "rb"), + eval_data=open("./customer-chat-eval.jsonl", "rb"), ) result = co.wait(my_dataset) @@ -40,15 +40,15 @@ from cohere.finetuning import FinetunedModel, Settings, BaseModel # start training a custom model using the dataset finetuned_model = co.finetuning.create_finetuned_model( - request=FinetunedModel( - name="customer-service-chat-model", - settings=Settings( - base_model=BaseModel( - base_type="BASE_TYPE_CHAT", - ), - dataset_id=my_dataset.id, + request=FinetunedModel( + name="customer-service-chat-model", + settings=Settings( + base_model=BaseModel( + base_type="BASE_TYPE_CHAT", + ), + dataset_id=my_dataset.id, + ), ), - ), ) ``` diff --git a/fern/pages/get-started/datasets.mdx b/fern/pages/get-started/datasets.mdx index 0b492347..ba36fd13 100644 --- a/fern/pages/get-started/datasets.mdx +++ b/fern/pages/get-started/datasets.mdx @@ -35,7 +35,7 @@ You should also be aware of how Cohere handles data retention. This is the most First, let's install the SDK -```python PYTHON +```bash pip install cohere ``` @@ -164,20 +164,20 @@ Datasets of type `chat-finetune-input`, for example, are expected to have a json ```python PYTHON { - "messages": [ - { - "role": "System", - "content": "You are a large language model trained by Cohere." - }, - { - "role": "User", - "content": "Hi! What were Time magazines top 10 cover stories in the last 10 years?" - }, - { - "role": "Chatbot", - "content": "Time magazines top 10 cover stories in the last 10 years were:\\n\\n1. Volodymyr Zelenskyy\\n2. Elon Musk\\n3. Martin Luther King Jr.\\n4. How Earth Survived\\n5. Her Lasting Impact\\n6. Nothing to See Here\\n7. Meltdown\\n8. Deal With It\\n9. The Top of America\\n10. Bitter Pill" - } - ] + "messages": [ + { + "role": "System", + "content": "You are a large language model trained by Cohere.", + }, + { + "role": "User", + "content": "Hi! What were Time magazines top 10 cover stories in the last 10 years?", + }, + { + "role": "Chatbot", + "content": "Time magazines top 10 cover stories in the last 10 years were:\\n\\n1. Volodymyr Zelenskyy\\n2. Elon Musk\\n3. Martin Luther King Jr.\\n4. How Earth Survived\\n5. Her Lasting Impact\\n6. Nothing to See Here\\n7. Meltdown\\n8. Deal With It\\n9. The Top of America\\n10. Bitter Pill", + }, + ] } ``` diff --git a/fern/pages/text-embeddings/embed-jobs-api.mdx b/fern/pages/text-embeddings/embed-jobs-api.mdx index 77b7f448..9684ddab 100644 --- a/fern/pages/text-embeddings/embed-jobs-api.mdx +++ b/fern/pages/text-embeddings/embed-jobs-api.mdx @@ -60,15 +60,15 @@ As seen in the example above, the following would be a valid `create_dataset` ca ```python PYTHON # Upload a dataset for embed jobs -ds=co.datasets.create( - name='sample_file', - # insert your file path here - you can upload it on the right - we accept .csv and jsonl files - data=open('embed_jobs_sample_data.jsonl', 'rb'), - keep_fields=['wiki_id','url','views','title'] - optional_fields=['langs'] - dataset_type="embed-input", - embedding_types=['float'] - ) +ds = co.datasets.create( + name="sample_file", + # insert your file path here - you can upload it on the right - we accept .csv and jsonl files + data=open("embed_jobs_sample_data.jsonl", "rb"), + keep_fields=["wiki_id", "url", "views", "title"], + optional_fields=["langs"], + dataset_type="embed-input", + embedding_types=["float"], +) # wait for the dataset to finish validation print(co.wait(ds)) @@ -82,13 +82,14 @@ The Embed Jobs API takes in `dataset IDs` as an input. Uploading a local file to ```python PYTHON import cohere + co = cohere.Client(api_key="") -input_dataset=co.datasets.create( - name='your_file_name', - data=open('/content/your_file_path', 'rb'), - dataset_type="embed-input" - ) +input_dataset = co.datasets.create( + name="your_file_name", + data=open("/content/your_file_path", "rb"), + dataset_type="embed-input", +) # block on server-side validation print(co.wait(input_dataset)) @@ -114,11 +115,12 @@ Your dataset is now ready to be embedded. Here's a code snippet illustrating wha ```python PYTHON embed_job = co.embed_jobs.create( - dataset_id=input_dataset.id, - input_type='search_document' , - model='embed-english-v3.0', - embedding_types=['float'], - truncate='END') + dataset_id=input_dataset.id, + input_type="search_document", + model="embed-english-v3.0", + embedding_types=["float"], + truncate="END", +) # block until the job is complete co.wait(embed_job) @@ -131,17 +133,17 @@ Since we’d like to search over these embeddings and we can think of them as co The output of embed jobs is a dataset object which you can download or pipe directly to a database of your choice: ```python PYTHON -output_dataset=co.datasets.get(id=embed_job.output.id) -co.utils.save(filepath='/content/embed_job_output.csv', format="csv") +output_dataset = co.datasets.get(id=embed_job.output.id) +co.utils.save(filepath="/content/embed_job_output.csv", format="csv") ``` Alternatively if you would like to pass the dataset into a downstream function you can do the following: ```python PYTHON -output_dataset=co.datasets.get(id=embed_job.output.id) -results=[] +output_dataset = co.datasets.get(id=embed_job.output.id) +results = [] for record in output_dataset: - results.append(record) + results.append(record) ``` ### Sample Output diff --git a/fern/pages/text-embeddings/embeddings.mdx b/fern/pages/text-embeddings/embeddings.mdx index cf77e4cd..c257fa26 100644 --- a/fern/pages/text-embeddings/embeddings.mdx +++ b/fern/pages/text-embeddings/embeddings.mdx @@ -26,22 +26,26 @@ co = cohere.Client(api_key="YOUR_API_KEY") # get the embeddings phrases = ["i love soup", "soup is my favorite", "london is far away"] -model="embed-english-v3.0" -input_type="search_query" +model = "embed-english-v3.0" +input_type = "search_query" -res = co.embed(texts=phrases, - model=model, - input_type=input_type, - embedding_types=['float']) +res = co.embed( + texts=phrases, + model=model, + input_type=input_type, + embedding_types=["float"], +) (soup1, soup2, london) = res.embeddings.float + # compare them def calculate_similarity(a, b): - return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) + return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) + -calculate_similarity(soup1, soup2) # 0.85 - very similar! -calculate_similarity(soup1, london) # 0.16 - not similar! +calculate_similarity(soup1, soup2) # 0.85 - very similar! +calculate_similarity(soup1, london) # 0.16 - not similar! ``` ## The `input_type` parameter @@ -58,24 +62,31 @@ Cohere embeddings are optimized for different types of inputs. In addition to `embed-english-v3.0` we offer a best-in-class multilingual model [embed-multilingual-v3.0](/docs/embed-2#multi-lingual-models) with support for over 100 languages, including Chinese, Spanish, and French. This model can be used with the Embed API, just like its English counterpart: ```python PYTHON -import cohere +import cohere + co = cohere.Client(api_key="") -texts = [ - 'Hello from Cohere!', 'مرحبًا من كوهير!', 'Hallo von Cohere!', - 'Bonjour de Cohere!', '¡Hola desde Cohere!', 'Olá do Cohere!', - 'Ciao da Cohere!', '您好,来自 Cohere!', 'कोहेरे से नमस्ते!' -] +texts = [ + "Hello from Cohere!", + "مرحبًا من كوهير!", + "Hallo von Cohere!", + "Bonjour de Cohere!", + "¡Hola desde Cohere!", + "Olá do Cohere!", + "Ciao da Cohere!", + "您好,来自 Cohere!", + "कोहेरे से नमस्ते!", +] response = co.embed( - model='embed-multilingual-v3.0', - texts=texts, - input_type='classification', - embedding_types=['float']) - -embeddings = response.embeddings.float # All text embeddings -print(embeddings[0][:5]) # Print embeddings for the first text - + model="embed-multilingual-v3.0", + texts=texts, + input_type="classification", + embedding_types=["float"], +) + +embeddings = response.embeddings.float # All text embeddings +print(embeddings[0][:5]) # Print embeddings for the first text ``` ## Image Embeddings @@ -96,12 +107,13 @@ Be aware that image embedding has the following restrictions: import cohere from PIL import Image from io import BytesIO -import base64 +import base64 co = cohere.Client(api_key="") # The model accepts input in base64 as a Data URL + def image_to_base64_data_url(image_path): # Open the image file with Image.open(image_path) as img: @@ -110,19 +122,23 @@ def image_to_base64_data_url(image_path): # Save the image as PNG to the BytesIO object img.save(buffered, format="PNG") # Encode the image data in base64 - img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") - + img_base64 = base64.b64encode(buffered.getvalue()).decode( + "utf-8" + ) + # Create the Data URL and assumes the original image file type was png data_url = f"data:image/png;base64,{img_base64}" return data_url - + + processed_image = image_to_base64_data_url("") - + ret = co.embed( - images=[processed_image], - model='embed-english-v3.0', - embedding_types= ["float"], - input_type='image') + images=[processed_image], + model="embed-english-v3.0", + embedding_types=["float"], + input_type="image", +) ret.embeddings.float ``` @@ -142,39 +158,41 @@ The following embedding types are supported: The parameter defaults to `float`, so if you pass in no argument you'll get back `float` embeddings: ```python PYTHON -ret = co.embed(texts=phrases, - model=model, - input_type=input_type) +ret = co.embed(texts=phrases, model=model, input_type=input_type) -ret.embeddings # This contains the float embeddings +ret.embeddings # This contains the float embeddings ``` However we recommend being explicit about the `embedding type(s)`. To specify an embedding types, pass one of the types from the list above in as list containing a string: ```python PYTHON -ret = co.embed(texts=phrases, - model=model, - input_type=input_type, - embedding_types=['int8']) - -ret.embeddings.int8 # This contains your int8 embeddings -ret.embeddings.float # This will be empty -ret.embeddings.uint8 # This will be empty -ret.embeddings.ubinary # This will be empty -ret.embeddings.binary # This will be empty +ret = co.embed( + texts=phrases, + model=model, + input_type=input_type, + embedding_types=["int8"], +) + +ret.embeddings.int8 # This contains your int8 embeddings +ret.embeddings.float # This will be empty +ret.embeddings.uint8 # This will be empty +ret.embeddings.ubinary # This will be empty +ret.embeddings.binary # This will be empty ``` Finally, you can also pass several `embedding types` in as a list, in which case the endpoint will return a dictionary with both types available: ```python PYTHON -ret = co.embed(texts=phrases, - model=model, - input_type=input_type, - embedding_types=['int8', 'float']) - -ret.embeddings.int8 # This contains your int8 embeddings -ret.embeddings.float # This contains your float embeddings -ret.embeddings.uint8 # This will be empty -ret.embeddings.ubinary # This will be empty -ret.embeddings.binary # This will be empty +ret = co.embed( + texts=phrases, + model=model, + input_type=input_type, + embedding_types=["int8", "float"], +) + +ret.embeddings.int8 # This contains your int8 embeddings +ret.embeddings.float # This contains your float embeddings +ret.embeddings.uint8 # This will be empty +ret.embeddings.ubinary # This will be empty +ret.embeddings.binary # This will be empty ``` diff --git a/fern/pages/text-embeddings/multimodal-embeddings.mdx b/fern/pages/text-embeddings/multimodal-embeddings.mdx index 9ff75a50..045a5605 100644 --- a/fern/pages/text-embeddings/multimodal-embeddings.mdx +++ b/fern/pages/text-embeddings/multimodal-embeddings.mdx @@ -35,29 +35,32 @@ The Embed API takes in images with the following file formats: `png`, `jpeg`,`We import os import base64 + # Defining the function to convert an image to a base 64 Data URL def image_to_base64_data_url(image_path): - _, file_extension = os.path.splitext(image_path) - file_type=(file_extension[1:]) - - with open(image_path, "rb") as f: - enc_img = base64.b64encode(f.read()).decode('utf-8') - enc_img = f"data:image/{file_type};base64,{enc_img}" - return enc_img - -image_path='' -processed_image=image_to_base64_data_url(image_path) + _, file_extension = os.path.splitext(image_path) + file_type = file_extension[1:] + + with open(image_path, "rb") as f: + enc_img = base64.b64encode(f.read()).decode("utf-8") + enc_img = f"data:image/{file_type};base64,{enc_img}" + return enc_img + + +image_path = "" +processed_image = image_to_base64_data_url(image_path) ``` #### 2\. Call the Embed Endpoint ```python PYTHON # Import the necessary packages import cohere + co = cohere.Client(api_key="") co.embed( - model='embed-english-v3.0', + model="embed-english-v3.0", images=[processed_image], - input_type='image' + input_type="image", ) ``` ## Sample Output diff --git a/fern/pages/text-embeddings/semantic-search-embed.mdx b/fern/pages/text-embeddings/semantic-search-embed.mdx index 5b7610e4..5ef6515c 100644 --- a/fern/pages/text-embeddings/semantic-search-embed.mdx +++ b/fern/pages/text-embeddings/semantic-search-embed.mdx @@ -18,7 +18,9 @@ Semantic search solves the problem faced by the more traditional approach of lex ```python PYTHON import cohere import numpy as np -co = cohere.Client(api_key="YOUR_API_KEY") # Get your free API key: https://dashboard.cohere.com/api-keys + +# Get your free API key: https://dashboard.cohere.com/api-keys +co = cohere.Client(api_key="YOUR_API_KEY") ``` The Embed endpoint takes in texts as input and returns embeddings as output. @@ -59,7 +61,7 @@ doc_emb = co.embed( texts=documents, model="embed-english-v3.0", input_type="search_document", - embedding_types=["float"] + embedding_types=["float"], ).embeddings.float ### STEP 2: Embed the query @@ -72,7 +74,7 @@ query_emb = co.embed( texts=[query], model="embed-english-v3.0", input_type="search_query", - embedding_types=["float"] + embedding_types=["float"], ).embeddings.float ### STEP 3: Return the most similar documents @@ -133,7 +135,7 @@ doc_emb = co.embed( texts=documents, model="embed-english-v3.0", input_type="search_document", - embedding_types=["float"] + embedding_types=["float"], ).embeddings.float ### STEP 2: Embed the query @@ -146,7 +148,7 @@ query_emb = co.embed( texts=[query], model="embed-english-v3.0", input_type="search_query", - embedding_types=["float"] + embedding_types=["float"], ).embeddings.float ### STEP 3: Return the most similar documents @@ -203,7 +205,7 @@ doc_emb = co.embed( texts=documents, model="embed-english-v3.0", input_type="search_document", - embedding_types=["float"] + embedding_types=["float"], ).embeddings.float ### STEP 2: Embed the query @@ -216,7 +218,7 @@ query_emb = co.embed( texts=[query], model="embed-english-v3.0", input_type="search_query", - embedding_types=["float"] + embedding_types=["float"], ).embeddings.float ### STEP 3: Return the most similar documents diff --git a/fern/pages/text-embeddings/text-classification-1.mdx b/fern/pages/text-embeddings/text-classification-1.mdx index da38bb82..1d9adc91 100644 --- a/fern/pages/text-embeddings/text-classification-1.mdx +++ b/fern/pages/text-embeddings/text-classification-1.mdx @@ -31,7 +31,7 @@ import cohere from cohere import ClassifyExample ``` ```python PYTHON -co = cohere.Client("COHERE_API_KEY") # Your Cohere API key +co = cohere.Client("COHERE_API_KEY") # Your Cohere API key ``` ### Preparing the Data and Inputs @@ -41,38 +41,49 @@ With the `classify` endpoint, you can create a text classifier with as few as tw Here are examples, created as `ClassifyExample` objects: ```python PYTHON -examples = [ClassifyExample(text="I’m so proud of you", label="positive"), - ClassifyExample(text="What a great time to be alive", label="positive"), - ClassifyExample(text="That’s awesome work", label="positive"), - ClassifyExample(text="The service was amazing", label="positive"), - ClassifyExample(text="I love my family", label="positive"), - ClassifyExample(text="They don't care about me", label="negative"), - ClassifyExample(text="I hate this place", label="negative"), - ClassifyExample(text="The most ridiculous thing I've ever heard", label="negative"), - ClassifyExample(text="I am really frustrated", label="negative"), - ClassifyExample(text="This is so unfair", label="negative"), - ClassifyExample(text="This made me think", label="neutral"), - ClassifyExample(text="The good old days", label="neutral"), - ClassifyExample(text="What's the difference", label="neutral"), - ClassifyExample(text="You can't ignore this", label="neutral"), - ClassifyExample(text="That's how I see it", label="neutral")] +examples = [ + ClassifyExample(text="I’m so proud of you", label="positive"), + ClassifyExample( + text="What a great time to be alive", label="positive" + ), + ClassifyExample(text="That’s awesome work", label="positive"), + ClassifyExample(text="The service was amazing", label="positive"), + ClassifyExample(text="I love my family", label="positive"), + ClassifyExample( + text="They don't care about me", label="negative" + ), + ClassifyExample(text="I hate this place", label="negative"), + ClassifyExample( + text="The most ridiculous thing I've ever heard", + label="negative", + ), + ClassifyExample(text="I am really frustrated", label="negative"), + ClassifyExample(text="This is so unfair", label="negative"), + ClassifyExample(text="This made me think", label="neutral"), + ClassifyExample(text="The good old days", label="neutral"), + ClassifyExample(text="What's the difference", label="neutral"), + ClassifyExample(text="You can't ignore this", label="neutral"), + ClassifyExample(text="That's how I see it", label="neutral"), +] ``` Besides the examples, you'll also need the 'inputs,' which are the strings of text you want the classifier to sort. Here are the ones we'll be using: ```python PYTHON -inputs = ["Hello, world! What a beautiful day", - "It was a great time with great people", - "Great place to work", - "That was a wonderful evening", - "Maybe this is why", - "Let's start again", - "That's how I see it", - "These are all facts", - "This is the worst thing", - "I cannot stand this any longer", - "This is really annoying", - "I am just plain fed up"] +inputs = [ + "Hello, world! What a beautiful day", + "It was a great time with great people", + "Great place to work", + "That was a wonderful evening", + "Maybe this is why", + "Let's start again", + "That's how I see it", + "These are all facts", + "This is the worst thing", + "I cannot stand this any longer", + "This is really annoying", + "I am just plain fed up", +] ``` ### Generate Predictions @@ -81,7 +92,6 @@ Setting up the model is quite straightforward with the `classify` endpoint. We'l ```python PYTHON def classify_text(inputs, examples): - """ Classifies a list of input texts given the examples Arguments: @@ -91,21 +101,21 @@ def classify_text(inputs, examples): Returns: classifications (list): each result contains the text, labels, and conf values """ - + # Classify text by calling the Classify endpoint response = co.classify( - model='embed-english-v3.0', - inputs=inputs, - examples=examples) + model="embed-english-v3.0", inputs=inputs, examples=examples + ) classifications = response.classifications return classifications + # Classify the inputs predictions = classify_text(inputs, examples) -print(predictions) +print(predictions) ``` Here’s a sample output returned (note that this output has been truncated to make it easier to read, you'll get much more in return if you run the code yourself): diff --git a/fern/pages/text-embeddings/text-classification-with-cohere.mdx b/fern/pages/text-embeddings/text-classification-with-cohere.mdx index ab481dd1..7d5c1190 100644 --- a/fern/pages/text-embeddings/text-classification-with-cohere.mdx +++ b/fern/pages/text-embeddings/text-classification-with-cohere.mdx @@ -27,7 +27,7 @@ from cohere import ClassifyExample ``` ```python PYTHON -co = cohere.Client("COHERE_API_KEY") # Your Cohere API key +co = cohere.Client("COHERE_API_KEY") # Your Cohere API key ``` ### Preparing the Data and Inputs @@ -37,39 +37,49 @@ With the `classify` endpoint, you can create a text classifier with as few as tw Here are examples, created as `ClassifyExample` objects: ```python PYTHON -examples = [ClassifyExample(text="I’m so proud of you", label="positive"), - ClassifyExample(text="What a great time to be alive", label="positive"), - ClassifyExample(text="That’s awesome work", label="positive"), - ClassifyExample(text="The service was amazing", label="positive"), - ClassifyExample(text="I love my family", label="positive"), - ClassifyExample(text="They don't care about me", label="negative"), - ClassifyExample(text="I hate this place", label="negative"), - ClassifyExample(text="The most ridiculous thing I've ever heard", label="negative"), - ClassifyExample(text="I am really frustrated", label="negative"), - ClassifyExample(text="This is so unfair", label="negative"), - ClassifyExample(text="This made me think", label="neutral"), - ClassifyExample(text="The good old days", label="neutral"), - ClassifyExample(text="What's the difference", label="neutral"), - ClassifyExample(text="You can't ignore this", label="neutral"), - ClassifyExample(text="That's how I see it", label="neutral")] - +examples = [ + ClassifyExample(text="I’m so proud of you", label="positive"), + ClassifyExample( + text="What a great time to be alive", label="positive" + ), + ClassifyExample(text="That’s awesome work", label="positive"), + ClassifyExample(text="The service was amazing", label="positive"), + ClassifyExample(text="I love my family", label="positive"), + ClassifyExample( + text="They don't care about me", label="negative" + ), + ClassifyExample(text="I hate this place", label="negative"), + ClassifyExample( + text="The most ridiculous thing I've ever heard", + label="negative", + ), + ClassifyExample(text="I am really frustrated", label="negative"), + ClassifyExample(text="This is so unfair", label="negative"), + ClassifyExample(text="This made me think", label="neutral"), + ClassifyExample(text="The good old days", label="neutral"), + ClassifyExample(text="What's the difference", label="neutral"), + ClassifyExample(text="You can't ignore this", label="neutral"), + ClassifyExample(text="That's how I see it", label="neutral"), +] ``` Besides the examples, you'll also need the 'inputs,' which are the strings of text you want the classifier to sort. Here are the ones we'll be using: ```python PYTHON -inputs = ["Hello, world! What a beautiful day", - "It was a great time with great people", - "Great place to work", - "That was a wonderful evening", - "Maybe this is why", - "Let's start again", - "That's how I see it", - "These are all facts", - "This is the worst thing", - "I cannot stand this any longer", - "This is really annoying", - "I am just plain fed up"] +inputs = [ + "Hello, world! What a beautiful day", + "It was a great time with great people", + "Great place to work", + "That was a wonderful evening", + "Maybe this is why", + "Let's start again", + "That's how I see it", + "These are all facts", + "This is the worst thing", + "I cannot stand this any longer", + "This is really annoying", + "I am just plain fed up", +] ``` ### Generate Predictions @@ -78,7 +88,6 @@ Setting up the model is quite straightforward with the `classify` endpoint. We'l ```python PYTHON def classify_text(inputs, examples): - """ Classifies a list of input texts given the examples Arguments: @@ -88,21 +97,21 @@ def classify_text(inputs, examples): Returns: classifications (list): each result contains the text, labels, and conf values """ - + # Classify text by calling the Classify endpoint response = co.classify( - model='embed-english-v3.0', - inputs=inputs, - examples=examples) + model="embed-english-v3.0", inputs=inputs, examples=examples + ) classifications = response.classifications return classifications + # Classify the inputs predictions = classify_text(inputs, examples) -print(predictions) +print(predictions) ``` Here’s a sample output returned (note that this output has been truncated to make it easier to read, you'll get much more in return if you run the code yourself):