Reformat python code snippets for test-embeddings pages

cohere-ai · Dec 18, 2024 · 9ac11ae · 9ac11ae
1 parent 373b29b
commit 9ac11ae
Show file tree

Hide file tree

Showing 8 changed files with 240 additions and 196 deletions.
diff --git a/fern/pages/fine-tuning/fine-tuning-with-the-python-sdk.mdx b/fern/pages/fine-tuning/fine-tuning-with-the-python-sdk.mdx
@@ -19,13 +19,13 @@ The snippet below creates a dataset for fine-tuning a model on records of custom
 
 ```python PYTHON
 # create a dataset
-co = cohere.Client('Your API key')
+co = cohere.Client("Your API key")
 
 my_dataset = co.datasets.create(
-  name="customer service logs",
-  type="chat-finetune-input",
-  data=open("./customer-chat.jsonl", "rb"),
-  eval_data=open("./customer-chat-eval.jsonl", "rb")
+    name="customer service logs",
+    type="chat-finetune-input",
+    data=open("./customer-chat.jsonl", "rb"),
+    eval_data=open("./customer-chat-eval.jsonl", "rb"),
 )
 
 result = co.wait(my_dataset)
@@ -40,15 +40,15 @@ from cohere.finetuning import FinetunedModel, Settings, BaseModel
 
 # start training a custom model using the dataset
 finetuned_model = co.finetuning.create_finetuned_model(
-  request=FinetunedModel(
-    name="customer-service-chat-model",
-    settings=Settings(
-      base_model=BaseModel(
-        base_type="BASE_TYPE_CHAT",
-      ),
-      dataset_id=my_dataset.id,
+    request=FinetunedModel(
+        name="customer-service-chat-model",
+        settings=Settings(
+            base_model=BaseModel(
+                base_type="BASE_TYPE_CHAT",
+            ),
+            dataset_id=my_dataset.id,
+        ),
     ),
-  ),
 )
 ```
 

diff --git a/fern/pages/get-started/datasets.mdx b/fern/pages/get-started/datasets.mdx
@@ -35,7 +35,7 @@ You should also be aware of how Cohere handles data retention. This is the most
 
 First, let's install the SDK
 
-```python PYTHON
+```bash
 pip install cohere
 ```
 
@@ -164,20 +164,20 @@ Datasets of type `chat-finetune-input`, for example, are expected to have a json
 
 ```python PYTHON
 {
-  "messages": [
-    {
-      "role": "System",
-      "content": "You are a large language model trained by Cohere."
-    },
-    {
-      "role": "User",
-      "content": "Hi! What were Time magazines top 10 cover stories in the last 10 years?"
-    },
-    {
-      "role": "Chatbot",
-      "content": "Time magazines top 10 cover stories in the last 10 years were:\\n\\n1. Volodymyr Zelenskyy\\n2. Elon Musk\\n3. Martin Luther King Jr.\\n4. How Earth Survived\\n5. Her Lasting Impact\\n6. Nothing to See Here\\n7. Meltdown\\n8. Deal With It\\n9. The Top of America\\n10. Bitter Pill"
-    }
-  ]
+    "messages": [
+        {
+            "role": "System",
+            "content": "You are a large language model trained by Cohere.",
+        },
+        {
+            "role": "User",
+            "content": "Hi! What were Time magazines top 10 cover stories in the last 10 years?",
+        },
+        {
+            "role": "Chatbot",
+            "content": "Time magazines top 10 cover stories in the last 10 years were:\\n\\n1. Volodymyr Zelenskyy\\n2. Elon Musk\\n3. Martin Luther King Jr.\\n4. How Earth Survived\\n5. Her Lasting Impact\\n6. Nothing to See Here\\n7. Meltdown\\n8. Deal With It\\n9. The Top of America\\n10. Bitter Pill",
+        },
+    ]
 }
 ```
 

diff --git a/fern/pages/text-embeddings/embed-jobs-api.mdx b/fern/pages/text-embeddings/embed-jobs-api.mdx
@@ -60,15 +60,15 @@ As seen in the example above, the following would be a valid `create_dataset` ca
 
 ```python PYTHON
 # Upload a dataset for embed jobs
-ds=co.datasets.create(
-	name='sample_file',
-	# insert your file path here - you can upload it on the right - we accept .csv and jsonl files
-	data=open('embed_jobs_sample_data.jsonl', 'rb'),
-	keep_fields=['wiki_id','url','views','title']
-	optional_fields=['langs']
-	dataset_type="embed-input",
-  embedding_types=['float']
-	)
+ds = co.datasets.create(
+    name="sample_file",
+    # insert your file path here - you can upload it on the right - we accept .csv and jsonl files
+    data=open("embed_jobs_sample_data.jsonl", "rb"),
+    keep_fields=["wiki_id", "url", "views", "title"],
+    optional_fields=["langs"],
+    dataset_type="embed-input",
+    embedding_types=["float"],
+)
 
 # wait for the dataset to finish validation
 print(co.wait(ds))
@@ -82,13 +82,14 @@ The Embed Jobs API takes in `dataset IDs` as an input. Uploading a local file to
 
 ```python PYTHON
 import cohere
+
 co = cohere.Client(api_key="<YOUR API KEY>")
 
-input_dataset=co.datasets.create(
-	name='your_file_name',
-	data=open('/content/your_file_path', 'rb'),
-	dataset_type="embed-input"
-	)
+input_dataset = co.datasets.create(
+    name="your_file_name",
+    data=open("/content/your_file_path", "rb"),
+    dataset_type="embed-input",
+)
 
 # block on server-side validation
 print(co.wait(input_dataset))
@@ -114,11 +115,12 @@ Your dataset is now ready to be embedded. Here's a code snippet illustrating wha
 
 ```python PYTHON
 embed_job = co.embed_jobs.create(
-	dataset_id=input_dataset.id,
-	input_type='search_document' ,
-	model='embed-english-v3.0',
-  embedding_types=['float'],
-	truncate='END')
+    dataset_id=input_dataset.id,
+    input_type="search_document",
+    model="embed-english-v3.0",
+    embedding_types=["float"],
+    truncate="END",
+)
 
 # block until the job is complete
 co.wait(embed_job)
@@ -131,17 +133,17 @@ Since we’d like to search over these embeddings and we can think of them as co
 The output of embed jobs is a dataset object which you can download or pipe directly to a database of your choice:
 
 ```python PYTHON
-output_dataset=co.datasets.get(id=embed_job.output.id)
-co.utils.save(filepath='/content/embed_job_output.csv', format="csv")
+output_dataset = co.datasets.get(id=embed_job.output.id)
+co.utils.save(filepath="/content/embed_job_output.csv", format="csv")
 ```
 
 Alternatively if you would like to pass the dataset into a downstream function you can do the following:
 
 ```python PYTHON
-output_dataset=co.datasets.get(id=embed_job.output.id)
-results=[]
+output_dataset = co.datasets.get(id=embed_job.output.id)
+results = []
 for record in output_dataset:
-	results.append(record)
+    results.append(record)
 ```
 
 ### Sample Output

diff --git a/fern/pages/text-embeddings/embeddings.mdx b/fern/pages/text-embeddings/embeddings.mdx
@@ -26,22 +26,26 @@ co = cohere.Client(api_key="YOUR_API_KEY")
 # get the embeddings
 phrases = ["i love soup", "soup is my favorite", "london is far away"]
 
-model="embed-english-v3.0"
-input_type="search_query"
+model = "embed-english-v3.0"
+input_type = "search_query"
 
-res = co.embed(texts=phrases,
-                model=model,
-                input_type=input_type,
-                embedding_types=['float'])
+res = co.embed(
+    texts=phrases,
+    model=model,
+    input_type=input_type,
+    embedding_types=["float"],
+)
 
 (soup1, soup2, london) = res.embeddings.float
 
+
 # compare them
 def calculate_similarity(a, b):
-  return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+
 
-calculate_similarity(soup1, soup2) # 0.85 - very similar!
-calculate_similarity(soup1, london) # 0.16 - not similar!
+calculate_similarity(soup1, soup2)  # 0.85 - very similar!
+calculate_similarity(soup1, london)  # 0.16 - not similar!
 ```
 
 ## The `input_type` parameter
@@ -58,24 +62,31 @@ Cohere embeddings are optimized for different types of inputs.
 In addition to `embed-english-v3.0` we offer a best-in-class multilingual model [embed-multilingual-v3.0](/docs/embed-2#multi-lingual-models)  with support for over 100 languages, including Chinese, Spanish, and French. This model can be used with the Embed API, just like its English counterpart:
 
 ```python PYTHON
-import cohere  
+import cohere
+
 co = cohere.Client(api_key="<YOUR API KEY>")
 
-texts = [  
-   'Hello from Cohere!', 'مرحبًا من كوهير!', 'Hallo von Cohere!',  
-   'Bonjour de Cohere!', '¡Hola desde Cohere!', 'Olá do Cohere!',  
-   'Ciao da Cohere!', '您好，来自 Cohere！', 'कोहेरे से नमस्ते!'  
-]  
+texts = [
+    "Hello from Cohere!",
+    "مرحبًا من كوهير!",
+    "Hallo von Cohere!",
+    "Bonjour de Cohere!",
+    "¡Hola desde Cohere!",
+    "Olá do Cohere!",
+    "Ciao da Cohere!",
+    "您好，来自 Cohere！",
+    "कोहेरे से नमस्ते!",
+]
 
 response = co.embed(
-  model='embed-multilingual-v3.0',
-  texts=texts, 
-  input_type='classification',
-  embedding_types=['float']) 
-
-embeddings = response.embeddings.float # All text embeddings 
-print(embeddings[0][:5]) # Print embeddings for the first text
-
+    model="embed-multilingual-v3.0",
+    texts=texts,
+    input_type="classification",
+    embedding_types=["float"],
+)
+
+embeddings = response.embeddings.float  # All text embeddings
+print(embeddings[0][:5])  # Print embeddings for the first text
 ```
 
 ## Image Embeddings
@@ -96,12 +107,13 @@ Be aware that image embedding has the following restrictions:
 import cohere
 from PIL import Image
 from io import BytesIO
-import base64 
+import base64
 
 co = cohere.Client(api_key="<YOUR API KEY>")
 
 # The model accepts input in base64 as a Data URL
 
+
 def image_to_base64_data_url(image_path):
     # Open the image file
     with Image.open(image_path) as img:
@@ -110,19 +122,23 @@ def image_to_base64_data_url(image_path):
         # Save the image as PNG to the BytesIO object
         img.save(buffered, format="PNG")
         # Encode the image data in base64
-        img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
-
+        img_base64 = base64.b64encode(buffered.getvalue()).decode(
+            "utf-8"
+        )
+
     # Create the Data URL and assumes the original image file type was png
     data_url = f"data:image/png;base64,{img_base64}"
     return data_url
-
+
+
 processed_image = image_to_base64_data_url("<PATH_TO_IMAGE>")
-  
+
 ret = co.embed(
-							 images=[processed_image],
-               model='embed-english-v3.0',
-               embedding_types= ["float"],
-               input_type='image')
+    images=[processed_image],
+    model="embed-english-v3.0",
+    embedding_types=["float"],
+    input_type="image",
+)
 
 ret.embeddings.float
 ```
@@ -142,39 +158,41 @@ The following embedding types are supported:
 The parameter defaults to `float`, so if you pass in no argument you'll get back `float` embeddings:
 
 ```python PYTHON
-ret = co.embed(texts=phrases,
-               model=model,
-               input_type=input_type)
+ret = co.embed(texts=phrases, model=model, input_type=input_type)
 
-ret.embeddings # This contains the float embeddings
+ret.embeddings  # This contains the float embeddings
 ```
 
 However we recommend being explicit about the `embedding type(s)`. To specify an embedding types, pass one of the types from the list above in as list containing a string:
 
 ```python PYTHON
-ret = co.embed(texts=phrases,
-               model=model,
-               input_type=input_type,
-               embedding_types=['int8'])
-
-ret.embeddings.int8 # This contains your int8 embeddings
-ret.embeddings.float # This will be empty
-ret.embeddings.uint8 # This will be empty
-ret.embeddings.ubinary # This will be empty
-ret.embeddings.binary # This will be empty
+ret = co.embed(
+    texts=phrases,
+    model=model,
+    input_type=input_type,
+    embedding_types=["int8"],
+)
+
+ret.embeddings.int8  # This contains your int8 embeddings
+ret.embeddings.float  # This will be empty
+ret.embeddings.uint8  # This will be empty
+ret.embeddings.ubinary  # This will be empty
+ret.embeddings.binary  # This will be empty
 ```
 
 Finally, you can also pass several `embedding types` in as a list, in which case the endpoint will return a dictionary with both types available:
 
 ```python PYTHON
-ret = co.embed(texts=phrases,
-               model=model,
-               input_type=input_type,
-               embedding_types=['int8', 'float'])
-
-ret.embeddings.int8 # This contains your int8 embeddings
-ret.embeddings.float # This contains your float embeddings
-ret.embeddings.uint8 # This will be empty
-ret.embeddings.ubinary # This will be empty
-ret.embeddings.binary # This will be empty
+ret = co.embed(
+    texts=phrases,
+    model=model,
+    input_type=input_type,
+    embedding_types=["int8", "float"],
+)
+
+ret.embeddings.int8  # This contains your int8 embeddings
+ret.embeddings.float  # This contains your float embeddings
+ret.embeddings.uint8  # This will be empty
+ret.embeddings.ubinary  # This will be empty
+ret.embeddings.binary  # This will be empty
 ```