AgentOps-AI · the-praxs · Nov 15, 2024 · Nov 8, 2024 · Nov 9, 2024 · Nov 9, 2024
diff --git a/tests/test_costs.py b/tests/test_costs.py
@@ -46,7 +46,7 @@
         ("gpt-4-vision-preview", 15),
         ("gpt-4o", 15),
         ("azure/gpt-4o", 15),
-        ("claude-2.1", 4),
+        ("claude-3-opus-latest", 11),
     ],
 )
 def test_count_message_tokens(model, expected_output):
@@ -74,8 +74,7 @@ def test_count_message_tokens(model, expected_output):
         ("gpt-4-vision-preview", 17),
         ("gpt-4o", 17),
         ("azure/gpt-4o", 17),
-        ("claude-2.1", 4),
-
+        # ("claude-3-opus-latest", 4), # NOTE: Claude only supports messages without extra inputs
     ],
 )
 def test_count_message_tokens_with_name(model, expected_output):
@@ -116,7 +115,7 @@ def test_count_message_tokens_invalid_model():
         ("gpt-4-vision-preview", 4),
         ("text-embedding-ada-002", 4),
         ("gpt-4o", 4),
-        ("claude-2.1", 4)
+        # ("claude-3-opus-latest", 4), # NOTE: Claude only supports messages
     ],
 )
 def test_count_string_tokens(model, expected_output):
@@ -154,9 +153,9 @@ def test_count_string_invalid_model():
         (MESSAGES, "gpt-4-0613", Decimal("0.00045")),
         (MESSAGES, "gpt-4-1106-preview", Decimal("0.00015")),
         (MESSAGES, "gpt-4-vision-preview", Decimal("0.00015")),
-        (MESSAGES, "gpt-4o", Decimal("0.000075")),
+        (MESSAGES, "gpt-4o", Decimal("0.0000375")),
         (MESSAGES, "azure/gpt-4o", Decimal("0.000075")),
-        (MESSAGES, "claude-2.1", Decimal("0.000032")),
+        (MESSAGES, "claude-3-opus-latest", Decimal("0.000165")),
         (STRING, "text-embedding-ada-002", Decimal("0.0000004")),
     ],
 )
@@ -191,9 +190,9 @@ def test_invalid_prompt_format():
         (STRING, "gpt-4-0613", Decimal("0.00024")),
         (STRING, "gpt-4-1106-preview", Decimal("0.00012")),
         (STRING, "gpt-4-vision-preview", Decimal("0.00012")),
-        (STRING, "gpt-4o", Decimal("0.000060")),
+        (STRING, "gpt-4o", Decimal("0.00004")),
         (STRING, "azure/gpt-4o", Decimal("0.000060")),
-        (STRING, "claude-2.1", Decimal("0.000096")),
+        # (STRING, "claude-3-opus-latest", Decimal("0.000096")), # NOTE: Claude only supports messages
         (STRING, "text-embedding-ada-002", 0),
     ],
 )

diff --git a/tests/test_llama_index_callbacks.py b/tests/test_llama_index_callbacks.py
@@ -1,8 +1,7 @@
 # test_llama_index.py
 import pytest
 from tokencost.callbacks import llama_index
-from llama_index.core.callbacks.schema import CBEventType, EventPayload
-from unittest.mock import MagicMock
+from llama_index.core.callbacks.schema import EventPayload
 
 # Mock the calculate_prompt_cost and calculate_completion_cost functions
 

diff --git a/tokencost/__init__.py b/tokencost/__init__.py
@@ -4,6 +4,6 @@
     calculate_completion_cost,
     calculate_prompt_cost,
     calculate_all_costs_and_tokens,
-    calculate_cost_by_tokens
+    calculate_cost_by_tokens,
 )
 from .constants import TOKEN_COSTS_STATIC, TOKEN_COSTS, update_token_costs
diff --git a/tokencost/constants.py b/tokencost/constants.py
@@ -39,7 +39,9 @@ async def fetch_costs():
             if response.status == 200:
                 return await response.json(content_type=None)
             else:
-                raise Exception(f"Failed to fetch token costs, status code: {response.status}")
+                raise Exception(
+                    f"Failed to fetch token costs, status code: {response.status}"
+                )
 
 
 async def update_token_costs():
@@ -49,11 +51,12 @@ async def update_token_costs():
         fetched_costs = await fetch_costs()
         # Safely remove 'sample_spec' if it exists
         TOKEN_COSTS.update(fetched_costs)
-        TOKEN_COSTS.pop('sample_spec', None)
+        TOKEN_COSTS.pop("sample_spec", None)
     except Exception as e:
         logger.error(f"Failed to update TOKEN_COSTS: {e}")
         raise
 
+
 with open(os.path.join(os.path.dirname(__file__), "model_prices.json"), "r") as f:
     TOKEN_COSTS_STATIC = json.load(f)
 
@@ -63,4 +66,4 @@ async def update_token_costs():
     TOKEN_COSTS = TOKEN_COSTS_STATIC
     asyncio.run(update_token_costs())
 except Exception:
-    logger.error('Failed to update token costs. Using static costs.')
+    logger.error("Failed to update token costs. Using static costs.")
diff --git a/tokencost/costs.py b/tokencost/costs.py
@@ -1,8 +1,8 @@
-
 """
 Costs dictionary and utility tool for counting tokens
 """
 
+import os
 import tiktoken
 import anthropic
 from typing import Union, List, Dict
@@ -12,12 +12,30 @@
 
 logger = logging.getLogger(__name__)
 
-# TODO: Add Claude support
-# https://www-files.anthropic.com/production/images/model_pricing_july2023.pdf
 # Note: cl100k is the openai base tokenizer. Nothing to do with Claude. Tiktoken doesn't have claude yet.
 # https://github.com/anthropics/anthropic-tokenizer-typescript/blob/main/index.ts
 
 
+def get_anthropic_token_count(messages: List[Dict[str, str]], model: str) -> int:
+    if not any(
+        supported_model in model for supported_model in [
+            "claude-3-5-sonnet", "claude-3-5-haiku", "claude-3-haiku", "claude-3-opus"
+        ]
+    ):
+        raise ValueError(
+            f"{model} is not supported in token counting (beta) API. Use the `usage` property in the response for exact counts."
+        )
+    try:
+        return anthropic.Anthropic().beta.messages.count_tokens(
+            model=model,
+            messages=messages,
+        ).input_tokens
+    except TypeError as e:
+        raise e
+    except Exception as e:
+        raise e
+
+
 def strip_ft_model_name(model: str) -> str:
     """
     Finetuned models format: ft:gpt-3.5-turbo:my-org:custom_suffix:id
@@ -42,14 +60,12 @@ def count_message_tokens(messages: List[Dict[str, str]], model: str) -> int:
     model = model.lower()
     model = strip_ft_model_name(model)
 
+    # Anthropic token counting requires a valid API key
     if "claude-" in model:
-        """
-        Note that this is only accurate for older models, e.g. `claude-2.1`. 
-        For newer models this can only be used as a _very_ rough estimate, 
-        instead you should rely on the `usage` property in the response for exact counts.
-        """
-        prompt = "".join(message["content"] for message in messages)
-        return count_string_tokens(prompt, model)
+        logger.warning(
+            "Warning: Anthropic token counting API is currently in beta. Please expect differences in costs!"
+        )
+        return get_anthropic_token_count(messages, model)
 
     try:
         encoding = tiktoken.encoding_for_model(model)
@@ -80,8 +96,9 @@ def count_message_tokens(messages: List[Dict[str, str]], model: str) -> int:
         )
         return count_message_tokens(messages, model="gpt-3.5-turbo-0613")
     elif "gpt-4o" in model:
-        print(
-            "Warning: gpt-4o may update over time. Returning num tokens assuming gpt-4o-2024-05-13.")
+        logger.warning(
+            "Warning: gpt-4o may update over time. Returning num tokens assuming gpt-4o-2024-05-13."
+        )
         return count_message_tokens(messages, model="gpt-4o-2024-05-13")
     elif "gpt-4" in model:
         logger.warning(
@@ -121,18 +138,9 @@ def count_string_tokens(prompt: str, model: str) -> int:
         model = model.split("/")[-1]
 
     if "claude-" in model:
-        """
-        Note that this is only accurate for older models, e.g. `claude-2.1`. 
-        For newer models this can only be used as a _very_ rough estimate, 
-        instead you should rely on the `usage` property in the response for exact counts.
-        """
-        if "claude-3" in model:
-            logger.warning(
-                "Warning: Claude-3 models are not yet supported. Returning num tokens assuming claude-2.1."
-            )
-        client = anthropic.Client()
-        token_count = client.count_tokens(prompt)
-        return token_count
+        raise ValueError(
+            "Warning: Anthropic does not support this method. Please use the `count_message_tokens` function for the exact counts."
+        )
 
     try:
         encoding = tiktoken.encoding_for_model(model)
@@ -200,13 +208,11 @@ def calculate_prompt_cost(prompt: Union[List[dict], str], model: str) -> Decimal
         )
     if not isinstance(prompt, (list, str)):
         raise TypeError(
-            f"""Prompt must be either a string or list of message objects.
-            it is {type(prompt)} instead.
-            """
+            f"Prompt must be either a string or list of message objects but found {type(prompt)} instead."
         )
     prompt_tokens = (
         count_string_tokens(prompt, model)
-        if isinstance(prompt, str)
+        if isinstance(prompt, str) and "claude-" not in model
         else count_message_tokens(prompt, model)
     )
 
@@ -235,7 +241,18 @@ def calculate_completion_cost(completion: str, model: str) -> Decimal:
             f"""Model {model} is not implemented.
             Double-check your spelling, or submit an issue/PR"""
         )
-    completion_tokens = count_string_tokens(completion, model)
+
+    if not isinstance(completion, str):
+        raise TypeError(
+            f"Prompt must be a string but found {type(completion)} instead."
+        )
+
+    if "claude-" in model:
+        completion_list = [{"role": "assistant", "content": completion}]
+        # Anthropic appends some 13 additional tokens to the actual completion tokens
+        completion_tokens = count_message_tokens(completion_list, model) - 13
+    else:
+        completion_tokens = count_string_tokens(completion, model)
 
     return calculate_cost_by_tokens(completion_tokens, model, "output")
 
@@ -264,10 +281,19 @@ def calculate_all_costs_and_tokens(
     completion_cost = calculate_completion_cost(completion, model)
     prompt_tokens = (
         count_string_tokens(prompt, model)
-        if isinstance(prompt, str)
+        if isinstance(prompt, str) and "claude-" not in model
         else count_message_tokens(prompt, model)
     )
-    completion_tokens = count_string_tokens(completion, model)
+
+    if "claude-" in model:
+        logger.warning(
+            "Warning: Token counting is estimated for "
+        )
+        completion_list = [{"role": "assistant", "content": completion}]
+        # Anthropic appends some 13 additional tokens to the actual completion tokens
+        completion_tokens = count_message_tokens(completion_list, model) - 13
+    else:
+        completion_tokens = count_string_tokens(completion, model)
 
     return {
         "prompt_cost": prompt_cost,

diff --git a/update_prices.py b/update_prices.py
@@ -9,7 +9,9 @@
 def diff_dicts(dict1, dict2):
     diff_keys = dict1.keys() ^ dict2.keys()
     differences = {k: (dict1.get(k), dict2.get(k)) for k in diff_keys}
-    differences.update({k: (dict1[k], dict2[k]) for k in dict1 if k in dict2 and dict1[k] != dict2[k]})
+    differences.update(
+        {k: (dict1[k], dict2[k]) for k in dict1 if k in dict2 and dict1[k] != dict2[k]}
+    )
 
     if differences:
         print("Differences found:")
@@ -24,56 +26,74 @@ def diff_dicts(dict1, dict2):
         return False
 
 
-with open('tokencost/model_prices.json', 'r') as f:
+with open("tokencost/model_prices.json", "r") as f:
     model_prices = json.load(f)
 
 if diff_dicts(model_prices, tokencost.TOKEN_COSTS):
-    print('Updating model_prices.json')
-    with open('tokencost/model_prices.json', 'w') as f:
+    print("Updating model_prices.json")
+    with open("tokencost/model_prices.json", "w") as f:
         json.dump(tokencost.TOKEN_COSTS, f, indent=4)
 # Load the data
 df = pd.DataFrame(tokencost.TOKEN_COSTS).T
-df.loc[df.index[1:], 'max_input_tokens'] = df['max_input_tokens'].iloc[1:].apply(lambda x: '{:,.0f}'.format(x))
-df.loc[df.index[1:], 'max_tokens'] = df['max_tokens'].iloc[1:].apply(lambda x: '{:,.0f}'.format(x))
+df.loc[df.index[1:], "max_input_tokens"] = (
+    df["max_input_tokens"].iloc[1:].apply(lambda x: "{:,.0f}".format(x))
+)
+df.loc[df.index[1:], "max_tokens"] = (
+    df["max_tokens"].iloc[1:].apply(lambda x: "{:,.0f}".format(x))
+)
 
 
 # Updated function to format the cost or handle NaN
 
 
 def format_cost(x):
     if pd.isna(x):
-        return '--'
+        return "--"
     else:
         price_per_million = Decimal(str(x)) * Decimal(str(1_000_000))
         # print(price_per_million)
         normalized = price_per_million.normalize()
-        formatted_price = '{:2f}'.format(normalized)
+        formatted_price = "{:2f}".format(normalized)
 
-        formatted_price = formatted_price.rstrip('0').rstrip('.') if '.' in formatted_price else formatted_price + '.00'
+        formatted_price = (
+            formatted_price.rstrip("0").rstrip(".")
+            if "." in formatted_price
+            else formatted_price + ".00"
+        )
 
         return f"${formatted_price}"
 
 
 # Apply the formatting function using DataFrame.apply and lambda
-df[['input_cost_per_token', 'output_cost_per_token']] = df[[
-    'input_cost_per_token', 'output_cost_per_token']].apply(lambda x: x.map(format_cost))
+df[["input_cost_per_token", "output_cost_per_token"]] = df[
+    ["input_cost_per_token", "output_cost_per_token"]
+].apply(lambda x: x.map(format_cost))
 
 
 column_mapping = {
-    'input_cost_per_token': 'Prompt Cost (USD) per 1M tokens',
-    'output_cost_per_token': 'Completion Cost (USD) per 1M tokens',
-    'max_input_tokens': 'Max Prompt Tokens',
-    'max_output_tokens': 'Max Output Tokens',
-    'model_name': 'Model Name'
+    "input_cost_per_token": "Prompt Cost (USD) per 1M tokens",
+    "output_cost_per_token": "Completion Cost (USD) per 1M tokens",
+    "max_input_tokens": "Max Prompt Tokens",
+    "max_output_tokens": "Max Output Tokens",
+    "model_name": "Model Name",
 }
 
 # Assuming the keys of the JSON data represent the model names and have been set as the index
-df['Model Name'] = df.index
+df["Model Name"] = df.index
 
 # Apply the column renaming
 df.rename(columns=column_mapping, inplace=True)
 
 # Write the DataFrame with the correct column names as markdown to a file
-with open('pricing_table.md', 'w') as f:
-    f.write(df[['Model Name', 'Prompt Cost (USD) per 1M tokens', 'Completion Cost (USD) per 1M tokens',
-            'Max Prompt Tokens', 'Max Output Tokens']].to_markdown(index=False))
+with open("pricing_table.md", "w") as f:
+    f.write(
+        df[
+            [
+                "Model Name",
+                "Prompt Cost (USD) per 1M tokens",
+                "Completion Cost (USD) per 1M tokens",
+                "Max Prompt Tokens",
+                "Max Output Tokens",
+            ]
+        ].to_markdown(index=False)
+    )