Misc maintenance (#40)

* Update API version to 2024-08-01-preview * Improve upon credential use
simonkurtz-MSFT · Oct 18, 2024 · 3a0cebe · 3a0cebe
1 parent bf11a0a
commit 3a0cebe
Show file tree

Hide file tree

Showing 5 changed files with 39 additions and 32 deletions.
diff --git a/PACKAGE_README.md b/PACKAGE_README.md
@@ -105,7 +105,7 @@ from openai_priority_loadbalancer import AsyncLoadBalancer, Backend
     client = AzureOpenAI(
         azure_endpoint = f"https://{backends[0].host}",         # Must be seeded, so we use the first host. It will get overwritten by the load balancer.
         azure_ad_token_provider = token_provider,               # Your authentication may vary. Please adjust accordingly.
-        api_version = "2024-04-01-preview",
+        api_version = "2024-08-01-preview",
         http_client = httpx.Client(transport = lb)              # Inject the synchronous load balancer as the transport in a new default httpx client.
     )
     ```
@@ -118,7 +118,7 @@ from openai_priority_loadbalancer import AsyncLoadBalancer, Backend
     client = AsyncAzureOpenAI(
         azure_endpoint = f"https://{backends[0].host}",         # Must be seeded, so we use the first host. It will get overwritten by the load balancer.
         azure_ad_token_provider = token_provider,               # Your authentication may vary. Please adjust accordingly.
-        api_version = "2024-04-01-preview",
+        api_version = "2024-08-01-preview",
         http_client = httpx.AsyncClient(transport = lb)         # Inject the asynchronous load balancer as the transport in a new default async httpx client.
     )
     ```
@@ -152,7 +152,7 @@ When a backend's `api_key` property is set, the `api-key` header will be replace
     client = AzureOpenAI(
         azure_endpoint = f"https://{backends[0].host}",         # Must be seeded, so we use the first host. It will get overwritten by the load balancer.
         api_key = "obtain_from_load_balancer",                  # the value is not used, but it must be set
-        api_version = "2024-04-01-preview",
+        api_version = "2024-08-01-preview",
         http_client = httpx.Client(transport = lb)              # Inject the synchronous load balancer as the transport in a new default httpx client.
     )
     ```
@@ -165,7 +165,7 @@ When a backend's `api_key` property is set, the `api-key` header will be replace
     client = AsyncAzureOpenAI(
         azure_endpoint = f"https://{backends[0].host}",         # Must be seeded, so we use the first host. It will get overwritten by the load balancer.
         api_key = "obtain_from_load_balancer",                  # the value is not used, but it must be set
-        api_version = "2024-04-01-preview",
+        api_version = "2024-08-01-preview",
         http_client = httpx.AsyncClient(transport = lb)         # Inject the asynchronous load balancer as the transport in a new default async httpx client.
     )
     ```
@@ -286,14 +286,14 @@ In this log excerpt, we see that all three backends are timing out. As the stand
 The wait periods are 44 seconds (westus), 4 seconds (eastus), and 7 seconds (southcentralus) in this log. Our logic determines that eastus will become available soonest. Therefore, we return a `Retry-After` header with a value of `4`. The OpenAI Python library then adds its exponential backoff (~2 seconds here).
 
 ```text
-2024-05-11 00:56:32.299477:   Request sent to server: https://oai-westus-20240509.openai.azure.com/openai/deployments/gpt-35-turbo-sjk-001/chat/completions?api-version=2024-04-01-preview, Status Code: 429 - FAIL
+2024-05-11 00:56:32.299477:   Request sent to server: https://oai-westus-20240509.openai.azure.com/openai/deployments/gpt-35-turbo-sjk-001/chat/completions?api-version=2024-08-01-preview, Status Code: 429 - FAIL
 2024-05-11 00:56:32.299477:   Backend oai-westus-20240509.openai.azure.com is throttling. Retry after 44 second(s).
-2024-05-11 00:56:32.394350:   Request sent to server: https://oai-eastus-20240509.openai.azure.com/openai/deployments/gpt-35-turbo-sjk-001/chat/completions?api-version=2024-04-01-preview, Status Code: 429 - FAIL
+2024-05-11 00:56:32.394350:   Request sent to server: https://oai-eastus-20240509.openai.azure.com/openai/deployments/gpt-35-turbo-sjk-001/chat/completions?api-version=2024-08-01-preview, Status Code: 429 - FAIL
 2024-05-11 00:56:32.395578:   Backend oai-eastus-20240509.openai.azure.com is throttling. Retry after 4 second(s).
-2024-05-11 00:56:32.451891:   Request sent to server: https://oai-southcentralus-20240509.openai.azure.com/openai/deployments/gpt-35-turbo-sjk-001/chat/completions?api-version=2024-04-01-preview, Status Code: 429 - FAIL
+2024-05-11 00:56:32.451891:   Request sent to server: https://oai-southcentralus-20240509.openai.azure.com/openai/deployments/gpt-35-turbo-sjk-001/chat/completions?api-version=2024-08-01-preview, Status Code: 429 - FAIL
 2024-05-11 00:56:32.452883:   Backend oai-southcentralus-20240509.openai.azure.com is throttling. Retry after 7 second(s).
 2024-05-11 00:56:32.452883:   No backends available. Exiting.
 2024-05-11 00:56:32.453891:   Soonest Retry After: oai-eastus-20240509.openai.azure.com - 4 second(s)
 2024-05-11 00:56:38.551672:   Backend oai-eastus-20240509.openai.azure.com is no longer throttling.
-2024-05-11 00:56:39.851076:   Request sent to server: https://oai-eastus-20240509.openai.azure.com/openai/deployments/gpt-35-turbo-sjk-001/chat/completions?api-version=2024-04-01-preview, Status code: 200
+2024-05-11 00:56:39.851076:   Request sent to server: https://oai-eastus-20240509.openai.azure.com/openai/deployments/gpt-35-turbo-sjk-001/chat/completions?api-version=2024-08-01-preview, Status code: 200
 ```
diff --git a/README.md b/README.md
@@ -123,16 +123,16 @@ In this log excerpt, we see that all three backends are timing out. As the stand
 The wait periods are 44 seconds (westus), 4 seconds (eastus), and 7 seconds (southcentralus) in this log. Our logic determines that eastus will become available soonest. Therefore, we return a `Retry-After` header with a value of `4`. The OpenAI Python library then adds its exponential backoff (~2 seconds here).
 
 ```text
-2024-05-11 00:56:32.299477:   Request sent to server: https://oai-westus-20240509.openai.azure.com/openai/deployments/gpt-35-turbo-sjk-001/chat/completions?api-version=2024-04-01-preview, Status Code: 429 - FAIL
+2024-05-11 00:56:32.299477:   Request sent to server: https://oai-westus-20240509.openai.azure.com/openai/deployments/gpt-35-turbo-sjk-001/chat/completions?api-version=2024-08-01-preview, Status Code: 429 - FAIL
 2024-05-11 00:56:32.299477:   Backend oai-westus-20240509.openai.azure.com is throttling. Retry after 44 second(s).
-2024-05-11 00:56:32.394350:   Request sent to server: https://oai-eastus-20240509.openai.azure.com/openai/deployments/gpt-35-turbo-sjk-001/chat/completions?api-version=2024-04-01-preview, Status Code: 429 - FAIL
+2024-05-11 00:56:32.394350:   Request sent to server: https://oai-eastus-20240509.openai.azure.com/openai/deployments/gpt-35-turbo-sjk-001/chat/completions?api-version=2024-08-01-preview, Status Code: 429 - FAIL
 2024-05-11 00:56:32.395578:   Backend oai-eastus-20240509.openai.azure.com is throttling. Retry after 4 second(s).
-2024-05-11 00:56:32.451891:   Request sent to server: https://oai-southcentralus-20240509.openai.azure.com/openai/deployments/gpt-35-turbo-sjk-001/chat/completions?api-version=2024-04-01-preview, Status Code: 429 - FAIL
+2024-05-11 00:56:32.451891:   Request sent to server: https://oai-southcentralus-20240509.openai.azure.com/openai/deployments/gpt-35-turbo-sjk-001/chat/completions?api-version=2024-08-01-preview, Status Code: 429 - FAIL
 2024-05-11 00:56:32.452883:   Backend oai-southcentralus-20240509.openai.azure.com is throttling. Retry after 7 second(s).
 2024-05-11 00:56:32.452883:   No backends available. Exiting.
 2024-05-11 00:56:32.453891:   Soonest Retry After: oai-eastus-20240509.openai.azure.com - 4 second(s)
 2024-05-11 00:56:38.551672:   Backend oai-eastus-20240509.openai.azure.com is no longer throttling.
-2024-05-11 00:56:39.851076:   Request sent to server: https://oai-eastus-20240509.openai.azure.com/openai/deployments/gpt-35-turbo-sjk-001/chat/completions?api-version=2024-04-01-preview, Status code: 200
+2024-05-11 00:56:39.851076:   Request sent to server: https://oai-eastus-20240509.openai.azure.com/openai/deployments/gpt-35-turbo-sjk-001/chat/completions?api-version=2024-08-01-preview, Status code: 200
 ```
 
 ## Load Balancer Backend Configuration

diff --git a/aoai.py b/aoai.py
@@ -36,7 +36,13 @@ def __init__(self):
 
 # get_bearer_token_provider automatically caches and refreshes tokens.
 # https://github.com/openai/openai-python/blob/main/examples/azure_ad.py#L5
-token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")
+
+# Sometimes, especially if you receive 400s from Azure OpenAI, you may need to use fresh credentials after an az logout / az login. Experiment with excluding the cached credential, if need be.
+# You can also remove the MSAL cache files in C:\Users\<user>\AppData\Local\.IdentityService: msal.cache, msalV2.cache
+# Set logging to DEBUG above to see where it's failing.
+# https://github.com/Azure/azure-sdk-for-python/issues/29040
+credential = DefaultAzureCredential(exclude_shared_token_cache_credential = False)
+token_provider = get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default")
 
 # Standard Azure OpenAI Implementation (One Backend)
 def send_request(num_of_requests: int, azure_endpoint: str):
@@ -48,7 +54,7 @@ def send_request(num_of_requests: int, azure_endpoint: str):
         client = AzureOpenAI(
             azure_endpoint = azure_endpoint,
             azure_ad_token_provider = token_provider,
-            api_version = "2024-04-01-preview"
+            api_version = config.API_VERSION
         )
 
         for i in range(num_of_requests):
@@ -86,7 +92,7 @@ def send_loadbalancer_request(num_of_requests: int):
         client = AzureOpenAI(
             azure_endpoint = f"https://{config.backends[0].host}", # Must be seeded, so we use the first host. It will get overwritten by the load balancer.
             azure_ad_token_provider = token_provider,
-            api_version = "2024-04-01-preview",
+            api_version = config.API_VERSION,
             http_client = httpx.Client(transport = lb)      # Inject the load balancer as the transport in a new default httpx client
         )
 
@@ -137,7 +143,7 @@ def send_loadbalancer_request_with_api_keys(num_of_requests: int):
         client = AzureOpenAI(
             azure_endpoint = f"https://{config.backends_with_api_keys[0].host}", # Must be seeded, so we use the first host. It will get overwritten by the load balancer.
             api_key = "obtain_from_load_balancer",          # the value is not used, but it must be set
-            api_version = "2024-04-01-preview",
+            api_version = config.API_VERSION,
             http_client = httpx.Client(transport = lb)      # Inject the load balancer as the transport in a new default httpx client
         )
 
@@ -187,7 +193,7 @@ async def send_async_loadbalancer_request(num_of_requests: int):
         client = AsyncAzureOpenAI(
             azure_endpoint = f"https://{config.backends[0].host}", # Must be seeded, so we use the first host. It will get overwritten by the load balancer.
             azure_ad_token_provider = token_provider,
-            api_version = "2024-04-01-preview",
+            api_version = config.API_VERSION,
             http_client = httpx.AsyncClient(transport = lb) # Inject the load balancer as the transport in a new default httpx client
         )
 
@@ -237,7 +243,7 @@ async def send_async_loadbalancer_request_with_api_keys(num_of_requests: int):
         client = AsyncAzureOpenAI(
             azure_endpoint = f"https://{config.backends_with_api_keys[0].host}", # Must be seeded, so we use the first host. It will get overwritten by the load balancer.
             api_key = "obtain_from_load_balancer",          # the value is not used, but it must be set
-            api_version = "2024-04-01-preview",
+            api_version = config.API_VERSION,
             http_client = httpx.AsyncClient(transport = lb) # Inject the load balancer as the transport in a new default httpx client
         )
 
@@ -288,7 +294,7 @@ def send_stream_loadbalancer_request(num_of_requests: int):
         client = AzureOpenAI(
             azure_endpoint = f"https://{config.backends[0].host}", # Must be seeded, so we use the first host. It will get overwritten by the load balancer.
             azure_ad_token_provider = token_provider,
-            api_version = "2024-04-01-preview",
+            api_version = config.API_VERSION,
             http_client = httpx.Client(transport = lb)      # Inject the load balancer as the transport in a new default httpx client
         )
 
@@ -361,7 +367,7 @@ async def send_async_stream_loadbalancer_request(num_of_requests: int):
         client = AsyncAzureOpenAI(
             azure_endpoint = f"https://{config.backends[0].host}", # Must be seeded, so we use the first host. It will get overwritten by the load balancer.
             azure_ad_token_provider = token_provider,
-            api_version = "2024-04-01-preview",
+            api_version = config.API_VERSION,
             http_client = httpx.AsyncClient(transport = lb) # Inject the load balancer as the transport in a new default httpx client
         )
 

diff --git a/config.py b/config.py
@@ -8,6 +8,7 @@
 NUM_OF_REQUESTS = 5
 MODEL           = "<your-aoai-model>"  # the model, also known as the Deployment in Azure OpenAI, is common across standard and load-balanced requests
 AZURE_ENDPOINT  = "https://oai-eastus-xxxxxxxx.openai.azure.com"
+API_VERSION     = "2024-08-01-preview"
 
 backends: List[Backend] = [
     Backend("oai-eastus-xxxxxxxx.openai.azure.com", 1),

diff --git a/tests/lib/test_openai_priority_loadbalancer.py b/tests/lib/test_openai_priority_loadbalancer.py
@@ -31,7 +31,7 @@ def create_async_client(backends: List[Backend]) -> AsyncAzureOpenAI:
     return AsyncAzureOpenAI(
         azure_endpoint = "https://foo.openai.azure.com",
         api_key = "example API key",
-        api_version = "2024-04-01-preview",
+        api_version = "2024-08-01-preview",
         http_client = httpx.AsyncClient(transport = lb)
     )
 
@@ -41,7 +41,7 @@ def create_client(backends: List[Backend]) -> AzureOpenAI:
     return AzureOpenAI(
         azure_endpoint = "https://foo.openai.azure.com",
         api_key = "example API key",
-        api_version = "2024-04-01-preview",
+        api_version = "2024-08-01-preview",
         http_client = httpx.Client(transport = lb)
     )
 
@@ -288,14 +288,14 @@ def test_loadbalancer_modify_request_url_path(self, client_same_priority_custom_
         with patch('httpx.Client.send', return_value = mock_response):
             req = client._build_request(create_final_request_options())
 
-            assert req.url == 'https://foo.openai.azure.com/openai/completions?api-version=2024-04-01-preview'
+            assert req.url == 'https://foo.openai.azure.com/openai/completions?api-version=2024-08-01-preview'
 
             client._client._transport.handle_request(req)
 
             assert req.url in (
-                'https://oai-eastus.openai.azure.com/ai/openai/completions?api-version=2024-04-01-preview',
-                'https://oai-westus.openai.azure.com/ai/openai/completions?api-version=2024-04-01-preview',
-                'https://oai-southcentralus.openai.azure.com/ai/openai/completions?api-version=2024-04-01-preview'
+                'https://oai-eastus.openai.azure.com/ai/openai/completions?api-version=2024-08-01-preview',
+                'https://oai-westus.openai.azure.com/ai/openai/completions?api-version=2024-08-01-preview',
+                'https://oai-southcentralus.openai.azure.com/ai/openai/completions?api-version=2024-08-01-preview'
             )
 
     @pytest.mark.loadbalancer
@@ -308,7 +308,7 @@ def test_loadbalancer_use_api_keys(self, client_same_priority_api_keys):
         with patch('httpx.Client.send', return_value = mock_response):
             req = client._build_request(create_final_request_options())
 
-            assert req.url == 'https://foo.openai.azure.com/openai/completions?api-version=2024-04-01-preview'
+            assert req.url == 'https://foo.openai.azure.com/openai/completions?api-version=2024-08-01-preview'
 
             client._client._transport.handle_request(req)
 
@@ -512,14 +512,14 @@ async def test_async_loadbalancer_modify_request_url_path(self, async_client_sam
         with patch('httpx.AsyncClient.send', return_value = mock_response):
             req = client._build_request(create_final_request_options())
 
-            assert req.url == 'https://foo.openai.azure.com/openai/completions?api-version=2024-04-01-preview'
+            assert req.url == 'https://foo.openai.azure.com/openai/completions?api-version=2024-08-01-preview'
 
             await client._client._transport.handle_async_request(req)
 
             assert req.url in (
-                'https://oai-eastus.openai.azure.com/ai/openai/completions?api-version=2024-04-01-preview',
-                'https://oai-westus.openai.azure.com/ai/openai/completions?api-version=2024-04-01-preview',
-                'https://oai-southcentralus.openai.azure.com/ai/openai/completions?api-version=2024-04-01-preview'
+                'https://oai-eastus.openai.azure.com/ai/openai/completions?api-version=2024-08-01-preview',
+                'https://oai-westus.openai.azure.com/ai/openai/completions?api-version=2024-08-01-preview',
+                'https://oai-southcentralus.openai.azure.com/ai/openai/completions?api-version=2024-08-01-preview'
             )
 
     @pytest.mark.asyncio
@@ -533,7 +533,7 @@ async def test_async_loadbalancer_use_api_keys(self, async_client_same_priority_
         with patch('httpx.AsyncClient.send', return_value = mock_response):
             req = client._build_request(create_final_request_options())
 
-            assert req.url == 'https://foo.openai.azure.com/openai/completions?api-version=2024-04-01-preview'
+            assert req.url == 'https://foo.openai.azure.com/openai/completions?api-version=2024-08-01-preview'
 
             await client._client._transport.handle_async_request(req)