diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py index f5f38ac6c0f..60146ad15a1 100644 --- a/integration-tests/conftest.py +++ b/integration-tests/conftest.py @@ -333,6 +333,8 @@ def local_launcher( max_input_length: Optional[int] = None, max_batch_prefill_tokens: Optional[int] = None, max_total_tokens: Optional[int] = None, + lora_adapters: Optional[List[str]] = None, + cuda_graphs: Optional[List[int]] = None, ): port = random.randint(8000, 10_000) master_port = random.randint(10_000, 20_000) @@ -379,6 +381,14 @@ def local_launcher( if max_total_tokens: args.append("--max-total-tokens") args.append(str(max_total_tokens)) + if lora_adapters: + args.append("--lora-adapters") + args.append(",".join(lora_adapters)) + if cuda_graphs: + args.append("--cuda-graphs") + args.append(",".join(map(str, cuda_graphs))) + + print(" ".join(args), file=sys.stderr) env["LOG_LEVEL"] = "info,text_generation_router=debug" @@ -418,6 +428,8 @@ def docker_launcher( max_input_length: Optional[int] = None, max_batch_prefill_tokens: Optional[int] = None, max_total_tokens: Optional[int] = None, + lora_adapters: Optional[List[str]] = None, + cuda_graphs: Optional[List[int]] = None, ): port = random.randint(8000, 10_000) @@ -447,6 +459,12 @@ def docker_launcher( if max_total_tokens: args.append("--max-total-tokens") args.append(str(max_total_tokens)) + if lora_adapters: + args.append("--lora-adapters") + args.append(",".join(lora_adapters)) + if cuda_graphs: + args.append("--cuda-graphs") + args.append(",".join(map(str, cuda_graphs))) client = docker.from_env() diff --git a/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_with_customer_support_adapter.json b/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_with_customer_support_adapter.json new file mode 100644 index 00000000000..dfdd2cc3f18 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_with_customer_support_adapter.json @@ -0,0 +1,251 @@ +{ + "details": { + "finish_reason": "length", + "generated_tokens": 40, + "prefill": [], + "seed": null, + "tokens": [ + { + "id": 13, + "logprob": -0.27416992, + "special": false, + "text": "\n" + }, + { + "id": 13, + "logprob": -0.17016602, + "special": false, + "text": "\n" + }, + { + "id": 28737, + "logprob": -2.7109375, + "special": false, + "text": "I" + }, + { + "id": 28809, + "logprob": -1.5, + "special": false, + "text": "’" + }, + { + "id": 28719, + "logprob": -0.34204102, + "special": false, + "text": "m" + }, + { + "id": 459, + "logprob": -1.6914062, + "special": false, + "text": " not" + }, + { + "id": 1864, + "logprob": -0.69140625, + "special": false, + "text": " sure" + }, + { + "id": 513, + "logprob": -1.6171875, + "special": false, + "text": " if" + }, + { + "id": 315, + "logprob": -1.3837891, + "special": false, + "text": " I" + }, + { + "id": 541, + "logprob": -1.2226562, + "special": false, + "text": " can" + }, + { + "id": 1567, + "logprob": -1.8652344, + "special": false, + "text": " come" + }, + { + "id": 582, + "logprob": -0.0070228577, + "special": false, + "text": " up" + }, + { + "id": 395, + "logprob": -0.0054092407, + "special": false, + "text": " with" + }, + { + "id": 28705, + "logprob": -0.62597656, + "special": false, + "text": " " + }, + { + "id": 28770, + "logprob": -0.0035572052, + "special": false, + "text": "3" + }, + { + "id": 4842, + "logprob": -0.93603516, + "special": false, + "text": " unique" + }, + { + "id": 3085, + "logprob": -0.028411865, + "special": false, + "text": " words" + }, + { + "id": 369, + "logprob": -1.0400391, + "special": false, + "text": " that" + }, + { + "id": 6685, + "logprob": -0.09710693, + "special": false, + "text": " describe" + }, + { + "id": 528, + "logprob": -0.066467285, + "special": false, + "text": " me" + }, + { + "id": 28725, + "logprob": -1.0722656, + "special": false, + "text": "," + }, + { + "id": 562, + "logprob": -0.33422852, + "special": false, + "text": " but" + }, + { + "id": 315, + "logprob": -0.5136719, + "special": false, + "text": " I" + }, + { + "id": 28809, + "logprob": -0.8989258, + "special": false, + "text": "’" + }, + { + "id": 584, + "logprob": -0.2076416, + "special": false, + "text": "ll" + }, + { + "id": 1464, + "logprob": -0.8808594, + "special": false, + "text": " try" + }, + { + "id": 28723, + "logprob": -0.88427734, + "special": false, + "text": "." + }, + { + "id": 13, + "logprob": -0.91064453, + "special": false, + "text": "\n" + }, + { + "id": 13, + "logprob": -0.08105469, + "special": false, + "text": "\n" + }, + { + "id": 28740, + "logprob": -1.8486328, + "special": false, + "text": "1" + }, + { + "id": 28723, + "logprob": -0.111572266, + "special": false, + "text": "." + }, + { + "id": 23626, + "logprob": -3.15625, + "special": false, + "text": " Creative" + }, + { + "id": 13, + "logprob": -0.9194336, + "special": false, + "text": "\n" + }, + { + "id": 28750, + "logprob": -0.24841309, + "special": false, + "text": "2" + }, + { + "id": 28723, + "logprob": -9.393692e-05, + "special": false, + "text": "." + }, + { + "id": 6785, + "logprob": -3.1386719, + "special": false, + "text": " Fun" + }, + { + "id": 1780, + "logprob": -0.53564453, + "special": false, + "text": "ny" + }, + { + "id": 13, + "logprob": -0.09033203, + "special": false, + "text": "\n" + }, + { + "id": 28770, + "logprob": -0.00466156, + "special": false, + "text": "3" + }, + { + "id": 28723, + "logprob": -0.00016450882, + "special": false, + "text": "." + } + ] + }, + "generated_text": "\n\nI’m not sure if I can come up with 3 unique words that describe me, but I’ll try.\n\n1. Creative\n2. Funny\n3." +} diff --git a/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_with_dbpedia_adapter.json b/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_with_dbpedia_adapter.json new file mode 100644 index 00000000000..91eb5edffb4 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_with_dbpedia_adapter.json @@ -0,0 +1,53 @@ +{ + "details": { + "finish_reason": "eos_token", + "generated_tokens": 7, + "prefill": [], + "seed": null, + "tokens": [ + { + "id": 1, + "logprob": -0.49658203, + "special": true, + "text": "" + }, + { + "id": 28705, + "logprob": -0.0016384125, + "special": false, + "text": " " + }, + { + "id": 1, + "logprob": -1.4931641, + "special": true, + "text": "" + }, + { + "id": 28705, + "logprob": -0.00075769424, + "special": false, + "text": " " + }, + { + "id": 28740, + "logprob": -0.25024414, + "special": false, + "text": "1" + }, + { + "id": 28740, + "logprob": -0.2631836, + "special": false, + "text": "1" + }, + { + "id": 2, + "logprob": -0.0003285408, + "special": true, + "text": "" + } + ] + }, + "generated_text": " 11" +} diff --git a/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_without_adapter.json b/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_without_adapter.json new file mode 100644 index 00000000000..130186884d6 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_without_adapter.json @@ -0,0 +1,251 @@ +{ + "details": { + "finish_reason": "length", + "generated_tokens": 40, + "prefill": [], + "seed": null, + "tokens": [ + { + "id": 13, + "logprob": -1.0488281, + "special": false, + "text": "\n" + }, + { + "id": 13, + "logprob": -1.0800781, + "special": false, + "text": "\n" + }, + { + "id": 27332, + "logprob": -2.1152344, + "special": false, + "text": "###" + }, + { + "id": 28705, + "logprob": -1.6748047, + "special": false, + "text": " " + }, + { + "id": 28740, + "logprob": -0.097229004, + "special": false, + "text": "1" + }, + { + "id": 28723, + "logprob": -0.16467285, + "special": false, + "text": "." + }, + { + "id": 7615, + "logprob": -2.2246094, + "special": false, + "text": " News" + }, + { + "id": 13, + "logprob": -1.0488281, + "special": false, + "text": "\n" + }, + { + "id": 27332, + "logprob": -0.69189453, + "special": false, + "text": "###" + }, + { + "id": 28705, + "logprob": -0.013343811, + "special": false, + "text": " " + }, + { + "id": 28750, + "logprob": -0.011230469, + "special": false, + "text": "2" + }, + { + "id": 28723, + "logprob": -0.00096845627, + "special": false, + "text": "." + }, + { + "id": 21095, + "logprob": -2.5605469, + "special": false, + "text": " Blog" + }, + { + "id": 13, + "logprob": -0.19458008, + "special": false, + "text": "\n" + }, + { + "id": 27332, + "logprob": -0.031280518, + "special": false, + "text": "###" + }, + { + "id": 28705, + "logprob": -0.0030708313, + "special": false, + "text": " " + }, + { + "id": 28770, + "logprob": -0.0029277802, + "special": false, + "text": "3" + }, + { + "id": 28723, + "logprob": -0.0012350082, + "special": false, + "text": "." + }, + { + "id": 20108, + "logprob": -2.1582031, + "special": false, + "text": " Article" + }, + { + "id": 13, + "logprob": -0.05810547, + "special": false, + "text": "\n" + }, + { + "id": 27332, + "logprob": -0.35083008, + "special": false, + "text": "###" + }, + { + "id": 28705, + "logprob": -0.034332275, + "special": false, + "text": " " + }, + { + "id": 28781, + "logprob": -0.009666443, + "special": false, + "text": "4" + }, + { + "id": 28723, + "logprob": -0.0013113022, + "special": false, + "text": "." + }, + { + "id": 8349, + "logprob": -2.6191406, + "special": false, + "text": " Review" + }, + { + "id": 13, + "logprob": -0.04031372, + "special": false, + "text": "\n" + }, + { + "id": 27332, + "logprob": -0.45239258, + "special": false, + "text": "###" + }, + { + "id": 28705, + "logprob": -0.045410156, + "special": false, + "text": " " + }, + { + "id": 28782, + "logprob": -0.0041236877, + "special": false, + "text": "5" + }, + { + "id": 28723, + "logprob": -0.0010223389, + "special": false, + "text": "." + }, + { + "id": 5299, + "logprob": -2.8066406, + "special": false, + "text": " Other" + }, + { + "id": 13, + "logprob": -0.12054443, + "special": false, + "text": "\n" + }, + { + "id": 13, + "logprob": -0.44580078, + "special": false, + "text": "\n" + }, + { + "id": 13, + "logprob": -1.4921875, + "special": false, + "text": "\n" + }, + { + "id": 13, + "logprob": -1.3574219, + "special": false, + "text": "\n" + }, + { + "id": 13, + "logprob": -1.0039062, + "special": false, + "text": "\n" + }, + { + "id": 13, + "logprob": -0.5859375, + "special": false, + "text": "\n" + }, + { + "id": 13, + "logprob": -0.43481445, + "special": false, + "text": "\n" + }, + { + "id": 13, + "logprob": -0.2783203, + "special": false, + "text": "\n" + }, + { + "id": 13, + "logprob": -0.20410156, + "special": false, + "text": "\n" + } + ] + }, + "generated_text": "\n\n### 1. News\n### 2. Blog\n### 3. Article\n### 4. Review\n### 5. Other\n\n\n\n\n\n\n\n\n" +} diff --git a/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_without_customer_support_adapter.json b/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_without_customer_support_adapter.json new file mode 100644 index 00000000000..8c00dee75a2 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_without_customer_support_adapter.json @@ -0,0 +1,251 @@ +{ + "details": { + "finish_reason": "length", + "generated_tokens": 40, + "prefill": [], + "seed": null, + "tokens": [ + { + "id": 13, + "logprob": -0.31347656, + "special": false, + "text": "\n" + }, + { + "id": 13, + "logprob": -0.27441406, + "special": false, + "text": "\n" + }, + { + "id": 28737, + "logprob": -2.2285156, + "special": false, + "text": "I" + }, + { + "id": 28809, + "logprob": -1.4677734, + "special": false, + "text": "’" + }, + { + "id": 28719, + "logprob": -0.31762695, + "special": false, + "text": "m" + }, + { + "id": 264, + "logprob": -1.6865234, + "special": false, + "text": " a" + }, + { + "id": 1215, + "logprob": -3.2695312, + "special": false, + "text": " very" + }, + { + "id": 20640, + "logprob": -3.1230469, + "special": false, + "text": " passionate" + }, + { + "id": 1338, + "logprob": -0.48339844, + "special": false, + "text": " person" + }, + { + "id": 28723, + "logprob": -0.9970703, + "special": false, + "text": "." + }, + { + "id": 315, + "logprob": -0.5498047, + "special": false, + "text": " I" + }, + { + "id": 28809, + "logprob": -1.1923828, + "special": false, + "text": "’" + }, + { + "id": 28719, + "logprob": -0.080444336, + "special": false, + "text": "m" + }, + { + "id": 1215, + "logprob": -1.8271484, + "special": false, + "text": " very" + }, + { + "id": 12215, + "logprob": -2.8847656, + "special": false, + "text": " driven" + }, + { + "id": 28723, + "logprob": -1.0927734, + "special": false, + "text": "." + }, + { + "id": 315, + "logprob": -0.4584961, + "special": false, + "text": " I" + }, + { + "id": 28809, + "logprob": -0.5019531, + "special": false, + "text": "’" + }, + { + "id": 28719, + "logprob": -0.030715942, + "special": false, + "text": "m" + }, + { + "id": 1215, + "logprob": -0.96972656, + "special": false, + "text": " very" + }, + { + "id": 7798, + "logprob": -2.8847656, + "special": false, + "text": " determined" + }, + { + "id": 28723, + "logprob": -0.27319336, + "special": false, + "text": "." + }, + { + "id": 13, + "logprob": -0.56396484, + "special": false, + "text": "\n" + }, + { + "id": 13, + "logprob": -0.011016846, + "special": false, + "text": "\n" + }, + { + "id": 3195, + "logprob": -0.7163086, + "special": false, + "text": "What" + }, + { + "id": 349, + "logprob": -1.1611328, + "special": false, + "text": " is" + }, + { + "id": 574, + "logprob": -0.515625, + "special": false, + "text": " your" + }, + { + "id": 6656, + "logprob": -1.0253906, + "special": false, + "text": " favorite" + }, + { + "id": 1970, + "logprob": -2.1738281, + "special": false, + "text": " thing" + }, + { + "id": 684, + "logprob": -0.48364258, + "special": false, + "text": " about" + }, + { + "id": 1250, + "logprob": -1.8876953, + "special": false, + "text": " being" + }, + { + "id": 264, + "logprob": -0.41967773, + "special": false, + "text": " a" + }, + { + "id": 8626, + "logprob": -2.9160156, + "special": false, + "text": " teacher" + }, + { + "id": 28804, + "logprob": -0.11920166, + "special": false, + "text": "?" + }, + { + "id": 13, + "logprob": -0.023727417, + "special": false, + "text": "\n" + }, + { + "id": 13, + "logprob": -0.010848999, + "special": false, + "text": "\n" + }, + { + "id": 28737, + "logprob": -1.0566406, + "special": false, + "text": "I" + }, + { + "id": 2016, + "logprob": -0.7163086, + "special": false, + "text": " love" + }, + { + "id": 272, + "logprob": -1.9169922, + "special": false, + "text": " the" + }, + { + "id": 1639, + "logprob": -2.03125, + "special": false, + "text": " fact" + } + ] + }, + "generated_text": "\n\nI’m a very passionate person. I’m very driven. I’m very determined.\n\nWhat is your favorite thing about being a teacher?\n\nI love the fact" +} diff --git a/integration-tests/models/test_lora_mistral.py b/integration-tests/models/test_lora_mistral.py new file mode 100644 index 00000000000..ccdc148635b --- /dev/null +++ b/integration-tests/models/test_lora_mistral.py @@ -0,0 +1,134 @@ +import pytest +import requests + + +@pytest.fixture(scope="module") +def lora_mistral_handle(launcher): + with launcher( + "mistralai/Mistral-7B-v0.1", + lora_adapters=[ + "predibase/dbpedia", + "predibase/customer_support", + ], + cuda_graphs=[0], + ) as handle: + yield handle + + +@pytest.fixture(scope="module") +async def lora_mistral(lora_mistral_handle): + await lora_mistral_handle.health(300) + return lora_mistral_handle.client + + +@pytest.mark.asyncio +@pytest.mark.private +async def test_lora_mistral(lora_mistral, response_snapshot): + response = await lora_mistral.generate( + "Test request", max_new_tokens=10, decoder_input_details=True + ) + assert response.details.generated_tokens == 10 + + +classification_prompt = """You are given the title and the body of an article below. Please determine the type of the article.\n### Title: Great White Whale\n\n### Body: Great White Whale is the debut album by the Canadian rock band Secret and Whisper. The album was in the works for about a year and was released on February 12 2008. A music video was shot in Pittsburgh for the album's first single XOXOXO. The album reached number 17 on iTunes's top 100 albums in its first week on sale.\n\n### Article Type:""" + + +@pytest.mark.asyncio +@pytest.mark.private +async def test_lora_mistral_without_adapter(lora_mistral, response_snapshot): + response = requests.post( + f"{lora_mistral.base_url}/generate", + headers=lora_mistral.headers, + json={ + "inputs": classification_prompt, + "parameters": { + "max_new_tokens": 40, + "details": True, + }, + }, + ) + + assert response.status_code == 200 + data = response.json() + assert ( + data["generated_text"] + == "\n\n### 1. News\n### 2. Blog\n### 3. Article\n### 4. Review\n### 5. Other\n\n\n\n\n\n\n\n\n" + ) + assert data == response_snapshot + + +@pytest.mark.asyncio +@pytest.mark.private +async def test_lora_mistral_with_dbpedia_adapter(lora_mistral, response_snapshot): + response = requests.post( + f"{lora_mistral.base_url}/generate", + headers=lora_mistral.headers, + json={ + "inputs": classification_prompt, + "parameters": { + "max_new_tokens": 40, + "adapter_id": "predibase/dbpedia", + "details": True, + }, + }, + ) + + assert response.status_code == 200 + data = response.json() + assert data["generated_text"] == " 11" + assert data == response_snapshot + + +@pytest.mark.asyncio +@pytest.mark.private +async def test_lora_mistral_with_customer_support_adapter( + lora_mistral, response_snapshot +): + print(lora_mistral.base_url) + print(lora_mistral.headers) + response = requests.post( + f"{lora_mistral.base_url}/generate", + headers=lora_mistral.headers, + json={ + "inputs": "What are 3 unique words that describe you?", + "parameters": { + "max_new_tokens": 40, + "adapter_id": "predibase/customer_support", + "details": True, + }, + }, + ) + + assert response.status_code == 200 + data = response.json() + assert ( + data["generated_text"] + == "\n\nI’m not sure if I can come up with 3 unique words that describe me, but I’ll try.\n\n1. Creative\n2. Funny\n3." + ) + assert data == response_snapshot + + +@pytest.mark.asyncio +@pytest.mark.private +async def test_lora_mistral_without_customer_support_adapter( + lora_mistral, response_snapshot +): + response = requests.post( + f"{lora_mistral.base_url}/generate", + headers=lora_mistral.headers, + json={ + "inputs": "What are 3 unique words that describe you?", + "parameters": { + "max_new_tokens": 40, + "details": True, + }, + }, + ) + + assert response.status_code == 200 + data = response.json() + assert ( + data["generated_text"] + == "\n\nI’m a very passionate person. I’m very driven. I’m very determined.\n\nWhat is your favorite thing about being a teacher?\n\nI love the fact" + ) + assert data == response_snapshot diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py index 71ad18f7920..eab6483128f 100644 --- a/server/text_generation_server/cli.py +++ b/server/text_generation_server/cli.py @@ -91,6 +91,15 @@ def serve( f"LoRA adapters are enabled. This is an experimental feature and may not work as expected." ) + # TODO: enable lora with cuda graphs. for now disable cuda graphs if lora is enabled + # and warn the user + if len(lora_adapter_ids) > 0 and os.getenv("CUDA_GRAPHS", None) is not None: + logger.warning( + f"LoRa adapter are not supported with CUDA Graphs. Disabling CUDA Graphs." + ) + global CUDA_GRAPHS + CUDA_GRAPHS = None + # Downgrade enum into str for easier management later on quantize = None if quantize is None else quantize.value dtype = None if dtype is None else dtype.value