From 57340d285179163fbb8027dbbb37d66c05954ee4 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Tue, 7 Jan 2025 02:46:32 +0000 Subject: [PATCH] update --- tests/tpu/test_quantization_accuracy.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py index 3444117ae3d86..f20dd95b1cfd8 100644 --- a/tests/tpu/test_quantization_accuracy.py +++ b/tests/tpu/test_quantization_accuracy.py @@ -15,17 +15,20 @@ class GSM8KAccuracyTestConfig: def get_model_args(self) -> str: return (f"pretrained={self.model_name}," - "max_model_len=4096,max_num_seqs=128,enforce_eager=True") + "max_model_len=4096,max_num_seqs=128") -# NOTE(rob): Accuracy scores measured on GPUs. +# NOTE: Accuracy scores measured on GPUs. ACCURACY_CONFIGS = [ GSM8KAccuracyTestConfig( model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", excepted_value=0.76), # no bias - GSM8KAccuracyTestConfig( - model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8", - excepted_value=0.66), # bias + # NOTE(rob): We cannot re-initialize VLLM in the same process for TPU, + # so only one of these tests can run in a single call to pytest. As + # a follow up, move this into the LM-EVAL section of the CI. + # GSM8KAccuracyTestConfig( + # model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8", + # excepted_value=0.66), # bias in QKV layers ] @@ -37,7 +40,6 @@ def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig): model_args=config.get_model_args(), tasks="gsm8k", batch_size="auto", - limit=1, ) # EXPECTED_VALUE = config.excepted_value