modelscope · wangxingjun778 · Dec 25, 2024 · Dec 25, 2024
diff --git a/evalscope/perf/benchmark.py b/evalscope/perf/benchmark.py
@@ -157,7 +157,7 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
             while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
                 try:
                     # Attempt to get benchmark data from the queue with a timeout
-                    benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=1)
+                    benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
                     benchmark_data_queue.task_done()
                 except asyncio.TimeoutError:
                     # If timeout, continue to the next iteration

diff --git a/evalscope/perf/plugin/api/openai_api.py b/evalscope/perf/plugin/api/openai_api.py
@@ -96,19 +96,21 @@ def __compose_query_from_parameter(self, payload: Dict, param: Arguments):
 
     def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
         """Parser responses and return number of request and response tokens.
-        One response for non-stream, multiple responses for stream.
+        Only one response for non-stream, multiple responses for stream.
         """
-        delta_contents = {}
-        input_tokens = None
-        output_tokens = None
 
+        # when stream, the last response is the full usage
+        # when non-stream, the last response is the first response
+        last_response_js = json.loads(responses[-1])
+        if 'usage' in last_response_js and last_response_js['usage']:
+            input_tokens = last_response_js['usage']['prompt_tokens']
+            output_tokens = last_response_js['usage']['completion_tokens']
+            return input_tokens, output_tokens
+
+        # no usage information in the response, parse the response to get the tokens
+        delta_contents = {}
         for response in responses:
             js = json.loads(response)
-            if 'usage' in js and js['usage']:
-                input_tokens = js['usage']['prompt_tokens']
-                output_tokens = js['usage']['completion_tokens']
-                return input_tokens, output_tokens
-
             if 'object' in js:
                 self.__process_response_object(js, delta_contents)
             else:

diff --git a/tests/perf/test_perf.py b/tests/perf/test_perf.py
@@ -19,13 +19,13 @@ def tearDown(self) -> None:
     @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_run_perf(self):
         task_cfg = {
-            'url': 'http://127.0.0.1:8000/v1/chat/completions',
+            'url': 'http://127.0.0.1:8001/v1/chat/completions',
             'parallel': 1,
             'model': 'qwen2.5',
             'number': 15,
             'api': 'openai',
             'dataset': 'openqa',
-            'stream': True,
+            # 'stream': True,
             'debug': True,
         }
         run_perf_benchmark(task_cfg)
@@ -47,7 +47,7 @@ def test_run_perf_stream(self):
     @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_run_perf_speed_benchmark(self):
         task_cfg = {
-            'url': 'http://127.0.0.1:8801/v1/completions',
+            'url': 'http://127.0.0.1:8001/v1/completions',
             'parallel': 1,
             'model': 'qwen2.5',
             'api': 'openai',