diff --git a/evals/benchmark/stresscli/locust/aistress.py b/evals/benchmark/stresscli/locust/aistress.py index e9377a27..1264ac41 100644 --- a/evals/benchmark/stresscli/locust/aistress.py +++ b/evals/benchmark/stresscli/locust/aistress.py @@ -120,6 +120,7 @@ def bench_main(self): "faqgenfixed", "faqgenbench", ] + test_start_time = time.time() try: start_ts = time.perf_counter() with self.client.post( @@ -150,6 +151,7 @@ def bench_main(self): "response_string": resp.text, "first_token_latency": time.perf_counter() - start_ts, "total_latency": time.perf_counter() - start_ts, + "test_start_time": test_start_time, } else: first_token_ts = None @@ -184,6 +186,7 @@ def bench_main(self): "response_string": complete_response, "first_token_latency": first_token_ts - start_ts, "total_latency": end_ts - start_ts, + "test_start_time": test_start_time, } reqdata = bench_package.respStatics(self.environment, reqData, respData) logging.debug(f"Request data collected {reqdata}") diff --git a/evals/benchmark/stresscli/locust/tokenresponse.py b/evals/benchmark/stresscli/locust/tokenresponse.py index 77f9740b..4b6bfe75 100644 --- a/evals/benchmark/stresscli/locust/tokenresponse.py +++ b/evals/benchmark/stresscli/locust/tokenresponse.py @@ -26,9 +26,7 @@ def respStatics(environment, req, resp): num_token_input_prompt = -1 num_token_output = len( - tokenizer.encode( - resp["response_string"].lstrip().encode("utf-8").decode("unicode_escape"), add_special_tokens=False - ) + tokenizer.encode(resp["response_string"].encode("utf-8").decode("unicode_escape"), add_special_tokens=False) ) return { @@ -37,6 +35,7 @@ def respStatics(environment, req, resp): "first_token": resp["first_token_latency"] * 1000, "next_token": (resp["total_latency"] - resp["first_token_latency"]) / (num_token_output - 1) * 1000, "total_latency": resp["total_latency"] * 1000, + "test_start_time": resp["test_start_time"], } @@ -47,7 +46,6 @@ def staticsOutput(environment, reqlist): e2e_lat = [] tokens_input = 0 tokens_output = 0 - duration = environment.runner.stats.last_request_timestamp - environment.runner.stats.start_time if len(reqlist) == 0: logging.debug(f"len(reqlist): {len(reqlist)}, skip printing") @@ -60,6 +58,8 @@ def staticsOutput(environment, reqlist): e2e_lat.append(req["total_latency"]) tokens_output += req["tokens_output"] tokens_input += req["tokens_input"] + test_start_time = req["test_start_time"] + duration = environment.runner.stats.last_request_timestamp - test_start_time # Statistics for success response data only if tokens_output == 0: @@ -70,8 +70,8 @@ def staticsOutput(environment, reqlist): " Output Tokens: {}, RPS: {:.2f}, Input Tokens per Second: {:.2f}, Output Tokens per Second: {:.2f}" ) e2e_msg = "End to End latency(ms), P50: {:.2f}, P90: {:.2f}, P99: {:.2f}, Avg: {:.2f}" - first_msg = "First token latency(ms), P50: {:.2f}, P90: {:.2f}, P99: {:.2f}, Avg: {:.2f}" - next_msg = "Next token latency(ms), P50: {:.2f}, P90: {:.2f}, P99: {:.2f}, Avg: {:.2f}" + first_msg = "Time to First Token-TTFT(ms), P50: {:.2f}, P90: {:.2f}, P99: {:.2f}, Avg: {:.2f}" + next_msg = "Time Per Output Token-TPOT(ms), P50: {:.2f}, P90: {:.2f}, P99: {:.2f}, Avg: {:.2f}" average_msg = "Average token latency(ms) : {:.2f}" console_logger.warning("\n=================Total statistics=====================") if tokens_output == 0: