From 63878fc3ed6b7998abad17362bc3cbe80e188576 Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Sat, 21 Dec 2024 09:04:42 +0400 Subject: [PATCH] [Prompt lookup] Bench --- .../prompt_lookup_decoding_lm/CMakeLists.txt | 30 ++ .../prompt_lookup_decoding_lm.cpp | 13 +- .../prompt_lookup_decoding_lm_sdpa.cpp | 279 ++++++++++++++++++ .../cpp/text_generation/greedy_causal_lm.cpp | 7 +- .../get_statistic_pl.py | 207 +++++++++++++ 5 files changed, 527 insertions(+), 9 deletions(-) create mode 100644 samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm_sdpa.cpp create mode 100644 samples/python/prompt_lookup_decoding_lm/get_statistic_pl.py diff --git a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt index b0ce8b1b60..e474bd92df 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt +++ b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt @@ -21,3 +21,33 @@ install(TARGETS ${TARGET_NAME} RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin EXCLUDE_FROM_ALL) + + +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading) +find_package(OpenVINOGenAI REQUIRED + PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH +) + +add_executable(prompt_lookup_decoding_lm_sdpa prompt_lookup_decoding_lm_sdpa.cpp) +target_link_libraries(prompt_lookup_decoding_lm_sdpa PRIVATE openvino::runtime openvino::threading) +set_target_properties(prompt_lookup_decoding_lm_sdpa PROPERTIES + COMPILE_PDB_NAME prompt_lookup_decoding_lm_sdpa + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) +target_compile_features(prompt_lookup_decoding_lm_sdpa PRIVATE cxx_std_17) +get_target_property(genai_imported openvino::genai IMPORTED_LOCATION) +set(OPENVINO_TOKENIZERS_PATH $,${genai_imported},$>) +set(OPENVINO_TOKENIZERS_FILENAME "${CMAKE_SHARED_LIBRARY_PREFIX}openvino_tokenizers${CMAKE_SHARED_LIBRARY_SUFFIX}") +target_compile_definitions(prompt_lookup_decoding_lm_sdpa PRIVATE + OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}/${OPENVINO_TOKENIZERS_FILENAME}") + +install(TARGETS prompt_lookup_decoding_lm_sdpa + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) \ No newline at end of file diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp index e692110027..d54e1a6ffd 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp +++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp @@ -6,19 +6,20 @@ #include "openvino/genai/llm_pipeline.hpp" int main(int argc, char* argv[]) try { - if (3 != argc) { - throw std::runtime_error(std::string{"Usage: "} + argv[0] + " ''"); + if (4 != argc) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " '' "); } + std::string model_path = argv[1]; + std::string prompt = argv[2]; + size_t gen_len = std::stoi(argv[3]); + ov::genai::GenerationConfig config; - config.max_new_tokens = 100; + config.max_new_tokens = gen_len; // Define candidates number for candidate generation config.num_assistant_tokens = 5; // Define max_ngram_size config.max_ngram_size = 3; - - std::string model_path = argv[1]; - std::string prompt = argv[2]; std::string device = "CPU"; diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm_sdpa.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm_sdpa.cpp new file mode 100644 index 0000000000..1d1c333c50 --- /dev/null +++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm_sdpa.cpp @@ -0,0 +1,279 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include + +namespace { +// only batch_size = 1 currently supported +constexpr size_t BATCH_SIZE = 1; +size_t get_seq_len_axis(std::shared_ptr model) { + // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size], + // therefore usually seq_length_axis = 2 + size_t seq_length_axis = 2; + // "ReadValue" node is KV cache representation in stateful model + std::string kv_node_type_name = std::string(ov::op::v6::ReadValue::get_type_info_static().name); + for (const auto op : model->get_ops()) { + if (op->get_type_name() != kv_node_type_name) { + continue; + } + // Shape example: [-1,4,0,64] + auto shape = op->get_input_partial_shape(0); + for (size_t i = 0; i < shape.rank().get_length(); i++) { + // Find axis = 0. This would be sequence length axis. + if (shape[i] == 0) { + seq_length_axis = i; + } + } + break; + } + return seq_length_axis; +} +std::pair tokenize(ov::InferRequest& tokenizer, std::string&& prompt) { + tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt}); + tokenizer.infer(); + return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")}; +} +std::string detokenize(ov::InferRequest& detokenizer, std::vector& tokens) { + detokenizer.set_input_tensor(ov::Tensor{ov::element::i64, {BATCH_SIZE, tokens.size()}, tokens.data()}); + detokenizer.infer(); + return detokenizer.get_output_tensor().data()[0]; +} +// The following reasons require TextStreamer to keep a cache of previous tokens: +// detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a", +// but detokenize(tokenize("prefix a")) == "prefix a" +// 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�" +struct TextStreamer { + ov::InferRequest detokenizer; + std::vector token_cache; + size_t print_len = 0; + void put(int64_t token) { + token_cache.push_back(token); + std::string text = detokenize(detokenizer, token_cache); + if (!text.empty() && '\n' == text.back() && text.size() > print_len) { + // Flush the cache after the new line symbol + std::cout << std::string_view{text.data() + print_len, text.size() - print_len}; + token_cache.clear(); + print_len = 0; + return; + } + constexpr char replacement[] = "\xef\xbf\xbd"; // MSVC with /utf-8 fails to compile � directly with newline in string literal error. + if (text.size() >= 3 && text.compare(text.size() - 3, 3, replacement) == 0) { + // Don't print incomplete text + return; + } else if (text.size() > print_len) { + // It is possible to have a shorter text after adding new token. + // Print to output only if text length is increaeseds. + std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; + print_len = text.size(); + } + } + void end() { + std::string text = detokenize(detokenizer, token_cache); + if (text.size() <= print_len) + return; + std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n'; + token_cache.clear(); + print_len = 0; + } +}; +ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len) { + // Copy elements from the old to a new tensor and return it. + // Trim kv tensor on sequence length axis + // key/values tensor shape example: [BATCH_SIZE, num_kv_heads, seq_len, head_size] + // Sequence length axis position may vary from one model to another + auto shape = tensor.get_shape(); + OPENVINO_ASSERT(seq_len_axis < shape.size(), + "Sequence length axis: ", + seq_len_axis, + " should be less than shape size: ", + shape.size()); + size_t old_seq_len = shape[seq_len_axis]; + OPENVINO_ASSERT(new_seq_len <= old_seq_len); + // if new_seq_len equal to old one no need to copy tensor, return as is + if (old_seq_len == new_seq_len) + return tensor; + shape[seq_len_axis] = new_seq_len; + if (seq_len_axis == 0) { + tensor.set_shape(shape); + return tensor; + } + ov::Coordinate new_shape_begin{0, 0, 0, 0}; + ov::Coordinate new_shape_end{shape}; + auto new_tensor = ov::Tensor(tensor, new_shape_begin, new_shape_end); + return new_tensor; +} +void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len) { + // trim kv_cache values up to the new_seq_len + auto states = request.query_state(); + ov::parallel_for(states.size(), [&](size_t i) { + ov::Tensor old_tensor = states.at(i).get_state(); + states.at(i).set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len)); + }); +} +class PromptLookupCandidateGenerator { +private: + const size_t max_ngram_size = 3; + size_t num_pred_tokens = 5; + const size_t max_pred_tokens = 20; +public: + PromptLookupCandidateGenerator(const size_t max_ngram_size, const size_t num_pred_tokens) + : max_ngram_size{max_ngram_size}, + num_pred_tokens{num_pred_tokens} {}; + std::vector generate_candidates(const std::vector& input_ids) { + const size_t input_length = input_ids.size(); + for (int32_t ngram_size = max_ngram_size; ngram_size > 0; ngram_size--) { + // extract last ngram_size tokens as search ngram + std::vector ngram = std::vector{input_ids.cend() - ngram_size, input_ids.cend()}; + // find ngram match in input_ids + size_t ngram_i = 0; + for (size_t input_i = 0; input_i < input_length - ngram_size; input_i++) { + if (ngram[ngram_i] != input_ids[input_i]) { + ngram_i = 0; + continue; + } + ngram_i++; + if (ngram_i < ngram_size) { + continue; + } + // match found with the end at input_i + size_t avaliable_num_pred = std::min(input_length - (input_i + 1), num_pred_tokens); + // return candidates with length of avaliable_num_pred + return std::vector{input_ids.cbegin() + input_i + 1, + input_ids.cbegin() + input_i + 1 + avaliable_num_pred}; + } + } + return std::vector{}; + } + void update_candidate_strategy(const size_t num_matches) { + // dynamically adjust number of generated candidates based on number of matches + // we want to balance the benefits of getting assistant tokens correct with the + // cost of forecasting incorrect assistant tokens. + if (num_matches == num_pred_tokens) { + num_pred_tokens = std::min(num_pred_tokens + 2, max_pred_tokens); + } else { + num_pred_tokens = std::max(num_pred_tokens - 1, size_t(1)); + } + } +}; +int64_t get_eos_token(const std::shared_ptr tokenizer) { + auto rt_info = tokenizer->get_rt_info(); // Get the runtime info for the model + auto it = rt_info.find("eos_token_id"); + if (it == rt_info.end()) { + throw std::runtime_error("EOS token ID not found in model's runtime information."); + } + return it->second.as(); +} +} // namespace + +int main(int argc, char* argv[]) try { + if (argc != 4) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " '' "); + } + + // tokenizer model + ov::Core core; + core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt + const std::string model_dir = std::string{argv[1]}; + auto tokenizer_model = core.read_model(model_dir + "/openvino_tokenizer.xml"); + // tokenizer and detokenizer work on CPU only + ov::InferRequest tokenizer = core.compile_model(tokenizer_model, "CPU").create_infer_request(); + auto [input_ids, attention_mask] = tokenize(tokenizer, argv[2]); + std::vector full_input_ids{input_ids.data(), input_ids.data() + input_ids.get_size()}; + ov::InferRequest detokenizer = + core.compile_model(model_dir + "/openvino_detokenizer.xml", "CPU").create_infer_request(); + TextStreamer text_streamer{std::move(detokenizer)}; + std::shared_ptr ov_model = core.read_model(model_dir + "/openvino_model.xml"); + size_t seq_len_axis = get_seq_len_axis(ov_model); + ov::InferRequest model = core.compile_model(ov_model, "CPU").create_infer_request(); + model.set_tensor("input_ids", input_ids); + model.set_tensor("attention_mask", attention_mask); + ov::Tensor position_ids = model.get_tensor("position_ids"); + position_ids.set_shape(input_ids.get_shape()); + std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), 0); + size_t seq_len = input_ids.get_shape()[1]; + // set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1 + model.get_tensor("beam_idx").set_shape({BATCH_SIZE}); + model.get_tensor("beam_idx").data()[0] = 0; + // To collect kv-cache for the and to get the next token run the very first infer request + model.infer(); + // logits shape is [BATCH_SIZE, seq_len, vocab_size] + auto logits = model.get_tensor("logits"); + size_t vocab_size = logits.get_shape().back(); + auto data_logits = logits.data() + (seq_len - 1) * vocab_size; + int64_t out_token = std::max_element(data_logits, data_logits + vocab_size) - data_logits; + full_input_ids.push_back(out_token); + auto first_token = out_token; + text_streamer.put(out_token); + const int64_t EOS_TOKEN = get_eos_token(tokenizer_model); + // Prompt lookup decoding is a speculative decoding technique where the draft model replaced + // with string matching in the prompt to generate candidate token sequences. + int gen_len = 0; + int max_sequence_length = std::stoi(argv[3]); + PromptLookupCandidateGenerator candidateGenerator{3, 5}; + while (out_token != EOS_TOKEN && gen_len < max_sequence_length) { + auto candidates = candidateGenerator.generate_candidates(full_input_ids); + // cut redundant candidates on last iteration + size_t tokens_to_generate = max_sequence_length - gen_len; + candidates.resize(std::min(candidates.size(), tokens_to_generate - 1)); + size_t candidates_size = candidates.size(); + // candidates_size + 1 tokens will be fed at once in a single infer request. + input_ids.set_shape({BATCH_SIZE, candidates_size + 1}); + input_ids.data()[0] = first_token; + std::copy_n(candidates.begin(), candidates_size, input_ids.data() + 1); + attention_mask.set_shape({BATCH_SIZE, seq_len + candidates_size + 1}); + std::fill_n(attention_mask.data(), attention_mask.get_size(), 1); + position_ids.set_shape({BATCH_SIZE, candidates_size + 1}); + std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), seq_len); + model.infer(); + data_logits = logits.data(); // [BATCH_SIZE, 1 + candidates_size, vocab_size] + // 1. accept current out token (if not eos) + // 2. check if it matches appropriate candidate + // 2.1 if it's match, continue - accept next token + // 2.2 it it's mismatch, stop iteration but still accept current token as it was last token generated by + // model from a valid sequence. + size_t accepted_tokens_number = 0; + for (size_t i = 0; i < candidates_size + 1; i++) { + auto start = data_logits + vocab_size * i; + auto stop = data_logits + vocab_size * (i + 1); + out_token = std::max_element(start, stop) - start; + if (out_token == EOS_TOKEN) { + break; + } + text_streamer.put(out_token); + full_input_ids.push_back(out_token); + accepted_tokens_number++; + if (i == candidates_size || out_token != candidates[i]) { + break; + } + } + if (accepted_tokens_number > 0) { + candidateGenerator.update_candidate_strategy(accepted_tokens_number - 1); + } + // After the inference request, key/values have shape [BATCH_SIZE, seq_len + candidates_size, vocab_size]. + // Increment the sequence length by the number of matched tokens, and + // trim the KV cache to match the new sequence length. + gen_len += accepted_tokens_number; + seq_len += accepted_tokens_number; + update_kv_cache(model, seq_len_axis, seq_len); + first_token = out_token; + } + text_streamer.end(); + // Model is stateful which means that context (kv-cache) which belongs to a particular + // text sequence is accumulated inside the model during the generation loop above. + // This context should be reset before processing the next text sequence. + // While it is not required to reset context in this sample as only one sequence is processed, + // it is called for education purposes: + model.reset_state(); +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} \ No newline at end of file diff --git a/samples/cpp/text_generation/greedy_causal_lm.cpp b/samples/cpp/text_generation/greedy_causal_lm.cpp index b5ca59095b..faccbb2690 100644 --- a/samples/cpp/text_generation/greedy_causal_lm.cpp +++ b/samples/cpp/text_generation/greedy_causal_lm.cpp @@ -4,16 +4,17 @@ #include "openvino/genai/llm_pipeline.hpp" int main(int argc, char* argv[]) try { - if (3 > argc) - throw std::runtime_error(std::string{"Usage: "} + argv[0] + " \"\""); + if (4 > argc) + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " \"\" "); std::string models_path = argv[1]; std::string prompt = argv[2]; std::string device = "CPU"; // GPU can be used as well + size_t gen_len = std::stoi(argv[3]); // GPU can be used as well ov::genai::LLMPipeline pipe(models_path, device); ov::genai::GenerationConfig config; - config.max_new_tokens = 100; + config.max_new_tokens = gen_len; std::string result = pipe.generate(prompt, config); std::cout << result << std::endl; } catch (const std::exception& error) { diff --git a/samples/python/prompt_lookup_decoding_lm/get_statistic_pl.py b/samples/python/prompt_lookup_decoding_lm/get_statistic_pl.py new file mode 100644 index 0000000000..b4745c9757 --- /dev/null +++ b/samples/python/prompt_lookup_decoding_lm/get_statistic_pl.py @@ -0,0 +1,207 @@ +import csv +import os +from subprocess import Popen, TimeoutExpired, run, call +from statistics import median +import datetime + +_model_path = "/home/efode/repo/openvino.genai/temp/speculative_decoding/customer_test" + +models = { + # model_name: main_model + "llama" : [ + "llama-2-7b-chat", + "CodeLlama-7b-hf", + ], + # "opt": ["opt-6.7b"], + # "phi": ["Phi-3-medium-4k-instruct"], + # "qwen": ["qwen1.5-7b", ], + # "starcoder": ["starcoder2-7b"], + # "dolly-v2": ["dolly-v2-3b","dolly-v2-12b"] +} + +element_types = [""] + + +num_run = range(0, 3) + +prompts = [ + # "return 0;", + "Please, implement the C++ class for Timer to get a durations of other class methids.", + ''' + The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct. + ''' +] + +gen_len = [ 100, 300, 500 ] + +devices = [ "CPU" ] + +row_desc = [ + "prompt", + + "model", + "model_type", + + "generation_len", + "device", + + "sdpa_duration, s", + "sdpa_prompt_lookup_duration, s", + "prompt_lookup_duration, s", + + "old_prompt_boost", + "old_prompt_boost, %", + "new_prompt_lookup_boost", + "new_prompt_lookup_boost, %", + "new_vs_old_prompt_lookup_boost", + "new_vs_old_prompt_lookup_boost, %", +] + + +codenames = { + "total_duration": "Total duration, ms: ", + "main_duration": "Main model duration, ms: ", + "main_duration_percentage": "Main model duration, %: ", + "token_per_second": "Token per sec:", + "iterations": "Main model iterations: ", + "avg_acceptance_rate": "AVG acceptance rate, %: ", + "accepted_token_cnt": "Accepted tokens by draft model: ", + "accepted_token_rate": "Accepted token rate, %: " +} + +binary_path = "/home/efode/repo/openvino/bin/intel64/Release" + +replace = "REPLACE_" +cb_binary_name = "prompt_lookup_decoding_lm" +old_binary_name = "prompt_lookup_decoding_lm_sdpa" +llm_binary_name = f"greedy_causal_lm" + +log_path = "/home/efode/repo/openvino.genai/temp/speculative_decoding/log.log" +csv_path = "/home/efode/repo/openvino.genai/temp/speculative_decoding/results_avg.csv" +csv_path_all = "/home/efode/repo/openvino.genai/temp/speculative_decoding/results_all.csv" + +with open(csv_path_all, "w", encoding='UTF-8') as csv_all_file: + csv_writer_all = csv.writer(csv_all_file, dialect="excel") + csv_writer_all.writerow(row_desc) + + with open(csv_path, "w", encoding='UTF-8') as csv_file: + csv_writer = csv.writer(csv_file, dialect="excel") + csv_writer.writerow(row_desc) + + for prompt in prompts: + for model_name in models: + for model in models[model_name]: + for element_type in element_types: + for device in devices: + for generation_len in gen_len: + avg_sdpa_duration = [] + avg_sdpa_prompt_lookup_duration = [] + avg_pa_prompt_lookup_duration = [] + + for run_id in num_run: + sdpa_duration = 0 + pl_sdpa_duration = 0 + pl_pa_duration = 0 + + sdpa_exec_path = os.path.join(binary_path, llm_binary_name) + sdpa_pl_exec_path = os.path.join(binary_path, old_binary_name) + pa_pl_exec_path = os.path.join(binary_path, cb_binary_name) + + model_path = os.path.join(_model_path, model_name, model, element_type) + + if not os.path.exists(model_path): + print(model_path) + continue + + command_line_sdpa = f"{sdpa_exec_path} {model_path} \"{prompt}\" {generation_len} > {log_path}" + command_line_sdpa_pl = f"{sdpa_pl_exec_path} {model_path} \"{prompt}\" {generation_len} > {log_path}" + command_line_pa_pl = f"{pa_pl_exec_path} {model_path} \"{prompt}\" {generation_len} > {log_path}" + + try: + print(command_line_sdpa) + time_start = datetime.datetime.now() + run(command_line_sdpa, check=True, shell=True) + time_end = datetime.datetime.now() + sdpa_duration = (time_end - time_start).total_seconds() + avg_sdpa_duration.append(sdpa_duration) + except: + pass + + try: + print(command_line_sdpa_pl) + time_start = datetime.datetime.now() + run(command_line_sdpa_pl, check=True, shell=True) + time_end = datetime.datetime.now() + pl_sdpa_duration = (time_end - time_start).total_seconds() + avg_sdpa_prompt_lookup_duration.append(pl_sdpa_duration) + except: + pass + + try: + print(command_line_pa_pl) + time_start = datetime.datetime.now() + run(command_line_pa_pl, check=True, shell=True) + time_end = datetime.datetime.now() + pl_pa_duration = (time_end - time_start).total_seconds() + avg_pa_prompt_lookup_duration.append(pl_pa_duration) + except: + pass + + pl_sdpa_boost = sdpa_duration / pl_sdpa_duration + pl_sdpa_boost_percent = pl_sdpa_boost * 100 + + pl_pa_boost = sdpa_duration / pl_sdpa_duration + pl_pa_boost_percent = pl_pa_boost * 100 + + sdpa_vs_pa_boost = sdpa_duration / sdpa_duration + sdpa_vs_pa_boost_percent = sdpa_vs_pa_boost * 100 + + csv_writer_all.writerow([ + prompt, + model, + element_type, + generation_len, + device, + sdpa_duration, + pl_sdpa_duration, + pl_pa_duration, + pl_sdpa_boost, + pl_sdpa_boost_percent, + pl_pa_boost, + pl_pa_boost_percent, + sdpa_vs_pa_boost, + sdpa_vs_pa_boost_percent + ]) + + if run_id == len(num_run) - 1: + avg_sdpa_duration = median(avg_sdpa_duration) + avg_pl_sdpa_duration = median(avg_sdpa_prompt_lookup_duration) + avg_pl_pa_duration = median(avg_pa_prompt_lookup_duration) + + pl_sdpa_boost = avg_sdpa_duration / avg_pl_sdpa_duration + pl_sdpa_boost_percent = pl_sdpa_boost * 100 + + pl_pa_boost = avg_sdpa_duration / avg_pl_pa_duration + pl_pa_boost_percent = pl_pa_boost * 100 + + sdpa_vs_pa_boost = avg_pl_sdpa_duration / avg_pl_pa_duration + sdpa_vs_pa_boost_percent = sdpa_vs_pa_boost * 100 + + csv_writer.writerow([ + prompt, + model, + element_type, + generation_len, + device, + sdpa_duration, + pl_sdpa_duration, + pl_pa_duration, + pl_sdpa_boost, + pl_sdpa_boost_percent, + pl_pa_boost, + pl_pa_boost_percent, + sdpa_vs_pa_boost, + sdpa_vs_pa_boost_percent + ]) + +