From b139e6945a5c2ad62ffdca6017d948ebe55879a6 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Wed, 3 Apr 2024 10:14:06 +0200 Subject: [PATCH] Greedy search works --- .../generate_pipeline/generate_pipeline.hpp | 98 +++++++++++++++++-- .../causal_lm/cpp/generate_pipeline/main.cpp | 43 +++----- .../generate_pipeline/sampling_parameters.hpp | 21 ++-- .../causal_lm/cpp/group_beam_searcher.hpp | 7 +- 4 files changed, 125 insertions(+), 44 deletions(-) diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp index 783b5a5474..e3bcc52473 100644 --- a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp +++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp @@ -6,8 +6,10 @@ #include #include "sampling_parameters.hpp" #include +#include "group_beam_searcher.hpp" -using GenerationResult = ov::Tensor; +// using GenerationResult = ov::Tensor; +using GenerationResult = std::vector>; class LLMEngine { ov::InferRequest m_model_runner; @@ -15,10 +17,9 @@ class LLMEngine { GenerationResult greedy_search(ov::Tensor prompts, SamplingParameters sampling_params) { ov::Shape prompts_shape = prompts.get_shape(); size_t batch_size = prompts_shape[0]; - // todo: implement for batch > 1 OPENVINO_ASSERT(batch_size == 1); - GenerationResult results = ov::Tensor{ov::element::i64, {batch_size, sampling_params.max_new_tokens}}; + GenerationResult results(batch_size); auto attention_mask = ov::Tensor{ov::element::i64, prompts.get_shape()}; std::fill_n(attention_mask.data(), attention_mask.get_size(), 1); @@ -45,12 +46,13 @@ class LLMEngine { std::fill_n(m_model_runner.get_tensor("attention_mask").data(), m_model_runner.get_tensor("attention_mask").get_size(), 1); m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1}); - + for (size_t batch = 0; batch < batch_size; ++batch) { const float * logits_data = logits.data() + seq_len * vocab_size * batch + (seq_len - 1) * vocab_size; int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data; - results.data()[sampling_params.max_new_tokens * batch + i] = out_token; + results[batch].emplace_back(out_token); + // todo: add exit criteria when pad or EOS is met m_model_runner.get_tensor("input_ids").data()[batch] = out_token; m_model_runner.get_tensor("position_ids").data()[batch] = int64_t(initial_seq_len + i); } @@ -59,8 +61,72 @@ class LLMEngine { } GenerationResult beam_search(ov::Tensor prompts, SamplingParameters sampling_params) { - // todo: implement + ov::Shape prompts_shape = prompts.get_shape(); + size_t batch_size = prompts_shape[0]; + // todo: implement for batch > 1 + OPENVINO_ASSERT(batch_size == 1); + + // initialize inputs + auto attention_mask = ov::Tensor{ov::element::i64, prompts.get_shape()}; + std::fill_n(attention_mask.data(), attention_mask.get_size(), 1); + auto position_ids = ov::Tensor{ov::element::i64, prompts.get_shape()}; + std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), 0); + auto initial_seq_len = prompts.get_shape()[1]; + + m_model_runner.set_tensor("input_ids", prompts); + m_model_runner.set_tensor("attention_mask", attention_mask); + m_model_runner.set_tensor("position_ids", position_ids); + + // set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1 + m_model_runner.get_tensor("beam_idx").set_shape({batch_size}); + m_model_runner.get_tensor("beam_idx").data()[0] = 0; + + const int64_t* prompt_data = prompts.data(); + + // todo: remove this duplicatino and use the same SamplingParameters for both greedy and beam + Parameters parameters{std::vector{prompt_data, prompt_data + prompts.get_size()}}; + parameters.n_groups = sampling_params.n_groups; + parameters.diversity_penalty = sampling_params.diversity_penalty; + parameters.group_size = sampling_params.group_size; + + GroupBeamSearcher group_beam_searcher{parameters}; + std::vector next_tokens; + std::vector next_beams; + for (size_t length_count = 0; length_count < sampling_params.max_new_tokens; ++length_count) { + m_model_runner.infer(); + std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(m_model_runner.get_tensor("logits")); + if (next_tokens.empty()) { + break; + } + size_t batch_size = next_tokens.size(); + // Set pointers + m_model_runner.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()}); + m_model_runner.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()}); + // Set auxiliary inputs + ov::Tensor attention_mask = m_model_runner.get_tensor("attention_mask"); + ov::Shape mask_shape{batch_size, attention_mask.get_shape()[1] + 1}; + attention_mask.set_shape(mask_shape); + std::fill_n(attention_mask.data(), ov::shape_size(mask_shape), 1); + + m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1}); + std::fill_n(m_model_runner.get_tensor("position_ids").data(), batch_size, mask_shape.at(1) - 1); + } + + std::vector beams; + for (const std::vector& group : finalize(std::move(group_beam_searcher))) { + for (const Beam& beam : group) { + beams.emplace_back(beam); + // results.emplace_back(beam.tokens); + } + } + + auto compare_scores = [](Beam left, Beam right) { return (left.score < right.score); }; + std::sort(beams.begin(), beams.end(), compare_scores); + GenerationResult results; + for (auto beam = beams.begin(); beam != beams.begin() + sampling_params.num_return_sequences; ++beam) { + results.emplace_back(beam->tokens); + } return results; } @@ -124,6 +190,23 @@ std::vector detokenize(ov::InferRequest& detokenizer, ov::Tensor to return strings; } +std::vector detokenize(ov::InferRequest& detokenizer, + std::vector> lines, + int64_t pad_token_idx) { + // todo: implement calling detokenizer in a single batch + + std::vector strings; + for (auto& line: lines){ + ov::Tensor tokens = ov::Tensor{ov::element::i64, {1, line.size()}, line.data()}; + detokenizer.set_input_tensor(tokens); + detokenizer.infer(); + auto res = detokenizer.get_output_tensor(); + strings.emplace_back(res.data()[0]); + } + + return strings; +} + // The following reasons require TextStreamer to keep a cache of previous tokens: // detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a", // but detokenize(tokenize("prefix a")) == "prefix a" @@ -171,6 +254,7 @@ class LLMPipeline { if (std::experimental::filesystem::exists(m_path + "/generation_config.json")) { m_sampling_parameters = SamplingParameters(m_path + "/generation_config.json"); } + m_sampling_parameters = SamplingParameters(m_path + "/generation_config_beam.json"); ov::Core core; // The model can be compiled for GPU as well @@ -188,6 +272,6 @@ class LLMPipeline { auto generate_results = m_model_runner.generate(input_ids, m_sampling_parameters); - return detokenize(m_detokenizer, generate_results)[0]; + return detokenize(m_detokenizer, generate_results, 0)[0]; } }; diff --git a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp index 88a4d3621c..12b85bbe02 100644 --- a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp +++ b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp @@ -11,38 +11,27 @@ constexpr size_t BATCH_SIZE = 1; } // namespace -// void print_generation_results(GenerationResult results, ov::InferRequest& detokenizer) { -// TextStreamer text_streamer{std::move(detokenizer)}; -// for (const auto& result: results) { -// text_streamer.put(result); -// } -// text_streamer.end(); -// } - int main(int argc, char* argv[]) try { + // PIPELINE std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/"; - LLMPipeline pipe(model_path); - std::cout << pipe.call("Alan Turing was a"); + // std::cout << pipe.call("Alan Turing was a"); - // ov::Core core; - // // core.add_extension("libuser_ov_extensions.so"); - // core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt - // // tokenizer and detokenizer work on CPU only - // ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request(); - // ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request(); - - // // The model can be compiled for GPU as well - // std::shared_ptr model = core.read_model(model_path + "/openvino_model.xml"); - // ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request(); + // GENERATE + ov::Core core; + core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt + ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request(); + ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request(); + + // todo: beam search does not work properly on GPU, when reshaped from batch=1 to batch=num_beams not broadcasted + std::shared_ptr model = core.read_model(model_path + "/openvino_model.xml"); + ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request(); - // auto [input_ids, attention_mask] = tokenize(tokenizer, argv[1]); - - // SamplingParameters sampling_params = SamplingParameters::greedy(); - - // LLMEngine engine(request); - // GenerationResult generation_results = engine.generate(input_ids, sampling_params); - // print_generation_results(generation_results, detokenizer); + auto [input_ids, attention_mask] = tokenize(tokenizer, "Alan Turing was a"); + SamplingParameters sampling_params = SamplingParameters::beam_search(); + LLMEngine engine(request); + GenerationResult generation_results = engine.generate(input_ids, sampling_params); + std::cout << detokenize(detokenizer, generation_results[0]); } catch (const std::exception& error) { std::cerr << error.what() << '\n'; diff --git a/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp b/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp index 2ee4a88096..b12a25bbe7 100644 --- a/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp +++ b/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp @@ -1,4 +1,3 @@ - // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 @@ -8,8 +7,7 @@ #include #include #include - -enum class StopCriteria {early, heuristic, never}; +#include // used only for StopCriteria // forward declaration class Sequence; @@ -18,15 +16,17 @@ class Sequence; // and has parameters that are not present in the original SamplingParameters for continous batching struct SamplingParameters { // Generic - size_t max_new_tokens = 100; + size_t max_new_tokens = 10; size_t max_length = 100; // max_new tokens should have priority over max_new_tokens bool ignore_eos = false; int64_t eos_token = 2; // There's no way to extract special token values from the tokenizer for now + size_t num_return_sequences = 3; // Beam search specific size_t n_groups = 1; size_t group_size = 1; // beam_width float diversity_penalty = 1.0f; // 0.0 means no diversity + StopCriteria stop_criteria = StopCriteria::heuristic; float length_penalty = 1.0f; size_t no_repeat_ngram_size = std::numeric_limits::max(); @@ -53,10 +53,17 @@ struct SamplingParameters { eos_token_id = data.value("eos_token_id", 0); max_length = data.value("max_length", 0); pad_token_id = data.value("pad_token_id", 0); + num_return_sequences = data.value("num_return_sequences", 1); temperature = data.value("temperature", 0.0f); do_sample = data.value("do_sample", false); top_p = data.value("top_p", 0.0f); + + // beam_search_params + n_groups = data.value("num_beam_groups", 1); + diversity_penalty = data.value("diversity_penalty", 1.0f); + int num_beams = data.value("num_beams", 1); + group_size = num_beams / n_groups; } static SamplingParameters greedy() { @@ -68,9 +75,9 @@ struct SamplingParameters { static SamplingParameters beam_search() { SamplingParameters beam_search; - beam_search.n_groups = 2; - beam_search.group_size = 2; - beam_search.max_new_tokens = 100; + beam_search.n_groups = 3; + beam_search.group_size = 5; + beam_search.max_new_tokens = 10; beam_search.diversity_penalty = 2.0f; return beam_search; } diff --git a/text_generation/causal_lm/cpp/group_beam_searcher.hpp b/text_generation/causal_lm/cpp/group_beam_searcher.hpp index 732e2a7c22..ac33e9cf97 100644 --- a/text_generation/causal_lm/cpp/group_beam_searcher.hpp +++ b/text_generation/causal_lm/cpp/group_beam_searcher.hpp @@ -1,5 +1,6 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 +#pragma once #include @@ -84,9 +85,9 @@ enum class StopCriteria { early, heuristic, never }; struct Parameters { std::vector prompt; - size_t n_groups = 2; - size_t group_size = 2; - float diversity_penalty = 2.0; + size_t n_groups = 3; + size_t group_size = 5; + float diversity_penalty = 1.0; size_t max_new_tokens = 100; StopCriteria stop_criteria = StopCriteria::heuristic; float length_penalty = 1.0;