From d551da3913d2691c9a1bd300086b35eec1b51985 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Thu, 11 Apr 2024 15:18:15 +0200 Subject: [PATCH] cleanup generate_sample.cpp --- text_generation/causal_lm/cpp/CMakeLists.txt | 6 +- .../generate_pipeline/generate_pipeline.hpp | 22 +++- .../cpp/generate_pipeline/generate_sample.cpp | 113 ++++++++++++++++++ .../generate_pipeline/generation_config.hpp | 1 - .../causal_lm/cpp/generate_pipeline/main.cpp | 93 -------------- .../causal_lm/cpp/group_beam_searcher.hpp | 6 +- 6 files changed, 135 insertions(+), 106 deletions(-) create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp delete mode 100644 text_generation/causal_lm/cpp/generate_pipeline/main.cpp diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt index 7c75aad0af..26e99843d1 100644 --- a/text_generation/causal_lm/cpp/CMakeLists.txt +++ b/text_generation/causal_lm/cpp/CMakeLists.txt @@ -17,7 +17,7 @@ target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime) set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17) set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON) -set(TARGET_NAME beam_search_sample) +set(TARGET_NAME beam_search_causal_lm) add_executable(${TARGET_NAME} beam_search_causal_lm.cpp) target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$\") @@ -35,8 +35,8 @@ target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime) set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17) set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON) -set(TARGET_NAME generate_pipeline) -add_executable(${TARGET_NAME} generate_pipeline/main.cpp) +set(TARGET_NAME generate_sample) +add_executable(${TARGET_NAME} generate_pipeline/generate_sample.cpp) target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$\") find_package(OpenVINO REQUIRED COMPONENTS Runtime) diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp index 66cb2f8eaa..5ed6d1b65d 100644 --- a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp +++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp @@ -9,7 +9,7 @@ #include #include "group_beam_searcher.hpp" -using GenerationResult = std::vector>; +using GenerationResult = std::vector>>; using namespace std; std::pair pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token=2) { @@ -124,7 +124,8 @@ class LLMPipeline { if (!is_xml(full_path)) full_path += "/openvino_model.xml"; m_model_runner = core.compile_model(full_path, device, config).create_infer_request(); - + + // todo: add loading EOS_TOKEN_ID from IR core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt // tokenizer and detokenizer work on CPU only full_path = tokenizer_path; @@ -205,6 +206,11 @@ class LLMPipeline { return {m_tokenizer.get_tensor("input_ids"), m_tokenizer.get_tensor("attention_mask")}; } + + std::pair tokenize(std::initializer_list text) { + return tokenize(std::vector(text.begin(), text.end())); + } + std::string detokenize(std::vector tokens) { size_t batch_size = 1; @@ -231,7 +237,7 @@ class LLMPipeline { // todo: implement calling detokenizer in a single batch std::vector strings; - for (auto& line: lines){ + for (auto& [score, line]: lines){ ov::Tensor tokens = ov::Tensor{ov::element::i64, {1, line.size()}, line.data()}; m_detokenizer.set_input_tensor(tokens); m_detokenizer.infer(); @@ -281,13 +287,14 @@ class LLMPipeline { for (size_t batch = 0; batch < batch_size; ++batch) { const float * logits_data = logits.data() + seq_len * vocab_size * batch + (seq_len - 1) * vocab_size; int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data; - results[batch].emplace_back(out_token); + results[batch].second.emplace_back(out_token); token_iter_results[batch] = out_token; eos_met[batch] != (out_token == sampling_params.m_eos_token_id); m_model_runner.get_tensor("input_ids").data()[batch] = out_token; m_model_runner.get_tensor("position_ids").data()[batch] = int64_t(prompt_len + i); } + // place sampling_params.m_callback(std::move(token_iter_results), *this); // stop generation when EOS is met in all batches @@ -348,13 +355,16 @@ class LLMPipeline { m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1}); std::fill_n(m_model_runner.get_tensor("position_ids").data(), batch_size, mask_shape.at(1) - 1); + + // place + sampling_params.m_callback(std::move(next_tokens), *this); + } std::vector beams; for (const std::vector& group : finalize(std::move(group_beam_searcher))) { for (const Beam& beam : group) { beams.emplace_back(beam); - // results.emplace_back(beam.tokens); } } @@ -363,7 +373,7 @@ class LLMPipeline { GenerationResult results; for (auto beam = beams.begin(); beam != beams.begin() + sampling_params.m_num_return_sequences; ++beam) { - results.emplace_back(beam->tokens); + results.emplace_back(std::pair(beam->score, beam->tokens)); } return results; } diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp new file mode 100644 index 0000000000..9af5e474a6 --- /dev/null +++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp @@ -0,0 +1,113 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "generate_pipeline.hpp" + + +// The following reasons require TextStreamer to keep a cache of previous tokens: +// detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a", +// but detokenize(tokenize("prefix a")) == "prefix a" +// 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�" +struct TextStreamer { + LLMPipeline pipe; + std::vector token_cache; + size_t print_len = 0; + + void put(int64_t token) { + token_cache.push_back(token); + std::string text = pipe.detokenize(token_cache); + if (!text.empty() && '\n' == text.back()) { + // Flush the cache after the new line symbol + std::cout << std::string_view{text.data() + print_len, text.size() - print_len}; + token_cache.clear(); + print_len = 0; + return; + } + if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) { + // Don't print incomplete text + return; + } + std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; + print_len = text.size(); + } + + void end() { + std::string text = pipe.detokenize(token_cache); + std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n'; + token_cache.clear(); + print_len = 0; + } +}; + +int main(int argc, char* argv[]) try { + if (2 >= argc && argc <= 4) + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " \"\" "); + + std::string prompt = "table is made of"; + std::string device = "CPU"; // can be replaced with GPU + + std::string model_path = argv[1]; + if (argc > 2) + prompt = argv[2]; + if (argc > 3) + device = argv[3]; + + // Example 1: TextStreaming example with greedy search + LLMPipeline pipe(model_path, device); + // Will try to load config from generation_config.json. + // but if not found default velues for gready search will be used + GenerationConfig config = pipe.generation_config(); + + auto text_streamer = TextStreamer{pipe}; + auto text_streamer_callback = [&text_streamer](std::vector&& tokens, LLMPipeline& pipe){ + text_streamer.put(tokens[0]); + }; + + cout << "greedy generate streaming mode:" << endl; + config.max_new_tokens(20).set_callback(text_streamer_callback); + pipe(prompt, config); + text_streamer.end(); + + // Example 2: Grouped Beam Search decoding example + pipe = LLMPipeline(model_path, device); + config = pipe.generation_config(); + + // will return vector with num_return_sequences strings + auto num_return_sequences = 3; + config.max_new_tokens(20).num_groups(3).group_size(5).num_return_sequences(num_return_sequences); + + cout << endl << "grouped beam search generated candidates:" << endl; + auto generation_results = pipe({prompt}, config); + for (int i = 0; i < num_return_sequences; ++i) + cout << "candidate " << i << ": " << generation_results[i] << endl; + + // Example 3: Greedy Decoding with multiple batch + pipe = LLMPipeline(model_path, device); + config = pipe.generation_config(); + + cout << endl << "greedy decoding with multiple batches:" << endl; + std::vector prompts = {"table is made of", "Alan Turing was a", "1 + 1 = ", "Why is the Sun yellow?"}; + auto results = pipe(prompts, config.max_new_tokens(20)); + for (int i = 0; i < prompts.size(); i++) + cout << prompts[i] << ": " << results[i] << endl; + + // Example 4: Calling tokenizer/detokenizer manually and getting beam scores for all candidates + pipe = LLMPipeline(model_path); + auto [input_ids, attention_mask] = pipe.tokenize({prompt}); + config = GenerationConfig::beam_search(); + // config for grouped beam search + config.max_new_tokens(30).num_groups(3).group_size(5).num_return_sequences(15); + + cout << endl << "beam search with printing of all candidates:" << endl; + auto beams = pipe.generate(input_ids, attention_mask, config); + for (const auto& beam : beams) + std::cout << beam.first << ": " << pipe.detokenize(beam.second) << std::endl; + +} catch (const std::exception& error) { + std::cerr << error.what() << '\n'; + return EXIT_FAILURE; +} catch (...) { + std::cerr << "Non-exception object thrown\n"; + return EXIT_FAILURE; +} diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp index 348559f3a0..ce250696e4 100644 --- a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp +++ b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp @@ -33,7 +33,6 @@ struct GenerationConfig { size_t m_num_return_sequences = 3; // is used by beam search, in other case is equal to batch size StopCriteria stop_criteria = StopCriteria::heuristic; - float m_repetition_penalty = 1.0f; float m_length_penalty = 1.0f; size_t m_no_repeat_ngram_size = std::numeric_limits::max(); diff --git a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp deleted file mode 100644 index 61aa4e274b..0000000000 --- a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include -#include "generate_pipeline.hpp" - -namespace { - -constexpr size_t BATCH_SIZE = 1; - -} // namespace - -using namespace std; - -struct TextStreamer { - LLMPipeline pipe; - std::vector token_cache; - size_t print_len = 0; - - void put(int64_t token) { - token_cache.push_back(token); - std::string text = pipe.detokenize(token_cache); - if (!text.empty() && '\n' == text.back()) { - // Flush the cache after the new line symbol - std::cout << std::string_view{text.data() + print_len, text.size() - print_len}; - token_cache.clear(); - print_len = 0; - return; - } - if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) { - // Don't print incomplete text - return; - } - std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; - print_len = text.size(); - } - - void end() { - std::string text = pipe.detokenize(token_cache); - std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n'; - token_cache.clear(); - print_len = 0; - } -}; - -int main(int argc, char* argv[]) try { - { - // PIPELINE Ex.1 - std::string model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/"; - LLMPipeline pipe(model_path, "CPU"); - GenerationConfig config = pipe.generation_config(); - - auto text_streamer = TextStreamer{pipe}; - auto print_text_callback = [&text_streamer](std::vector&& tokens, LLMPipeline& pipe){ - text_streamer.put(tokens[0]); - }; - - pipe("table is made of", config.max_new_tokens(100).set_callback(print_text_callback)); - text_streamer.end(); - cout << endl << "------------- END OF GENERATE -------------" << endl; - } - - { - // PIPELINE Ex.2 - std::string model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/"; - LLMPipeline pipe(model_path, "CPU"); - GenerationConfig config = pipe.generation_config(); - // batched inputs - auto results = pipe({"table is made of", - "Alan Turing was a", - "1 + 1 = ", - "Why is the Sun yellow?" - }, config.do_sample(false).max_new_tokens(100)); - - for (const auto& res: results) { - cout << res << endl; - cout << "-------------------" << endl; - } - } - - // GENERATE - std::string model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/"; - LLMPipeline pipe(model_path); - auto [input_ids, attention_mask] = pipe.tokenize("table is made of"); - auto res = pipe.generate(input_ids, attention_mask); - std::cout << pipe.detokenize(res)[0]; -} catch (const std::exception& error) { - std::cerr << error.what() << '\n'; - return EXIT_FAILURE; -} catch (...) { - std::cerr << "Non-exception object thrown\n"; - return EXIT_FAILURE; -} diff --git a/text_generation/causal_lm/cpp/group_beam_searcher.hpp b/text_generation/causal_lm/cpp/group_beam_searcher.hpp index 2128f988ca..e23c277e52 100644 --- a/text_generation/causal_lm/cpp/group_beam_searcher.hpp +++ b/text_generation/causal_lm/cpp/group_beam_searcher.hpp @@ -108,9 +108,9 @@ struct Group { } min_heap.push_back(std::move(beam)); - std::push_heap(min_heap.begin(), min_heap.end(), greater); + std::push_heap(min_heap.begin(), min_heap.end(), ::greater); if (min_heap.size() > parameters.group_size) { - std::pop_heap(min_heap.begin(), min_heap.end(), greater); + std::pop_heap(min_heap.begin(), min_heap.end(), ::greater); min_heap.pop_back(); } } @@ -223,7 +223,7 @@ struct GroupBeamSearcher { throw std::runtime_error("No beams left to search"); } auto to_sort = candidates.begin() + ptrdiff_t(2 * parameters.group_size); - std::partial_sort(candidates.begin(), to_sort, candidates.end(), greater); + std::partial_sort(candidates.begin(), to_sort, candidates.end(), ::greater); group->ongoing.clear(); for (size_t cand_idx = 0; cand_idx < candidates.size(); ++cand_idx) { if (parameters.eos_token == candidates.at(cand_idx).tokens.back()) {