cleanup generate_sample.cpp

openvinotoolkit · Apr 11, 2024 · d551da3 · d551da3
1 parent 4cc0c64
commit d551da3
Show file tree

Hide file tree

Showing 6 changed files with 135 additions and 106 deletions.
diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -17,7 +17,7 @@ target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
-set(TARGET_NAME beam_search_sample)
+set(TARGET_NAME beam_search_causal_lm)
 add_executable(${TARGET_NAME} beam_search_causal_lm.cpp)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
@@ -35,8 +35,8 @@ target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
-set(TARGET_NAME generate_pipeline)
-add_executable(${TARGET_NAME} generate_pipeline/main.cpp)
+set(TARGET_NAME generate_sample)
+add_executable(${TARGET_NAME} generate_pipeline/generate_sample.cpp)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
@@ -9,7 +9,7 @@
 #include <filesystem>
 #include "group_beam_searcher.hpp"
 
-using GenerationResult = std::vector<std::vector<int64_t>>;
+using GenerationResult = std::vector<std::pair<float, std::vector<int64_t>>>;
 using namespace std;
 
 std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token=2) {
@@ -124,7 +124,8 @@ class LLMPipeline {
 	    if (!is_xml(full_path))
 		    full_path += "/openvino_model.xml";
         m_model_runner = core.compile_model(full_path, device, config).create_infer_request();
-
+
+        // todo: add loading EOS_TOKEN_ID from IR
         core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
         // tokenizer and detokenizer work on CPU only
         full_path = tokenizer_path;
@@ -205,6 +206,11 @@ class LLMPipeline {
 
         return {m_tokenizer.get_tensor("input_ids"), m_tokenizer.get_tensor("attention_mask")};
     }
+
+    std::pair<ov::Tensor, ov::Tensor> tokenize(std::initializer_list<std::string> text) {
+        return tokenize(std::vector<std::string>(text.begin(), text.end()));
+    }
+
 
     std::string detokenize(std::vector<int64_t> tokens) {
         size_t batch_size = 1;
@@ -231,7 +237,7 @@ class LLMPipeline {
         // todo: implement calling detokenizer in a single batch
 
         std::vector<std::string> strings;
-        for (auto& line: lines){
+        for (auto& [score, line]: lines){
             ov::Tensor tokens = ov::Tensor{ov::element::i64, {1, line.size()}, line.data()};
             m_detokenizer.set_input_tensor(tokens);
             m_detokenizer.infer();
@@ -281,13 +287,14 @@ class LLMPipeline {
             for (size_t batch = 0; batch < batch_size; ++batch) {
                 const float * logits_data = logits.data<const float>() + seq_len * vocab_size * batch + (seq_len - 1) * vocab_size;
                 int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
-                results[batch].emplace_back(out_token);
+                results[batch].second.emplace_back(out_token);
                 token_iter_results[batch] = out_token;
                 eos_met[batch] != (out_token == sampling_params.m_eos_token_id);
 
                 m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
                 m_model_runner.get_tensor("position_ids").data<int64_t>()[batch] = int64_t(prompt_len + i);
             }
+            // place
             sampling_params.m_callback(std::move(token_iter_results), *this);
 
             // stop generation when EOS is met in all batches
@@ -348,13 +355,16 @@ class LLMPipeline {
 
             m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
             std::fill_n(m_model_runner.get_tensor("position_ids").data<int64_t>(), batch_size, mask_shape.at(1) - 1);
+
+            // place
+            sampling_params.m_callback(std::move(next_tokens), *this);
+
         }
 
         std::vector<Beam> beams;
         for (const std::vector<Beam>& group : finalize(std::move(group_beam_searcher))) {
             for (const Beam& beam : group) {
                 beams.emplace_back(beam);
-                // results.emplace_back(beam.tokens);
             }
         }
 
@@ -363,7 +373,7 @@ class LLMPipeline {
 
         GenerationResult results;
         for (auto beam = beams.begin(); beam != beams.begin() + sampling_params.m_num_return_sequences; ++beam) {
-            results.emplace_back(beam->tokens);
+            results.emplace_back(std::pair(beam->score, beam->tokens));
         }
         return results;
     }

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
@@ -0,0 +1,113 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openvino/openvino.hpp>
+#include "generate_pipeline.hpp"
+
+
+// The following reasons require TextStreamer to keep a cache of previous tokens:
+// detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a",
+// but detokenize(tokenize("prefix a")) == "prefix a"
+// 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�"
+struct TextStreamer {
+    LLMPipeline pipe;
+    std::vector<int64_t> token_cache;
+    size_t print_len = 0;
+
+    void put(int64_t token) {
+        token_cache.push_back(token);
+        std::string text = pipe.detokenize(token_cache);
+        if (!text.empty() && '\n' == text.back()) {
+            // Flush the cache after the new line symbol
+            std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
+            token_cache.clear();
+            print_len = 0;
+	    return;
+        }
+        if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
+            // Don't print incomplete text
+            return;
+        }
+        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+        print_len = text.size();
+    }
+
+    void end() {
+        std::string text = pipe.detokenize(token_cache);
+        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
+        token_cache.clear();
+        print_len = 0;
+    }
+};
+
+int main(int argc, char* argv[]) try {
+    if (2 >= argc && argc <= 4)
+        throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> \"<PROMPT>\" <DEVICE>");
+
+    std::string prompt = "table is made of";
+    std::string device = "CPU"; // can be replaced with GPU
+
+    std::string model_path = argv[1];
+    if (argc > 2)
+        prompt = argv[2];
+    if (argc > 3)
+        device = argv[3];
+
+    // Example 1: TextStreaming example with greedy search
+    LLMPipeline pipe(model_path, device);
+    // Will try to load config from generation_config.json.
+    // but if not found default velues for gready search will be used
+    GenerationConfig config = pipe.generation_config();
+
+    auto text_streamer = TextStreamer{pipe};
+    auto text_streamer_callback = [&text_streamer](std::vector<int64_t>&& tokens, LLMPipeline& pipe){
+        text_streamer.put(tokens[0]);
+    };
+
+    cout << "greedy generate streaming mode:" << endl;
+    config.max_new_tokens(20).set_callback(text_streamer_callback);
+    pipe(prompt, config);
+    text_streamer.end();
+
+    // Example 2: Grouped Beam Search decoding example
+    pipe = LLMPipeline(model_path, device);  
+    config = pipe.generation_config();
+
+    // will return vector with num_return_sequences strings
+    auto num_return_sequences = 3;
+    config.max_new_tokens(20).num_groups(3).group_size(5).num_return_sequences(num_return_sequences);
+
+    cout << endl << "grouped beam search generated candidates:" << endl;
+    auto generation_results = pipe({prompt}, config);
+    for (int i = 0; i < num_return_sequences; ++i)
+        cout << "candidate " << i << ": " << generation_results[i] << endl;
+
+    // Example 3: Greedy Decoding with multiple batch
+    pipe = LLMPipeline(model_path, device);
+    config = pipe.generation_config();
+
+    cout << endl << "greedy decoding with multiple batches:" << endl;
+    std::vector<std::string> prompts = {"table is made of", "Alan Turing was a", "1 + 1 = ", "Why is the Sun yellow?"};
+    auto results = pipe(prompts, config.max_new_tokens(20));
+    for (int i = 0; i < prompts.size(); i++)
+        cout << prompts[i] << ": " << results[i] << endl;
+
+    // Example 4: Calling tokenizer/detokenizer manually and getting beam scores for all candidates
+    pipe = LLMPipeline(model_path);
+    auto [input_ids, attention_mask] = pipe.tokenize({prompt});
+    config = GenerationConfig::beam_search();
+    // config for grouped beam search
+    config.max_new_tokens(30).num_groups(3).group_size(5).num_return_sequences(15);
+
+    cout << endl << "beam search with printing of all candidates:" << endl;
+    auto beams = pipe.generate(input_ids, attention_mask, config);
+    for (const auto& beam : beams)
+        std::cout << beam.first << ": " << pipe.detokenize(beam.second) << std::endl;
+
+} catch (const std::exception& error) {
+    std::cerr << error.what() << '\n';
+    return EXIT_FAILURE;
+} catch (...) {
+    std::cerr << "Non-exception object thrown\n";
+    return EXIT_FAILURE;
+}
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
@@ -33,7 +33,6 @@ struct GenerationConfig {
     size_t m_num_return_sequences = 3;  // is used by beam search, in other case is equal to batch size
     StopCriteria stop_criteria = StopCriteria::heuristic;
 
-
     float m_repetition_penalty = 1.0f;
     float m_length_penalty = 1.0f;
     size_t m_no_repeat_ngram_size = std::numeric_limits<size_t>::max();

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
diff --git a/text_generation/causal_lm/cpp/group_beam_searcher.hpp b/text_generation/causal_lm/cpp/group_beam_searcher.hpp
@@ -108,9 +108,9 @@ struct Group {
         }
 
         min_heap.push_back(std::move(beam));
-        std::push_heap(min_heap.begin(), min_heap.end(), greater);
+        std::push_heap(min_heap.begin(), min_heap.end(), ::greater);
         if (min_heap.size() > parameters.group_size) {
-            std::pop_heap(min_heap.begin(), min_heap.end(), greater);
+            std::pop_heap(min_heap.begin(), min_heap.end(), ::greater);
             min_heap.pop_back();
         }
     }
@@ -223,7 +223,7 @@ struct GroupBeamSearcher {
                 throw std::runtime_error("No beams left to search");
             }
             auto to_sort = candidates.begin() + ptrdiff_t(2 * parameters.group_size);
-            std::partial_sort(candidates.begin(), to_sort, candidates.end(), greater);
+            std::partial_sort(candidates.begin(), to_sort, candidates.end(), ::greater);
             group->ongoing.clear();
             for (size_t cand_idx = 0; cand_idx < candidates.size(); ++cand_idx) {
                 if (parameters.eos_token == candidates.at(cand_idx).tokens.back()) {