From d551da3913d2691c9a1bd300086b35eec1b51985 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Thu, 11 Apr 2024 15:18:15 +0200
Subject: [PATCH] cleanup generate_sample.cpp

---
 text_generation/causal_lm/cpp/CMakeLists.txt  |   6 +-
 .../generate_pipeline/generate_pipeline.hpp   |  22 +++-
 .../cpp/generate_pipeline/generate_sample.cpp | 113 ++++++++++++++++++
 .../generate_pipeline/generation_config.hpp   |   1 -
 .../causal_lm/cpp/generate_pipeline/main.cpp  |  93 --------------
 .../causal_lm/cpp/group_beam_searcher.hpp     |   6 +-
 6 files changed, 135 insertions(+), 106 deletions(-)
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
 delete mode 100644 text_generation/causal_lm/cpp/generate_pipeline/main.cpp
diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index 7c75aad0af..26e99843d1 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -17,7 +17,7 @@ target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
-set(TARGET_NAME beam_search_sample)
+set(TARGET_NAME beam_search_causal_lm)
 add_executable(${TARGET_NAME} beam_search_causal_lm.cpp)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
@@ -35,8 +35,8 @@ target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
-set(TARGET_NAME generate_pipeline)
-add_executable(${TARGET_NAME} generate_pipeline/main.cpp)
+set(TARGET_NAME generate_sample)
+add_executable(${TARGET_NAME} generate_pipeline/generate_sample.cpp)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
index 66cb2f8eaa..5ed6d1b65d 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
@@ -9,7 +9,7 @@
 #include <filesystem>
 #include "group_beam_searcher.hpp"
 
-using GenerationResult = std::vector<std::vector<int64_t>>;
+using GenerationResult = std::vector<std::pair<float, std::vector<int64_t>>>;
 using namespace std;
 
 std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token=2) {
@@ -124,7 +124,8 @@ class LLMPipeline {
 	    if (!is_xml(full_path))
 		    full_path += "/openvino_model.xml";
         m_model_runner = core.compile_model(full_path, device, config).create_infer_request();
-
+        
+        // todo: add loading EOS_TOKEN_ID from IR
         core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
         // tokenizer and detokenizer work on CPU only
         full_path = tokenizer_path;
@@ -205,6 +206,11 @@ class LLMPipeline {
 
         return {m_tokenizer.get_tensor("input_ids"), m_tokenizer.get_tensor("attention_mask")};
     }
+    
+    std::pair<ov::Tensor, ov::Tensor> tokenize(std::initializer_list<std::string> text) {
+        return tokenize(std::vector<std::string>(text.begin(), text.end()));
+    }
+    
 
     std::string detokenize(std::vector<int64_t> tokens) {
         size_t batch_size = 1;
@@ -231,7 +237,7 @@ class LLMPipeline {
         // todo: implement calling detokenizer in a single batch
 
         std::vector<std::string> strings;
-        for (auto& line: lines){
+        for (auto& [score, line]: lines){
             ov::Tensor tokens = ov::Tensor{ov::element::i64, {1, line.size()}, line.data()};
             m_detokenizer.set_input_tensor(tokens);
             m_detokenizer.infer();
@@ -281,13 +287,14 @@ class LLMPipeline {
             for (size_t batch = 0; batch < batch_size; ++batch) {
                 const float * logits_data = logits.data<const float>() + seq_len * vocab_size * batch + (seq_len - 1) * vocab_size;
                 int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
-                results[batch].emplace_back(out_token);
+                results[batch].second.emplace_back(out_token);
                 token_iter_results[batch] = out_token;
                 eos_met[batch] != (out_token == sampling_params.m_eos_token_id);
 
                 m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
                 m_model_runner.get_tensor("position_ids").data<int64_t>()[batch] = int64_t(prompt_len + i);
             }
+            // place
             sampling_params.m_callback(std::move(token_iter_results), *this);
             
             // stop generation when EOS is met in all batches
@@ -348,13 +355,16 @@ class LLMPipeline {
 
             m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
             std::fill_n(m_model_runner.get_tensor("position_ids").data<int64_t>(), batch_size, mask_shape.at(1) - 1);
+            
+            // place
+            sampling_params.m_callback(std::move(next_tokens), *this);
+
         }
 
         std::vector<Beam> beams;
         for (const std::vector<Beam>& group : finalize(std::move(group_beam_searcher))) {
             for (const Beam& beam : group) {
                 beams.emplace_back(beam);
-                // results.emplace_back(beam.tokens);
             }
         }
 
@@ -363,7 +373,7 @@ class LLMPipeline {
         
         GenerationResult results;
         for (auto beam = beams.begin(); beam != beams.begin() + sampling_params.m_num_return_sequences; ++beam) {
-            results.emplace_back(beam->tokens);
+            results.emplace_back(std::pair(beam->score, beam->tokens));
         }
         return results;
     }
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
new file mode 100644
index 0000000000..9af5e474a6
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
@@ -0,0 +1,113 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openvino/openvino.hpp>
+#include "generate_pipeline.hpp"
+
+
+// The following reasons require TextStreamer to keep a cache of previous tokens:
+// detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a",
+// but detokenize(tokenize("prefix a")) == "prefix a"
+// 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�"
+struct TextStreamer {
+    LLMPipeline pipe;
+    std::vector<int64_t> token_cache;
+    size_t print_len = 0;
+
+    void put(int64_t token) {
+        token_cache.push_back(token);
+        std::string text = pipe.detokenize(token_cache);
+        if (!text.empty() && '\n' == text.back()) {
+            // Flush the cache after the new line symbol
+            std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
+            token_cache.clear();
+            print_len = 0;
+	    return;
+        }
+        if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
+            // Don't print incomplete text
+            return;
+        }
+        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+        print_len = text.size();
+    }
+
+    void end() {
+        std::string text = pipe.detokenize(token_cache);
+        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
+        token_cache.clear();
+        print_len = 0;
+    }
+};
+
+int main(int argc, char* argv[]) try {
+    if (2 >= argc && argc <= 4)
+        throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> \"<PROMPT>\" <DEVICE>");
+    
+    std::string prompt = "table is made of";
+    std::string device = "CPU"; // can be replaced with GPU
+
+    std::string model_path = argv[1];
+    if (argc > 2)
+        prompt = argv[2];
+    if (argc > 3)
+        device = argv[3];
+
+    // Example 1: TextStreaming example with greedy search
+    LLMPipeline pipe(model_path, device);
+    // Will try to load config from generation_config.json.
+    // but if not found default velues for gready search will be used
+    GenerationConfig config = pipe.generation_config();
+
+    auto text_streamer = TextStreamer{pipe};
+    auto text_streamer_callback = [&text_streamer](std::vector<int64_t>&& tokens, LLMPipeline& pipe){
+        text_streamer.put(tokens[0]);
+    };
+
+    cout << "greedy generate streaming mode:" << endl;
+    config.max_new_tokens(20).set_callback(text_streamer_callback);
+    pipe(prompt, config);
+    text_streamer.end();
+    
+    // Example 2: Grouped Beam Search decoding example
+    pipe = LLMPipeline(model_path, device);  
+    config = pipe.generation_config();
+
+    // will return vector with num_return_sequences strings
+    auto num_return_sequences = 3;
+    config.max_new_tokens(20).num_groups(3).group_size(5).num_return_sequences(num_return_sequences);
+    
+    cout << endl << "grouped beam search generated candidates:" << endl;
+    auto generation_results = pipe({prompt}, config);
+    for (int i = 0; i < num_return_sequences; ++i)
+        cout << "candidate " << i << ": " << generation_results[i] << endl;
+
+    // Example 3: Greedy Decoding with multiple batch
+    pipe = LLMPipeline(model_path, device);
+    config = pipe.generation_config();
+
+    cout << endl << "greedy decoding with multiple batches:" << endl;
+    std::vector<std::string> prompts = {"table is made of", "Alan Turing was a", "1 + 1 = ", "Why is the Sun yellow?"};
+    auto results = pipe(prompts, config.max_new_tokens(20));
+    for (int i = 0; i < prompts.size(); i++)
+        cout << prompts[i] << ": " << results[i] << endl;
+
+    // Example 4: Calling tokenizer/detokenizer manually and getting beam scores for all candidates
+    pipe = LLMPipeline(model_path);
+    auto [input_ids, attention_mask] = pipe.tokenize({prompt});
+    config = GenerationConfig::beam_search();
+    // config for grouped beam search
+    config.max_new_tokens(30).num_groups(3).group_size(5).num_return_sequences(15);
+    
+    cout << endl << "beam search with printing of all candidates:" << endl;
+    auto beams = pipe.generate(input_ids, attention_mask, config);
+    for (const auto& beam : beams)
+        std::cout << beam.first << ": " << pipe.detokenize(beam.second) << std::endl;
+
+} catch (const std::exception& error) {
+    std::cerr << error.what() << '\n';
+    return EXIT_FAILURE;
+} catch (...) {
+    std::cerr << "Non-exception object thrown\n";
+    return EXIT_FAILURE;
+}
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
index 348559f3a0..ce250696e4 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
@@ -33,7 +33,6 @@ struct GenerationConfig {
     size_t m_num_return_sequences = 3;  // is used by beam search, in other case is equal to batch size
     StopCriteria stop_criteria = StopCriteria::heuristic;
     
-    
     float m_repetition_penalty = 1.0f;
     float m_length_penalty = 1.0f;
     size_t m_no_repeat_ngram_size = std::numeric_limits<size_t>::max();
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
deleted file mode 100644
index 61aa4e274b..0000000000
--- a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (C) 2023-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include <openvino/openvino.hpp>
-#include "generate_pipeline.hpp"
-
-namespace {
-
-constexpr size_t BATCH_SIZE = 1;
-
-}  // namespace
-
-using namespace std;
-
-struct TextStreamer {
-    LLMPipeline pipe;
-    std::vector<int64_t> token_cache;
-    size_t print_len = 0;
-
-    void put(int64_t token) {
-        token_cache.push_back(token);
-        std::string text = pipe.detokenize(token_cache);
-        if (!text.empty() && '\n' == text.back()) {
-            // Flush the cache after the new line symbol
-            std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
-            token_cache.clear();
-            print_len = 0;
-	    return;
-        }
-        if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
-            // Don't print incomplete text
-            return;
-        }
-        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
-        print_len = text.size();
-    }
-
-    void end() {
-        std::string text = pipe.detokenize(token_cache);
-        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
-        token_cache.clear();
-        print_len = 0;
-    }
-};
-
-int main(int argc, char* argv[]) try {
-    {
-        // PIPELINE Ex.1
-        std::string model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
-        LLMPipeline pipe(model_path, "CPU");
-        GenerationConfig config = pipe.generation_config();
-
-        auto text_streamer = TextStreamer{pipe};
-        auto print_text_callback = [&text_streamer](std::vector<int64_t>&& tokens, LLMPipeline& pipe){
-            text_streamer.put(tokens[0]);
-        };
-
-        pipe("table is made of", config.max_new_tokens(100).set_callback(print_text_callback));
-        text_streamer.end();
-        cout << endl <<  "------------- END OF GENERATE -------------" << endl;
-    }
-
-    {
-        // PIPELINE Ex.2
-        std::string model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
-        LLMPipeline pipe(model_path, "CPU");
-        GenerationConfig config = pipe.generation_config();
-        // batched inputs
-        auto results = pipe({"table is made of", 
-                            "Alan Turing was a",
-                            "1 + 1 = ",
-                            "Why is the Sun yellow?"
-                            }, config.do_sample(false).max_new_tokens(100));
-        
-        for (const auto& res: results) {
-            cout << res << endl;
-            cout << "-------------------" << endl;
-        }
-    }
-
-    // GENERATE
-    std::string model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
-    LLMPipeline pipe(model_path);
-    auto [input_ids, attention_mask] = pipe.tokenize("table is made of");
-    auto res = pipe.generate(input_ids, attention_mask);
-    std::cout << pipe.detokenize(res)[0];
-} catch (const std::exception& error) {
-    std::cerr << error.what() << '\n';
-    return EXIT_FAILURE;
-} catch (...) {
-    std::cerr << "Non-exception object thrown\n";
-    return EXIT_FAILURE;
-}
diff --git a/text_generation/causal_lm/cpp/group_beam_searcher.hpp b/text_generation/causal_lm/cpp/group_beam_searcher.hpp
index 2128f988ca..e23c277e52 100644
--- a/text_generation/causal_lm/cpp/group_beam_searcher.hpp
+++ b/text_generation/causal_lm/cpp/group_beam_searcher.hpp
@@ -108,9 +108,9 @@ struct Group {
         }
 
         min_heap.push_back(std::move(beam));
-        std::push_heap(min_heap.begin(), min_heap.end(), greater);
+        std::push_heap(min_heap.begin(), min_heap.end(), ::greater);
         if (min_heap.size() > parameters.group_size) {
-            std::pop_heap(min_heap.begin(), min_heap.end(), greater);
+            std::pop_heap(min_heap.begin(), min_heap.end(), ::greater);
             min_heap.pop_back();
         }
     }
@@ -223,7 +223,7 @@ struct GroupBeamSearcher {
                 throw std::runtime_error("No beams left to search");
             }
             auto to_sort = candidates.begin() + ptrdiff_t(2 * parameters.group_size);
-            std::partial_sort(candidates.begin(), to_sort, candidates.end(), greater);
+            std::partial_sort(candidates.begin(), to_sort, candidates.end(), ::greater);
             group->ongoing.clear();
             for (size_t cand_idx = 0; cand_idx < candidates.size(); ++cand_idx) {
                 if (parameters.eos_token == candidates.at(cand_idx).tokens.back()) {