reorder tokenizer.cpp, add comments to BaseStreamer

Wovchena · May 13, 2024 · a111a3f · a111a3f
1 parent af22a8a
commit a111a3f
Show file tree

Hide file tree

Showing 9 changed files with 168 additions and 136 deletions.
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -82,9 +82,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     * @param streamer optional streamer
     * @return std::string decoded resulting text
     */
-    std::string generate(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer);
+    std::string generate(std::string text, OptionalGenerationConfig generation_config=nullopt, OptionalStreamerVariant streamer=nullopt);
 
-
     template <typename... Properties>
     util::EnableIfAllStringAny<std::string, Properties...> generate(
         std::string text,
@@ -124,22 +123,21 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     */
     EncodedResults generate(ov::Tensor input_ids, 
                             std::optional<ov::Tensor> attention_mask, 
-                            OptionalGenerationConfig generation_config,
-                            OptionalStreamerVariant streamer);
+                            OptionalGenerationConfig generation_config=nullopt,
+                            OptionalStreamerVariant streamer=nullopt);
 
     template <typename InputsType, typename... Properties>
     util::EnableIfAllStringAny<std::string, Properties...> operator()(
         InputsType text,
         Properties&&... properties) {
         return generate(text, AnyMap{std::forward<Properties>(properties)...});
     }
-    std::string operator()(std::string text, OptionalGenerationConfig generation_config={});
 
-    DecodedResults operator()(std::vector<std::string> text, OptionalGenerationConfig generation_config);
-    DecodedResults operator()(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config);
+    DecodedResults operator()(std::vector<std::string> text, OptionalGenerationConfig generation_config=nullopt);
+    DecodedResults operator()(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config=nullopt);
 
     // generate with streamers
-    std::string operator()(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer);
+    std::string operator()(std::string text, OptionalGenerationConfig generation_config=nullopt, OptionalStreamerVariant streamer=nullopt);
     std::string operator()(std::string text, OptionalStreamerVariant streamer);
 
     ov::Tokenizer get_tokenizer();
@@ -162,10 +160,33 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
  * All names match to names in cofnig except streamer.
 */
 static constexpr ov::Property<size_t> max_new_tokens{"max_new_tokens"};
+static constexpr ov::Property<size_t> max_length{"max_length"};
+static constexpr ov::Property<bool> ignore_eos{"ignore_eos"};
+
+static constexpr ov::Property<size_t> num_beam_groups{"num_beam_groups"};
+static constexpr ov::Property<size_t> num_beams{"num_beams"};
+static constexpr ov::Property<float> diversity_penalty{"diversity_penalty"};
+static constexpr ov::Property<float> length_penalty{"length_penalty"};
+static constexpr ov::Property<size_t> num_return_sequences{"num_return_sequences"};
+static constexpr ov::Property<size_t> no_repeat_ngram_size{"no_repeat_ngram_size"};
+static constexpr ov::Property<StopCriteria> stop_criteria{"stop_criteria"};
+
 static constexpr ov::Property<float> temperature{"temperature"};
+static constexpr ov::Property<float> top_p{"top_p"};
+static constexpr ov::Property<int> top_k{"top_k"};
+static constexpr ov::Property<bool> do_sample{"do_sample"};
+static constexpr ov::Property<float> repetition_penalty{"repetition_penalty"};
+
+
+static constexpr ov::Property<int64_t> pad_token_id{"pad_token_id"};
+static constexpr ov::Property<int64_t> bos_token_id{"bos_token_id"};
+static constexpr ov::Property<int64_t> eos_token_id{"eos_token_id"};
+
+static constexpr ov::Property<std::string> bos_token{"bos_token"};
+static constexpr ov::Property<std::string> eos_token{"eos_token"};
 
-// It's problematic to store and automaticall convert std::variant in AnyMap
-static constexpr ov::Property<std::function<void (std::string)>> streamer_lambda{"streamer_lambda"};
-static constexpr ov::Property<std::shared_ptr<StreamerBase>> streamer{"streamer"};
+// only lambda streamer can be set via ov::streamer(),... syntaxic sugar,
+// because std::variant<StremaerBase, std::function<>> can not be stored in AnyMap
+static constexpr ov::Property<std::function<void (std::string)>> streamer_lambda{"streamer"};
 
 } // namespace ov
diff --git a/src/cpp/include/openvino/genai/streamer_base.hpp b/src/cpp/include/openvino/genai/streamer_base.hpp
@@ -7,10 +7,21 @@
 
 namespace ov {
 
+/** 
+ * @brief base class for streamers. In order to use inherit from from this class and inplement put, and methods
+ * 
+ * @param m_tokenizer tokenizer
+*/
 class StreamerBase {
 public:
+    Tokenizer m_tokenizer;
+    StreamerBase(Tokenizer tokenizer): m_tokenizer(tokenizer) {};
+    StreamerBase() = default;
+
+    /// @brief put is called every time new token is decoded
     virtual void put(int64_t token) = 0;
-
+
+    /// @brief end is called at the end of generation. It can be used to flush cache if your own streamer has one
     virtual void end() = 0;
 };
 

diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -12,7 +12,7 @@
 namespace ov {
 
 /**
-* @brief class used to encode prompts and decode resulting tokens
+* @brief class is used to encode prompts and decode resulting tokens
 */
 class OPENVINO_GENAI_EXPORTS Tokenizer {
 public:
@@ -27,7 +27,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     * @brief encode a single prompt
     * @return pair of [input_ids, attention_mask]
     */
-    std::pair<ov::Tensor, ov::Tensor> encode(const std::string prompt);  // todo: passing by reference fails
+    std::pair<ov::Tensor, ov::Tensor> encode(const std::string prompt);
 
     /**
     * @brief encode batch of prompts. Left padding will be applied by default

diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp
@@ -170,6 +170,8 @@ ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner,
         if (!generation_config.ignore_eos && all_are_eos)
             break;
     }
+    if (streamer)
+        streamer->end();
     return results;
 }
 

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
@@ -186,11 +186,6 @@ ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::generate(std::vector<std::s
     return {m_tokenizer.decode(generate_results.tokens), generate_results.scores};
 }
 
-std::string ov::LLMPipeline::operator()(std::string text, OptionalGenerationConfig generation_config) {
-    OptionalStreamerVariant empty_streamer;
-    return generate(text, generation_config, empty_streamer);
-}
-
 ov::DecodedResults ov::LLMPipeline::operator()(std::vector<std::string> texts, OptionalGenerationConfig generation_config) {
     return m_pimpl-> generate(texts, generation_config);
 }
@@ -245,16 +240,11 @@ std::string ov::LLMPipeline::generate(std::string text, OptionalGenerationConfig
     return m_pimpl->generate(text, generation_config, streamer);
 }
 
-
 std::string ov::LLMPipeline::generate(std::string text, const ov::AnyMap& config_map) {
     OptionalStreamerVariant streamer;
     auto config = GenerationConfigHelper(get_generation_config()).anymap_to_generation_config(config_map);
-
-    // todo: get attentions from properties?
-    if (config_map.count("streamer_lambda")) {
-        streamer = config_map.at("streamer_lambda").as<std::function<void (std::string)>>();
-    } else if (config_map.count("streamer")) {
-        streamer = config_map.at("streamer").as<std::shared_ptr<StreamerBase>>();
+    if (config_map.count("streamer")) {
+        streamer = config_map.at("streamer").as<std::function<void (std::string)>>();
     }
 
     return m_pimpl->generate(text, config, streamer);
@@ -263,13 +253,10 @@ std::string ov::LLMPipeline::generate(std::string text, const ov::AnyMap& config
 ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids, const ov::AnyMap& config_map) {
     OptionalStreamerVariant streamer;
     auto config = GenerationConfigHelper(get_generation_config()).anymap_to_generation_config(config_map);
-
-    // todo: get attentions from properties?
-    if (config_map.count("streamer_lambda")) {
-        streamer = config_map.at("streamer_lambda").as<std::function<void (std::string)>>();
-    } else if (config_map.count("streamer")) {
-        streamer = config_map.at("streamer").as<std::shared_ptr<StreamerBase>>();
+    if (config_map.count("streamer")) {
+        streamer = config_map.at("streamer").as<std::function<void (std::string)>>();
     }
+
     std::optional<ov::Tensor> attention_mask;
     return m_pimpl->generate(input_ids, attention_mask, config, streamer);
 }

diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp
@@ -1,7 +1,6 @@
 #include "text_callback_streamer.hpp"
 
 namespace ov {
-
 
 TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function<void (std::string)> callback, bool print_eos_token) {
     m_tokenizer = tokenizer;
@@ -17,11 +16,9 @@ TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, bool prin
 
 void TextCallbackStreamer::put(int64_t token) {
     std::stringstream res;
-    // do not print anything and flush cache if EOS token is met
-    if (token == m_tokenizer.get_eos_token_id()) {
-        end();
+    // do nothing if <eos> token is met and if print_eos_token=false
+    if (!m_print_eos_token && token == m_tokenizer.get_eos_token_id())
         return;
-    }
 
     m_tokens_cache.push_back(token);
     std::string text = m_tokenizer.decode(m_tokens_cache);