Skip to content

Commit

Permalink
reorder tokenizer.cpp, add comments to BaseStreamer
Browse files Browse the repository at this point in the history
  • Loading branch information
pavel-esir committed May 13, 2024
1 parent af22a8a commit a111a3f
Show file tree
Hide file tree
Showing 9 changed files with 168 additions and 136 deletions.
43 changes: 32 additions & 11 deletions src/cpp/include/openvino/genai/llm_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
* @param streamer optional streamer
* @return std::string decoded resulting text
*/
std::string generate(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer);
std::string generate(std::string text, OptionalGenerationConfig generation_config=nullopt, OptionalStreamerVariant streamer=nullopt);


template <typename... Properties>
util::EnableIfAllStringAny<std::string, Properties...> generate(
std::string text,
Expand Down Expand Up @@ -124,22 +123,21 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
*/
EncodedResults generate(ov::Tensor input_ids,
std::optional<ov::Tensor> attention_mask,
OptionalGenerationConfig generation_config,
OptionalStreamerVariant streamer);
OptionalGenerationConfig generation_config=nullopt,
OptionalStreamerVariant streamer=nullopt);

template <typename InputsType, typename... Properties>
util::EnableIfAllStringAny<std::string, Properties...> operator()(
InputsType text,
Properties&&... properties) {
return generate(text, AnyMap{std::forward<Properties>(properties)...});
}
std::string operator()(std::string text, OptionalGenerationConfig generation_config={});

DecodedResults operator()(std::vector<std::string> text, OptionalGenerationConfig generation_config);
DecodedResults operator()(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config);
DecodedResults operator()(std::vector<std::string> text, OptionalGenerationConfig generation_config=nullopt);
DecodedResults operator()(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config=nullopt);

// generate with streamers
std::string operator()(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer);
std::string operator()(std::string text, OptionalGenerationConfig generation_config=nullopt, OptionalStreamerVariant streamer=nullopt);
std::string operator()(std::string text, OptionalStreamerVariant streamer);

ov::Tokenizer get_tokenizer();
Expand All @@ -162,10 +160,33 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
* All names match to names in cofnig except streamer.
*/
static constexpr ov::Property<size_t> max_new_tokens{"max_new_tokens"};
static constexpr ov::Property<size_t> max_length{"max_length"};
static constexpr ov::Property<bool> ignore_eos{"ignore_eos"};

static constexpr ov::Property<size_t> num_beam_groups{"num_beam_groups"};
static constexpr ov::Property<size_t> num_beams{"num_beams"};
static constexpr ov::Property<float> diversity_penalty{"diversity_penalty"};
static constexpr ov::Property<float> length_penalty{"length_penalty"};
static constexpr ov::Property<size_t> num_return_sequences{"num_return_sequences"};
static constexpr ov::Property<size_t> no_repeat_ngram_size{"no_repeat_ngram_size"};
static constexpr ov::Property<StopCriteria> stop_criteria{"stop_criteria"};

static constexpr ov::Property<float> temperature{"temperature"};
static constexpr ov::Property<float> top_p{"top_p"};
static constexpr ov::Property<int> top_k{"top_k"};
static constexpr ov::Property<bool> do_sample{"do_sample"};
static constexpr ov::Property<float> repetition_penalty{"repetition_penalty"};


static constexpr ov::Property<int64_t> pad_token_id{"pad_token_id"};
static constexpr ov::Property<int64_t> bos_token_id{"bos_token_id"};
static constexpr ov::Property<int64_t> eos_token_id{"eos_token_id"};

static constexpr ov::Property<std::string> bos_token{"bos_token"};
static constexpr ov::Property<std::string> eos_token{"eos_token"};

// It's problematic to store and automaticall convert std::variant in AnyMap
static constexpr ov::Property<std::function<void (std::string)>> streamer_lambda{"streamer_lambda"};
static constexpr ov::Property<std::shared_ptr<StreamerBase>> streamer{"streamer"};
// only lambda streamer can be set via ov::streamer(),... syntaxic sugar,
// because std::variant<StremaerBase, std::function<>> can not be stored in AnyMap
static constexpr ov::Property<std::function<void (std::string)>> streamer_lambda{"streamer"};

} // namespace ov
13 changes: 12 additions & 1 deletion src/cpp/include/openvino/genai/streamer_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,21 @@

namespace ov {

/**
* @brief base class for streamers. In order to use inherit from from this class and inplement put, and methods
*
* @param m_tokenizer tokenizer
*/
class StreamerBase {
public:
Tokenizer m_tokenizer;
StreamerBase(Tokenizer tokenizer): m_tokenizer(tokenizer) {};
StreamerBase() = default;

/// @brief put is called every time new token is decoded
virtual void put(int64_t token) = 0;


/// @brief end is called at the end of generation. It can be used to flush cache if your own streamer has one
virtual void end() = 0;
};

Expand Down
4 changes: 2 additions & 2 deletions src/cpp/include/openvino/genai/tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
namespace ov {

/**
* @brief class used to encode prompts and decode resulting tokens
* @brief class is used to encode prompts and decode resulting tokens
*/
class OPENVINO_GENAI_EXPORTS Tokenizer {
public:
Expand All @@ -27,7 +27,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
* @brief encode a single prompt
* @return pair of [input_ids, attention_mask]
*/
std::pair<ov::Tensor, ov::Tensor> encode(const std::string prompt); // todo: passing by reference fails
std::pair<ov::Tensor, ov::Tensor> encode(const std::string prompt);

/**
* @brief encode batch of prompts. Left padding will be applied by default
Expand Down
2 changes: 2 additions & 0 deletions src/cpp/src/greedy_decoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@ ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner,
if (!generation_config.ignore_eos && all_are_eos)
break;
}
if (streamer)
streamer->end();
return results;
}

Expand Down
23 changes: 5 additions & 18 deletions src/cpp/src/llm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,11 +186,6 @@ ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::generate(std::vector<std::s
return {m_tokenizer.decode(generate_results.tokens), generate_results.scores};
}

std::string ov::LLMPipeline::operator()(std::string text, OptionalGenerationConfig generation_config) {
OptionalStreamerVariant empty_streamer;
return generate(text, generation_config, empty_streamer);
}

ov::DecodedResults ov::LLMPipeline::operator()(std::vector<std::string> texts, OptionalGenerationConfig generation_config) {
return m_pimpl-> generate(texts, generation_config);
}
Expand Down Expand Up @@ -245,16 +240,11 @@ std::string ov::LLMPipeline::generate(std::string text, OptionalGenerationConfig
return m_pimpl->generate(text, generation_config, streamer);
}


std::string ov::LLMPipeline::generate(std::string text, const ov::AnyMap& config_map) {
OptionalStreamerVariant streamer;
auto config = GenerationConfigHelper(get_generation_config()).anymap_to_generation_config(config_map);

// todo: get attentions from properties?
if (config_map.count("streamer_lambda")) {
streamer = config_map.at("streamer_lambda").as<std::function<void (std::string)>>();
} else if (config_map.count("streamer")) {
streamer = config_map.at("streamer").as<std::shared_ptr<StreamerBase>>();
if (config_map.count("streamer")) {
streamer = config_map.at("streamer").as<std::function<void (std::string)>>();
}

return m_pimpl->generate(text, config, streamer);
Expand All @@ -263,13 +253,10 @@ std::string ov::LLMPipeline::generate(std::string text, const ov::AnyMap& config
ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids, const ov::AnyMap& config_map) {
OptionalStreamerVariant streamer;
auto config = GenerationConfigHelper(get_generation_config()).anymap_to_generation_config(config_map);

// todo: get attentions from properties?
if (config_map.count("streamer_lambda")) {
streamer = config_map.at("streamer_lambda").as<std::function<void (std::string)>>();
} else if (config_map.count("streamer")) {
streamer = config_map.at("streamer").as<std::shared_ptr<StreamerBase>>();
if (config_map.count("streamer")) {
streamer = config_map.at("streamer").as<std::function<void (std::string)>>();
}

std::optional<ov::Tensor> attention_mask;
return m_pimpl->generate(input_ids, attention_mask, config, streamer);
}
Expand Down
7 changes: 2 additions & 5 deletions src/cpp/src/text_callback_streamer.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#include "text_callback_streamer.hpp"

namespace ov {


TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function<void (std::string)> callback, bool print_eos_token) {
m_tokenizer = tokenizer;
Expand All @@ -17,11 +16,9 @@ TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, bool prin

void TextCallbackStreamer::put(int64_t token) {
std::stringstream res;
// do not print anything and flush cache if EOS token is met
if (token == m_tokenizer.get_eos_token_id()) {
end();
// do nothing if <eos> token is met and if print_eos_token=false
if (!m_print_eos_token && token == m_tokenizer.get_eos_token_id())
return;
}

m_tokens_cache.push_back(token);
std::string text = m_tokenizer.decode(m_tokens_cache);
Expand Down
Loading

0 comments on commit a111a3f

Please sign in to comment.