Skip to content

Commit

Permalink
cleanup generate_sample.cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
pavel-esir committed Apr 11, 2024
1 parent 4cc0c64 commit d551da3
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 106 deletions.
6 changes: 3 additions & 3 deletions text_generation/causal_lm/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)

set(TARGET_NAME beam_search_sample)
set(TARGET_NAME beam_search_causal_lm)
add_executable(${TARGET_NAME} beam_search_causal_lm.cpp)
target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
Expand All @@ -35,8 +35,8 @@ target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)

set(TARGET_NAME generate_pipeline)
add_executable(${TARGET_NAME} generate_pipeline/main.cpp)
set(TARGET_NAME generate_sample)
add_executable(${TARGET_NAME} generate_pipeline/generate_sample.cpp)
target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
find_package(OpenVINO REQUIRED COMPONENTS Runtime)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include <filesystem>
#include "group_beam_searcher.hpp"

using GenerationResult = std::vector<std::vector<int64_t>>;
using GenerationResult = std::vector<std::pair<float, std::vector<int64_t>>>;
using namespace std;

std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token=2) {
Expand Down Expand Up @@ -124,7 +124,8 @@ class LLMPipeline {
if (!is_xml(full_path))
full_path += "/openvino_model.xml";
m_model_runner = core.compile_model(full_path, device, config).create_infer_request();


// todo: add loading EOS_TOKEN_ID from IR
core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
// tokenizer and detokenizer work on CPU only
full_path = tokenizer_path;
Expand Down Expand Up @@ -205,6 +206,11 @@ class LLMPipeline {

return {m_tokenizer.get_tensor("input_ids"), m_tokenizer.get_tensor("attention_mask")};
}

std::pair<ov::Tensor, ov::Tensor> tokenize(std::initializer_list<std::string> text) {
return tokenize(std::vector<std::string>(text.begin(), text.end()));
}


std::string detokenize(std::vector<int64_t> tokens) {
size_t batch_size = 1;
Expand All @@ -231,7 +237,7 @@ class LLMPipeline {
// todo: implement calling detokenizer in a single batch

std::vector<std::string> strings;
for (auto& line: lines){
for (auto& [score, line]: lines){
ov::Tensor tokens = ov::Tensor{ov::element::i64, {1, line.size()}, line.data()};
m_detokenizer.set_input_tensor(tokens);
m_detokenizer.infer();
Expand Down Expand Up @@ -281,13 +287,14 @@ class LLMPipeline {
for (size_t batch = 0; batch < batch_size; ++batch) {
const float * logits_data = logits.data<const float>() + seq_len * vocab_size * batch + (seq_len - 1) * vocab_size;
int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
results[batch].emplace_back(out_token);
results[batch].second.emplace_back(out_token);
token_iter_results[batch] = out_token;
eos_met[batch] != (out_token == sampling_params.m_eos_token_id);

m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
m_model_runner.get_tensor("position_ids").data<int64_t>()[batch] = int64_t(prompt_len + i);
}
// place
sampling_params.m_callback(std::move(token_iter_results), *this);

// stop generation when EOS is met in all batches
Expand Down Expand Up @@ -348,13 +355,16 @@ class LLMPipeline {

m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
std::fill_n(m_model_runner.get_tensor("position_ids").data<int64_t>(), batch_size, mask_shape.at(1) - 1);

// place
sampling_params.m_callback(std::move(next_tokens), *this);

}

std::vector<Beam> beams;
for (const std::vector<Beam>& group : finalize(std::move(group_beam_searcher))) {
for (const Beam& beam : group) {
beams.emplace_back(beam);
// results.emplace_back(beam.tokens);
}
}

Expand All @@ -363,7 +373,7 @@ class LLMPipeline {

GenerationResult results;
for (auto beam = beams.begin(); beam != beams.begin() + sampling_params.m_num_return_sequences; ++beam) {
results.emplace_back(beam->tokens);
results.emplace_back(std::pair(beam->score, beam->tokens));
}
return results;
}
Expand Down
113 changes: 113 additions & 0 deletions text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#include <openvino/openvino.hpp>
#include "generate_pipeline.hpp"


// The following reasons require TextStreamer to keep a cache of previous tokens:
// detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a",
// but detokenize(tokenize("prefix a")) == "prefix a"
// 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�"
struct TextStreamer {
LLMPipeline pipe;
std::vector<int64_t> token_cache;
size_t print_len = 0;

void put(int64_t token) {
token_cache.push_back(token);
std::string text = pipe.detokenize(token_cache);
if (!text.empty() && '\n' == text.back()) {
// Flush the cache after the new line symbol
std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
token_cache.clear();
print_len = 0;
return;
}
if (text.size() >= 3 && text.compare(text.size() - 3, 3, "") == 0) {
// Don't print incomplete text
return;
}
std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
print_len = text.size();
}

void end() {
std::string text = pipe.detokenize(token_cache);
std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
token_cache.clear();
print_len = 0;
}
};

int main(int argc, char* argv[]) try {
if (2 >= argc && argc <= 4)
throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> \"<PROMPT>\" <DEVICE>");

std::string prompt = "table is made of";
std::string device = "CPU"; // can be replaced with GPU

std::string model_path = argv[1];
if (argc > 2)
prompt = argv[2];
if (argc > 3)
device = argv[3];

// Example 1: TextStreaming example with greedy search
LLMPipeline pipe(model_path, device);
// Will try to load config from generation_config.json.
// but if not found default velues for gready search will be used
GenerationConfig config = pipe.generation_config();

auto text_streamer = TextStreamer{pipe};
auto text_streamer_callback = [&text_streamer](std::vector<int64_t>&& tokens, LLMPipeline& pipe){
text_streamer.put(tokens[0]);
};

cout << "greedy generate streaming mode:" << endl;
config.max_new_tokens(20).set_callback(text_streamer_callback);
pipe(prompt, config);
text_streamer.end();

// Example 2: Grouped Beam Search decoding example
pipe = LLMPipeline(model_path, device);
config = pipe.generation_config();

// will return vector with num_return_sequences strings
auto num_return_sequences = 3;
config.max_new_tokens(20).num_groups(3).group_size(5).num_return_sequences(num_return_sequences);

cout << endl << "grouped beam search generated candidates:" << endl;
auto generation_results = pipe({prompt}, config);
for (int i = 0; i < num_return_sequences; ++i)
cout << "candidate " << i << ": " << generation_results[i] << endl;

// Example 3: Greedy Decoding with multiple batch
pipe = LLMPipeline(model_path, device);
config = pipe.generation_config();

cout << endl << "greedy decoding with multiple batches:" << endl;
std::vector<std::string> prompts = {"table is made of", "Alan Turing was a", "1 + 1 = ", "Why is the Sun yellow?"};
auto results = pipe(prompts, config.max_new_tokens(20));
for (int i = 0; i < prompts.size(); i++)
cout << prompts[i] << ": " << results[i] << endl;

// Example 4: Calling tokenizer/detokenizer manually and getting beam scores for all candidates
pipe = LLMPipeline(model_path);
auto [input_ids, attention_mask] = pipe.tokenize({prompt});
config = GenerationConfig::beam_search();
// config for grouped beam search
config.max_new_tokens(30).num_groups(3).group_size(5).num_return_sequences(15);

cout << endl << "beam search with printing of all candidates:" << endl;
auto beams = pipe.generate(input_ids, attention_mask, config);
for (const auto& beam : beams)
std::cout << beam.first << ": " << pipe.detokenize(beam.second) << std::endl;

} catch (const std::exception& error) {
std::cerr << error.what() << '\n';
return EXIT_FAILURE;
} catch (...) {
std::cerr << "Non-exception object thrown\n";
return EXIT_FAILURE;
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ struct GenerationConfig {
size_t m_num_return_sequences = 3; // is used by beam search, in other case is equal to batch size
StopCriteria stop_criteria = StopCriteria::heuristic;


float m_repetition_penalty = 1.0f;
float m_length_penalty = 1.0f;
size_t m_no_repeat_ngram_size = std::numeric_limits<size_t>::max();
Expand Down
93 changes: 0 additions & 93 deletions text_generation/causal_lm/cpp/generate_pipeline/main.cpp

This file was deleted.

6 changes: 3 additions & 3 deletions text_generation/causal_lm/cpp/group_beam_searcher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,9 @@ struct Group {
}

min_heap.push_back(std::move(beam));
std::push_heap(min_heap.begin(), min_heap.end(), greater);
std::push_heap(min_heap.begin(), min_heap.end(), ::greater);
if (min_heap.size() > parameters.group_size) {
std::pop_heap(min_heap.begin(), min_heap.end(), greater);
std::pop_heap(min_heap.begin(), min_heap.end(), ::greater);
min_heap.pop_back();
}
}
Expand Down Expand Up @@ -223,7 +223,7 @@ struct GroupBeamSearcher {
throw std::runtime_error("No beams left to search");
}
auto to_sort = candidates.begin() + ptrdiff_t(2 * parameters.group_size);
std::partial_sort(candidates.begin(), to_sort, candidates.end(), greater);
std::partial_sort(candidates.begin(), to_sort, candidates.end(), ::greater);
group->ongoing.clear();
for (size_t cand_idx = 0; cand_idx < candidates.size(); ++cand_idx) {
if (parameters.eos_token == candidates.at(cand_idx).tokens.back()) {
Expand Down

0 comments on commit d551da3

Please sign in to comment.