Skip to content

Commit

Permalink
Greedy search works
Browse files Browse the repository at this point in the history
  • Loading branch information
pavel-esir committed Apr 3, 2024
1 parent c39d1e3 commit b139e69
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 44 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,20 @@
#include <openvino/openvino.hpp>
#include "sampling_parameters.hpp"
#include <experimental/filesystem>
#include "group_beam_searcher.hpp"

using GenerationResult = ov::Tensor;
// using GenerationResult = ov::Tensor;
using GenerationResult = std::vector<std::vector<int64_t>>;

class LLMEngine {
ov::InferRequest m_model_runner;

GenerationResult greedy_search(ov::Tensor prompts, SamplingParameters sampling_params) {
ov::Shape prompts_shape = prompts.get_shape();
size_t batch_size = prompts_shape[0];
// todo: implement for batch > 1
OPENVINO_ASSERT(batch_size == 1);

GenerationResult results = ov::Tensor{ov::element::i64, {batch_size, sampling_params.max_new_tokens}};
GenerationResult results(batch_size);

auto attention_mask = ov::Tensor{ov::element::i64, prompts.get_shape()};
std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
Expand All @@ -45,12 +46,13 @@ class LLMEngine {
std::fill_n(m_model_runner.get_tensor("attention_mask").data<int64_t>(), m_model_runner.get_tensor("attention_mask").get_size(), 1);

m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});

for (size_t batch = 0; batch < batch_size; ++batch) {
const float * logits_data = logits.data<const float>() + seq_len * vocab_size * batch + (seq_len - 1) * vocab_size;
int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
results.data<int64_t>()[sampling_params.max_new_tokens * batch + i] = out_token;
results[batch].emplace_back(out_token);

// todo: add exit criteria when pad or EOS is met
m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
m_model_runner.get_tensor("position_ids").data<int64_t>()[batch] = int64_t(initial_seq_len + i);
}
Expand All @@ -59,8 +61,72 @@ class LLMEngine {
}

GenerationResult beam_search(ov::Tensor prompts, SamplingParameters sampling_params) {
// todo: implement
ov::Shape prompts_shape = prompts.get_shape();
size_t batch_size = prompts_shape[0];
// todo: implement for batch > 1
OPENVINO_ASSERT(batch_size == 1);

// initialize inputs
auto attention_mask = ov::Tensor{ov::element::i64, prompts.get_shape()};
std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
auto position_ids = ov::Tensor{ov::element::i64, prompts.get_shape()};
std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
auto initial_seq_len = prompts.get_shape()[1];

m_model_runner.set_tensor("input_ids", prompts);
m_model_runner.set_tensor("attention_mask", attention_mask);
m_model_runner.set_tensor("position_ids", position_ids);

// set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1
m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
m_model_runner.get_tensor("beam_idx").data<int32_t>()[0] = 0;

const int64_t* prompt_data = prompts.data<const int64_t>();

// todo: remove this duplicatino and use the same SamplingParameters for both greedy and beam
Parameters parameters{std::vector<int64_t>{prompt_data, prompt_data + prompts.get_size()}};
parameters.n_groups = sampling_params.n_groups;
parameters.diversity_penalty = sampling_params.diversity_penalty;
parameters.group_size = sampling_params.group_size;

GroupBeamSearcher group_beam_searcher{parameters};
std::vector<int64_t> next_tokens;
std::vector<int32_t> next_beams;
for (size_t length_count = 0; length_count < sampling_params.max_new_tokens; ++length_count) {
m_model_runner.infer();
std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(m_model_runner.get_tensor("logits"));
if (next_tokens.empty()) {
break;
}
size_t batch_size = next_tokens.size();
// Set pointers
m_model_runner.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()});
m_model_runner.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()});
// Set auxiliary inputs
ov::Tensor attention_mask = m_model_runner.get_tensor("attention_mask");
ov::Shape mask_shape{batch_size, attention_mask.get_shape()[1] + 1};
attention_mask.set_shape(mask_shape);
std::fill_n(attention_mask.data<int64_t>(), ov::shape_size(mask_shape), 1);

m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
std::fill_n(m_model_runner.get_tensor("position_ids").data<int64_t>(), batch_size, mask_shape.at(1) - 1);
}

std::vector<Beam> beams;
for (const std::vector<Beam>& group : finalize(std::move(group_beam_searcher))) {
for (const Beam& beam : group) {
beams.emplace_back(beam);
// results.emplace_back(beam.tokens);
}
}

auto compare_scores = [](Beam left, Beam right) { return (left.score < right.score); };
std::sort(beams.begin(), beams.end(), compare_scores);

GenerationResult results;
for (auto beam = beams.begin(); beam != beams.begin() + sampling_params.num_return_sequences; ++beam) {
results.emplace_back(beam->tokens);
}
return results;
}

Expand Down Expand Up @@ -124,6 +190,23 @@ std::vector<std::string> detokenize(ov::InferRequest& detokenizer, ov::Tensor to
return strings;
}

std::vector<std::string> detokenize(ov::InferRequest& detokenizer,
std::vector<std::vector<int64_t>> lines,
int64_t pad_token_idx) {
// todo: implement calling detokenizer in a single batch

std::vector<std::string> strings;
for (auto& line: lines){
ov::Tensor tokens = ov::Tensor{ov::element::i64, {1, line.size()}, line.data()};
detokenizer.set_input_tensor(tokens);
detokenizer.infer();
auto res = detokenizer.get_output_tensor();
strings.emplace_back(res.data<std::string>()[0]);
}

return strings;
}

// The following reasons require TextStreamer to keep a cache of previous tokens:
// detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a",
// but detokenize(tokenize("prefix a")) == "prefix a"
Expand Down Expand Up @@ -171,6 +254,7 @@ class LLMPipeline {
if (std::experimental::filesystem::exists(m_path + "/generation_config.json")) {
m_sampling_parameters = SamplingParameters(m_path + "/generation_config.json");
}
m_sampling_parameters = SamplingParameters(m_path + "/generation_config_beam.json");

ov::Core core;
// The model can be compiled for GPU as well
Expand All @@ -188,6 +272,6 @@ class LLMPipeline {

auto generate_results = m_model_runner.generate(input_ids, m_sampling_parameters);

return detokenize(m_detokenizer, generate_results)[0];
return detokenize(m_detokenizer, generate_results, 0)[0];
}
};
43 changes: 16 additions & 27 deletions text_generation/causal_lm/cpp/generate_pipeline/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,38 +11,27 @@ constexpr size_t BATCH_SIZE = 1;

} // namespace

// void print_generation_results(GenerationResult results, ov::InferRequest& detokenizer) {
// TextStreamer text_streamer{std::move(detokenizer)};
// for (const auto& result: results) {
// text_streamer.put(result);
// }
// text_streamer.end();
// }

int main(int argc, char* argv[]) try {
// PIPELINE
std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";

LLMPipeline pipe(model_path);
std::cout << pipe.call("Alan Turing was a");
// std::cout << pipe.call("Alan Turing was a");

// ov::Core core;
// // core.add_extension("libuser_ov_extensions.so");
// core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
// // tokenizer and detokenizer work on CPU only
// ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
// ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();

// // The model can be compiled for GPU as well
// std::shared_ptr<ov::Model> model = core.read_model(model_path + "/openvino_model.xml");
// ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request();
// GENERATE
ov::Core core;
core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();

// todo: beam search does not work properly on GPU, when reshaped from batch=1 to batch=num_beams not broadcasted
std::shared_ptr<ov::Model> model = core.read_model(model_path + "/openvino_model.xml");
ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request();

// auto [input_ids, attention_mask] = tokenize(tokenizer, argv[1]);

// SamplingParameters sampling_params = SamplingParameters::greedy();

// LLMEngine engine(request);
// GenerationResult generation_results = engine.generate(input_ids, sampling_params);
// print_generation_results(generation_results, detokenizer);
auto [input_ids, attention_mask] = tokenize(tokenizer, "Alan Turing was a");
SamplingParameters sampling_params = SamplingParameters::beam_search();
LLMEngine engine(request);
GenerationResult generation_results = engine.generate(input_ids, sampling_params);
std::cout << detokenize(detokenizer, generation_results[0]);

} catch (const std::exception& error) {
std::cerr << error.what() << '\n';
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -8,8 +7,7 @@
#include <functional>
#include <nlohmann/json.hpp>
#include <fstream>

enum class StopCriteria {early, heuristic, never};
#include <group_beam_searcher.hpp> // used only for StopCriteria

// forward declaration
class Sequence;
Expand All @@ -18,15 +16,17 @@ class Sequence;
// and has parameters that are not present in the original SamplingParameters for continous batching
struct SamplingParameters {
// Generic
size_t max_new_tokens = 100;
size_t max_new_tokens = 10;
size_t max_length = 100; // max_new tokens should have priority over max_new_tokens
bool ignore_eos = false;
int64_t eos_token = 2; // There's no way to extract special token values from the tokenizer for now
size_t num_return_sequences = 3;

// Beam search specific
size_t n_groups = 1;
size_t group_size = 1; // beam_width
float diversity_penalty = 1.0f; // 0.0 means no diversity

StopCriteria stop_criteria = StopCriteria::heuristic;
float length_penalty = 1.0f;
size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
Expand All @@ -53,10 +53,17 @@ struct SamplingParameters {
eos_token_id = data.value("eos_token_id", 0);
max_length = data.value("max_length", 0);
pad_token_id = data.value("pad_token_id", 0);
num_return_sequences = data.value("num_return_sequences", 1);

temperature = data.value("temperature", 0.0f);
do_sample = data.value("do_sample", false);
top_p = data.value("top_p", 0.0f);

// beam_search_params
n_groups = data.value("num_beam_groups", 1);
diversity_penalty = data.value("diversity_penalty", 1.0f);
int num_beams = data.value("num_beams", 1);
group_size = num_beams / n_groups;
}

static SamplingParameters greedy() {
Expand All @@ -68,9 +75,9 @@ struct SamplingParameters {

static SamplingParameters beam_search() {
SamplingParameters beam_search;
beam_search.n_groups = 2;
beam_search.group_size = 2;
beam_search.max_new_tokens = 100;
beam_search.n_groups = 3;
beam_search.group_size = 5;
beam_search.max_new_tokens = 10;
beam_search.diversity_penalty = 2.0f;
return beam_search;
}
Expand Down
7 changes: 4 additions & 3 deletions text_generation/causal_lm/cpp/group_beam_searcher.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#pragma once

#include <openvino/runtime/tensor.hpp>

Expand Down Expand Up @@ -84,9 +85,9 @@ enum class StopCriteria { early, heuristic, never };

struct Parameters {
std::vector<int64_t> prompt;
size_t n_groups = 2;
size_t group_size = 2;
float diversity_penalty = 2.0;
size_t n_groups = 3;
size_t group_size = 5;
float diversity_penalty = 1.0;
size_t max_new_tokens = 100;
StopCriteria stop_criteria = StopCriteria::heuristic;
float length_penalty = 1.0;
Expand Down

0 comments on commit b139e69

Please sign in to comment.