Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

StaticLLMPipeline: Support multinomial sampling #1431

Open
wants to merge 25 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
f7a63e6
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Oct 17, 2024
f87b049
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Oct 17, 2024
d584e5d
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Oct 18, 2024
66e384c
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Oct 18, 2024
614da55
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Oct 20, 2024
3acec5b
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Oct 21, 2024
2470613
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Oct 22, 2024
e640af3
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Oct 25, 2024
13ce329
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Oct 30, 2024
3f318be
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Oct 30, 2024
fbd14c3
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Oct 31, 2024
d4fd072
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Oct 31, 2024
01647e2
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Nov 6, 2024
5f72fa9
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Nov 13, 2024
5d77dea
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Nov 14, 2024
8ac7c7d
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Dec 9, 2024
40497d9
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Dec 10, 2024
8038422
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Dec 17, 2024
1e6998e
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Dec 19, 2024
93644f2
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Dec 19, 2024
b800531
Experimental snapshot with greedy decoding
TolyaTalamanov Dec 23, 2024
55ead2d
Add multinomial support
TolyaTalamanov Dec 23, 2024
e01ccdf
Merge branch 'master' of https://github.com/openvinotoolkit/openvino.…
TolyaTalamanov Dec 24, 2024
188575d
Handle rng seed
TolyaTalamanov Dec 24, 2024
ab84950
Update sampler.cpp
TolyaTalamanov Dec 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 40 additions & 4 deletions src/cpp/src/llm_pipeline_static.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@

#include "llm_pipeline_static.hpp"

#include "logit_processor.hpp"
#include "sampler.hpp"

#include <fstream>
#include <regex>

Expand Down Expand Up @@ -938,6 +941,30 @@ DecodedResults StaticLLMPipeline::generate(
return decoded_results;
}

int64_t sample_next_token(const ov::Tensor& logits,
const GenerationConfig& config,
std::mt19937& rng_engine,
LogitProcessor& logit_processor) {
const size_t vocab_size = logits.get_shape()[2];
const size_t seq_len_size = logits.get_shape()[1];
const size_t offset = (seq_len_size - 1) * vocab_size;
// NB: Slice out and take probabilities only for the last token
Logits logit_vector(logits.data<float>() + offset, vocab_size);
logit_processor.apply(logit_vector);
int64_t last_token = -1;
if (config.is_greedy_decoding()) {
last_token = ov::genai::greedy_sample(logit_vector, config.logprobs).m_index;
} else if (config.is_multinomial()) {
last_token = ov::genai::multinomial_sample(logit_vector, 1u, rng_engine)[0].m_index;
} else {
// NB: Only greedy and multinomial supported,
// the appropriate check is performed before
OPENVINO_ASSERT(false);
}
logit_processor.register_new_generated_token(last_token);
return last_token;
}

EncodedResults StaticLLMPipeline::generate(
const EncodedInputs& inputs,
OptionalGenerationConfig generation_config,
Expand Down Expand Up @@ -974,10 +1001,16 @@ EncodedResults StaticLLMPipeline::generate(
streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
}

if (!config.is_greedy_decoding()) {
OPENVINO_THROW("Currently only greedy decoding is supported");
if (!config.is_greedy_decoding() && !config.is_multinomial()) {
OPENVINO_THROW("Currently only greedy and multinomial decoding are supported");
}

std::vector<int64_t> input_ids_vec;
input_ids_vec.reserve(input_ids.get_size());
std::copy_n(input_ids.data<int64_t>(), input_ids.get_size(), std::back_inserter(input_ids_vec));
LogitProcessor logit_processor(config, input_ids_vec);
m_rng_engine.seed(config.rng_seed);

ov::Shape prompts_shape = input_ids.get_shape();
const size_t batch_size = prompts_shape[0];
ov::genai::EncodedResults results;
Expand Down Expand Up @@ -1016,7 +1049,9 @@ EncodedResults StaticLLMPipeline::generate(

// NB: Now there are prompt_len tokens in KV-cache
m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(prompt_len);
int64_t last_token = utils::argmax(m_prefill_request.get_tensor("logits"), 0);

auto last_token = sample_next_token(
m_prefill_request.get_tensor("logits"), config, m_rng_engine, logit_processor);
results.tokens[0].push_back(last_token);
if (streamer_ptr && streamer_ptr->put(last_token)) {
return results;
Expand Down Expand Up @@ -1070,7 +1105,8 @@ EncodedResults StaticLLMPipeline::generate(
m_kvcache_request.infer();
m_kvcache_desc.num_stored_tokens += 1;

last_token = utils::argmax(m_kvcache_request.get_tensor("logits"), 0);
last_token = sample_next_token(
m_kvcache_request.get_tensor("logits"), config, m_rng_engine, logit_processor);
results.tokens[0].push_back(last_token);

raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
Expand Down
4 changes: 4 additions & 0 deletions src/cpp/src/llm_pipeline_static.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#pragma once

#include <filesystem>
#include <random>

#include "llm_pipeline_base.hpp"

Expand Down Expand Up @@ -83,6 +84,9 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {

bool m_is_chat_conversation = false;
ChatHistory m_history;

// NB: For multinomial sampling
std::mt19937 m_rng_engine;
};

} // namespace genai
Expand Down
124 changes: 67 additions & 57 deletions src/cpp/src/sampler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,71 @@ std::vector<Token> log_softmax(const ov::Tensor& logits, size_t batch_idx) {
return tokens;
}

Token greedy_sample(const Logits& logits, size_t top_logprobs) {
// For greedy sampling we do not expect sorting or shrinking considered tokens
// so we can operate directly on the data buffer
size_t m = std::max(size_t(1), top_logprobs); // ensure m is at least 1
std::vector<float> top_values(m, -std::numeric_limits<float>::infinity());
std::vector<size_t> top_indexes(m, 0);

for (size_t i = 0; i < logits.m_size; ++i) {
if (logits.m_data[i] > top_values.back()) {
top_values.back() = logits.m_data[i];
top_indexes.back() = i;

for (size_t j = top_values.size() - 1; j > 0 && top_values[j] > top_values[j - 1]; --j) {
std::swap(top_values[j], top_values[j - 1]);
std::swap(top_indexes[j], top_indexes[j - 1]);
}
}
}

size_t max_index = top_indexes.front();
float max_value = 0.0;

if (top_logprobs) {
// apply log softmax to max value
max_value = top_values.front();
float log_sum = std::log(std::accumulate(
logits.m_data, logits.m_data + logits.m_size, 0.0f, [max_value](float accumulated, float to_add) {
return accumulated + std::exp(to_add - max_value);
}));
max_value = -log_sum;
}

return Token(max_value, max_index);
}

std::vector<Token> multinomial_sample(const Logits& logits,
size_t num_tokens_per_sequence,
std::mt19937& rng_engine) {
// If top_p or top_k was applied we use sorted vector, if not we go with original buffer.
std::vector<float> multinomial_weights;
multinomial_weights.reserve(logits.m_size);
if (logits.is_vector_initialized())
for (auto& logit: logits.m_vector) multinomial_weights.emplace_back(logit.m_log_prob);
else
multinomial_weights.assign(logits.m_data, logits.m_data + logits.m_size);

// std::discrete_distribution returns corrupted results when applied to log probabilities
// which result returning NAN only logprobs.
// so log() is applied after this line
auto dist = std::discrete_distribution<size_t>(multinomial_weights.begin(), multinomial_weights.end()); // equivalent to multinomial with number of trials == 1

std::vector<Token> out_tokens;
for (size_t token_idx = 0; token_idx < num_tokens_per_sequence; ++token_idx) {
size_t element_to_pick = dist(rng_engine);
if (logits.is_vector_initialized()) {
auto logit = logits.m_vector[element_to_pick];
logit.m_log_prob = std::log(logit.m_log_prob);
out_tokens.push_back(logit);
}
else
out_tokens.emplace_back(std::log(logits.m_data[element_to_pick]), element_to_pick);
}
return out_tokens;
}

std::vector<int64_t> wrap_tokens(const std::vector<int64_t>& tokens, const std::vector<int64_t>& prefix_tokens, const std::vector<int64_t>& suffix_tokens) {
std::vector<int64_t> all_tokens = prefix_tokens;
all_tokens.insert(all_tokens.end(), tokens.begin(), tokens.end());
Expand Down Expand Up @@ -481,66 +546,11 @@ Logits Sampler::_get_logit_vector(ov::Tensor logits, size_t batch_idx, size_t to
}

Token Sampler::_greedy_sample(const Logits& logits, size_t top_logprobs) const {
// For greedy sampling we do not expect sorting or shrinking considered tokens
// so we can operate directly on the data buffer
size_t m = std::max(size_t(1), top_logprobs); // ensure m is at least 1
std::vector<float> top_values(m, -std::numeric_limits<float>::infinity());
std::vector<size_t> top_indexes(m, 0);

for (size_t i = 0; i < logits.m_size; ++i) {
if (logits.m_data[i] > top_values.back()) {
top_values.back() = logits.m_data[i];
top_indexes.back() = i;

for (size_t j = top_values.size() - 1; j > 0 && top_values[j] > top_values[j - 1]; --j) {
std::swap(top_values[j], top_values[j - 1]);
std::swap(top_indexes[j], top_indexes[j - 1]);
}
}
}

size_t max_index = top_indexes.front();
float max_value = 0.0;

if (top_logprobs) {
// apply log softmax to max value
max_value = top_values.front();
float log_sum = std::log(std::accumulate(
logits.m_data, logits.m_data + logits.m_size, 0.0f, [max_value](float accumulated, float to_add) {
return accumulated + std::exp(to_add - max_value);
}));
max_value = -log_sum;
}

return Token(max_value, max_index);
return greedy_sample(logits, top_logprobs);
}

std::vector<Token> Sampler::_multinomial_sample(const Logits& logits, size_t num_tokens_per_sequence) {
// If top_p or top_k was applied we use sorted vector, if not we go with original buffer.
std::vector<float> multinomial_weights;
multinomial_weights.reserve(logits.m_size);
if (logits.is_vector_initialized())
for (auto& logit: logits.m_vector) multinomial_weights.emplace_back(logit.m_log_prob);
else
multinomial_weights.assign(logits.m_data, logits.m_data + logits.m_size);

// std::discrete_distribution returns corrupted results when applied to log probabilities
// which result returning NAN only logprobs.
// so log() is applied after this line
auto dist = std::discrete_distribution<size_t>(multinomial_weights.begin(), multinomial_weights.end()); // equivalent to multinomial with number of trials == 1

std::vector<Token> out_tokens;
for (size_t token_idx = 0; token_idx < num_tokens_per_sequence; ++token_idx) {
size_t element_to_pick = dist(rng_engine);
if (logits.is_vector_initialized()) {
auto logit = logits.m_vector[element_to_pick];
logit.m_log_prob = std::log(logit.m_log_prob);
out_tokens.push_back(logit);
}
else
out_tokens.emplace_back(std::log(logits.m_data[element_to_pick]), element_to_pick);
}
return out_tokens;
return multinomial_sample(logits, num_tokens_per_sequence, rng_engine);
}

std::vector<int64_t> Sampler::_try_finish_generation(SequenceGroup::Ptr & sequence_group) {
Expand Down
6 changes: 6 additions & 0 deletions src/cpp/src/sampler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ inline bool is_stop_token_id_hit(int64_t generated_token, const std::set<int64_t

std::vector<Token> log_softmax(const ov::Tensor& logits, size_t batch_idx);

Token greedy_sample(const Logits& logits, size_t top_logprobs);

std::vector<Token> multinomial_sample(const Logits& logits,
size_t num_tokens_per_sequence,
std::mt19937& rng_engine);

struct SamplerOutput {
// IDs of sequences that need to be dropped
std::vector<uint64_t> m_dropped_sequences;
Expand Down
Loading