diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index 9a1b21632f..bcf171f3a8 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -8,6 +8,11 @@ find_package(OpenVINOGenAI REQUIRED NO_CMAKE_FIND_ROOT_PATH ) +FetchContent_Declare(cxxopts + URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz + URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08) +FetchContent_MakeAvailable(cxxopts) + file(DOWNLOAD https://raw.githubusercontent.com/nothings/stb/f75e8d1cad7d90d72ef7a4661f1b994ef78b4e31/stb_image.h ${CMAKE_BINARY_DIR}/stb_image.h @@ -15,7 +20,7 @@ file(DOWNLOAD add_executable(visual_language_chat visual_language_chat.cpp load_image.cpp) target_include_directories(visual_language_chat PRIVATE "${CMAKE_CURRENT_SOUCE_DIR}" "${CMAKE_BINARY_DIR}") -target_link_libraries(visual_language_chat PRIVATE openvino::genai) +target_link_libraries(visual_language_chat PRIVATE openvino::genai cxxopts::cxxopts) set_target_properties(visual_language_chat PROPERTIES COMPILE_PDB_NAME visual_language_chat diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp index b1522903ef..4486376f48 100644 --- a/samples/cpp/visual_language_chat/visual_language_chat.cpp +++ b/samples/cpp/visual_language_chat/visual_language_chat.cpp @@ -5,23 +5,84 @@ #include #include +#include +#include +namespace fs = std::filesystem; + +#include + bool print_subword(std::string&& subword) { return !(std::cout << subword << std::flush); } int main(int argc, char* argv[]) try { - if (3 != argc) { - throw std::runtime_error(std::string{"Usage "} + argv[0] + " "); + + cxxopts::Options options("visual_language_chat", "Help command"); + + options.add_options() + ("m,model", "Path to model and tokenizers base directory", cxxopts::value()->default_value("")) + ("i,inputs", "Path to image or to directory with images", cxxopts::value()->default_value("")) + ("d,device", "Target device to run the model", cxxopts::value()->default_value("CPU")) + ("s,sampling", "Sampling method: [greedy|multinomial|beam_search]. Optional, 'greedy' by default.", cxxopts::value()->default_value("greedy")) + ("h,help", "Print usage"); + + cxxopts::ParseResult result; + try { + result = options.parse(argc, argv); + } catch (const cxxopts::exceptions::exception& e) { + std::cout << e.what() << "\n\n"; + std::cout << options.help() << std::endl; + return EXIT_FAILURE; + } + + if (result.count("help")) { + std::cout << options.help() << std::endl; + return EXIT_SUCCESS; } - ov::Tensor image = utils::load_image(argv[2]); - std::string device = "CPU"; // GPU can be used as well + + const std::string model_path = result["model"].as(); + const std::string device = result["device"].as(); + const std::string input_path = result["inputs"].as(); + const std::string sampling_method = result["sampling"].as(); + + ov::AnyMap properies; + if (sampling_method == "greedy") { + properies.insert(ov::genai::generation_config(ov::genai::greedy())); + properies.insert(ov::genai::streamer(print_subword)); + } else if (sampling_method == "beam_search") { + properies.insert(ov::genai::generation_config(ov::genai::beam_search())); + } else if (sampling_method == "multinomial") { + properies.insert(ov::genai::generation_config(ov::genai::multinomial())); + properies.insert(ov::genai::streamer(print_subword)); + } else { + throw std::runtime_error("Generation config should have values: [greedy|multinomial|beam_search] or could be empty, in which case the greedy approach will be used."); + } + + std::vector images; + if (!input_path.empty() && fs::exists(input_path)) { + if (fs::is_directory(input_path)) { + for (const auto& dir_entry : fs::directory_iterator(input_path)) { + ov::Tensor image = utils::load_image(dir_entry.path()); + images.push_back(std::move(image)); + } + } else if (fs::is_regular_file(input_path)) { + ov::Tensor image = utils::load_image(input_path); + images.push_back(std::move(image)); + } + } + + if (images.empty()) + throw std::runtime_error("No one image found by path " + input_path); + else + properies.insert(images.size() == 1 ? ov::genai::image(images.at(0)) : ov::genai::images(images)); + ov::AnyMap enable_compile_cache; if ("GPU" == device) { // Cache compiled models on disk for GPU to save time on the // next run. It's not beneficial for CPU. enable_compile_cache.insert({ov::cache_dir("vlm_cache")}); } - ov::genai::VLMPipeline pipe(argv[1], device, enable_compile_cache); + ov::genai::VLMPipeline pipe(model_path, device, enable_compile_cache); std::string prompt; pipe.start_chat(); @@ -29,22 +90,17 @@ int main(int argc, char* argv[]) try { if (!std::getline(std::cin, prompt)) { throw std::runtime_error("std::cin failed"); } - pipe.generate( - prompt, - // ov::genai::image(std::move(image)), - ov::genai::generation_config(ov::genai::beam_search()), - // ov::genai::generation_config(ov::genai::greedy()), - // ov::genai::generation_config(ov::genai::multinomial()), - ov::genai::streamer(print_subword) - ); + auto resuls = pipe.generate(prompt, properies); + if (sampling_method == "beam_search") { + std::cout << resuls.texts.at(0) << std::endl; + } std::cout << "\n----------\n" "question:\n"; while (std::getline(std::cin, prompt)) { - pipe.generate(prompt, - ov::genai::generation_config(ov::genai::beam_search()), - // ov::genai::generation_config(ov::genai::greedy()), - // ov::genai::generation_config(ov::genai::multinomial()), - ov::genai::streamer(print_subword)); + resuls = pipe.generate(prompt, properies); + if (sampling_method == "beam_search") { + std::cout << resuls.texts.at(0) << std::endl; + } std::cout << "\n----------\n" "question:\n"; } diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index aa3e2b1d5a..7fa36a9b04 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -577,13 +577,13 @@ std::vector Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen } -std::vector Sampler::get_beam_idxs(uint64_t request_id) { - std::vector beams; - if (m_beam_search_info.find(request_id) != m_beam_search_info.end()) { - GroupBeamSearcher beam_searcher = m_beam_search_info.at(request_id); - std::vector beams = beam_searcher.get_beam_idxs(); +std::vector Sampler::get_beam_idxs(SequenceGroup::CPtr request) { + uint64_t request_id = request->get_request_id(); + auto beam_searcher = m_beam_search_info.find(request_id); + if (m_beam_search_info.find(request_id) == m_beam_search_info.end()) { + return std::vector(request->num_running_seqs(), 0); } - return beams; + return beam_searcher->second.get_beam_idxs(); } diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index f664bc16dc..e66fc4b700 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -61,7 +61,7 @@ class Sampler { SamplerOutput sample(std::vector & sequence_groups, ov::Tensor logits); void set_seed(size_t seed) { rng_engine.seed(seed); } void clear_beam_search_info(uint64_t request_id); - std::vector get_beam_idxs(uint64_t request_id); + std::vector get_beam_idxs(SequenceGroup::CPtr); }; class Sampler::GroupBeamSearcher { diff --git a/src/cpp/src/vlm_pipeline.cpp b/src/cpp/src/vlm_pipeline.cpp index 0d24ddffe7..37c5001bf5 100644 --- a/src/cpp/src/vlm_pipeline.cpp +++ b/src/cpp/src/vlm_pipeline.cpp @@ -3,7 +3,7 @@ #include "openvino/genai/vlm_pipeline.hpp" #include "openvino/genai/tokenizer.hpp" -#include "vlm_sampling.hpp" +#include "sampler.hpp" #include "clip.hpp" #include #include "../src/text_callback_streamer.hpp" @@ -11,10 +11,6 @@ #include #include -#include "sampler.hpp" - -#include "debug_utils.hpp" - using namespace ov::genai; namespace { @@ -31,57 +27,6 @@ struct Args { float repeat_penalty = 1.0f; }; -int64_t get_out_token_id(const std::vector& input_ids, float* logits, size_t vocab_size, Args args) { - int64_t out_token; - - // logits pre-process - if (args.repeat_penalty != 1.f) { - sampling_repetition_penalty(logits, logits + vocab_size, input_ids, args.repeat_penalty); - } - - if (args.do_sample) - { - if (args.temp > 0) { - sampling_temperature(logits, logits + vocab_size, args.temp); - } - - std::vector token_scores(vocab_size); - for (int i = 0; i < vocab_size; i++) { - token_scores[i] = TokenIdScore(i, logits[i]); - } - - // top_k sampling - if (0 < args.top_k && args.top_k < (int)token_scores.size()) { - sampling_top_k(token_scores.data(), token_scores.data() + args.top_k, - token_scores.data() + token_scores.size()); - token_scores.resize(args.top_k); - } - - // top_p sampling - if (0.f < args.top_p && args.top_p < 1.f) { - auto pos = sampling_top_p(token_scores.data(), token_scores.data() + token_scores.size(), args.top_p); - token_scores.resize(pos - token_scores.data()); - } - - // sample next token - sampling_softmax_inplace(token_scores.data(), token_scores.data() + token_scores.size()); - for (size_t i = 0; i < token_scores.size(); i++) { - logits[i] = token_scores[i].score; - } - - thread_local std::random_device rd; - thread_local std::mt19937 gen(rd()); - - std::discrete_distribution<> dist(logits, logits + token_scores.size()); - out_token = token_scores[dist(gen)].id; - } - else { - out_token = std::max_element(logits, logits + vocab_size) - logits; - } - - return out_token; -} - ov::Tensor process_prompt(ov::InferRequest& embedding, const ov::Tensor& prompt, float scale_emb) { embedding.set_input_tensor(prompt); embedding.infer(); @@ -298,90 +243,6 @@ ov::Tensor resample(VLMPipeline& pipe, const ov::Tensor& encoded_image, const st pipe.m_resampler.infer(); return pipe.m_resampler.get_output_tensor(); // [N, query_num, new_hidden_size] } -} - - -void forward_embedings_and_lm(SequenceGroup::CPtr sequence_group, ov::InferRequest& embedding, ov::InferRequest& language, const VLMConfig m_vlm_config, const std::shared_ptr sampler) { - // compute aggregated values - size_t num_sequences = sequence_group->num_running_seqs(); - size_t batch_size_in_sequences = num_sequences; - size_t total_num_tokens = sequence_group->get_num_scheduled_tokens() * num_sequences; - size_t total_num_blocks = sequence_group->get_num_blocks() * num_sequences; - size_t max_context_len_val = std::max(max_context_len_val, sequence_group->get_context_len()); - - ov::Tensor - input_ids(ov::element::i64, {total_num_tokens, 1}), - position_ids(ov::element::i64, {total_num_tokens, 1}), - beam_idx(ov::element::i32, { total_num_tokens }); - - // get raw pointers to copy to - int64_t - * input_ids_data = input_ids.data(), - * position_ids_data = position_ids.data(); - int32_t - * beam_idx_data = beam_idx.data(); - - std::vector running_sequences = sequence_group->get_running_sequences(); - size_t num_running_sequences = running_sequences.size(); - size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens(); - size_t group_position_id = sequence_group->get_num_processed_tokens(); - - // spec: In case of multiple input tokens for current sequence (prompt_len > 1), - // context_len corresponds to first token within subgroup of scheduled tokens - size_t group_context_len = group_position_id; - - for (size_t seq_id = 0; seq_id < num_running_sequences; ++seq_id) { - Sequence::CPtr sequence = running_sequences[seq_id]; - - for (size_t token_id = 0, position_id = group_position_id; token_id < num_scheduled_tokens; ++token_id, ++position_id) { - // compute token for current sequence - input_ids_data[token_id] = position_id < sequence_group->get_prompt_len() ? - sequence_group->get_prompt_ids()[position_id] : - sequence->get_generated_ids()[position_id - sequence_group->get_prompt_len()]; - - position_ids_data[token_id] = position_id; - } - - // apply strides to shift to a next sequence - input_ids_data += num_scheduled_tokens; - position_ids_data += num_scheduled_tokens; - } - - embedding.set_input_tensor(input_ids); - - embedding.infer(); - const ov::Tensor& embed_prompt_tensor = embedding.get_output_tensor(); - float* embed_data = embed_prompt_tensor.data(); - for (auto idx = 0; idx < embed_prompt_tensor.get_size(); idx++) { - embed_data[idx] = embed_data[idx] * m_vlm_config.scale_emb; - } - - language.set_tensor("inputs_embeds", embed_prompt_tensor); - - language.get_tensor("attention_mask").set_shape({ total_num_tokens, language.get_tensor("attention_mask").get_shape()[1] + 1 }); - std::fill_n(language.get_tensor("attention_mask").data(), language.get_tensor("attention_mask").get_size(), 1); - - language.set_tensor("position_ids", position_ids); - std::vector beam_idxs = sampler->get_beam_idxs(sequence_group->get_request_id()); - if (beam_idxs.empty()) { - for (size_t i = 0; i < num_sequences; i++) { - beam_idx_data[i] = 0; - } - } else { - for (size_t i = 0; i < beam_idxs.size(); i++) { - beam_idx_data[i] = beam_idxs.at(i); - } - } - language.set_tensor("beam_idx", beam_idx); - - // print_tensor("input_ids", input_ids); - // print_tensor("position_ids", position_ids); - // print_tensor("attention_mask", language.get_tensor("attention_mask")); - // print_tensor("beam_idx", beam_idx); - - language.infer(); -} - EncodedGenerationResult get_lm_encoded_results( ov::InferRequest& language, @@ -389,7 +250,7 @@ EncodedGenerationResult get_lm_encoded_results( ov::Tensor inputs_embeds, const VLMConfig m_vlm_config, const std::shared_ptr streamer_ptr, - const std::shared_ptr sampler, + Sampler& sampler, std::vector requests ) { SequenceGroup::Ptr request = requests.back(); @@ -412,26 +273,75 @@ EncodedGenerationResult get_lm_encoded_results( int64_t sequence_len = language.get_tensor("logits").get_shape().at(1); request->schedule_tokens(sequence_len); - SamplerOutput sampler_output = sampler->sample(requests, language.get_tensor("logits")); + SamplerOutput sampler_output = sampler.sample(requests, language.get_tensor("logits")); language.get_tensor("inputs_embeds").set_shape({BATCH_SIZE, 1, m_vlm_config.hidden_size}); language.get_tensor("position_ids").set_shape({ BATCH_SIZE, 1 }); - while (!request->has_finished()) { request->schedule_tokens(1); + size_t num_sequences = request->num_running_seqs(); + size_t total_num_tokens = request->get_num_scheduled_tokens() * num_sequences; + + ov::Tensor + input_ids(ov::element::i64, {total_num_tokens, 1}), + position_ids(ov::element::i64, {total_num_tokens, 1}), + beam_idx(ov::element::i32, { total_num_tokens }); + + int64_t + * input_ids_data = input_ids.data(), + * position_ids_data = position_ids.data(); + + size_t num_scheduled_tokens = request->get_num_scheduled_tokens(); + size_t group_position_id = request->get_num_processed_tokens(); + for (Sequence::Ptr& sequence : request->get_running_sequences()) { + for (size_t token_id = 0, position_id = group_position_id; token_id < num_scheduled_tokens; ++token_id, ++position_id) { + // compute token for current sequence + input_ids_data[token_id] = position_id < request->get_prompt_len() ? + request->get_prompt_ids()[position_id] : + sequence->get_generated_ids()[position_id - request->get_prompt_len()]; + + position_ids_data[token_id] = position_id; + } + // apply strides to shift to a next sequence + input_ids_data += num_scheduled_tokens; + position_ids_data += num_scheduled_tokens; + } + + embedding.set_input_tensor(input_ids); + + embedding.infer(); + const ov::Tensor& embed_prompt_tensor = embedding.get_output_tensor(); + float* embed_data = embed_prompt_tensor.data(); + for (auto idx = 0; idx < embed_prompt_tensor.get_size(); idx++) { + embed_data[idx] = embed_data[idx] * m_vlm_config.scale_emb; + } + + language.set_tensor("inputs_embeds", embed_prompt_tensor); + + language.get_tensor("attention_mask").set_shape({ total_num_tokens, language.get_tensor("attention_mask").get_shape()[1] + 1 }); + std::fill_n(language.get_tensor("attention_mask").data(), language.get_tensor("attention_mask").get_size(), 1); + + language.set_tensor("position_ids", position_ids); - forward_embedings_and_lm(request, embedding, language, m_vlm_config, sampler); + std::vector beam_idxs = sampler.get_beam_idxs(request); + int32_t *beam_idx_data = beam_idx.data(); + for (size_t i = 0; i < beam_idxs.size(); i++) { + beam_idx_data[i] = beam_idxs.at(i); + } + language.set_tensor("beam_idx", beam_idx); + + language.infer(); if (streamer_ptr) { - // first sequences + // first sequence int64_t out_token = request.get()->operator[](0)->get_generated_ids().back(); if (streamer_ptr->put(out_token)) { break; } } - sampler_output = sampler->sample(requests, language.get_tensor("logits")); + sampler_output = sampler.sample(requests, language.get_tensor("logits")); } if (streamer_ptr) { @@ -455,6 +365,7 @@ EncodedGenerationResult get_lm_encoded_results( return result; } +} // anonymous class ov::genai::VLMPipeline::VLMPipelineImpl { @@ -608,11 +519,11 @@ DecodedResults VLMPipeline::generate( } } } - - std::shared_ptr sampler = std::make_shared(m_tokenizer); + + Sampler sampler = Sampler(m_tokenizer); std::vector requests; - auto attention_size = m_language.get_tensor("attention_mask").get_size(); // request_id, input_ids, generation_config, block_size, enable_prefix_caching + // request_id, input_ids, generation_config, block_size, enable_prefix_caching // now we have one prompt as input, so we need one request SequenceGroup::Ptr sequence_group = std::make_shared(0, encoded_input, generation_config, 1, false); sequence_group->set_sequence_group_ptr(sequence_group); @@ -632,6 +543,10 @@ DecodedResults VLMPipeline::generate( }, }, streamer); + if ((!(generation_config.is_greedy_decoding() || generation_config.is_multinomial())) && streamer_ptr) { + OPENVINO_THROW("Currently streaming is possible only for greedy or multinomial decoding"); + } + EncodedGenerationResult encoded_result = get_lm_encoded_results(m_language, m_embedding, inputs_embeds, m_vlm_config, streamer_ptr, sampler, requests); DecodedResults decoded; @@ -679,10 +594,6 @@ DecodedResults VLMPipeline::generate( if (config.eos_token_id == -1) config.set_eos_token_id(m_tokenizer.get_eos_token_id()); - // if (is_chat_conversation && config.num_return_sequences > 1) { - // config.num_return_sequences = 1; - // } - return generate( prompt, rgbs, diff --git a/src/cpp/src/vlm_sampling.hpp b/src/cpp/src/vlm_sampling.hpp deleted file mode 100644 index b0a7d2341f..0000000000 --- a/src/cpp/src/vlm_sampling.hpp +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include -#include -#include -#include - -struct TokenIdScore { - int id; - float score; - - TokenIdScore() = default; - TokenIdScore(int id, float score) : id(id), score(score) {} - - bool operator<(const TokenIdScore& other) const { return score < other.score; } - bool operator>(const TokenIdScore& other) const { return score > other.score; } - - friend std::ostream& operator<<(std::ostream& os, const TokenIdScore& self) { - return os << "TokenIdScore(id=" << self.id << ", score=" << self.score << ")"; - } -}; - -void sampling_softmax_inplace(TokenIdScore* first, TokenIdScore* last) { - float max_score = std::max_element(first, last)->score; - float sum = 0.f; - for (TokenIdScore* p = first; p != last; p++) { - float s = std::exp(p->score - max_score); - p->score = s; - sum += s; - } - float inv_sum = 1.f / sum; - for (TokenIdScore* p = first; p != last; p++) { - p->score *= inv_sum; - } -} - -void sampling_top_k(TokenIdScore* first, TokenIdScore* kth, TokenIdScore* last) { - std::nth_element(first, kth, last, std::greater()); -} - -TokenIdScore* sampling_top_p(TokenIdScore* first, TokenIdScore* last, float top_p) { - // fast top_p in expected O(n) time complexity - sampling_softmax_inplace(first, last); - - while (first + 1 < last) { - const float pivot_score = (last - 1)->score; // use mid score? - TokenIdScore* mid = - std::partition(first, last - 1, [pivot_score](const TokenIdScore& x) { return x.score > pivot_score; }); - std::swap(*mid, *(last - 1)); - - const float prefix_sum = - std::accumulate(first, mid, 0.f, [](float sum, const TokenIdScore& x) { return sum + x.score; }); - if (prefix_sum >= top_p) { - last = mid; - } - else if (prefix_sum + mid->score < top_p) { - first = mid + 1; - top_p -= prefix_sum + mid->score; - } - else { - return mid + 1; - } - } - return last; -} - -void sampling_repetition_penalty(float* first, float* last, const std::vector& input_ids, - float penalty) { - if (penalty < 0) { - std::cout << "penalty must be a positive float, but got " << penalty; - return; - } - const float inv_penalty = 1.f / penalty; - const ptrdiff_t vocab_size = last - first; - std::vector occurrence(vocab_size, false); - for (const int id : input_ids) { - if (!occurrence[id]) { - first[id] *= (first[id] > 0) ? inv_penalty : penalty; - } - occurrence[id] = true; - } -} - -void sampling_temperature(float* first, float* last, float temp) { - const float inv_temp = 1.f / temp; - for (float* it = first; it != last; it++) { - *it *= inv_temp; - } -} - - -