Skip to content

Commit

Permalink
Merge branch 'master' into feature/nodejs-bindings
Browse files Browse the repository at this point in the history
  • Loading branch information
vishniakov-nikolai authored Dec 23, 2024
2 parents 7f8ee36 + 3496d45 commit 6434daf
Show file tree
Hide file tree
Showing 15 changed files with 184 additions and 77 deletions.
16 changes: 11 additions & 5 deletions src/cpp/src/image_generation/models/clip_text_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,13 +118,20 @@ ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string
const size_t text_embedding_batch_size = do_classifier_free_guidance ? 2 : 1;

auto perform_tokenization = [&](const std::string& prompt, ov::Tensor input_ids) {
std::fill_n(input_ids.data<int32_t>(), input_ids.get_size(), pad_token_id);

ov::Tensor input_ids_token = m_clip_tokenizer.encode(prompt).input_ids;
std::copy_n(input_ids_token.data<std::int64_t>(), input_ids_token.get_size(), input_ids.data<std::int32_t>());

if (input_ids.get_element_type() == ov::element::i32) {
std::fill_n(input_ids.data<int32_t>(), input_ids.get_size(), pad_token_id);
std::copy_n(input_ids_token.data<int64_t>(), input_ids_token.get_size(), input_ids.data<int32_t>());
} else {
std::fill_n(input_ids.data<int64_t>(), input_ids.get_size(), pad_token_id);
std::copy_n(input_ids_token.data<int64_t>(), input_ids_token.get_size(), input_ids.data<int64_t>());
}
};

ov::Tensor input_ids(ov::element::i32, {text_embedding_batch_size, m_config.max_position_embeddings});
ov::Tensor input_ids = m_request.get_input_tensor();
input_ids.set_shape({text_embedding_batch_size, m_config.max_position_embeddings});

size_t current_batch_idx = 0;

if (do_classifier_free_guidance) {
Expand All @@ -141,7 +148,6 @@ ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string
{current_batch_idx + 1, m_config.max_position_embeddings}));

// text embeddings
m_request.set_tensor("input_ids", input_ids);
m_request.infer();

return m_request.get_output_tensor(0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,20 @@ ov::Tensor CLIPTextModelWithProjection::infer(const std::string& pos_prompt, con
const size_t text_embedding_batch_size = do_classifier_free_guidance ? 2 : 1;

auto perform_tokenization = [&](const std::string& prompt, ov::Tensor input_ids) {
std::fill_n(input_ids.data<int64_t>(), input_ids.get_size(), pad_token_id);

ov::Tensor input_ids_token = m_clip_tokenizer.encode(prompt).input_ids;
std::copy_n(input_ids_token.data<std::int64_t>(), input_ids_token.get_size(), input_ids.data<std::int64_t>());

if (input_ids.get_element_type() == ov::element::i32) {
std::fill_n(input_ids.data<int32_t>(), input_ids.get_size(), pad_token_id);
std::copy_n(input_ids_token.data<int64_t>(), input_ids_token.get_size(), input_ids.data<int32_t>());
} else {
std::fill_n(input_ids.data<int64_t>(), input_ids.get_size(), pad_token_id);
std::copy_n(input_ids_token.data<int64_t>(), input_ids_token.get_size(), input_ids.data<int64_t>());
}
};

ov::Tensor input_ids(ov::element::i64, {text_embedding_batch_size, m_config.max_position_embeddings});
ov::Tensor input_ids = m_request.get_input_tensor();
input_ids.set_shape({text_embedding_batch_size, m_config.max_position_embeddings});

size_t current_batch_idx = 0;

if (do_classifier_free_guidance) {
Expand All @@ -132,7 +139,6 @@ ov::Tensor CLIPTextModelWithProjection::infer(const std::string& pos_prompt, con
{current_batch_idx + 1, m_config.max_position_embeddings}));

// text embeddings
m_request.set_tensor("input_ids", input_ids);
m_request.infer();

return m_request.get_output_tensor(0);
Expand Down
10 changes: 7 additions & 3 deletions src/cpp/src/image_generation/models/t5_encoder_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,13 @@ ov::Tensor T5EncoderModel::infer(const std::string& pos_prompt, const std::strin
ov::Tensor input_ids_token = m_tokenizer.encode(prompt).input_ids;
size_t min_length = std::min(input_ids.get_size(), input_ids_token.get_size());

std::fill_n(input_ids.data<int32_t>(), input_ids.get_size(), pad_token_id);
std::copy_n(input_ids_token.data<std::int64_t>(), min_length, input_ids.data<std::int32_t>());
if (input_ids.get_element_type() == ov::element::i32) {
std::fill_n(input_ids.data<int32_t>(), input_ids.get_size(), pad_token_id);
std::copy_n(input_ids_token.data<int64_t>(), min_length, input_ids.data<int32_t>());
} else {
std::fill_n(input_ids.data<int64_t>(), input_ids.get_size(), pad_token_id);
std::copy_n(input_ids_token.data<int64_t>(), min_length, input_ids.data<int64_t>());
}
};

ov::Tensor input_ids = m_request.get_input_tensor();
Expand Down Expand Up @@ -114,7 +119,6 @@ ov::Tensor T5EncoderModel::infer(const std::string& pos_prompt, const std::strin
{current_batch_idx + 1, input_ids.get_shape()[1]}));

// text embeddings
m_request.set_tensor("input_ids", input_ids);
m_request.infer();

return m_request.get_output_tensor(0);
Expand Down
16 changes: 4 additions & 12 deletions src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,32 +12,26 @@ namespace genai {


class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel::UNetInference {

public:

virtual void compile(std::shared_ptr<ov::Model> model, const std::string& device, const ov::AnyMap& properties) override
{
virtual void compile(std::shared_ptr<ov::Model> model, const std::string& device, const ov::AnyMap& properties) override {
ov::Core core = utils::singleton_core();

ov::CompiledModel compiled_model = core.compile_model(model, device, properties);
ov::genai::utils::print_compiled_model_properties(compiled_model, "UNet 2D Condition dynamic model");
m_request = compiled_model.create_infer_request();
}

virtual void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states) override
{
virtual void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states) override {
OPENVINO_ASSERT(m_request, "UNet model must be compiled first");
m_request.set_tensor(tensor_name, encoder_hidden_states);
}

virtual void set_adapters(AdapterController &adapter_controller, const AdapterConfig& adapters) override
{
virtual void set_adapters(AdapterController &adapter_controller, const AdapterConfig& adapters) override {
OPENVINO_ASSERT(m_request, "UNet model must be compiled first");
adapter_controller.apply(m_request, adapters);
}

virtual ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep) override
{
virtual ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep) override {
OPENVINO_ASSERT(m_request, "UNet model must be compiled first. Cannot infer non-compiled model");

m_request.set_tensor("sample", sample);
Expand All @@ -49,10 +43,8 @@ class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel::
}

private:

ov::InferRequest m_request;
};


} // namespace genai
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,7 @@ class UNet2DConditionModel::UNetInferenceStaticBS1 : public UNet2DConditionModel
ov::CompiledModel compiled_model = core.compile_model(model, device, properties);
ov::genai::utils::print_compiled_model_properties(compiled_model, "UNet 2D Condition batch-1 model");

for (int i = 0; i < m_native_batch_size; i++)
{
for (int i = 0; i < m_native_batch_size; i++) {
m_requests[i] = compiled_model.create_infer_request();
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/perf_metrics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {

auto ttft = tok_times[0] - start_time_val;
raw_metrics.m_times_to_first_token = std::vector<MicroSeconds>();
raw_metrics.m_times_to_first_token.emplace_back(ttft / batch_sizes[0]);
raw_metrics.m_times_to_first_token.emplace_back(ttft);
num_generated_tokens = batch_sizes[0];

// The very first infer request (prefill stage) is slower than subsequent ones since we process a sequence of tokens.
Expand Down
37 changes: 4 additions & 33 deletions src/cpp/src/whisper/whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "whisper_config.hpp"
#include "whisper_feature_extractor.hpp"
#include "whisper_models.hpp"
#include "whisper_utils.hpp"

using ov::genai::MicroSeconds;

Expand Down Expand Up @@ -79,17 +80,6 @@ void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest) {
}
}

void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics) {
const auto infer_start = std::chrono::steady_clock::now();
request.infer();
const auto infer_end = std::chrono::steady_clock::now();
const auto infer_ms = ov::genai::PerfMetrics::get_microsec(infer_end - infer_start);
raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
raw_metrics.m_token_infer_durations.emplace_back(infer_ms);
raw_metrics.m_new_token_times.emplace_back(infer_end);
raw_metrics.m_batch_sizes.emplace_back(1);
}

int64_t decode(ov::Tensor& encoder_hidden_state,
ov::InferRequest& decoder,
std::vector<int64_t>& input_ids,
Expand All @@ -102,7 +92,7 @@ int64_t decode(ov::Tensor& encoder_hidden_state,
ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, input_ids.data());
decoder.set_tensor("input_ids", input_ids_tensor);

infer_with_perf_metrics(decoder, raw_metrics);
ov::genai::utils::infer_with_perf_metrics(decoder, raw_metrics);

auto output_tensor = decoder.get_tensor("logits");

Expand Down Expand Up @@ -138,7 +128,7 @@ int64_t decode_with_past(ov::Tensor& encoder_hidden_state,
cache_position_tensor.set_shape({1});
cache_position_tensor.data<int64_t>()[0] = cache_position;

infer_with_perf_metrics(decoder_with_past, raw_metrics);
ov::genai::utils::infer_with_perf_metrics(decoder_with_past, raw_metrics);

auto output_tensor = decoder_with_past.get_tensor("logits");

Expand Down Expand Up @@ -265,25 +255,6 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
return {false, output_tokens};
}

template <typename T>
void filter_by_ranges(std::vector<T>& value, size_t offset, std::vector<std::pair<size_t, size_t>>& ranges) {
OPENVINO_ASSERT(ranges.empty() || value.size() >= (offset + ranges.back().second));
std::vector<T> result{value.begin(), value.begin() + offset};
for (auto [start, end] : ranges) {
result.insert(result.end(), value.begin() + offset + start, value.begin() + offset + end);
}

value = result;
}

void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics,
size_t offset,
std::vector<std::pair<size_t, size_t>>& ranges) {
filter_by_ranges(raw_metrics.m_token_infer_durations, offset, ranges);
filter_by_ranges(raw_metrics.m_new_token_times, offset, ranges);
filter_by_ranges(raw_metrics.m_batch_sizes, offset, ranges);
}

} // namespace

namespace ov {
Expand Down Expand Up @@ -362,7 +333,7 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig&
feature_extractor.nb_max_frames,
time_precision);

filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges);
ov::genai::utils::filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges);

segments.insert(segments.end(), extracted_segments.segments.begin(), extracted_segments.segments.end());

Expand Down
46 changes: 46 additions & 0 deletions src/cpp/src/whisper/whisper_utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#include "whisper_utils.hpp"

namespace {

template <typename T>
void filter_by_ranges(std::vector<T>& value, size_t offset, std::vector<std::pair<size_t, size_t>>& ranges) {
OPENVINO_ASSERT(ranges.empty() || value.size() >= (offset + ranges.back().second));
std::vector<T> result{value.begin(), value.begin() + offset};
for (auto [start, end] : ranges) {
result.insert(result.end(), value.begin() + offset + start, value.begin() + offset + end);
}

value = result;
}

} // namespace

namespace ov {
namespace genai {
namespace utils {

void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics) {
const auto infer_start = std::chrono::steady_clock::now();
request.infer();
const auto infer_end = std::chrono::steady_clock::now();
const auto infer_ms = ov::genai::PerfMetrics::get_microsec(infer_end - infer_start);
raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
raw_metrics.m_token_infer_durations.emplace_back(infer_ms);
raw_metrics.m_new_token_times.emplace_back(infer_end);
raw_metrics.m_batch_sizes.emplace_back(1);
}

void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics,
size_t offset,
std::vector<std::pair<size_t, size_t>>& ranges) {
filter_by_ranges(raw_metrics.m_token_infer_durations, offset, ranges);
filter_by_ranges(raw_metrics.m_new_token_times, offset, ranges);
filter_by_ranges(raw_metrics.m_batch_sizes, offset, ranges);
}

} // namespace utils
} // namespace genai
} // namespace ov
22 changes: 22 additions & 0 deletions src/cpp/src/whisper/whisper_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include <openvino/openvino.hpp>

#include "openvino/genai/perf_metrics.hpp"

namespace ov {
namespace genai {
namespace utils {

void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics);

void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics,
size_t offset,
std::vector<std::pair<size_t, size_t>>& ranges);

} // namespace utils
} // namespace genai
} // namespace ov
Loading

0 comments on commit 6434daf

Please sign in to comment.