Merge branch 'master' into eltwise_blocked_mix_type

openvinotoolkit · Nov 29, 2024 · 9fd5f7e · 9fd5f7e
2 parents 914ecb0 + 90aacf2
commit 9fd5f7e
Show file tree

Hide file tree

Showing 35 changed files with 576 additions and 128 deletions.
diff --git a/docs/articles_en/about-openvino/performance-benchmarks.rst b/docs/articles_en/about-openvino/performance-benchmarks.rst
@@ -163,7 +163,7 @@ For a listing of all platforms and configurations used for testing, refer to the
   2024.5, as of November 20, 2024.
 
 * OpenVINO Model Server performance results are based on release
-  2024.4, as of Sept. 30, 2024.
+  2024.5, as of November 20, 2024.
 
 The results may not reflect all publicly available updates. Intel technologies' features and
 benefits depend on system configuration and may require enabled hardware, software, or service

diff --git a/.../openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst b/.../openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst
@@ -146,6 +146,8 @@ offer a limited set of supported OpenVINO features.
          ov::intel_npu::turbo
          ov::intel_npu::tiles
          ov::intel_npu::max_tiles
+         ov::intel_npu::bypass_umd_caching
+         ov::intel_npu::defer_weights_load
 
    .. tab-item:: Read-only properties
 
@@ -168,7 +170,6 @@ offer a limited set of supported OpenVINO features.
          ov::intel_npu::device_alloc_mem_size
          ov::intel_npu::device_total_mem_size
          ov::intel_npu::driver_version
-         ov::intel_npu::bypass_umd_caching
 
 
 .. note::

diff --git a/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms-genai.json b/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms-genai.json
@@ -6,7 +6,7 @@
         "whats_new_model": false,
         "PlatformType": "Server Platforms (Intel® Xeon®)",
         "Parameters": {
-            "OpenVINO Model Server": {
+            "Ovms": {
                 "Precisions": [
                     {
                         "Throughput": {
@@ -30,7 +30,7 @@
                     }
                 ]
             },
-            "vLLM with OpenVINO backend": {
+            "Vllm": {
                 "Precisions": [
                     {
                         "Throughput": {
@@ -63,7 +63,7 @@
         "whats_new_model": false,
         "PlatformType": "Server Platforms (Intel® Xeon®)",
         "Parameters": {
-            "OpenVINO Model Server": {
+            "Ovms": {
                 "Precisions": [
                     {
                         "Throughput": {
@@ -87,7 +87,7 @@
                     }
                 ]
             },
-            "vLLM with OpenVINO backend": {
+            "Vllm": {
                 "Precisions": [
                     {
                         "Throughput": {
@@ -120,7 +120,7 @@
         "whats_new_model": false,
         "PlatformType": "Server Platforms (Intel® Xeon®)",
         "Parameters": {
-            "OpenVINO Model Server": {
+            "Ovms": {
                 "Precisions": [
                     {
                         "Throughput": {
@@ -144,7 +144,7 @@
                     }
                 ]
             },
-            "vLLM with OpenVINO backend": {
+            "Vllm": {
                 "Precisions": [
                     {
                         "Throughput": {
@@ -177,7 +177,7 @@
         "whats_new_model": true,
         "PlatformType": "Server Platforms (Intel® Xeon®)",
         "Parameters": {
-            "OpenVINO Model Server": {
+            "Ovms": {
                 "Precisions": [
                     {
                         "Throughput": {
@@ -201,7 +201,7 @@
                     }
                 ]
             },
-            "vLLM with OpenVINO backend": {
+            "Vllm": {
                 "Precisions": [
                     {
                         "Throughput": {
@@ -228,7 +228,7 @@
         "whats_new_model": true,
         "PlatformType": "Server Platforms (Intel® Xeon®)",
         "Parameters": {
-            "OpenVINO Model Server": {
+            "Ovms": {
                 "Precisions": [
                     {
                         "Throughput": {
@@ -252,7 +252,7 @@
                     }
                 ]
             },
-            "vLLM with OpenVINO backend": {
+            "Vllm": {
                 "Precisions": [
                     {
                         "Throughput": {
@@ -283,7 +283,7 @@
         "whats_new_model": true,
         "PlatformType": "Server Platforms (Intel® Xeon®)",
         "Parameters": {
-            "OpenVINO Model Server": {
+            "Ovms": {
                 "Precisions": [
                     {
                         "Throughput": {
@@ -307,7 +307,7 @@
                     }
                 ]
             },
-            "vLLM with OpenVINO backend": {
+            "Vllm": {
                 "Precisions": [
                     {
                         "Throughput": {
@@ -338,7 +338,7 @@
         "whats_new_model": false,
         "PlatformType": "Server Platforms (Intel® Xeon®)",
         "Parameters": {
-            "OpenVINO Model Server": {
+            "Ovms": {
                 "Precisions": [
                     {
                         "Throughput": {
@@ -362,7 +362,7 @@
                     }
                 ]
             },
-            "vLLM with OpenVINO backend": {
+            "Vllm": {
                 "Precisions": [
                     {
                         "Throughput": {
@@ -393,7 +393,7 @@
         "whats_new_model": false,
         "PlatformType": "Server Platforms (Intel® Xeon®)",
         "Parameters": {
-            "OpenVINO Model Server": {
+            "Ovms": {
                 "Precisions": [
                     {
                         "Throughput": {
@@ -417,7 +417,7 @@
                     }
                 ]
             },
-            "vLLM with OpenVINO backend": {
+            "Vllm": {
                 "Precisions": [
                     {
                         "Throughput": {
@@ -450,7 +450,7 @@
         "whats_new_model": false,
         "PlatformType": "Server Platforms (Intel® Xeon®)",
         "Parameters": {
-            "OpenVINO Model Server": {
+            "Ovms": {
                 "Precisions": [
                     {
                         "Throughput": {
@@ -474,7 +474,7 @@
                     }
                 ]
             },
-            "vLLM with OpenVINO backend": {
+            "Vllm": {
                 "Precisions": [
                     {
                         "Throughput": {

diff --git a/...mmon/transformations/include/transformations/op_conversions/convert_reduce_to_pooling.hpp b/...mmon/transformations/include/transformations/op_conversions/convert_reduce_to_pooling.hpp
@@ -72,7 +72,7 @@ ov::matcher_pass_callback ConvertReduceBase::convert_reduce_to_pooling() {
     return [&](ov::pass::pattern::Matcher& m) {
         auto reduce = std::dynamic_pointer_cast<T>(m.get_match_root());
 
-        if (!reduce || transformation_callback(reduce)) {
+        if (!reduce || transformation_callback(reduce) || ov::shape_size(reduce->input_value(0).get_shape()) == 0) {
             return false;
         }
 

diff --git a/...ons/src/transformations/common_optimizations/remove_multi_subgraph_op_dangling_params.cpp b/...ons/src/transformations/common_optimizations/remove_multi_subgraph_op_dangling_params.cpp
@@ -11,10 +11,30 @@
 #include "openvino/op/tensor_iterator.hpp"
 #include "openvino/op/util/multi_subgraph_base.hpp"
 #include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "openvino/util/common_util.hpp"
 #include "transformations/utils/utils.hpp"
 
 using namespace ov::op::util;
 
+namespace {
+/** @brief Value to mark that input idx has been removed (at least one removed so last idx will be always available) */
+constexpr auto mark_removed = std::numeric_limits<uint64_t>::max();
+
+constexpr bool is_not_removed_idx(const decltype(mark_removed) idx) {
+    return mark_removed != idx;
+}
+
+uint64_t get_updated_idx(uint64_t idx, uint64_t removed_idx) {
+    if (idx == removed_idx) {
+        return mark_removed;
+    } else if (is_not_removed_idx(idx) && idx > removed_idx) {
+        return idx - 1;
+    } else {
+        return idx;
+    }
+};
+}  // namespace
+
 bool ov::pass::RemoveMultiSubGraphOpDanglingParamsResults::run_on_model(const std::shared_ptr<ov::Model>& m) {
     RUN_ON_MODEL_SCOPE(RemoveMultiSubGraphOpDanglingParamsResults);
     bool is_changed = false;
@@ -117,7 +137,6 @@ bool ov::pass::RemoveMultiSubGraphOpDanglingParamsResults::run_on_model(const st
         // Remove inputs
         bool pass_required = false;
         std::set<uint64_t> required_inputs_indices;
-        auto op_inputs = multi_subgraph_op->input_values();
         std::vector<std::vector<size_t>> to_remove_descriptors_indexes;
         to_remove_descriptors_indexes.resize(subgraphs_size);
         for (size_t body_idx = 0; body_idx < subgraphs_size; ++body_idx) {
@@ -142,64 +161,57 @@ bool ov::pass::RemoveMultiSubGraphOpDanglingParamsResults::run_on_model(const st
             using DescType = op::util::MultiSubGraphOp::MultiSubgraphInputDescriptionVector;
             auto update_body_param_desc = [](DescType& descriptors, uint64_t removed_body_idx) {
                 for (auto& desc : descriptors) {
-                    if (desc->m_body_parameter_index > removed_body_idx) {
-                        desc->m_body_parameter_index--;
-                    }
+                    desc->m_body_parameter_index = get_updated_idx(desc->m_body_parameter_index, removed_body_idx);
                 }
             };
             auto update_op_inputs_desc = [&subgraphs_size](const std::shared_ptr<op::util::MultiSubGraphOp>& op,
-                                                           std::set<uint64_t>& required_inputs_indices,
                                                            uint64_t removed_loop_idx) {
-                std::set<uint64_t> new_required_inputs_indices;
                 for (size_t body_idx = 0; body_idx < subgraphs_size; ++body_idx) {
                     auto& descriptors = op->get_input_descriptions(static_cast<int>(body_idx));
                     for (auto& desc : descriptors) {
-                        if (desc->m_input_index > removed_loop_idx) {
-                            desc->m_input_index--;
-                        }
+                        desc->m_input_index = get_updated_idx(desc->m_input_index, removed_loop_idx);
                     }
                 }
-                for (auto input_index : required_inputs_indices) {
-                    if (input_index > removed_loop_idx) {
-                        new_required_inputs_indices.insert(input_index - 1);
-                    } else {
-                        new_required_inputs_indices.insert(input_index);
-                    }
+            };
+
+            const auto update_required_input_indicies = [](std::set<uint64_t>& required_inputs_indices,
+                                                           uint64_t removed_input_idx) {
+                std::set<uint64_t> new_required_inputs_indices;
+                for (const auto& input_index : required_inputs_indices) {
+                    new_required_inputs_indices.insert(input_index > removed_input_idx ? input_index - 1 : input_index);
                 }
-                required_inputs_indices = new_required_inputs_indices;
+                required_inputs_indices = std::move(new_required_inputs_indices);
             };
             // Remove dangling body params and input and update input descriptors
+            auto op_inputs = multi_subgraph_op->input_values();
             for (size_t body_idx = 0; body_idx < subgraphs_size; ++body_idx) {
                 auto& body_in_descriptors = multi_subgraph_op->get_input_descriptions(static_cast<int>(body_idx));
-                auto& body_func = multi_subgraph_op->get_function(static_cast<int>(body_idx));
-                auto& body_params = body_func->get_parameters();
                 op::util::MultiSubGraphOp::MultiSubgraphInputDescriptionVector updated_body_in_descriptors;
+
                 for (size_t desc_idx = 0; desc_idx < body_in_descriptors.size(); ++desc_idx) {
-                    if (std::count(std::begin(to_remove_descriptors_indexes[body_idx]),
-                                   std::end(to_remove_descriptors_indexes[body_idx]),
-                                   desc_idx) > 0) {
-                        if (body_in_descriptors[desc_idx]->m_body_parameter_index < body_params.size()) {
-                            auto& body_param = body_params[body_in_descriptors[desc_idx]->m_body_parameter_index];
-                            body_func->remove_parameter(body_param);
-                            // Move all body indexes which are after these indicated by to_remove_descriptors_indexes
-                            update_body_param_desc(body_in_descriptors,
-                                                   body_in_descriptors[desc_idx]->m_body_parameter_index);
-                        }
-                        // remove dangling input of MultiSubGraphOp which was not removed earlier
-                        auto current_input_idx = body_in_descriptors[desc_idx]->m_input_index;
-                        // the same input tensor can go to different input ports
-                        if (current_input_idx < op_inputs.size() &&
-                            std::count(std::begin(required_inputs_indices),
-                                       std::end(required_inputs_indices),
-                                       current_input_idx) == 0 &&
-                            std::count(std::begin(op_inputs), std::end(op_inputs), op_inputs[current_input_idx]) > 0) {
-                            op_inputs.erase(std::next(op_inputs.begin(), current_input_idx));
-                            // Move all input indexes (in all bodies) which are after these indicated by
-                            // to_remove_descriptors_indexes and are not used in any body
-                            update_op_inputs_desc(multi_subgraph_op, required_inputs_indices, current_input_idx);
-                        }
-                    } else {
-                        updated_body_in_descriptors.emplace_back(body_in_descriptors[desc_idx]);
+                    auto& current_body_desc = body_in_descriptors[desc_idx];
+                    const auto current_body_parameter_idx = current_body_desc->m_body_parameter_index;
+                    if (!util::contains(to_remove_descriptors_indexes[body_idx], desc_idx)) {
+                        updated_body_in_descriptors.emplace_back(current_body_desc);
+                    } else if (is_not_removed_idx(current_body_parameter_idx)) {
+                        auto& body_func = multi_subgraph_op->get_function(body_idx);
+                        const auto& body_params = body_func->get_parameters();
+
+                        body_func->remove_parameter(body_params[current_body_parameter_idx]);
+                        // Move all body indexes which are after these indicated by to_remove_descriptors_indexes
+                        update_body_param_desc(body_in_descriptors, current_body_parameter_idx);
+                    }
+
+                    const auto current_input_idx = current_body_desc->m_input_index;
+                    // remove dangling input of MultiSubGraphOp which was not removed earlier
+                    // the same input tensor can go to different input ports
+                    if (!util::contains(required_inputs_indices, current_input_idx) &&
+                        is_not_removed_idx(current_input_idx)) {
+                        op_inputs.erase(op_inputs.begin() + current_input_idx);
+                        // Move all input indexes (in all bodies) which are after these indicated by
+                        // to_remove_descriptors_indexes and are not used in any body
+                        update_op_inputs_desc(multi_subgraph_op, current_input_idx);
+                        update_required_input_indicies(required_inputs_indices, current_input_idx);
                     }
                 }
                 multi_subgraph_op->set_input_descriptions(static_cast<int>(body_idx), updated_body_in_descriptors);