diff --git a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp index 0fe26c7560ac55..eb103c493e4ef4 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp @@ -15,14 +15,10 @@ #include "zero_pipeline.hpp" #include "zero_profiling.hpp" #include "zero_remote_tensor.hpp" +#include "zero_tensor.hpp" namespace intel_npu { -struct TensorInfo { - bool tensorCreatedLocally; - uint64_t originalMemoryId; -}; - class ZeroInferRequest final : public SyncInferRequest { public: explicit ZeroInferRequest(const std::shared_ptr& initStructs, @@ -67,12 +63,9 @@ class ZeroInferRequest final : public SyncInferRequest { std::shared_ptr& get_level_zero_input(size_t index, size_t tensorNo = 0) const; std::vector>& get_level_zero_inputs(size_t index) const; - std::shared_ptr allocate_tensor( - const IODescriptor& descriptor, - const size_t index, - const bool isInput, - const ov::Allocator& allocator = {}, - const std::optional batchSize = std::nullopt) const override; + std::shared_ptr create_tensor(ov::element::Type type, + const ov::Shape& shape, + const ov::Allocator& allocator = {}) const override; const std::shared_ptr _initStructs; const std::shared_ptr _graph; @@ -84,9 +77,6 @@ class ZeroInferRequest final : public SyncInferRequest { mutable std::vector>> _levelZeroInputTensors; mutable std::vector> _levelZeroOutputTensors; - mutable std::vector _levelZeroInputTensorInfo; - mutable std::vector _levelZeroOutputTensorInfo; - ze_device_properties_t _properties = {}; std::shared_ptr _inputAllocator; std::shared_ptr _outputAllocator; diff --git a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp index bfea560e907967..7bf598dc407915 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp @@ -7,9 +7,9 @@ #include "intel_npu/common/igraph.hpp" #include "intel_npu/utils/zero/zero_utils.hpp" #include "intel_npu/utils/zero/zero_wrappers.hpp" -#include "openvino/runtime/itensor.hpp" #include "zero_memory.hpp" #include "zero_profiling.hpp" +#include "zero_tensor.hpp" namespace intel_npu { diff --git a/src/plugins/intel_npu/src/backend/include/zero_tensor.hpp b/src/plugins/intel_npu/src/backend/include/zero_tensor.hpp index 8eedda4475b38a..9cae2b98425a37 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_tensor.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_tensor.hpp @@ -22,7 +22,7 @@ class ZeroTensor final : public ov::ITensor { const ov::Shape& shape, const ov::Allocator& allocator); - void* data(const ov::element::Type& element_type) const override; + void* data(const ov::element::Type& type = {}) const override; const ov::element::Type& get_element_type() const override; @@ -32,6 +32,9 @@ class ZeroTensor final : public ov::ITensor { const ov::Strides& get_strides() const override; + bool memory_address_changed(); + void reset_memory_flag(); + ~ZeroTensor(); private: @@ -51,6 +54,7 @@ class ZeroTensor final : public ov::ITensor { mutable std::once_flag _strides_once; ov::Allocator _allocator; void* _ptr = nullptr; + bool _reset_tensor_memory = false; }; } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp index af35229903de5d..1ae7d0a759cb6e 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp @@ -13,7 +13,6 @@ #include "openvino/op/util/op_types.hpp" #include "openvino/runtime/intel_npu/remote_properties.hpp" #include "zero_memory.hpp" -#include "zero_tensor.hpp" using namespace intel_npu; @@ -92,17 +91,6 @@ bool memory_was_allocated_in_the_same_l0_context(ze_context_handle_t hContext, c return false; } -uint64_t get_memory_id(ze_context_handle_t hContext, const void* ptr) { - ze_memory_allocation_properties_t desc = {}; - desc.stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES; - auto res = intel_npu::zeMemGetAllocProperties(hContext, ptr, &desc, nullptr); - if (res != ZE_RESULT_SUCCESS) { - return 0; - } - - return desc.id; -} - } // namespace //------------------------------------------------------------------------------ @@ -116,8 +104,6 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& _logger("ZeroInferRequest", config.get()), _levelZeroInputTensors(_metadata.inputs.size(), std::vector>(1, nullptr)), _levelZeroOutputTensors(_metadata.outputs.size(), nullptr), - _levelZeroInputTensorInfo(_metadata.inputs.size(), TensorInfo{false, 0}), - _levelZeroOutputTensorInfo(_metadata.outputs.size(), TensorInfo{false, 0}), _profilingPool(_initStructs, _graph, zeroProfiling::POOL_SIZE), _profilingQuery(_initStructs, 0) { _logger.debug("ZeroInferRequest::ZeroInferRequest - SyncInferRequest"); @@ -196,7 +182,6 @@ void ZeroInferRequest::create_pipeline() { INPUT, *_inputAllocator, _graph->get_batch_size()); - _levelZeroInputTensorInfo.at(inputIndex).tensorCreatedLocally = true; } for (size_t outputIndex = 0; outputIndex < _metadata.outputs.size(); ++outputIndex) { @@ -212,7 +197,6 @@ void ZeroInferRequest::create_pipeline() { OUTPUT, *_outputAllocator, _graph->get_batch_size()); - _levelZeroOutputTensorInfo.at(outputIndex).tensorCreatedLocally = true; } if (_initStructs->getMutableCommandListVersion()) { @@ -229,9 +213,6 @@ void ZeroInferRequest::create_pipeline() { if (std::dynamic_pointer_cast(get_level_zero_input(inputIndex)) != nullptr) { continue; } - - _levelZeroInputTensorInfo.at(inputIndex).originalMemoryId = - get_memory_id(_initStructs->getContext(), get_level_zero_input(inputIndex)->data()); } for (size_t outputIndex = 0; outputIndex < _metadata.outputs.size(); ++outputIndex) { @@ -243,9 +224,6 @@ void ZeroInferRequest::create_pipeline() { if (std::dynamic_pointer_cast(_levelZeroOutputTensors.at(outputIndex)) != nullptr) { continue; } - - _levelZeroOutputTensorInfo.at(outputIndex).originalMemoryId = - get_memory_id(_initStructs->getContext(), _levelZeroOutputTensors.at(outputIndex)->data()); } } @@ -275,24 +253,15 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr& tenso const bool isInput) { OV_ITT_TASK_CHAIN(ZERO_SET_TENSOR, itt::domains::LevelZeroBackend, "set_tensor", "set_tensor_data"); auto& levelZeroTensors = isInput ? get_level_zero_input(index) : _levelZeroOutputTensors.at(index); - auto& tensorCreatedLocally = isInput ? _levelZeroInputTensorInfo.at(index).tensorCreatedLocally - : _levelZeroOutputTensorInfo.at(index).tensorCreatedLocally; - - bool setTensorData = false; - bool levelZeroTensorCreatedLocally = true; - - OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "check_data_allocation"); - if (memory_was_allocated_in_the_same_l0_context(_initStructs->getContext(), tensor->data())) { - _logger.debug("ZeroInferRequest::set_tensor_data - tensor was created in the same L0 context"); - levelZeroTensors = tensor; - levelZeroTensorCreatedLocally = false; - setTensorData = true; - } - if (!setTensorData) { - // make sure that the L0 tensor was allocated locally and is not received from the user when receiving - // random tensor - if (!tensorCreatedLocally) { + const auto& zeroTensor = std::dynamic_pointer_cast(tensor); + + if (zeroTensor == nullptr) { + OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "check_data_allocation"); + if (memory_was_allocated_in_the_same_l0_context(_initStructs->getContext(), tensor->data())) { + _logger.debug("ZeroInferRequest::set_tensor_data - tensor was created in the same L0 context"); + levelZeroTensors = tensor; + } else { _logger.debug("ZeroInferRequest::set_tensor_data - create locally L0 tensor"); OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "allocate tensor"); @@ -301,28 +270,16 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr& tenso isInput, isInput ? *_inputAllocator : *_outputAllocator, _graph->get_batch_size()); - - setTensorData = true; - levelZeroTensorCreatedLocally = true; } - } - - if (setTensorData) { - tensorCreatedLocally = levelZeroTensorCreatedLocally; if (_pipelineIsCreated) { _logger.debug("ZeroInferRequest::infer_async - update command list"); - auto& updateOriginalAddress = isInput ? _levelZeroInputTensorInfo.at(index).originalMemoryId - : _levelZeroOutputTensorInfo.at(index).originalMemoryId; - OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "updateCommandList"); _pipeline->updateCommandList(levelZeroTensors->data(), levelZeroTensors->get_byte_size(), isInput ? _graph->get_input_descriptors().at(index).idx : _graph->get_output_descriptors().at(index).idx); - - updateOriginalAddress = get_memory_id(_initStructs->getContext(), levelZeroTensors->data()); } } } @@ -344,11 +301,7 @@ void ZeroInferRequest::set_remote_tensor_data(const std::shared_ptr ZeroInferRequest::get_tensor(const ov::Output ZeroInferRequest::get_tensor(const ov::Outputget_batch_size()); - tensorCreatedLocally = true; - return levelZeroTensors; } @@ -646,15 +595,20 @@ void ZeroInferRequest::infer_async() { continue; } - auto memoryId = get_memory_id(_initStructs->getContext(), levelZeroTensor.at(SINGLE_TENSOR)->data()); + auto zeroTensor = std::dynamic_pointer_cast(levelZeroTensor.at(SINGLE_TENSOR)); - if (_levelZeroInputTensorInfo.at(inputIndex).originalMemoryId != memoryId) { + if (zeroTensor == nullptr) { + ++inputIndex; + continue; + } + + if (zeroTensor->memory_address_changed()) { _logger.debug("Update input graph descriptor with the new tensor"); - _pipeline->updateCommandList(levelZeroTensor.at(SINGLE_TENSOR)->data(), - levelZeroTensor.at(SINGLE_TENSOR)->get_byte_size(), + _pipeline->updateCommandList(zeroTensor->data(), + zeroTensor->get_byte_size(), _graph->get_input_descriptors().at(inputIndex).idx); - _levelZeroInputTensorInfo.at(inputIndex).originalMemoryId = memoryId; + zeroTensor->reset_memory_flag(); } ++inputIndex; @@ -674,15 +628,20 @@ void ZeroInferRequest::infer_async() { continue; } - auto memoryId = get_memory_id(_initStructs->getContext(), levelZeroTensor->data()); + auto zeroTensor = std::dynamic_pointer_cast(levelZeroTensor); + + if (zeroTensor == nullptr) { + ++outputIndex; + continue; + } - if (_levelZeroOutputTensorInfo.at(outputIndex).originalMemoryId != memoryId) { + if (zeroTensor->memory_address_changed()) { _logger.debug("Update output graph descriptor with the new tensor"); - _pipeline->updateCommandList(levelZeroTensor->data(), - levelZeroTensor->get_byte_size(), + _pipeline->updateCommandList(zeroTensor->data(), + zeroTensor->get_byte_size(), _graph->get_output_descriptors().at(outputIndex).idx); - _levelZeroOutputTensorInfo.at(outputIndex).originalMemoryId = memoryId; + zeroTensor->reset_memory_flag(); } ++outputIndex; @@ -810,45 +769,12 @@ std::vector ZeroInferRequest::get_profiling_info() const { } } -std::shared_ptr ZeroInferRequest::allocate_tensor(const IODescriptor& descriptor, - const size_t index, - const bool isInput, - const ov::Allocator& allocator, - const std::optional batchSize) const { - check_network_precision(descriptor.precision); - - std::shared_ptr tensor; - ov::Shape allocatedTensorShape = descriptor.shapeFromCompiler.get_max_shape(); - - if (batchSize.has_value()) { - allocatedTensorShape[BATCH_AXIS] = *batchSize; - } - - if (descriptor.isStateOutput) { - // Only one buffer is required for each (state input, state output) pair, acting as an input before running the - // inference and as an output after performing it. Thus both the "state input" and "state output" entries shall - // point to the same buffer. - OPENVINO_ASSERT(descriptor.relatedDescriptorIndex.has_value(), - "The link between state descriptors is missing, state name: ", - descriptor.nameFromCompiler); - tensor = get_user_input(*descriptor.relatedDescriptorIndex)._ptr; - } else { - tensor = std::make_shared(_initStructs, descriptor.precision, allocatedTensorShape, allocator); - } - - if (isInput) { - if (get_user_input(index) == nullptr) { - get_user_input(index) = tensor; - } - - if (descriptor.isStateInput) { - _variableStates.push_back(std::make_shared(descriptor.nameFromCompiler, tensor)); - } - } else if (_userOutputTensors.at(index) == nullptr) { - _userOutputTensors.at(index) = tensor; - } +std::shared_ptr ZeroInferRequest::create_tensor(ov::element::Type type, + const ov::Shape& shape, + const ov::Allocator& allocator) const { + OPENVINO_ASSERT(allocator, "Allocator mush be provided when creating a zero tensor!"); - return tensor; + return std::make_shared(_initStructs, type, shape, allocator); } std::vector ZeroInferRequest::get_raw_profiling_data() const { diff --git a/src/plugins/intel_npu/src/backend/src/zero_tensor.cpp b/src/plugins/intel_npu/src/backend/src/zero_tensor.cpp index b98628f3f23c93..084ff4b200b837 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_tensor.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_tensor.cpp @@ -116,10 +116,6 @@ void ZeroTensor::set_shape(ov::Shape new_shape) { _shape = std::move(new_shape); if (get_size() > get_capacity()) { -#ifdef __linux__ - OPENVINO_THROW("Re-shaping the tensor with a larger shape is not available."); -#endif - if (!_init_structs->getMutableCommandListVersion()) { OPENVINO_THROW("Re-shaping the tensor with a larger shape is not available using this driver version. " "Please update the driver."); @@ -131,12 +127,22 @@ void ZeroTensor::set_shape(ov::Shape new_shape) { _capacity = _shape; _ptr = _allocator.allocate(get_bytes_capacity()); initialize_elements(_ptr, _element_type, _shape); + + _reset_tensor_memory = true; } _strides.clear(); update_strides(); } +bool ZeroTensor::memory_address_changed() { + return _reset_tensor_memory; +} + +void ZeroTensor::reset_memory_flag() { + _reset_tensor_memory = false; +} + ZeroTensor::~ZeroTensor() { destroy_memory(); } diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/sync_infer_request.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/sync_infer_request.hpp index dfcb37a0043ce6..8b96629c46240f 100644 --- a/src/plugins/intel_npu/src/common/include/intel_npu/common/sync_infer_request.hpp +++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/sync_infer_request.hpp @@ -163,12 +163,15 @@ class SyncInferRequest : public ov::IInferRequest { * @param batchSize If provided, the value of the shape on the 0th axis is overriden with this value. * @return Pointer towards the allocated tensor */ - virtual std::shared_ptr allocate_tensor( - const IODescriptor& descriptor, - const size_t index, - const bool isInput, - const ov::Allocator& allocator = {}, - const std::optional batchSize = std::nullopt) const; + std::shared_ptr allocate_tensor(const IODescriptor& descriptor, + const size_t index, + const bool isInput, + const ov::Allocator& allocator = {}, + const std::optional batchSize = std::nullopt) const; + + virtual std::shared_ptr create_tensor(ov::element::Type type, + const ov::Shape& shape, + const ov::Allocator& allocator = {}) const; bool is_batched_input(size_t idx) const; diff --git a/src/plugins/intel_npu/src/common/src/sync_infer_request.cpp b/src/plugins/intel_npu/src/common/src/sync_infer_request.cpp index 09f00b43c840c1..b2fd8f457a6d1e 100644 --- a/src/plugins/intel_npu/src/common/src/sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/common/src/sync_infer_request.cpp @@ -310,10 +310,8 @@ std::shared_ptr SyncInferRequest::allocate_tensor(const IODescripto "The link between state descriptors is missing, state name: ", descriptor.nameFromCompiler); tensor = get_user_input(*descriptor.relatedDescriptorIndex)._ptr; - } else if (allocator) { - tensor = ov::make_tensor(descriptor.precision, allocatedTensorShape, allocator); } else { - tensor = ov::make_tensor(descriptor.precision, allocatedTensorShape); + tensor = create_tensor(descriptor.precision, allocatedTensorShape, allocator); } if (isInput) { @@ -331,6 +329,12 @@ std::shared_ptr SyncInferRequest::allocate_tensor(const IODescripto return tensor; } +std::shared_ptr SyncInferRequest::create_tensor(ov::element::Type type, + const ov::Shape& shape, + const ov::Allocator& allocator) const { + return ov::make_tensor(type, shape, allocator); +} + bool SyncInferRequest::is_batched_input(size_t idx) const { return _userInputTensors.at(idx).size() > 1; }