Add method for checking memory address changes in the zero tensor

Signed-off-by: Bogdan Pereanu <[email protected]>
openvinotoolkit · Jan 7, 2025 · 9b66d23 · 9b66d23
1 parent ac35818
commit 9b66d23
Show file tree

Hide file tree

Showing 7 changed files with 69 additions and 136 deletions.
diff --git a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp
@@ -15,14 +15,10 @@
 #include "zero_pipeline.hpp"
 #include "zero_profiling.hpp"
 #include "zero_remote_tensor.hpp"
+#include "zero_tensor.hpp"
 
 namespace intel_npu {
 
-struct TensorInfo {
-    bool tensorCreatedLocally;
-    uint64_t originalMemoryId;
-};
-
 class ZeroInferRequest final : public SyncInferRequest {
 public:
     explicit ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>& initStructs,
@@ -67,12 +63,9 @@ class ZeroInferRequest final : public SyncInferRequest {
     std::shared_ptr<ov::ITensor>& get_level_zero_input(size_t index, size_t tensorNo = 0) const;
     std::vector<std::shared_ptr<ov::ITensor>>& get_level_zero_inputs(size_t index) const;
 
-    std::shared_ptr<ov::ITensor> allocate_tensor(
-        const IODescriptor& descriptor,
-        const size_t index,
-        const bool isInput,
-        const ov::Allocator& allocator = {},
-        const std::optional<std::size_t> batchSize = std::nullopt) const override;
+    std::shared_ptr<ov::ITensor> create_tensor(ov::element::Type type,
+                                               const ov::Shape& shape,
+                                               const ov::Allocator& allocator = {}) const override;
 
     const std::shared_ptr<ZeroInitStructsHolder> _initStructs;
     const std::shared_ptr<IGraph> _graph;
@@ -84,9 +77,6 @@ class ZeroInferRequest final : public SyncInferRequest {
     mutable std::vector<std::vector<std::shared_ptr<ov::ITensor>>> _levelZeroInputTensors;
     mutable std::vector<std::shared_ptr<ov::ITensor>> _levelZeroOutputTensors;
 
-    mutable std::vector<TensorInfo> _levelZeroInputTensorInfo;
-    mutable std::vector<TensorInfo> _levelZeroOutputTensorInfo;
-
     ze_device_properties_t _properties = {};
     std::shared_ptr<const zeroMemory::HostMemAllocator> _inputAllocator;
     std::shared_ptr<const zeroMemory::HostMemAllocator> _outputAllocator;

diff --git a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
@@ -7,9 +7,9 @@
 #include "intel_npu/common/igraph.hpp"
 #include "intel_npu/utils/zero/zero_utils.hpp"
 #include "intel_npu/utils/zero/zero_wrappers.hpp"
-#include "openvino/runtime/itensor.hpp"
 #include "zero_memory.hpp"
 #include "zero_profiling.hpp"
+#include "zero_tensor.hpp"
 
 namespace intel_npu {
 

diff --git a/src/plugins/intel_npu/src/backend/include/zero_tensor.hpp b/src/plugins/intel_npu/src/backend/include/zero_tensor.hpp
@@ -22,7 +22,7 @@ class ZeroTensor final : public ov::ITensor {
                const ov::Shape& shape,
                const ov::Allocator& allocator);
 
-    void* data(const ov::element::Type& element_type) const override;
+    void* data(const ov::element::Type& type = {}) const override;
 
     const ov::element::Type& get_element_type() const override;
 
@@ -32,6 +32,9 @@ class ZeroTensor final : public ov::ITensor {
 
     const ov::Strides& get_strides() const override;
 
+    bool memory_address_changed();
+    void reset_memory_flag();
+
     ~ZeroTensor();
 
 private:
@@ -51,6 +54,7 @@ class ZeroTensor final : public ov::ITensor {
     mutable std::once_flag _strides_once;
     ov::Allocator _allocator;
     void* _ptr = nullptr;
+    bool _reset_tensor_memory = false;
 };
 
 }  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
@@ -13,7 +13,6 @@
 #include "openvino/op/util/op_types.hpp"
 #include "openvino/runtime/intel_npu/remote_properties.hpp"
 #include "zero_memory.hpp"
-#include "zero_tensor.hpp"
 
 using namespace intel_npu;
 
@@ -92,17 +91,6 @@ bool memory_was_allocated_in_the_same_l0_context(ze_context_handle_t hContext, c
     return false;
 }
 
-uint64_t get_memory_id(ze_context_handle_t hContext, const void* ptr) {
-    ze_memory_allocation_properties_t desc = {};
-    desc.stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES;
-    auto res = intel_npu::zeMemGetAllocProperties(hContext, ptr, &desc, nullptr);
-    if (res != ZE_RESULT_SUCCESS) {
-        return 0;
-    }
-
-    return desc.id;
-}
-
 }  // namespace
 
 //------------------------------------------------------------------------------
@@ -116,8 +104,6 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
       _logger("ZeroInferRequest", config.get<LOG_LEVEL>()),
       _levelZeroInputTensors(_metadata.inputs.size(), std::vector<std::shared_ptr<ov::ITensor>>(1, nullptr)),
       _levelZeroOutputTensors(_metadata.outputs.size(), nullptr),
-      _levelZeroInputTensorInfo(_metadata.inputs.size(), TensorInfo{false, 0}),
-      _levelZeroOutputTensorInfo(_metadata.outputs.size(), TensorInfo{false, 0}),
       _profilingPool(_initStructs, _graph, zeroProfiling::POOL_SIZE),
       _profilingQuery(_initStructs, 0) {
     _logger.debug("ZeroInferRequest::ZeroInferRequest - SyncInferRequest");
@@ -196,7 +182,6 @@ void ZeroInferRequest::create_pipeline() {
                                                            INPUT,
                                                            *_inputAllocator,
                                                            _graph->get_batch_size());
-        _levelZeroInputTensorInfo.at(inputIndex).tensorCreatedLocally = true;
     }
 
     for (size_t outputIndex = 0; outputIndex < _metadata.outputs.size(); ++outputIndex) {
@@ -212,7 +197,6 @@ void ZeroInferRequest::create_pipeline() {
                                                                   OUTPUT,
                                                                   *_outputAllocator,
                                                                   _graph->get_batch_size());
-        _levelZeroOutputTensorInfo.at(outputIndex).tensorCreatedLocally = true;
     }
 
     if (_initStructs->getMutableCommandListVersion()) {
@@ -229,9 +213,6 @@ void ZeroInferRequest::create_pipeline() {
             if (std::dynamic_pointer_cast<ZeroRemoteTensor>(get_level_zero_input(inputIndex)) != nullptr) {
                 continue;
             }
-
-            _levelZeroInputTensorInfo.at(inputIndex).originalMemoryId =
-                get_memory_id(_initStructs->getContext(), get_level_zero_input(inputIndex)->data());
         }
 
         for (size_t outputIndex = 0; outputIndex < _metadata.outputs.size(); ++outputIndex) {
@@ -243,9 +224,6 @@ void ZeroInferRequest::create_pipeline() {
             if (std::dynamic_pointer_cast<ZeroRemoteTensor>(_levelZeroOutputTensors.at(outputIndex)) != nullptr) {
                 continue;
             }
-
-            _levelZeroOutputTensorInfo.at(outputIndex).originalMemoryId =
-                get_memory_id(_initStructs->getContext(), _levelZeroOutputTensors.at(outputIndex)->data());
         }
     }
 
@@ -275,24 +253,15 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr<ov::ITensor>& tenso
                                        const bool isInput) {
     OV_ITT_TASK_CHAIN(ZERO_SET_TENSOR, itt::domains::LevelZeroBackend, "set_tensor", "set_tensor_data");
     auto& levelZeroTensors = isInput ? get_level_zero_input(index) : _levelZeroOutputTensors.at(index);
-    auto& tensorCreatedLocally = isInput ? _levelZeroInputTensorInfo.at(index).tensorCreatedLocally
-                                         : _levelZeroOutputTensorInfo.at(index).tensorCreatedLocally;
-
-    bool setTensorData = false;
-    bool levelZeroTensorCreatedLocally = true;
-
-    OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "check_data_allocation");
-    if (memory_was_allocated_in_the_same_l0_context(_initStructs->getContext(), tensor->data())) {
-        _logger.debug("ZeroInferRequest::set_tensor_data - tensor was created in the same L0 context");
-        levelZeroTensors = tensor;
-        levelZeroTensorCreatedLocally = false;
-        setTensorData = true;
-    }
 
-    if (!setTensorData) {
-        // make sure that the L0 tensor was allocated locally and is not received from the user when receiving
-        // random tensor
-        if (!tensorCreatedLocally) {
+    const auto& zeroTensor = std::dynamic_pointer_cast<ZeroTensor>(tensor);
+
+    if (zeroTensor == nullptr) {
+        OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "check_data_allocation");
+        if (memory_was_allocated_in_the_same_l0_context(_initStructs->getContext(), tensor->data())) {
+            _logger.debug("ZeroInferRequest::set_tensor_data - tensor was created in the same L0 context");
+            levelZeroTensors = tensor;
+        } else {
             _logger.debug("ZeroInferRequest::set_tensor_data - create locally L0 tensor");
             OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "allocate tensor");
 
@@ -301,28 +270,16 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr<ov::ITensor>& tenso
                                                isInput,
                                                isInput ? *_inputAllocator : *_outputAllocator,
                                                _graph->get_batch_size());
-
-            setTensorData = true;
-            levelZeroTensorCreatedLocally = true;
         }
-    }
-
-    if (setTensorData) {
-        tensorCreatedLocally = levelZeroTensorCreatedLocally;
 
         if (_pipelineIsCreated) {
             _logger.debug("ZeroInferRequest::infer_async - update command list");
 
-            auto& updateOriginalAddress = isInput ? _levelZeroInputTensorInfo.at(index).originalMemoryId
-                                                  : _levelZeroOutputTensorInfo.at(index).originalMemoryId;
-
             OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "updateCommandList");
             _pipeline->updateCommandList(levelZeroTensors->data(),
                                          levelZeroTensors->get_byte_size(),
                                          isInput ? _graph->get_input_descriptors().at(index).idx
                                                  : _graph->get_output_descriptors().at(index).idx);
-
-            updateOriginalAddress = get_memory_id(_initStructs->getContext(), levelZeroTensors->data());
         }
     }
 }
@@ -344,11 +301,7 @@ void ZeroInferRequest::set_remote_tensor_data(const std::shared_ptr<ZeroRemoteTe
     }
 
     auto& levelZeroTensors = isInput ? get_level_zero_input(index) : _levelZeroOutputTensors.at(index);
-    auto& tensorCreatedLocally = isInput ? _levelZeroInputTensorInfo.at(index).tensorCreatedLocally
-                                         : _levelZeroOutputTensorInfo.at(index).tensorCreatedLocally;
-
     levelZeroTensors = tensor;
-    tensorCreatedLocally = false;
 
     if (_pipelineIsCreated) {
         _logger.debug("ZeroInferRequest::infer_async - update command list");
@@ -500,17 +453,13 @@ ov::SoPtr<ov::ITensor> ZeroInferRequest::get_tensor(const ov::Output<const ov::N
                   metadata.nodeFriendlyName.c_str());
 
     auto& levelZeroTensors = isInput ? get_level_zero_input(ioIndex) : _levelZeroOutputTensors.at(ioIndex);
-    auto& tensorCreatedLocally = isInput ? _levelZeroInputTensorInfo.at(ioIndex).tensorCreatedLocally
-                                         : _levelZeroOutputTensorInfo.at(ioIndex).tensorCreatedLocally;
 
     levelZeroTensors = allocate_tensor(metadata,
                                        ioIndex,
                                        isInput,
                                        isInput ? *_inputAllocator : *_outputAllocator,
                                        _graph->get_batch_size());
 
-    tensorCreatedLocally = true;
-
     return levelZeroTensors;
 }
 
@@ -646,15 +595,20 @@ void ZeroInferRequest::infer_async() {
                 continue;
             }
 
-            auto memoryId = get_memory_id(_initStructs->getContext(), levelZeroTensor.at(SINGLE_TENSOR)->data());
+            auto zeroTensor = std::dynamic_pointer_cast<ZeroTensor>(levelZeroTensor.at(SINGLE_TENSOR));
 
-            if (_levelZeroInputTensorInfo.at(inputIndex).originalMemoryId != memoryId) {
+            if (zeroTensor == nullptr) {
+                ++inputIndex;
+                continue;
+            }
+
+            if (zeroTensor->memory_address_changed()) {
                 _logger.debug("Update input graph descriptor with the new tensor");
-                _pipeline->updateCommandList(levelZeroTensor.at(SINGLE_TENSOR)->data(),
-                                             levelZeroTensor.at(SINGLE_TENSOR)->get_byte_size(),
+                _pipeline->updateCommandList(zeroTensor->data(),
+                                             zeroTensor->get_byte_size(),
                                              _graph->get_input_descriptors().at(inputIndex).idx);
 
-                _levelZeroInputTensorInfo.at(inputIndex).originalMemoryId = memoryId;
+                zeroTensor->reset_memory_flag();
             }
 
             ++inputIndex;
@@ -674,15 +628,20 @@ void ZeroInferRequest::infer_async() {
                 continue;
             }
 
-            auto memoryId = get_memory_id(_initStructs->getContext(), levelZeroTensor->data());
+            auto zeroTensor = std::dynamic_pointer_cast<ZeroTensor>(levelZeroTensor);
+
+            if (zeroTensor == nullptr) {
+                ++outputIndex;
+                continue;
+            }
 
-            if (_levelZeroOutputTensorInfo.at(outputIndex).originalMemoryId != memoryId) {
+            if (zeroTensor->memory_address_changed()) {
                 _logger.debug("Update output graph descriptor with the new tensor");
-                _pipeline->updateCommandList(levelZeroTensor->data(),
-                                             levelZeroTensor->get_byte_size(),
+                _pipeline->updateCommandList(zeroTensor->data(),
+                                             zeroTensor->get_byte_size(),
                                              _graph->get_output_descriptors().at(outputIndex).idx);
 
-                _levelZeroOutputTensorInfo.at(outputIndex).originalMemoryId = memoryId;
+                zeroTensor->reset_memory_flag();
             }
 
             ++outputIndex;
@@ -810,45 +769,12 @@ std::vector<ov::ProfilingInfo> ZeroInferRequest::get_profiling_info() const {
     }
 }
 
-std::shared_ptr<ov::ITensor> ZeroInferRequest::allocate_tensor(const IODescriptor& descriptor,
-                                                               const size_t index,
-                                                               const bool isInput,
-                                                               const ov::Allocator& allocator,
-                                                               const std::optional<std::size_t> batchSize) const {
-    check_network_precision(descriptor.precision);
-
-    std::shared_ptr<ov::ITensor> tensor;
-    ov::Shape allocatedTensorShape = descriptor.shapeFromCompiler.get_max_shape();
-
-    if (batchSize.has_value()) {
-        allocatedTensorShape[BATCH_AXIS] = *batchSize;
-    }
-
-    if (descriptor.isStateOutput) {
-        // Only one buffer is required for each (state input, state output) pair, acting as an input before running the
-        // inference and as an output after performing it. Thus both the "state input" and "state output" entries shall
-        // point to the same buffer.
-        OPENVINO_ASSERT(descriptor.relatedDescriptorIndex.has_value(),
-                        "The link between state descriptors is missing, state name: ",
-                        descriptor.nameFromCompiler);
-        tensor = get_user_input(*descriptor.relatedDescriptorIndex)._ptr;
-    } else {
-        tensor = std::make_shared<ZeroTensor>(_initStructs, descriptor.precision, allocatedTensorShape, allocator);
-    }
-
-    if (isInput) {
-        if (get_user_input(index) == nullptr) {
-            get_user_input(index) = tensor;
-        }
-
-        if (descriptor.isStateInput) {
-            _variableStates.push_back(std::make_shared<VariableState>(descriptor.nameFromCompiler, tensor));
-        }
-    } else if (_userOutputTensors.at(index) == nullptr) {
-        _userOutputTensors.at(index) = tensor;
-    }
+std::shared_ptr<ov::ITensor> ZeroInferRequest::create_tensor(ov::element::Type type,
+                                                             const ov::Shape& shape,
+                                                             const ov::Allocator& allocator) const {
+    OPENVINO_ASSERT(allocator, "Allocator mush be provided when creating a zero tensor!");
 
-    return tensor;
+    return std::make_shared<ZeroTensor>(_initStructs, type, shape, allocator);
 }
 
 std::vector<uint8_t> ZeroInferRequest::get_raw_profiling_data() const {

diff --git a/src/plugins/intel_npu/src/backend/src/zero_tensor.cpp b/src/plugins/intel_npu/src/backend/src/zero_tensor.cpp
@@ -116,10 +116,6 @@ void ZeroTensor::set_shape(ov::Shape new_shape) {
     _shape = std::move(new_shape);
 
     if (get_size() > get_capacity()) {
-#ifdef __linux__
-        OPENVINO_THROW("Re-shaping the tensor with a larger shape is not available.");
-#endif
-
         if (!_init_structs->getMutableCommandListVersion()) {
             OPENVINO_THROW("Re-shaping the tensor with a larger shape is not available using this driver version. "
                            "Please update the driver.");
@@ -131,12 +127,22 @@ void ZeroTensor::set_shape(ov::Shape new_shape) {
         _capacity = _shape;
         _ptr = _allocator.allocate(get_bytes_capacity());
         initialize_elements(_ptr, _element_type, _shape);
+
+        _reset_tensor_memory = true;
     }
 
     _strides.clear();
     update_strides();
 }
 
+bool ZeroTensor::memory_address_changed() {
+    return _reset_tensor_memory;
+}
+
+void ZeroTensor::reset_memory_flag() {
+    _reset_tensor_memory = false;
+}
+
 ZeroTensor::~ZeroTensor() {
     destroy_memory();
 }