From 6eefc3d0ead2c6360eec248daab0cae66a737ea0 Mon Sep 17 00:00:00 2001 From: Joe Mayer <114769929+jomayeri@users.noreply.github.com> Date: Thu, 17 Oct 2024 19:58:06 -0700 Subject: [PATCH] Fix Memory Leak In AIO (#6630) Fixing a memory leak in AIO pinned tensor as well as an incorrect function type for gds op. --------- Co-authored-by: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com> --- csrc/aio/py_lib/deepspeed_cpu_op.cpp | 5 ++++- csrc/aio/py_lib/deepspeed_pin_tensor.cpp | 2 ++ csrc/gds/py_lib/deepspeed_gds_op.cpp | 3 +-- csrc/gds/py_lib/deepspeed_gds_op.h | 2 +- csrc/gds/py_lib/deepspeed_py_gds_handle.cpp | 2 +- csrc/gds/py_lib/deepspeed_py_gds_handle.h | 2 +- 6 files changed, 10 insertions(+), 6 deletions(-) diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.cpp b/csrc/aio/py_lib/deepspeed_cpu_op.cpp index da1a52d9c6e3..da2ff568d74b 100644 --- a/csrc/aio/py_lib/deepspeed_cpu_op.cpp +++ b/csrc/aio/py_lib/deepspeed_cpu_op.cpp @@ -38,7 +38,10 @@ void cpu_op_desc_t::finish() { if (_use_bounce_buffer) { if (_read_op) { - if (_buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); } + if (_buffer.is_cuda()) { + _buffer.copy_(_cpu_buffer.to(torch::Device(torch::kCUDA, _buffer.get_device()), + /*non_blocking=*/true)); + } if (_buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); } if (_buffer.is_cpu()) { _buffer.copy_(_cpu_buffer); } #if defined(__ENABLE_CANN__) diff --git a/csrc/aio/py_lib/deepspeed_pin_tensor.cpp b/csrc/aio/py_lib/deepspeed_pin_tensor.cpp index 6d2800468e06..a97a4ac18ba8 100644 --- a/csrc/aio/py_lib/deepspeed_pin_tensor.cpp +++ b/csrc/aio/py_lib/deepspeed_pin_tensor.cpp @@ -15,6 +15,7 @@ deepspeed_pin_tensor_t::~deepspeed_pin_tensor_t() { for (auto iter = _locked_tensors.begin(); iter != _locked_tensors.end(); ++iter) { munlock(iter->first, iter->second); + std::free((void*)iter->first); } _locked_tensors.clear(); } @@ -43,6 +44,7 @@ bool deepspeed_pin_tensor_t::free(torch::Tensor& locked_tensor) auto addr = locked_tensor.data_ptr(); if (_locked_tensors.find(addr) != _locked_tensors.end()) { munlock(addr, _locked_tensors[addr]); + std::free(addr); _locked_tensors.erase(addr); return true; } diff --git a/csrc/gds/py_lib/deepspeed_gds_op.cpp b/csrc/gds/py_lib/deepspeed_gds_op.cpp index dae2eef21c6f..f49f74394374 100644 --- a/csrc/gds/py_lib/deepspeed_gds_op.cpp +++ b/csrc/gds/py_lib/deepspeed_gds_op.cpp @@ -58,7 +58,6 @@ void gds_op_desc_t::add_buffer_to_registry(const torch::Tensor& buffer) const int64_t device = buffer.get_device(); void* reg_ptr = buffer.data_ptr(); - // std::cout << "REG PTR " << reg_ptr << std::endl; // TODO: add checking to make sure pointer isn't already in set const auto it = base_ptr_registry.find(device); if (it == base_ptr_registry.end()) { @@ -94,7 +93,7 @@ gds_op_desc_t::gds_op_desc_t(const bool read_op, const torch::Tensor& buffer, const int fd, const char* filename, - const long long int file_num_bytes, + const int64_t file_num_bytes, const int intra_op_parallelism, const bool validate) : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, intra_op_parallelism, validate) diff --git a/csrc/gds/py_lib/deepspeed_gds_op.h b/csrc/gds/py_lib/deepspeed_gds_op.h index c9d4c076f189..380bb0b9b6ae 100644 --- a/csrc/gds/py_lib/deepspeed_gds_op.h +++ b/csrc/gds/py_lib/deepspeed_gds_op.h @@ -22,7 +22,7 @@ struct gds_op_desc_t : io_op_desc_t { const torch::Tensor& buffer, const int fd, const char* filename, - const long long int file_num_bytes, + const int64_t file_num_bytes, const int intra_op_parallelism, const bool validate); diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp index 43705939dc3e..c052144a0190 100644 --- a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp +++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp @@ -106,7 +106,7 @@ std::shared_ptr deepspeed_gds_handle_t::_create_io_op_desc( const torch::Tensor& buffer, const int fd, const char* filename, - const long long int file_num_bytes, + const int64_t file_num_bytes, const bool validate) { if (buffer.is_cuda()) { diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.h b/csrc/gds/py_lib/deepspeed_py_gds_handle.h index a3c10a4f6467..131e83e7b838 100644 --- a/csrc/gds/py_lib/deepspeed_py_gds_handle.h +++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.h @@ -41,7 +41,7 @@ struct deepspeed_gds_handle_t : deepspeed_io_handle_t { const torch::Tensor& buffer, const int fd, const char* filename, - const long long int file_num_bytes, + const int64_t file_num_bytes, const bool validate); static int s_cuFile_init;