From 844d7b6c23822a80985d279888483231bf8ac4c9 Mon Sep 17 00:00:00 2001 From: rdeodhar Date: Thu, 27 Jan 2022 11:47:10 -0700 Subject: [PATCH] [SYCL] Experimental support for L0 host pointer import (#4891) This change adds support for an experimental L0 API for host pointer import into USM. Signed-off-by: Rajiv Deodhar --- sycl/doc/EnvironmentVariables.md | 1 + sycl/plugins/level_zero/pi_level_zero.cpp | 141 +++++++++++++++++++--- sycl/plugins/level_zero/pi_level_zero.hpp | 15 ++- 3 files changed, 132 insertions(+), 25 deletions(-) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 5f1c386a93cac..89c9341b772ca 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -21,6 +21,7 @@ compiler and runtime. | `SYCL_CACHE_MIN_DEVICE_IMAGE_SIZE` | Positive integer | Minimum size of device code image in bytes which is reasonable to cache on disk because disk access operation may take more time than do JIT compilation for it. Default value is 0 to cache all images. | | `SYCL_CACHE_MAX_DEVICE_IMAGE_SIZE` | Positive integer | Maximum size of device image in bytes which is cached. Too big kernels may overload disk too fast. Default value is 1 GB. | | `SYCL_ENABLE_DEFAULT_CONTEXTS` | '1' or '0' | Enable ('1') or disable ('0') creation of default platform contexts in SYCL runtime. The default context for each platform contains all devices in the platform. Refer to [Platform Default Contexts](extensions/PlatformContext/PlatformContext.adoc) extension to learn more. Enabled by default on Linux and disabled on Windows. | +| `SYCL_USM_HOSTPTR_IMPORT` | Integer | Enable by specifying non-zero value. Buffers created with a host pointer will result in host data promotion to USM, improving data transfer performance. To use this feature, also set SYCL_HOST_UNIFIED_MEMORY=1. | `(*) Note: Any means this environment variable is effective when set to any non-null value.` diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index bdd324971f665..d1476d407528b 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1700,6 +1700,54 @@ static bool setEnvVar(const char *name, const char *value) { return true; } +static class ZeUSMImportExtension { + // Pointers to functions that import/release host memory into USM + ze_result_t (*zexDriverImportExternalPointer)(ze_driver_handle_t hDriver, + void *, size_t); + ze_result_t (*zexDriverReleaseImportedPointer)(ze_driver_handle_t, void *); + +public: + // Whether user has requested Import/Release, and platform supports it. + bool Enabled; + + ZeUSMImportExtension() : Enabled{false} {} + + void setZeUSMImport(pi_platform Platform) { + // Whether env var SYCL_USM_HOSTPTR_IMPORT has been set requesting + // host ptr import during buffer creation. + const char *USMHostPtrImportStr = std::getenv("SYCL_USM_HOSTPTR_IMPORT"); + if (!USMHostPtrImportStr || std::atoi(USMHostPtrImportStr) == 0) + return; + + // Check if USM hostptr import feature is available. + ze_driver_handle_t driverHandle = Platform->ZeDriver; + if (ZE_CALL_NOCHECK(zeDriverGetExtensionFunctionAddress, + (driverHandle, "zexDriverImportExternalPointer", + reinterpret_cast( + &zexDriverImportExternalPointer))) == 0) { + ZE_CALL_NOCHECK( + zeDriverGetExtensionFunctionAddress, + (driverHandle, "zexDriverReleaseImportedPointer", + reinterpret_cast(&zexDriverReleaseImportedPointer))); + // Hostptr import/release is turned on because it has been requested + // by the env var, and this platform supports the APIs. + Enabled = true; + // Hostptr import is only possible if piMemBufferCreate receives a + // hostptr as an argument. The SYCL runtime passes a host ptr + // only when SYCL_HOST_UNIFIED_MEMORY is enabled. Therefore we turn it on. + setEnvVar("SYCL_HOST_UNIFIED_MEMORY", "1"); + } + } + void doZeUSMImport(ze_driver_handle_t driverHandle, void *HostPtr, + size_t Size) { + ZE_CALL_NOCHECK(zexDriverImportExternalPointer, + (driverHandle, HostPtr, Size)); + } + void doZeUSMRelease(ze_driver_handle_t driverHandle, void *HostPtr) { + ZE_CALL_NOCHECK(zexDriverReleaseImportedPointer, (driverHandle, HostPtr)); + } +} ZeUSMImport; + pi_result _pi_platform::initialize() { // Cache driver properties ZeStruct ZeDriverProperties; @@ -1745,6 +1793,10 @@ pi_result _pi_platform::initialize() { zeDriverExtensionMap[extension.name] = extension.version; } + // Check if import user ptr into USM feature has been requested. + // If yes, then set up L0 API pointers if the platform supports it. + ZeUSMImport.setZeUSMImport(this); + return PI_SUCCESS; } @@ -1854,8 +1906,9 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, std::copy_n(PiPlatformsCache->begin(), NumEntries, Platforms); } - if (NumPlatforms) + if (NumPlatforms) { *NumPlatforms = PiPlatformsCache->size(); + } zePrint("Using events scope: %s\n", EventsScope == AllHostVisible ? "all host-visible" @@ -3360,21 +3413,52 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, else Alignment = 1UL; - pi_result Result = PI_SUCCESS; + // If USM Import feature is enabled and hostptr is supplied, + // import the hostptr if not already imported into USM. + // Data transfer rate is maximized when both source and destination + // are USM pointers. Promotion of the host pointer to USM thus + // optimizes data transfer performance. + bool HostPtrImported = false; + if (ZeUSMImport.Enabled && HostPtr != nullptr && + (Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0) { + // Query memory type of the host pointer + ze_device_handle_t ZeDeviceHandle; + ZeStruct ZeMemoryAllocationProperties; + ZE_CALL(zeMemGetAllocProperties, + (Context->ZeContext, HostPtr, &ZeMemoryAllocationProperties, + &ZeDeviceHandle)); + + // If not shared of any type, we can import the ptr + if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN) { + // Promote the host ptr to USM host memory + ze_driver_handle_t driverHandle = Context->Devices[0]->Platform->ZeDriver; + ZeUSMImport.doZeUSMImport(driverHandle, HostPtr, Size); + HostPtrImported = true; + } + } + + pi_result Result; if (DeviceIsIntegrated) { - if (enableBufferPooling()) { - PI_CALL(piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment)); - } else - Result = ZeHostMemAllocHelper(&Ptr, Context, Size); + if (HostPtrImported) { + // When HostPtr is imported we use it for the buffer. + Ptr = HostPtr; + } else { + if (enableBufferPooling()) { + PI_CALL(piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment)); + } else { + Result = ZeHostMemAllocHelper(&Ptr, Context, Size); + } + } } else if (Context->SingleRootDevice) { // If we have a single discrete device or all devices in the context are // sub-devices of the same device then we can allocate on device if (enableBufferPooling()) { PI_CALL(piextUSMDeviceAlloc(&Ptr, Context, Context->SingleRootDevice, nullptr, Size, Alignment)); - } else + } else { Result = ZeDeviceMemAllocHelper(&Ptr, Context, Context->SingleRootDevice, Size); + } } else { // Context with several gpu cards. Temporarily use host allocation because // it is accessible by all devices. But it is not good in terms of @@ -3382,10 +3466,16 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, // TODO: We need to either allow remote access to device memory using IPC, // or do explicit memory transfers from one device to another using host // resources as backing buffers to allow those transfers. - if (enableBufferPooling()) { - PI_CALL(piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment)); - } else - Result = ZeHostMemAllocHelper(&Ptr, Context, Size); + if (HostPtrImported) { + // When HostPtr is imported we use it for the buffer. + Ptr = HostPtr; + } else { + if (enableBufferPooling()) { + PI_CALL(piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment)); + } else { + Result = ZeHostMemAllocHelper(&Ptr, Context, Size); + } + } } if (Result != PI_SUCCESS) @@ -3396,8 +3486,10 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) { // Initialize the buffer with user data if (DeviceIsIntegrated) { - // Do a host to host copy - memcpy(Ptr, HostPtr, Size); + // Do a host to host copy. + // For an imported HostPtr the copy is unneeded. + if (!HostPtrImported) + memcpy(Ptr, HostPtr, Size); } else if (Context->SingleRootDevice) { // Initialize the buffer synchronously with immediate offload ZE_CALL(zeCommandListAppendMemoryCopy, @@ -3406,7 +3498,9 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, } else { // Multiple root devices, do a host to host copy because we use a host // allocation for this case. - memcpy(Ptr, HostPtr, Size); + // For an imported HostPtr the copy is unneeded. + if (!HostPtrImported) + memcpy(Ptr, HostPtr, Size); } } else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) { // Nothing more to do. @@ -3421,7 +3515,7 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, *RetMem = new _pi_buffer( Context, pi_cast(Ptr) /* Level Zero Memory Handle */, HostPtrOrNull, nullptr, 0, 0, - DeviceIsIntegrated /* allocation in host memory */); + DeviceIsIntegrated /* allocation in host memory */, HostPtrImported); } catch (const std::bad_alloc &) { return PI_OUT_OF_HOST_MEMORY; } catch (...) { @@ -3491,11 +3585,17 @@ pi_result piMemRelease(pi_mem Mem) { } else { auto Buf = static_cast<_pi_buffer *>(Mem); if (!Buf->isSubBuffer()) { - if (enableBufferPooling()) { - PI_CALL(piextUSMFree(Mem->Context, Mem->getZeHandle())); + if (Mem->HostPtrImported) { + ze_driver_handle_t driverHandle = + Mem->Context->Devices[0]->Platform->ZeDriver; + ZeUSMImport.doZeUSMRelease(driverHandle, Mem->MapHostPtr); } else { - if (auto Res = ZeMemFreeHelper(Mem->Context, Mem->getZeHandle())) - return Res; + if (enableBufferPooling()) { + PI_CALL(piextUSMFree(Mem->Context, Mem->getZeHandle())); + } else { + if (auto Res = ZeMemFreeHelper(Mem->Context, Mem->getZeHandle())) + return Res; + } } } } @@ -6020,7 +6120,8 @@ pi_result piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, if (Buffer->MapHostPtr) { *RetMap = Buffer->MapHostPtr + Offset; - if (!(MapFlags & PI_MAP_WRITE_INVALIDATE_REGION)) + if (!Buffer->HostPtrImported && + !(MapFlags & PI_MAP_WRITE_INVALIDATE_REGION)) memcpy(*RetMap, pi_cast(Buffer->getZeHandle()) + Offset, Size); } else { *RetMap = pi_cast(Buffer->getZeHandle()) + Offset; diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 62e042772af19..2892bd52febeb 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -811,6 +811,9 @@ struct _pi_mem : _pi_object { // Flag to indicate that this memory is allocated in host memory bool OnHost; + // Flag to indicate that the host ptr has been imported into USM + bool HostPtrImported; + // Supplementary data to keep track of the mappings of this memory // created with piEnqueueMemBufferMap and piEnqueueMemImageMap. struct Mapping { @@ -838,8 +841,10 @@ struct _pi_mem : _pi_object { pi_result removeMapping(void *MappedTo, Mapping &MapInfo); protected: - _pi_mem(pi_context Ctx, char *HostPtr, bool MemOnHost = false) - : Context{Ctx}, MapHostPtr{HostPtr}, OnHost{MemOnHost}, Mappings{} {} + _pi_mem(pi_context Ctx, char *HostPtr, bool MemOnHost = false, + bool ImportedHostPtr = false) + : Context{Ctx}, MapHostPtr{HostPtr}, OnHost{MemOnHost}, + HostPtrImported{ImportedHostPtr}, Mappings{} {} private: // The key is the host pointer representing an active mapping. @@ -856,9 +861,9 @@ struct _pi_buffer final : _pi_mem { // Buffer/Sub-buffer constructor _pi_buffer(pi_context Ctx, char *Mem, char *HostPtr, _pi_mem *Parent = nullptr, size_t Origin = 0, size_t Size = 0, - bool MemOnHost = false) - : _pi_mem(Ctx, HostPtr, MemOnHost), ZeMem{Mem}, SubBuffer{Parent, Origin, - Size} {} + bool MemOnHost = false, bool ImportedHostPtr = false) + : _pi_mem(Ctx, HostPtr, MemOnHost, ImportedHostPtr), ZeMem{Mem}, + SubBuffer{Parent, Origin, Size} {} void *getZeHandle() override { return ZeMem; }