From 936b2da6ac5a2f54f02831c62ac2306642b25290 Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Fri, 16 Jul 2021 22:04:27 +0000 Subject: [PATCH 1/4] [SYCL][L0] Add temporary option to allow user to use copy engine for device to device copy --- sycl/plugins/level_zero/pi_level_zero.cpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 9d458a2cb2c55..2575ba7aaa144 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -4946,6 +4946,17 @@ pi_result piEnqueueMemBufferWriteRect( EventWaitList, Event); } +// This is an experimental option to test performance of device to device copy +// operations on copy engines (versus compute engine) +static bool UseCopyEngineForD2DCopy(bool HasCopyEngine) { + if (HasCopyEngine) { + const char *CopyEngineForD2DCopy = + std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY"); + return (CopyEngineForD2DCopy && (std::stoi(CopyEngineForD2DCopy) != 0)); + } + return false; +} + pi_result piEnqueueMemBufferCopy(pi_queue Queue, pi_mem SrcBuffer, pi_mem DstBuffer, size_t SrcOffset, size_t DstOffset, size_t Size, @@ -4957,6 +4968,11 @@ pi_result piEnqueueMemBufferCopy(pi_queue Queue, pi_mem SrcBuffer, // Copy engine is preferred only for host to device transfer. // Device to device transfers run faster on compute engines. bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost); + + // Temporary option added to use copy engine for D2D copy + // This is an experimental option and will be removed soon + PreferCopyEngine |= UseCopyEngineForD2DCopy(Queue->Device->hasCopyEngine()); + return enqueueMemCopyHelper( PI_COMMAND_TYPE_MEM_BUFFER_COPY, Queue, pi_cast(DstBuffer->getZeHandle()) + DstOffset, @@ -6204,6 +6220,11 @@ pi_result piextUSMEnqueueMemcpy(pi_queue Queue, pi_bool Blocking, void *DstPtr, // (versus compute engine). bool PreferCopyEngine = !IsDevicePointer(Queue->Context, SrcPtr) || !IsDevicePointer(Queue->Context, DstPtr); + + // Temporary option added to use copy engine for D2D copy + // This is an experimental option and will be removed soon + PreferCopyEngine |= UseCopyEngineForD2DCopy(Queue->Device->hasCopyEngine()); + return enqueueMemCopyHelper( // TODO: do we need a new command type for this? PI_COMMAND_TYPE_MEM_BUFFER_COPY, Queue, DstPtr, Blocking, Size, SrcPtr, From c26e9772b55ea52c30e5b79db22dfefcc792434e Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Sun, 18 Jul 2021 18:39:51 +0000 Subject: [PATCH 2/4] Addressing comments to modify how environment variable is accessed Signed-off-by: Arvind Sudarsanam --- sycl/plugins/level_zero/pi_level_zero.cpp | 25 +++++++++-------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 2575ba7aaa144..f244438f60426 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -46,6 +46,14 @@ static const pi_uint32 ZeSerialize = [] { return SerializeModeValue; }(); +// This is an experimental option to test performance of device to device copy +// operations on copy engines (versus compute engine) +static const bool UseCopyEngineForD2DCopy = [] { + const char *CopyEngineForD2DCopy = + std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY"); + return (CopyEngineForD2DCopy && (std::stoi(CopyEngineForD2DCopy) != 0)); +}(); + // This class encapsulates actions taken along with a call to Level Zero API. class ZeCall { private: @@ -4946,17 +4954,6 @@ pi_result piEnqueueMemBufferWriteRect( EventWaitList, Event); } -// This is an experimental option to test performance of device to device copy -// operations on copy engines (versus compute engine) -static bool UseCopyEngineForD2DCopy(bool HasCopyEngine) { - if (HasCopyEngine) { - const char *CopyEngineForD2DCopy = - std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY"); - return (CopyEngineForD2DCopy && (std::stoi(CopyEngineForD2DCopy) != 0)); - } - return false; -} - pi_result piEnqueueMemBufferCopy(pi_queue Queue, pi_mem SrcBuffer, pi_mem DstBuffer, size_t SrcOffset, size_t DstOffset, size_t Size, @@ -4970,8 +4967,7 @@ pi_result piEnqueueMemBufferCopy(pi_queue Queue, pi_mem SrcBuffer, bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost); // Temporary option added to use copy engine for D2D copy - // This is an experimental option and will be removed soon - PreferCopyEngine |= UseCopyEngineForD2DCopy(Queue->Device->hasCopyEngine()); + PreferCopyEngine |= UseCopyEngineForD2DCopy; return enqueueMemCopyHelper( PI_COMMAND_TYPE_MEM_BUFFER_COPY, Queue, @@ -6222,8 +6218,7 @@ pi_result piextUSMEnqueueMemcpy(pi_queue Queue, pi_bool Blocking, void *DstPtr, !IsDevicePointer(Queue->Context, DstPtr); // Temporary option added to use copy engine for D2D copy - // This is an experimental option and will be removed soon - PreferCopyEngine |= UseCopyEngineForD2DCopy(Queue->Device->hasCopyEngine()); + PreferCopyEngine |= UseCopyEngineForD2DCopy; return enqueueMemCopyHelper( // TODO: do we need a new command type for this? From d3d4a517af5cba9ee468524a5d79ea5c4907b06b Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Tue, 20 Jul 2021 13:58:34 +0000 Subject: [PATCH 3/4] Add documentation for environment variable Signed-off-by: Arvind Sudarsanam --- sycl/doc/EnvironmentVariables.md | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 752231f0e4aef..80b8d300d0177 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -32,6 +32,7 @@ subject to change. Do not rely on these variables in production code. | `SYCL_PI_LEVEL_ZERO_BATCH_SIZE` | Integer | Sets a preferred number of commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. | | `SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST` | Integer | When set to 0, disables filtering of signaled events from wait lists when using the Level Zero backend. The default is 1. | | `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE` | Integer | Allows the use of copy engine, if available in the device, in Level Zero plugin to transfer SYCL buffer or image data between the host and/or device(s) and to fill SYCL buffer or image data in device or shared memory. The default is 1. | +| `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY` | Integer | Allows the use of copy engine, if available in the device, in Level Zero plugin for device to device copy operations. The default is 0. This option is experimental and will be removed once heuristics are added to make a decision about use of copy engine for device to device copy oeprations. | | `SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY` | Any(\*) | Enable support of the kernels with indirect access and corresponding deferred release of memory allocations in the Level Zero plugin. | | `SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE` | Any(\*) | Enables tracing of `parallel_for` invocations with rounded-up ranges. | | `SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING` | Any(\*) | Disables automatic rounding-up of `parallel_for` invocation ranges. | From 6f09bc3307aa5897c5a36069a0761a4a00883ed1 Mon Sep 17 00:00:00 2001 From: asudarsa Date: Tue, 20 Jul 2021 16:02:33 -0400 Subject: [PATCH 4/4] Update sycl/doc/EnvironmentVariables.md Co-authored-by: Pavel Chupin --- sycl/doc/EnvironmentVariables.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 5de0219ae982b..2fd2acf037367 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -32,7 +32,7 @@ subject to change. Do not rely on these variables in production code. | `SYCL_PI_LEVEL_ZERO_BATCH_SIZE` | Integer | Sets a preferred number of commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. | | `SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST` | Integer | When set to 0, disables filtering of signaled events from wait lists when using the Level Zero backend. The default is 1. | | `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE` | Integer | Allows the use of copy engine, if available in the device, in Level Zero plugin to transfer SYCL buffer or image data between the host and/or device(s) and to fill SYCL buffer or image data in device or shared memory. The default is 1. | -| `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY` | Integer | Allows the use of copy engine, if available in the device, in Level Zero plugin for device to device copy operations. The default is 0. This option is experimental and will be removed once heuristics are added to make a decision about use of copy engine for device to device copy oeprations. | +| `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY` (experimental) | Integer | Allows the use of copy engine, if available in the device, in Level Zero plugin for device to device copy operations. The default is 0. This option is experimental and will be removed once heuristics are added to make a decision about use of copy engine for device to device copy operations. | | `SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY` | Any(\*) | Enable support of the kernels with indirect access and corresponding deferred release of memory allocations in the Level Zero plugin. | | `SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE` | Any(\*) | Enables tracing of `parallel_for` invocations with rounded-up ranges. | | `SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING` | Any(\*) | Disables automatic rounding-up of `parallel_for` invocation ranges. |