diff --git a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp index e6afe27d58bae..b35d5e6dbf71f 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp @@ -184,11 +184,40 @@ lowerDynamicLocalMemCallDirect(CallInst *CI, Triple TT, static void lowerLocalMemCall(Function *LocalMemAllocFunc, std::function TransformCall) { + static SmallPtrSet FuncsCache; SmallVector DelCalls; for (User *U : LocalMemAllocFunc->users()) { auto *CI = cast(U); TransformCall(CI); DelCalls.push_back(CI); + // Now, take each kernel that calls the builtins that allocate local memory, + // either directly or through a series of function calls that eventually end + // up in a direct call to the builtin, and attach the + // work-group-memory-static attribute to the kernel if not already attached. + // This is needed because free function kernels do not have the attribute + // added by the library as is the case with other types of kernels. + if (!FuncsCache.insert(CI->getFunction()).second) + continue; // We have already traversed call graph from this function. + + SmallVector WorkList; + WorkList.push_back(CI->getFunction()); + while (!WorkList.empty()) { + Function *F = WorkList.back(); + WorkList.pop_back(); + + // Mark kernel as using scratch memory if it isn't marked already. + if (F->getCallingConv() == CallingConv::SPIR_KERNEL && + !F->hasFnAttribute(WORK_GROUP_STATIC_ATTR)) + F->addFnAttr(WORK_GROUP_STATIC_ATTR); + + for (auto *FU : F->users()) { + if (auto *UCI = dyn_cast(FU)) { + if (FuncsCache.insert(UCI->getFunction()).second) + WorkList.push_back(UCI->getFunction()); + } // Even though there could be other uses of a Function, we don't + // care about them because we are only concerned about call graph. + } + } } for (auto *CI : DelCalls) { diff --git a/llvm/test/SYCLLowerIR/work_group_static.ll b/llvm/test/SYCLLowerIR/work_group_static.ll index 39eb0cfa56cff..397a34aa6e98c 100644 --- a/llvm/test/SYCLLowerIR/work_group_static.ll +++ b/llvm/test/SYCLLowerIR/work_group_static.ll @@ -22,9 +22,29 @@ entry: ret void } +; Function Attrs: convergent norecurse +; CHECK: @__sycl_kernel_B{{.*}} #[[ATTRS:[0-9]+]] +define weak_odr dso_local spir_kernel void @__sycl_kernel_B(ptr addrspace(1) %0) local_unnamed_addr #1 !kernel_arg_addr_space !5 { +entry: + %1 = tail call spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64 128) #1 + ret void +} + +; Function Attrs: convergent norecurse +; CHECK: @__sycl_kernel_C{{.*}} #[[ATTRS]] +define weak_odr dso_local spir_kernel void @__sycl_kernel_C(ptr addrspace(1) %0) local_unnamed_addr #1 !kernel_arg_addr_space !5 { +entry: + %1 = tail call spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 128, i64 4) #1 + ret void +} + +; Function Attrs: convergent +declare dso_local spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64, i64) local_unnamed_addr #1 + ; Function Attrs: convergent declare dso_local spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64) local_unnamed_addr #1 +; CHECK: #[[ATTRS]] = {{.*}} "sycl-work-group-static" attributes #0 = { convergent norecurse "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" "sycl-work-group-static"="1" } attributes #1 = { convergent norecurse } diff --git a/sycl/test-e2e/FreeFunctionKernels/free_function_kernel_local_memory.cpp b/sycl/test-e2e/FreeFunctionKernels/free_function_kernel_local_memory.cpp new file mode 100644 index 0000000000000..c5cfed9da311e --- /dev/null +++ b/sycl/test-e2e/FreeFunctionKernels/free_function_kernel_local_memory.cpp @@ -0,0 +1,114 @@ +// REQUIRES: aspect-usm_shared_allocations +// UNSUPPORTED: target-amd +// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16072 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// This test verifies that we can compile, run and get correct results when +// using a free function kernel that allocates shared local memory in a kernel +// either by way of the work group scratch memory extension or the work group +// static memory extension. + +#include "helpers.hpp" + +#include +#include +#include +#include +#include +#include + +namespace syclext = sycl::ext::oneapi; +namespace syclexp = sycl::ext::oneapi::experimental; + +constexpr int SIZE = 16; + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) +void scratchKernel(float *Src, float *Dst) { + size_t Lid = syclext::this_work_item::get_nd_item<1>().get_local_linear_id(); + float *LocalMem = + reinterpret_cast(syclexp::get_work_group_scratch_memory()); + LocalMem[Lid] = 2 * Src[Lid]; + Dst[Lid] = LocalMem[Lid]; +} + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) +void staticKernel(float *Src, float *Dst) { + sycl::nd_item<1> Item = syclext::this_work_item::get_nd_item<1>(); + size_t Lid = Item.get_local_linear_id(); + syclexp::work_group_static LocalMem; + LocalMem[Lid] = Src[Lid] * Src[Lid]; + sycl::group_barrier(Item.get_group()); + if (Item.get_group().leader()) { // Check that memory is indeed shared between + // the work group. + for (int I = 0; I < SIZE; ++I) + assert(LocalMem[I] == Src[I] * Src[I]); + } + Dst[Lid] = LocalMem[Lid]; +} + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) +void scratchStaticKernel(float *Src, float *Dst) { + size_t Lid = syclext::this_work_item::get_nd_item<1>().get_local_linear_id(); + float *ScratchMem = + reinterpret_cast(syclexp::get_work_group_scratch_memory()); + syclexp::work_group_static StaticMem; + ScratchMem[Lid] = Src[Lid]; + StaticMem[Lid] = Src[Lid]; + Dst[Lid] = ScratchMem[Lid] + StaticMem[Lid]; +} + +int main() { + sycl::queue Q; + float *Src = sycl::malloc_shared(SIZE, Q); + float *Dst = sycl::malloc_shared(SIZE, Q); + + for (int I = 0; I < SIZE; I++) { + Src[I] = I; + } + + auto ScratchBndl = + syclexp::get_kernel_bundle( + Q.get_context()); + auto StaticBndl = + syclexp::get_kernel_bundle( + Q.get_context()); + auto ScratchStaticBndl = syclexp::get_kernel_bundle< + scratchStaticKernel, sycl::bundle_state::executable>(Q.get_context()); + + sycl::kernel ScratchKrn = + ScratchBndl.template ext_oneapi_get_kernel(); + sycl::kernel StaticKrn = + StaticBndl.template ext_oneapi_get_kernel(); + sycl::kernel ScratchStaticKrn = + ScratchStaticBndl.template ext_oneapi_get_kernel(); + syclexp::launch_config ScratchKernelcfg{ + ::sycl::nd_range<1>(::sycl::range<1>(SIZE), ::sycl::range<1>(SIZE)), + syclexp::properties{ + syclexp::work_group_scratch_size(SIZE * sizeof(float))}}; + syclexp::launch_config StaticKernelcfg{ + ::sycl::nd_range<1>(::sycl::range<1>(SIZE), ::sycl::range<1>(SIZE))}; + + syclexp::nd_launch(Q, ScratchKernelcfg, ScratchKrn, Src, Dst); + Q.wait(); + for (int I = 0; I < SIZE; I++) { + assert(Dst[I] == 2 * Src[I]); + } + + syclexp::nd_launch(Q, StaticKernelcfg, StaticKrn, Src, Dst); + Q.wait(); + for (int I = 0; I < SIZE; I++) { + assert(Dst[I] == Src[I] * Src[I]); + } + + syclexp::nd_launch(Q, ScratchKernelcfg, ScratchStaticKrn, Src, Dst); + Q.wait(); + for (int I = 0; I < SIZE; I++) { + assert(Dst[I] == 2 * Src[I]); + } + + sycl::free(Src, Q); + sycl::free(Dst, Q); + return 0; +}