Skip to content

Commit bede0e9

Browse files
authored
[SYCL][SYCLLowerWGLocalMemoryPass] Remove implicit dependency on AlwaysInlinerPass and move to PipelineStart (#16356)
Currently SYCLLowerWGLocalMemoryPass must run after AlwaysInlinerPass because in sycl header __sycl_allocateLocalMemory call is wrapped in group_local_memory/group_local_memory_for_overwrite function. Each call to __sycl_allocateLocalMemory represents a unique local memory, so group_local_memory/group_local_memory_for_overwrite must be inlined. The dependency is implicit and prohibits SYCLLowerWGLocalMemoryPass being moved around in the pass pipeline. Since the pass transforms __sycl_allocateLocalMemory call to access of global variable @WGLocalMem, moving the pass to beginning of pipeline could enable more optimization than the function call does. We can't assume backend compiler lowers the global variable after AlwaysInlinerPass.
1 parent 7a4bb2c commit bede0e9

File tree

9 files changed

+207
-51
lines changed

9 files changed

+207
-51
lines changed

clang/lib/CodeGen/BackendUtil.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1044,6 +1044,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
10441044
/*FP64ConvEmu=*/CodeGenOpts.FP64ConvEmu,
10451045
/*ExcludeAspects=*/{"fp64"}));
10461046
MPM.addPass(SYCLPropagateJointMatrixUsagePass());
1047+
// Lowers static/dynamic local memory builtin calls.
1048+
MPM.addPass(SYCLLowerWGLocalMemoryPass());
10471049
});
10481050
else if (LangOpts.SYCLIsHost && !LangOpts.SYCLESIMDBuildHostCode)
10491051
PB.registerPipelineStartEPCallback(
@@ -1206,10 +1208,6 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
12061208
MPM.addPass(SPIRITTAnnotationsPass());
12071209
}
12081210

1209-
// Allocate static local memory in SYCL kernel scope for each allocation
1210-
// call.
1211-
MPM.addPass(SYCLLowerWGLocalMemoryPass());
1212-
12131211
// Process properties and annotations
12141212
MPM.addPass(CompileTimePropertiesPass());
12151213

clang/test/CodeGenSYCL/group-local-memory.cpp

Lines changed: 0 additions & 29 deletions
This file was deleted.

clang/test/CodeGenSYCL/kernel-early-optimization-pipeline.cpp

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,24 @@
22
// SYCL device target, and can be disabled with -fno-sycl-early-optimizations.
33
// New pass manager doesn't print all passes tree, only module level.
44
//
5-
// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-NEWPM-EARLYOPT
6-
// CHECK-NEWPM-EARLYOPT: ConstantMergePass
7-
// CHECK-NEWPM-EARLYOPT: SYCLMutatePrintfAddrspacePass
5+
// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -o /dev/null 2>&1 | FileCheck %s
6+
// CHECK: SYCLVirtualFunctionsAnalysisPass
7+
// CHECK: ESIMDVerifierPass
8+
// CHECK: SYCLConditionalCallOnDevicePass
9+
// CHECK: SYCLPropagateAspectsUsagePass
10+
// CHECK: SYCLPropagateJointMatrixUsagePass
11+
// CHECK: SYCLLowerWGLocalMemoryPass
12+
// CHECK: InferFunctionAttrsPass
13+
// CHECK: AlwaysInlinerPass
14+
// CHECK: ModuleInlinerWrapperPass
15+
// CHECK: ConstantMergePass
16+
// CHECK: SYCLMutatePrintfAddrspacePass
17+
// CHECK: SYCLPropagateAspectsUsagePass
18+
// CHECK: SYCLAddOptLevelAttributePass
19+
// CHECK: CompileTimePropertiesPass
20+
// CHECK: RecordSYCLAspectNamesPass
21+
// CHECK: CleanupSYCLMetadataPass
822
//
9-
// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -fno-sycl-early-optimizations -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-NEWPM-NOEARLYOPT
10-
// CHECK-NEWPM-NOEARLYOPT-NOT: ConstantMergePass
11-
// CHECK-NEWPM-NOEARLYOPT: SYCLMutatePrintfAddrspacePass
12-
13-
// Checks that the compile time properties pass is added into the compilation pipeline
14-
//
15-
// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-COMPTIMEPROPS
16-
// CHECK-COMPTIMEPROPS: Running pass: CompileTimePropertiesPass on [module]
23+
// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -fno-sycl-early-optimizations -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-NOEARLYOPT
24+
// CHECK-NOEARLYOPT-NOT: ConstantMergePass1
25+
// CHECK-NOEARLYOPT: SYCLMutatePrintfAddrspacePass

llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,17 @@
99
//===----------------------------------------------------------------------===//
1010

1111
#include "llvm/SYCLLowerIR/LowerWGLocalMemory.h"
12+
#include "llvm/ADT/DenseSet.h"
1213
#include "llvm/IR/Function.h"
1314
#include "llvm/IR/IRBuilder.h"
1415
#include "llvm/IR/InstIterator.h"
1516
#include "llvm/Pass.h"
1617
#include "llvm/TargetParser/Triple.h"
18+
#include "llvm/Transforms/Utils/Cloning.h"
1719

1820
using namespace llvm;
1921

20-
#define DEBUG_TYPE "LowerWGLocalMemory"
22+
#define DEBUG_TYPE "sycllowerwglocalmemory"
2123

2224
static constexpr char SYCL_ALLOCLOCALMEM_CALL[] = "__sycl_allocateLocalMemory";
2325
static constexpr char SYCL_DYNAMIC_LOCALMEM_CALL[] =
@@ -88,6 +90,42 @@ ModulePass *llvm::createSYCLLowerWGLocalMemoryLegacyPass() {
8890
return new SYCLLowerWGLocalMemoryLegacy();
8991
}
9092

93+
// In sycl header __sycl_allocateLocalMemory builtin call is wrapped in
94+
// group_local_memory/group_local_memory_for_overwrite functions, which must be
95+
// inlined first before each __sycl_allocateLocalMemory call can be lowered to a
96+
// distinct global variable. Inlining them here so that this pass doesn't have
97+
// implicit dependency on AlwaysInlinerPass.
98+
//
99+
// syclcompat::local_mem, which represents a distinct allocation, calls
100+
// group_local_memory_for_overwrite. So local_mem should be inlined as well.
101+
static bool inlineGroupLocalMemoryFunc(Module &M) {
102+
Function *ALMFunc = M.getFunction(SYCL_ALLOCLOCALMEM_CALL);
103+
if (!ALMFunc || ALMFunc->use_empty())
104+
return false;
105+
106+
SmallVector<Function *, 4> WorkList{ALMFunc};
107+
DenseSet<Function *> Visited;
108+
while (!WorkList.empty()) {
109+
auto *F = WorkList.pop_back_val();
110+
for (auto *U : make_early_inc_range(F->users())) {
111+
auto *CI = cast<CallInst>(U);
112+
auto *Caller = CI->getFunction();
113+
if (Caller->hasFnAttribute("sycl-forceinline") &&
114+
Visited.insert(Caller).second)
115+
WorkList.push_back(Caller);
116+
if (F != ALMFunc) {
117+
InlineFunctionInfo IFI;
118+
[[maybe_unused]] auto Result = InlineFunction(*CI, IFI);
119+
assert(Result.isSuccess() && "inlining failed");
120+
}
121+
}
122+
if (F != ALMFunc)
123+
F->eraseFromParent();
124+
}
125+
126+
return !Visited.empty();
127+
}
128+
91129
// TODO: It should be checked that __sycl_allocateLocalMemory (or its source
92130
// form - group_local_memory) does not occur:
93131
// - in a function (other than user lambda/functor)
@@ -322,9 +360,8 @@ static bool dynamicWGLocalMemory(Module &M) {
322360

323361
PreservedAnalyses SYCLLowerWGLocalMemoryPass::run(Module &M,
324362
ModuleAnalysisManager &) {
325-
bool MadeChanges = allocaWGLocalMemory(M);
326-
MadeChanges = dynamicWGLocalMemory(M) || MadeChanges;
327-
if (MadeChanges)
328-
return PreservedAnalyses::none();
329-
return PreservedAnalyses::all();
363+
bool Changed = inlineGroupLocalMemoryFunc(M);
364+
Changed |= allocaWGLocalMemory(M);
365+
Changed |= dynamicWGLocalMemory(M);
366+
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
330367
}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
; RUN: opt < %s -passes=sycllowerwglocalmemory -S | FileCheck %s
2+
3+
; Check group_local_memory_for_overwrite and group_local_memory functions are inlined.
4+
; Check __sycl_allocateLocalMemory calls are lowered to four separate allocations.
5+
6+
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1"
7+
target triple = "spir64-unknown-unknown"
8+
9+
%"class.sycl::_V1::multi_ptr" = type { ptr addrspace(3) }
10+
%"class.sycl::_V1::group" = type { %"class.sycl::_V1::range", %"class.sycl::_V1::range", %"class.sycl::_V1::range", %"class.sycl::_V1::id" }
11+
%"class.sycl::_V1::range" = type { %"class.sycl::_V1::detail::array" }
12+
%"class.sycl::_V1::detail::array" = type { [1 x i64] }
13+
%"class.sycl::_V1::id" = type { %"class.sycl::_V1::detail::array" }
14+
15+
; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] poison, align 4
16+
; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] poison, align 4
17+
; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] poison, align 4
18+
; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] poison, align 4
19+
20+
; Function Attrs: alwaysinline
21+
define internal spir_func void @_ZZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_ENKUlNS0_7nd_itemILi1EEEE_clES5_() #0 {
22+
entry:
23+
; CHECK: define internal spir_func void @_ZZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_ENKUlNS0_7nd_itemILi1EEEE_clES5_(
24+
; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8
25+
; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8
26+
; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8
27+
; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8
28+
29+
%Ptr = alloca %"class.sycl::_V1::multi_ptr", align 8
30+
%agg = alloca %"class.sycl::_V1::group", align 8
31+
%Ptr.ascast = addrspacecast ptr %Ptr to ptr addrspace(4)
32+
call spir_func void @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg)
33+
call spir_func void @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg)
34+
call spir_func void @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg)
35+
call spir_func void @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg)
36+
ret void
37+
}
38+
39+
; CHECK-NOT: define {{.*}} @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(
40+
41+
; Function Attrs: alwaysinline
42+
define spir_func void @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %agg.result, ptr byval(%"class.sycl::_V1::group") align 8 %g) #1 {
43+
entry:
44+
%AllocatedMem = alloca ptr addrspace(3), align 8
45+
%AllocatedMem.ascast = addrspacecast ptr %AllocatedMem to ptr addrspace(4)
46+
%call = call spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 4, i64 4)
47+
store ptr addrspace(3) %call, ptr addrspace(4) %AllocatedMem.ascast, align 8
48+
ret void
49+
}
50+
51+
; CHECK-NOT: define {{.*}} @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(
52+
53+
; Function Attrs: alwaysinline
54+
define spir_func void @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %agg.result, ptr byval(%"class.sycl::_V1::group") align 8 %g) #1 {
55+
entry:
56+
%AllocatedMem = alloca ptr addrspace(3), align 8
57+
%AllocatedMem.ascast = addrspacecast ptr %AllocatedMem to ptr addrspace(4)
58+
%call = call spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 4, i64 4)
59+
store ptr addrspace(3) %call, ptr addrspace(4) %AllocatedMem.ascast, align 8
60+
ret void
61+
}
62+
63+
declare spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 noundef, i64 noundef)
64+
65+
attributes #0 = { alwaysinline }
66+
attributes #1 = { "sycl-forceinline"="true" }

sycl/include/sycl/ext/oneapi/group_local_memory.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ namespace sycl {
2121
inline namespace _V1 {
2222
namespace ext::oneapi {
2323
template <typename T, typename Group>
24+
#ifdef __SYCL_DEVICE_ONLY__
25+
[[__sycl_detail__::add_ir_attributes_function("sycl-forceinline", true)]]
26+
#endif
2427
std::enable_if_t<
2528
std::is_trivially_destructible_v<T> && sycl::detail::is_group<Group>::value,
2629
multi_ptr<T, access::address_space::local_space, access::decorated::legacy>>
@@ -44,6 +47,9 @@ std::enable_if_t<
4447
}
4548

4649
template <typename T, typename Group, typename... Args>
50+
#ifdef __SYCL_DEVICE_ONLY__
51+
[[__sycl_detail__::add_ir_attributes_function("sycl-forceinline", true)]]
52+
#endif
4753
std::enable_if_t<
4854
std::is_trivially_destructible_v<T> && sycl::detail::is_group<Group>::value,
4955
multi_ptr<T, access::address_space::local_space, access::decorated::legacy>>

sycl/include/syclcompat/memory.hpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,11 @@
6868

6969
namespace syclcompat {
7070

71-
template <typename AllocT> auto *local_mem() {
71+
template <typename AllocT>
72+
#ifdef __SYCL_DEVICE_ONLY__
73+
[[__sycl_detail__::add_ir_attributes_function("sycl-forceinline", true)]]
74+
#endif
75+
__SYCL_ALWAYS_INLINE auto *local_mem() {
7276
sycl::multi_ptr<AllocT, sycl::access::address_space::local_space>
7377
As_multi_ptr =
7478
sycl::ext::oneapi::group_local_memory_for_overwrite<AllocT>(
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -o - | FileCheck %s
2+
// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -fno-sycl-early-optimizations -o - | FileCheck %s
3+
// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -O0 -o - | FileCheck %s
4+
5+
// The test checks that multiple calls to the same template instantiation of a
6+
// group local memory function result in separate allocations.
7+
8+
// CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] poison, align 4
9+
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] poison, align 4
10+
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] poison, align 4
11+
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] poison, align 4
12+
13+
#include <sycl/detail/core.hpp>
14+
#include <sycl/ext/oneapi/group_local_memory.hpp>
15+
#include <sycl/usm.hpp>
16+
17+
using namespace sycl;
18+
19+
int main() {
20+
queue Q;
21+
22+
int **Out = malloc_shared<int *>(4, Q);
23+
24+
Q.submit([&](handler &Cgh) {
25+
Cgh.parallel_for(nd_range<1>({1}, {1}), [=](nd_item<1> Item) {
26+
auto Ptr0 =
27+
ext::oneapi::group_local_memory_for_overwrite<int>(Item.get_group());
28+
auto Ptr1 =
29+
ext::oneapi::group_local_memory_for_overwrite<int>(Item.get_group());
30+
auto Ptr2 = ext::oneapi::group_local_memory<int>(Item.get_group());
31+
auto Ptr3 = ext::oneapi::group_local_memory<int>(Item.get_group());
32+
Out[0] = Ptr0;
33+
Out[1] = Ptr1;
34+
Out[2] = Ptr2;
35+
Out[3] = Ptr3;
36+
});
37+
});
38+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -o - | FileCheck %s
2+
3+
// The test checks that multiple calls to the same template instantiation of
4+
// syclcompat local_mem function result in separate allocations.
5+
6+
// CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] poison, align 4
7+
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] poison, align 4
8+
9+
#include <sycl/detail/core.hpp>
10+
#include <syclcompat/memory.hpp>
11+
12+
using namespace sycl;
13+
14+
int main() {
15+
queue Q;
16+
17+
int **Out = malloc_shared<int *>(2, Q);
18+
19+
Q.submit([&](handler &Cgh) {
20+
Cgh.parallel_for(nd_range<1>({1}, {1}), [=](nd_item<1> Item) {
21+
auto Ptr0 = syclcompat::local_mem<int[1]>();
22+
auto Ptr1 = syclcompat::local_mem<int[1]>();
23+
Out[0] = Ptr0;
24+
Out[1] = Ptr1;
25+
});
26+
});
27+
}

0 commit comments

Comments
 (0)