Skip to content

Commit 7503f1b

Browse files
dhruvachakronlieb
authored andcommitted
[OpenMP] Allow more flexibility in XteamReduction blocksize.
The blocksize will not be fixed any more at compile-time. Instead, an upper limit on the blocksize will be determined during codegen. This allows reducing the blocksize at runtime with the environment variable OMP_TEAMS_THREAD_LIMIT. Blocksizes between [1, 1024] will now be allowed for XteamReduction. While the above flexibility is allowed, codegen and the runtime will round down the blocksize to a power of 2. The DeviceRTL reduction function assumes the blocksize to be a power of 2 for performance reasons. This change adds a runtime query to obtain the dynamic blocksize in the DeviceRTL XteamReduction function whereas previously the blocksize would be a compile-time constant. The compiler will now honor blocksizes in the range [1, 1024] using the option -fopenmp-target-xteam-reduction-blocksize but the compiler will round it to a power-of-2 if not so already. The default blocksize for XteamReduction remains at 1024. Change-Id: I7d7bf800a86c420ba76261bec1175f6576ffc68a
1 parent c3fd312 commit 7503f1b

File tree

11 files changed

+435
-138
lines changed

11 files changed

+435
-138
lines changed

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3779,11 +3779,6 @@ llvm::Value *CGOpenMPRuntimeGPU::getGPUBlockID(CodeGenFunction &CGF) {
37793779
return Bld.CreateCall(F, std::nullopt, "gpu_block_id");
37803780
}
37813781

3782-
llvm::Value *CGOpenMPRuntimeGPU::getXteamRedBlockSize(CodeGenFunction &CGF,
3783-
int BlockSize) {
3784-
return llvm::ConstantInt::get(CGF.Int32Ty, BlockSize);
3785-
}
3786-
37873782
llvm::Value *CGOpenMPRuntimeGPU::getGPUNumBlocks(CodeGenFunction &CGF) {
37883783
return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
37893784
CGM.getModule(), OMPRTL___kmpc_get_hardware_num_blocks));
@@ -3872,10 +3867,22 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamRedSum(
38723867
unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
38733868
assert(WarpSize == 32 || WarpSize == 64);
38743869

3870+
assert(BlockSize > 0 && BlockSize <= llvm::omp::xteam_red::MaxBlockSize &&
3871+
"XTeam Reduction blocksize outside expected range");
3872+
assert(((BlockSize & (BlockSize - 1)) == 0) &&
3873+
"XTeam Reduction blocksize must be a power of two");
3874+
38753875
if (SumType->isIntegerTy()) {
38763876
if (SumType->getPrimitiveSizeInBits() == 32) {
38773877
if (WarpSize == 32) {
38783878
switch (BlockSize) {
3879+
default:
3880+
return CGF.EmitRuntimeCall(
3881+
OMPBuilder.getOrCreateRuntimeFunction(
3882+
CGM.getModule(), IsFast
3883+
? OMPRTL___kmpc_xteamr_ui_1x32_fast_sum
3884+
: OMPRTL___kmpc_xteamr_ui_1x32),
3885+
Args);
38793886
case 64:
38803887
return CGF.EmitRuntimeCall(
38813888
OMPBuilder.getOrCreateRuntimeFunction(
@@ -3914,7 +3921,7 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamRedSum(
39143921
}
39153922
} else {
39163923
switch (BlockSize) {
3917-
case 64:
3924+
default:
39183925
return CGF.EmitRuntimeCall(
39193926
OMPBuilder.getOrCreateRuntimeFunction(
39203927
CGM.getModule(), IsFast
@@ -3955,6 +3962,13 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamRedSum(
39553962
if (SumType->getPrimitiveSizeInBits() == 64) {
39563963
if (WarpSize == 32) {
39573964
switch (BlockSize) {
3965+
default:
3966+
return CGF.EmitRuntimeCall(
3967+
OMPBuilder.getOrCreateRuntimeFunction(
3968+
CGM.getModule(), IsFast
3969+
? OMPRTL___kmpc_xteamr_ul_1x32_fast_sum
3970+
: OMPRTL___kmpc_xteamr_ul_1x32),
3971+
Args);
39583972
case 64:
39593973
return CGF.EmitRuntimeCall(
39603974
OMPBuilder.getOrCreateRuntimeFunction(
@@ -3993,7 +4007,7 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamRedSum(
39934007
}
39944008
} else {
39954009
switch (BlockSize) {
3996-
case 64:
4010+
default:
39974011
return CGF.EmitRuntimeCall(
39984012
OMPBuilder.getOrCreateRuntimeFunction(
39994013
CGM.getModule(), IsFast
@@ -4035,6 +4049,12 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamRedSum(
40354049
if (SumType->isFloatTy()) {
40364050
if (WarpSize == 32) {
40374051
switch (BlockSize) {
4052+
default:
4053+
return CGF.EmitRuntimeCall(
4054+
OMPBuilder.getOrCreateRuntimeFunction(
4055+
CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_f_1x32_fast_sum
4056+
: OMPRTL___kmpc_xteamr_f_1x32),
4057+
Args);
40384058
case 64:
40394059
return CGF.EmitRuntimeCall(
40404060
OMPBuilder.getOrCreateRuntimeFunction(
@@ -4068,7 +4088,7 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamRedSum(
40684088
}
40694089
} else {
40704090
switch (BlockSize) {
4071-
case 64:
4091+
default:
40724092
return CGF.EmitRuntimeCall(
40734093
OMPBuilder.getOrCreateRuntimeFunction(
40744094
CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_f_1x64_fast_sum
@@ -4104,6 +4124,12 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamRedSum(
41044124
if (SumType->isDoubleTy()) {
41054125
if (WarpSize == 32) {
41064126
switch (BlockSize) {
4127+
default:
4128+
return CGF.EmitRuntimeCall(
4129+
OMPBuilder.getOrCreateRuntimeFunction(
4130+
CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_d_1x32_fast_sum
4131+
: OMPRTL___kmpc_xteamr_d_1x32),
4132+
Args);
41074133
case 64:
41084134
return CGF.EmitRuntimeCall(
41094135
OMPBuilder.getOrCreateRuntimeFunction(
@@ -4137,7 +4163,7 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamRedSum(
41374163
}
41384164
} else {
41394165
switch (BlockSize) {
4140-
case 64:
4166+
default:
41414167
return CGF.EmitRuntimeCall(
41424168
OMPBuilder.getOrCreateRuntimeFunction(
41434169
CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_d_1x64_fast_sum

clang/lib/CodeGen/CGOpenMPRuntimeGPU.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -168,9 +168,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
168168
/// Get the number of blocks on the GPU
169169
llvm::Value *getGPUNumBlocks(CodeGenFunction &CGF);
170170

171-
/// Get the number of blocks on the GPU for special reduction
172-
llvm::Value *getXteamRedBlockSize(CodeGenFunction &CGF, int BlockSize);
173-
174171
std::pair<llvm::Value *, llvm::Value *>
175172
getXteamRedFunctionPtrs(CodeGenFunction &CGF, llvm::Type *RedVarType);
176173

clang/lib/CodeGen/CGStmt.cpp

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -88,10 +88,7 @@ CodeGenFunction::EmitBigJumpLoopStartingIndex(const ForStmt &FStmt) {
8888
llvm::Value *GpuThreadId = RT.getGPUThreadID(*this);
8989

9090
// workgroup_size
91-
llvm::Value *WorkGroupSize =
92-
CGM.isXteamRedKernel(&FStmt)
93-
? RT.getXteamRedBlockSize(*this, CGM.getXteamRedBlockSize(&FStmt))
94-
: RT.getGPUNumThreads(*this);
91+
llvm::Value *WorkGroupSize = RT.getGPUNumThreads(*this);
9592

9693
// workgroup_id
9794
llvm::Value *WorkGroupId = RT.getGPUBlockID(*this);
@@ -147,11 +144,7 @@ void CodeGenFunction::EmitBigJumpLoopInc(const ForStmt &FStmt,
147144
const OMPLoopDirective &LD = *(cast<OMPLoopDirective>(Directives.back()));
148145

149146
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
150-
llvm::Value *BlockSize =
151-
CGM.isXteamRedKernel(&FStmt)
152-
? RT.getXteamRedBlockSize(*this, CGM.getXteamRedBlockSize(&FStmt))
153-
: RT.getGPUNumThreads(*this);
154-
147+
llvm::Value *BlockSize = RT.getGPUNumThreads(*this);
155148
llvm::Value *NumBlocks = CGM.isXteamRedKernel(&FStmt)
156149
? CGM.getXteamRedNumTeams(&FStmt)
157150
: RT.getGPUNumBlocks(*this);

clang/lib/CodeGen/CodeGenModule.cpp

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8281,20 +8281,23 @@ int CodeGenModule::getOptKernelWorkGroupSize(
82818281
? llvm::omp::xteam_red::DefaultBlockSize
82828282
: getTarget().getGridValue().GV_Default_WG_Size;
82838283

8284+
int ThreadLimit = isXteamRed ? llvm::omp::xteam_red::MaxBlockSize
8285+
: getTarget().getGridValue().GV_Max_WG_Size;
8286+
82848287
// Allow command-line option override clauses on the OpenMP construct.
82858288
// Exception: If the command line value is the same as the default, the clause
82868289
// overrides.
82878290
int CmdLineOption = isXteamRed
82888291
? getLangOpts().OpenMPTargetXteamReductionBlockSize
82898292
: getLangOpts().OpenMPGPUThreadsPerTeam;
8290-
if (CmdLineOption != WGSizeDefault)
8293+
if (CmdLineOption > 0 && CmdLineOption <= ThreadLimit &&
8294+
CmdLineOption != WGSizeDefault)
82918295
return CmdLineOption;
82928296

82938297
// The blocksize used by optimized kernels is the minimum of the
82948298
// max_wg_size and any thread_limit or num_threads specified on any OpenMP
82958299
// clauses.
8296-
int WGSize = isXteamRed ? llvm::omp::xteam_red::MaxBlockSize
8297-
: getTarget().getGridValue().GV_Max_WG_Size;
8300+
int WGSize = ThreadLimit;
82988301
for (const auto &Dir : NestDirs)
82998302
WGSize = std::min(WGSize, getWorkGroupSizeSPMDHelper(*Dir));
83008303
return WGSize;
@@ -8305,17 +8308,8 @@ int CodeGenModule::computeOptKernelBlockSize(
83058308
int InitialBlockSize = getOptKernelWorkGroupSize(NestDirs, isXteamRed);
83068309
if (!isXteamRed)
83078310
return InitialBlockSize;
8308-
// We support block sizes 64, 128, 256, 512, and 1024 only for Xteam
8309-
// reduction.
8310-
if (InitialBlockSize < 128)
8311-
return 64;
8312-
if (InitialBlockSize < 256)
8313-
return 128;
8314-
if (InitialBlockSize < 512)
8315-
return 256;
8316-
if (InitialBlockSize < 1024)
8317-
return 512;
8318-
return 1024;
8311+
// We support block sizes that are a power of 2 for Xteam reduction.
8312+
return llvm::omp::getBlockSizeAsPowerOfTwo(InitialBlockSize);
83198313
}
83208314

83218315
std::pair<CodeGenModule::NoLoopXteamErr, bool>
@@ -8746,8 +8740,7 @@ CodeGenModule::checkAndSetXteamRedKernel(const OMPExecutableDirective &D) {
87468740
RedVarMapPair.second.second, isFastXteamSumReduction())));
87478741

87488742
// The blocksize has to be computed after adding this kernel to the metadata
8749-
// above, since the computation below depends on that metadata. Compute
8750-
// block size during device compilation only.
8743+
// above, since the computation below depends on that metadata.
87518744
int BlockSize = computeOptKernelBlockSize(NestDirs, /*isXteamRed=*/true);
87528745
if (BlockSize > 0)
87538746
updateXteamRedKernel(FStmt, BlockSize);

0 commit comments

Comments
 (0)