Skip to content

Commit fbcce33

Browse files
committed
[OpenMP] Honor thread_limit value when choosing grid size
D152014 introduced an optimization that favors more smaller blocks over fewer larger blocks, even if user sets `thread_limit` explicitly. This patch changes the behavior to honor user value. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D158802
1 parent 35c9072 commit fbcce33

File tree

2 files changed

+10
-5
lines changed

2 files changed

+10
-5
lines changed

openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -374,8 +374,9 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
374374
KernelArgs.NumArgs, Args, Ptrs);
375375

376376
uint32_t NumThreads = getNumThreads(GenericDevice, KernelArgs.ThreadLimit);
377-
uint64_t NumBlocks = getNumBlocks(GenericDevice, KernelArgs.NumTeams,
378-
KernelArgs.Tripcount, NumThreads);
377+
uint64_t NumBlocks =
378+
getNumBlocks(GenericDevice, KernelArgs.NumTeams, KernelArgs.Tripcount,
379+
NumThreads, KernelArgs.ThreadLimit[0] > 0);
379380

380381
if (auto Err =
381382
printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
@@ -418,7 +419,8 @@ uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
418419
uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
419420
uint32_t NumTeamsClause[3],
420421
uint64_t LoopTripCount,
421-
uint32_t &NumThreads) const {
422+
uint32_t &NumThreads,
423+
bool IsNumThreadsFromUser) const {
422424
assert(NumTeamsClause[1] == 0 && NumTeamsClause[2] == 0 &&
423425
"Multi dimensional launch not supported yet.");
424426

@@ -443,7 +445,8 @@ uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
443445

444446
// Honor the thread_limit clause; only lower the number of threads.
445447
[[maybe_unused]] auto OldNumThreads = NumThreads;
446-
if (LoopTripCount >= DefaultNumBlocks * NumThreads) {
448+
if (LoopTripCount >= DefaultNumBlocks * NumThreads ||
449+
IsNumThreadsFromUser) {
447450
// Enough parallelism for teams and threads.
448451
TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
449452
assert(TripCountNumBlocks >= DefaultNumBlocks &&

openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,9 +335,11 @@ struct GenericKernelTy {
335335
uint32_t ThreadLimitClause[3]) const;
336336

337337
/// The number of threads \p NumThreads can be adjusted by this method.
338+
/// \p IsNumThreadsFromUser is true is \p NumThreads is defined by user via
339+
/// thread_limit clause.
338340
uint64_t getNumBlocks(GenericDeviceTy &GenericDevice,
339341
uint32_t BlockLimitClause[3], uint64_t LoopTripCount,
340-
uint32_t &NumThreads) const;
342+
uint32_t &NumThreads, bool IsNumThreadsFromUser) const;
341343

342344
/// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
343345
bool isGenericSPMDMode() const {

0 commit comments

Comments
 (0)