Skip to content

Commit 0136a44

Browse files
committed
[OpenMP] Add an option to limit shared memory usage in OpenMPOpt
One of the optimizations performed in OpenMPOpt pushes globalized variables to static shared memory. This is preferable to keeping the runtime call in all cases, however if too many variables are pushed to hared memory the kernel will crash. Since this is an optimization and not something the user specified explicitly, there should be an option to limit this optimization in those cases. This path introduces the `-openmp-opt-shared-limit=` option to limit the amount of bytes that will be placed in shared memory from HeapToShared. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D120079
1 parent 0870a4f commit 0136a44

File tree

2 files changed

+20
-1
lines changed

2 files changed

+20
-1
lines changed

llvm/lib/Transforms/IPO/OpenMPOpt.cpp

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,11 @@ static cl::opt<unsigned>
129129
cl::desc("Maximal number of attributor iterations."),
130130
cl::init(256));
131131

132+
static cl::opt<unsigned>
133+
SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden,
134+
cl::desc("Maximum amount of shared memory to use."),
135+
cl::init(std::numeric_limits<unsigned>::max()));
136+
132137
STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
133138
"Number of OpenMP runtime calls deduplicated");
134139
STATISTIC(NumOpenMPParallelRegionsDeleted,
@@ -3000,6 +3005,14 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
30003005

30013006
auto *AllocSize = cast<ConstantInt>(CB->getArgOperand(0));
30023007

3008+
if (AllocSize->getZExtValue() + SharedMemoryUsed > SharedMemoryLimit) {
3009+
LLVM_DEBUG(dbgs() << TAG << "Cannot replace call " << *CB
3010+
<< " with shared memory."
3011+
<< " Shared memory usage is limited to "
3012+
<< SharedMemoryLimit << " bytes\n");
3013+
continue;
3014+
}
3015+
30033016
LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB
30043017
<< " with " << AllocSize->getZExtValue()
30053018
<< " bytes of shared memory\n");
@@ -3034,7 +3047,8 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
30343047
A.deleteAfterManifest(*CB);
30353048
A.deleteAfterManifest(*FreeCalls.front());
30363049

3037-
NumBytesMovedToSharedMemory += AllocSize->getZExtValue();
3050+
SharedMemoryUsed += AllocSize->getZExtValue();
3051+
NumBytesMovedToSharedMemory = SharedMemoryUsed;
30383052
Changed = ChangeStatus::CHANGED;
30393053
}
30403054

@@ -3070,6 +3084,8 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
30703084
SmallSetVector<CallBase *, 4> MallocCalls;
30713085
/// Collection of potentially removed free calls in a function.
30723086
SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
3087+
/// The total amount of shared memory that has been used for HeapToShared.
3088+
unsigned SharedMemoryUsed = 0;
30733089
};
30743090

30753091
struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {

llvm/test/Transforms/OpenMP/replace_globalization.ll

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
22
; RUN: opt -S -passes='openmp-opt' < %s | FileCheck %s
33
; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-REMARKS
4+
; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -pass-remarks-missed=openmp-opt -disable-output -openmp-opt-shared-limit=4 < %s 2>&1 | FileCheck %s -check-prefix=CHECK-LIMIT
45
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
56
target triple = "nvptx64"
67

78
; UTC_ARGS: --disable
89
; CHECK-REMARKS: remark: replace_globalization.c:5:7: Replaced globalized variable with 16 bytes of shared memory
910
; CHECK-REMARKS: remark: replace_globalization.c:5:14: Replaced globalized variable with 4 bytes of shared memory
1011
; CHECK-REMARKS-NOT: 6 bytes
12+
; CHECK-LIMIT: remark: replace_globalization.c:5:14: Replaced globalized variable with 4 bytes of shared memory
13+
; CHECK-LIMIT: remark: replace_globalization.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization
1114
; UTC_ARGS: --enable
1215

1316
%struct.ident_t = type { i32, i32, i32, i32, i8* }

0 commit comments

Comments
 (0)