Skip to content

Commit fa4780f

Browse files
authored
[OpenMP][USM] Introduces -fopenmp-force-usm flag (llvm#76571)
This flag forces the compiler to generate code for OpenMP target regions as if the user specified the #pragma omp requires unified_shared_memory in each source file. The option does not have a -fno-* friend since OpenMP requires the unified_shared_memory clause to be present in all source files. Since this flag does no harm if the clause is present, it can be used in conjunction. My understanding is that USM should not be turned off selectively, hence, no -fno- version. This adds a basic test to check the correct generation of double indirect access to declare target globals in USM mode vs non-USM mode. Which I think is the only difference observable in code generation. This runtime test checks for the (non-)occurence of data movement between host and device. It does one run without the flag and one with the flag to also see that both versions behave as expected. In the case w/o the new flag data movement between host and device is expected. In the case with the flag such data movement should not be present / reported.
1 parent 3f3a3e8 commit fa4780f

File tree

7 files changed

+160
-0
lines changed

7 files changed

+160
-0
lines changed

clang/include/clang/Basic/LangOptions.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,7 @@ LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have mo
260260
LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
261261
LANGOPT(OpenMPNoNestedParallelism , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
262262
LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")
263+
LANGOPT(OpenMPForceUSM , 1, 0, "Enable OpenMP unified shared memory mode via compiler.")
263264
LANGOPT(NoGPULib , 1, 0, "Indicate a build without the standard GPU libraries.")
264265
LANGOPT(RenderScript , 1, 0, "RenderScript")
265266

clang/include/clang/Driver/Options.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3459,6 +3459,10 @@ def fopenmp_offload_mandatory : Flag<["-"], "fopenmp-offload-mandatory">, Group<
34593459
Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CC1Option]>,
34603460
HelpText<"Do not create a host fallback if offloading to the device fails.">,
34613461
MarshallingInfoFlag<LangOpts<"OpenMPOffloadMandatory">>;
3462+
def fopenmp_force_usm : Flag<["-"], "fopenmp-force-usm">, Group<f_Group>,
3463+
Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CC1Option]>,
3464+
HelpText<"Force behvaior as if the user specified pragma omp requires unified_shared_memory.">,
3465+
MarshallingInfoFlag<LangOpts<"OpenMPForceUSM">>;
34623466
def fopenmp_target_jit : Flag<["-"], "fopenmp-target-jit">, Group<f_Group>,
34633467
Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CLOption]>,
34643468
HelpText<"Emit code that can be JIT compiled for OpenMP offloading. Implies -foffload-lto=full">;

clang/lib/CodeGen/CGOpenMPRuntime.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1044,6 +1044,13 @@ CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
10441044
? CGM.getLangOpts().OMPHostIRFile
10451045
: StringRef{});
10461046
OMPBuilder.setConfig(Config);
1047+
1048+
// The user forces the compiler to behave as if omp requires
1049+
// unified_shared_memory was given.
1050+
if (CGM.getLangOpts().OpenMPForceUSM) {
1051+
HasRequiresUnifiedSharedMemory = true;
1052+
OMPBuilder.Config.setHasRequiresUnifiedSharedMemory(true);
1053+
}
10471054
}
10481055

10491056
void CGOpenMPRuntime::clear() {

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6460,6 +6460,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
64606460
CmdArgs.push_back("-fopenmp-assume-no-nested-parallelism");
64616461
if (Args.hasArg(options::OPT_fopenmp_offload_mandatory))
64626462
CmdArgs.push_back("-fopenmp-offload-mandatory");
6463+
if (Args.hasArg(options::OPT_fopenmp_force_usm))
6464+
CmdArgs.push_back("-fopenmp-force-usm");
64636465
break;
64646466
default:
64656467
// By default, if Clang doesn't know how to generate useful OpenMP code

clang/test/OpenMP/force-usm.c

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 3
2+
// REQUIRES: amdgpu-registered-target
3+
4+
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -fopenmp-force-usm -emit-llvm-bc %s -o %t-ppc-host.bc
5+
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-force-usm -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix=CHECK-USM %s
6+
7+
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
8+
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix=CHECK-DEFAULT %s
9+
// expected-no-diagnostics
10+
11+
extern "C" void *malloc(unsigned int b);
12+
13+
int GI;
14+
#pragma omp declare target
15+
int *pGI;
16+
#pragma omp end declare target
17+
18+
int main(void) {
19+
20+
GI = 0;
21+
22+
pGI = (int *) malloc(sizeof(int));
23+
*pGI = 42;
24+
25+
#pragma omp target map(pGI[:1], GI)
26+
{
27+
GI = 1;
28+
*pGI = 2;
29+
}
30+
31+
return 0;
32+
}
33+
34+
// CHECK-USM-LABEL: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25(
35+
// CHECK-USM-SAME: ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GI:%.*]]) #[[ATTR0:[0-9]+]] {
36+
// CHECK-USM-NEXT: entry:
37+
// CHECK-USM-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
38+
// CHECK-USM-NEXT: [[GI_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
39+
// CHECK-USM-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
40+
// CHECK-USM-NEXT: [[GI_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GI_ADDR]] to ptr
41+
// CHECK-USM-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
42+
// CHECK-USM-NEXT: store ptr [[GI]], ptr [[GI_ADDR_ASCAST]], align 8
43+
// CHECK-USM-NEXT: [[TMP0:%.*]] = load ptr, ptr [[GI_ADDR_ASCAST]], align 8
44+
// CHECK-USM-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_kernel_environment to ptr), ptr [[DYN_PTR]])
45+
// CHECK-USM-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
46+
// CHECK-USM-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
47+
// CHECK-USM: user_code.entry:
48+
// CHECK-USM-NEXT: store i32 1, ptr [[TMP0]], align 4
49+
// CHECK-USM-NEXT: [[TMP2:%.*]] = load ptr, ptr @pGI_decl_tgt_ref_ptr, align 8
50+
// CHECK-USM-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
51+
// CHECK-USM-NEXT: store i32 2, ptr [[TMP3]], align 4
52+
// CHECK-USM-NEXT: call void @__kmpc_target_deinit()
53+
// CHECK-USM-NEXT: ret void
54+
// CHECK-USM: worker.exit:
55+
// CHECK-USM-NEXT: ret void
56+
//
57+
//
58+
// CHECK-DEFAULT-LABEL: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25(
59+
// CHECK-DEFAULT-SAME: ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GI:%.*]]) #[[ATTR0:[0-9]+]] {
60+
// CHECK-DEFAULT-NEXT: entry:
61+
// CHECK-DEFAULT-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
62+
// CHECK-DEFAULT-NEXT: [[GI_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
63+
// CHECK-DEFAULT-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
64+
// CHECK-DEFAULT-NEXT: [[GI_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GI_ADDR]] to ptr
65+
// CHECK-DEFAULT-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
66+
// CHECK-DEFAULT-NEXT: store ptr [[GI]], ptr [[GI_ADDR_ASCAST]], align 8
67+
// CHECK-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[GI_ADDR_ASCAST]], align 8
68+
// CHECK-DEFAULT-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_kernel_environment to ptr), ptr [[DYN_PTR]])
69+
// CHECK-DEFAULT-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
70+
// CHECK-DEFAULT-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
71+
// CHECK-DEFAULT: user_code.entry:
72+
// CHECK-DEFAULT-NEXT: store i32 1, ptr [[TMP0]], align 4
73+
// CHECK-DEFAULT-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspacecast (ptr addrspace(1) @pGI to ptr), align 8
74+
// CHECK-DEFAULT-NEXT: store i32 2, ptr [[TMP2]], align 4
75+
// CHECK-DEFAULT-NEXT: call void @__kmpc_target_deinit()
76+
// CHECK-DEFAULT-NEXT: ret void
77+
// CHECK-DEFAULT: worker.exit:
78+
// CHECK-DEFAULT-NEXT: ret void
79+
//

openmp/libomptarget/test/lit.cfg

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,8 @@ for libomptarget_target in config.libomptarget_all_targets:
190190
"%libomptarget-compile-and-run-" + libomptarget_target))
191191
config.substitutions.append(("%libomptarget-compilexx-generic",
192192
"%libomptarget-compilexx-" + libomptarget_target))
193+
config.substitutions.append(("%libomptarget-compilexxx-generic-force-usm",
194+
"%libomptarget-compilexxx-force-usm-" + libomptarget_target))
193195
config.substitutions.append(("%libomptarget-compile-generic",
194196
"%libomptarget-compile-" + libomptarget_target))
195197
config.substitutions.append(("%libomptarget-compile-fortran-generic",
@@ -247,6 +249,9 @@ for libomptarget_target in config.libomptarget_all_targets:
247249
config.substitutions.append(("%libomptarget-compilexx-" + \
248250
libomptarget_target, \
249251
"%clangxx-" + libomptarget_target + add_libraries(" %s -o %t")))
252+
config.substitutions.append(("%libomptarget-compilexxx-force-usm-" +
253+
libomptarget_target, "%clangxxx-force-usm-" + libomptarget_target + \
254+
add_libraries(" %s -o %t")))
250255
config.substitutions.append(("%libomptarget-compile-" + \
251256
libomptarget_target, \
252257
"%clang-" + libomptarget_target + add_libraries(" %s -o %t")))
@@ -284,6 +289,9 @@ for libomptarget_target in config.libomptarget_all_targets:
284289
config.substitutions.append(("%clangxx-" + libomptarget_target, \
285290
"%clangxx %openmp_flags %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
286291
remove_suffix_if_present(libomptarget_target)))
292+
config.substitutions.append(("%clangxxx-force-usm-" + libomptarget_target, \
293+
"%clangxx %openmp_flags -fopenmp-force-usm %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
294+
remove_suffix_if_present(libomptarget_target)))
287295
config.substitutions.append(("%clang-" + libomptarget_target, \
288296
"%clang %openmp_flags %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
289297
remove_suffix_if_present(libomptarget_target)))
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
// clang-format off
2+
// RUN: %libomptarget-compilexx-generic
3+
// RUN: env LIBOMPTARGET_INFO=32 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=NO-USM
4+
//
5+
// RUN: %libomptarget-compilexxx-generic-force-usm
6+
// RUN: env HSA_XNACK=1 LIBOMPTARGET_INFO=32 \
7+
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=FORCE-USM
8+
//
9+
// UNSUPPORTED: nvptx64-nvidia-cuda
10+
// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
11+
// clang-format on
12+
13+
#include <cassert>
14+
#include <cstdio>
15+
#include <cstdlib>
16+
17+
int GI;
18+
#pragma omp declare target
19+
int *pGI;
20+
#pragma omp end declare target
21+
22+
int main(void) {
23+
24+
GI = 0;
25+
// Implicit mappings
26+
int alpha = 1;
27+
int beta[3] = {2, 5, 8};
28+
29+
// Require map clauses for non-USM execution
30+
pGI = (int *)malloc(sizeof(int));
31+
*pGI = 42;
32+
33+
#pragma omp target map(pGI[ : 1], GI)
34+
{
35+
GI = 1 * alpha;
36+
*pGI = 2 * beta[1];
37+
}
38+
39+
assert(GI == 1);
40+
assert(*pGI == 10);
41+
42+
printf("SUCCESS\n");
43+
44+
return 0;
45+
}
46+
47+
// clang-format off
48+
// NO-USM: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4
49+
// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=12
50+
// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4
51+
// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=8, Name=pGI
52+
// NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=4
53+
// NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=12
54+
// NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=4
55+
// NO-USM-NEXT: SUCCESS
56+
57+
// FORCE-USM: SUCCESS
58+
//
59+
// clang-format on

0 commit comments

Comments
 (0)