Skip to content

[OpenMP][USM] Introduces -fopenmp-force-usm flag #76571

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/include/clang/Basic/LangOptions.def
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@ LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have mo
LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
LANGOPT(OpenMPNoNestedParallelism , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")
LANGOPT(OpenMPForceUSM , 1, 0, "Enable OpenMP unified shared memory mode via compiler.")
LANGOPT(NoGPULib , 1, 0, "Indicate a build without the standard GPU libraries.")
LANGOPT(RenderScript , 1, 0, "RenderScript")

Expand Down
4 changes: 4 additions & 0 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -3451,6 +3451,10 @@ def fopenmp_offload_mandatory : Flag<["-"], "fopenmp-offload-mandatory">, Group<
Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CC1Option]>,
HelpText<"Do not create a host fallback if offloading to the device fails.">,
MarshallingInfoFlag<LangOpts<"OpenMPOffloadMandatory">>;
def fopenmp_force_usm : Flag<["-"], "fopenmp-force-usm">, Group<f_Group>,
Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CC1Option]>,
HelpText<"Force behvaior as if the user specified pragma omp requires unified_shared_memory.">,
MarshallingInfoFlag<LangOpts<"OpenMPForceUSM">>;
def fopenmp_target_jit : Flag<["-"], "fopenmp-target-jit">, Group<f_Group>,
Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CLOption]>,
HelpText<"Emit code that can be JIT compiled for OpenMP offloading. Implies -foffload-lto=full">;
Expand Down
7 changes: 7 additions & 0 deletions clang/lib/CodeGen/CGOpenMPRuntime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1044,6 +1044,13 @@ CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
? CGM.getLangOpts().OMPHostIRFile
: StringRef{});
OMPBuilder.setConfig(Config);

// The user forces the compiler to behave as if omp requires
// unified_shared_memory was given.
if (CGM.getLangOpts().OpenMPForceUSM) {
HasRequiresUnifiedSharedMemory = true;
OMPBuilder.Config.setHasRequiresUnifiedSharedMemory(true);
}
}

void CGOpenMPRuntime::clear() {
Expand Down
2 changes: 2 additions & 0 deletions clang/lib/Driver/ToolChains/Clang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6382,6 +6382,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-fopenmp-assume-no-nested-parallelism");
if (Args.hasArg(options::OPT_fopenmp_offload_mandatory))
CmdArgs.push_back("-fopenmp-offload-mandatory");
if (Args.hasArg(options::OPT_fopenmp_force_usm))
CmdArgs.push_back("-fopenmp-force-usm");
break;
default:
// By default, if Clang doesn't know how to generate useful OpenMP code
Expand Down
79 changes: 79 additions & 0 deletions clang/test/OpenMP/force-usm.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 3
// REQUIRES: amdgpu-registered-target

// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -fopenmp-force-usm -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-force-usm -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix=CHECK-USM %s

// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix=CHECK-DEFAULT %s
// expected-no-diagnostics

extern "C" void *malloc(unsigned int b);

int GI;
#pragma omp declare target
int *pGI;
#pragma omp end declare target

int main(void) {

GI = 0;

pGI = (int *) malloc(sizeof(int));
*pGI = 42;

#pragma omp target map(pGI[:1], GI)
{
GI = 1;
*pGI = 2;
}

return 0;
}

// CHECK-USM-LABEL: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25(
// CHECK-USM-SAME: ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GI:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-USM-NEXT: entry:
// CHECK-USM-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-USM-NEXT: [[GI_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-USM-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
// CHECK-USM-NEXT: [[GI_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GI_ADDR]] to ptr
// CHECK-USM-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
// CHECK-USM-NEXT: store ptr [[GI]], ptr [[GI_ADDR_ASCAST]], align 8
// CHECK-USM-NEXT: [[TMP0:%.*]] = load ptr, ptr [[GI_ADDR_ASCAST]], align 8
// CHECK-USM-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_kernel_environment to ptr), ptr [[DYN_PTR]])
// CHECK-USM-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
// CHECK-USM-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK-USM: user_code.entry:
// CHECK-USM-NEXT: store i32 1, ptr [[TMP0]], align 4
// CHECK-USM-NEXT: [[TMP2:%.*]] = load ptr, ptr @pGI_decl_tgt_ref_ptr, align 8
// CHECK-USM-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
// CHECK-USM-NEXT: store i32 2, ptr [[TMP3]], align 4
// CHECK-USM-NEXT: call void @__kmpc_target_deinit()
// CHECK-USM-NEXT: ret void
// CHECK-USM: worker.exit:
// CHECK-USM-NEXT: ret void
//
//
// CHECK-DEFAULT-LABEL: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25(
// CHECK-DEFAULT-SAME: ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GI:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-DEFAULT-NEXT: entry:
// CHECK-DEFAULT-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-DEFAULT-NEXT: [[GI_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-DEFAULT-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
// CHECK-DEFAULT-NEXT: [[GI_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GI_ADDR]] to ptr
// CHECK-DEFAULT-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
// CHECK-DEFAULT-NEXT: store ptr [[GI]], ptr [[GI_ADDR_ASCAST]], align 8
// CHECK-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[GI_ADDR_ASCAST]], align 8
// CHECK-DEFAULT-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_kernel_environment to ptr), ptr [[DYN_PTR]])
// CHECK-DEFAULT-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
// CHECK-DEFAULT-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK-DEFAULT: user_code.entry:
// CHECK-DEFAULT-NEXT: store i32 1, ptr [[TMP0]], align 4
// CHECK-DEFAULT-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspacecast (ptr addrspace(1) @pGI to ptr), align 8
// CHECK-DEFAULT-NEXT: store i32 2, ptr [[TMP2]], align 4
// CHECK-DEFAULT-NEXT: call void @__kmpc_target_deinit()
// CHECK-DEFAULT-NEXT: ret void
// CHECK-DEFAULT: worker.exit:
// CHECK-DEFAULT-NEXT: ret void
//
8 changes: 8 additions & 0 deletions openmp/libomptarget/test/lit.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ for libomptarget_target in config.libomptarget_all_targets:
"%libomptarget-compile-and-run-" + libomptarget_target))
config.substitutions.append(("%libomptarget-compilexx-generic",
"%libomptarget-compilexx-" + libomptarget_target))
config.substitutions.append(("%libomptarget-compilexxx-generic-force-usm",
"%libomptarget-compilexxx-force-usm-" + libomptarget_target))
config.substitutions.append(("%libomptarget-compile-generic",
"%libomptarget-compile-" + libomptarget_target))
config.substitutions.append(("%libomptarget-compile-fortran-generic",
Expand Down Expand Up @@ -242,6 +244,9 @@ for libomptarget_target in config.libomptarget_all_targets:
config.substitutions.append(("%libomptarget-compilexx-" + \
libomptarget_target, \
"%clangxx-" + libomptarget_target + add_libraries(" %s -o %t")))
config.substitutions.append(("%libomptarget-compilexxx-force-usm-" +
libomptarget_target, "%clangxxx-force-usm-" + libomptarget_target + \
add_libraries(" %s -o %t")))
config.substitutions.append(("%libomptarget-compile-" + \
libomptarget_target, \
"%clang-" + libomptarget_target + add_libraries(" %s -o %t")))
Expand Down Expand Up @@ -279,6 +284,9 @@ for libomptarget_target in config.libomptarget_all_targets:
config.substitutions.append(("%clangxx-" + libomptarget_target, \
"%clangxx %openmp_flags %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
remove_suffix_if_present(libomptarget_target)))
config.substitutions.append(("%clangxxx-force-usm-" + libomptarget_target, \
"%clangxx %openmp_flags -fopenmp-force-usm %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
remove_suffix_if_present(libomptarget_target)))
config.substitutions.append(("%clang-" + libomptarget_target, \
"%clang %openmp_flags %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
remove_suffix_if_present(libomptarget_target)))
Expand Down
59 changes: 59 additions & 0 deletions openmp/libomptarget/test/offloading/force-usm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// clang-format off
// RUN: %libomptarget-compilexx-generic
// RUN: env LIBOMPTARGET_INFO=32 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=NO-USM
//
// RUN: %libomptarget-compilexxx-generic-force-usm
// RUN: env HSA_XNACK=1 LIBOMPTARGET_INFO=32 \
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=FORCE-USM
//
// UNSUPPORTED: nvptx64-nvidia-cuda
// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
// clang-format on

#include <cassert>
#include <cstdio>
#include <cstdlib>

int GI;
#pragma omp declare target
int *pGI;
#pragma omp end declare target

int main(void) {

GI = 0;
// Implicit mappings
int alpha = 1;
int beta[3] = {2, 5, 8};

// Require map clauses for non-USM execution
pGI = (int *)malloc(sizeof(int));
*pGI = 42;

#pragma omp target map(pGI[ : 1], GI)
{
GI = 1 * alpha;
*pGI = 2 * beta[1];
}

assert(GI == 1);
assert(*pGI == 10);

printf("SUCCESS\n");

return 0;
}

// clang-format off
// NO-USM: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4
// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=12
// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4
// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=8, Name=pGI
// NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=4
// NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=12
// NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=4
// NO-USM-NEXT: SUCCESS

// FORCE-USM: SUCCESS
//
// clang-format on