-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[OpenMP] Allow GPUs to be targeted directly via -fopenmp
.
#122149
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-clang-codegen @llvm/pr-subscribers-clang Author: Joseph Huber (jhuber6) ChangesSummary:
This patch lifts this restriction, allowing individual files to be This patch allows the following to work.
Full diff: https://github.com/llvm/llvm-project/pull/122149.diff 6 Files Affected:
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 756f0482b8ea72..1ad4b4b0e8a7fc 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -870,9 +870,6 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
OMPBuilder.setConfig(Config);
- if (!CGM.getLangOpts().OpenMPIsTargetDevice)
- llvm_unreachable("OpenMP can only handle device code.");
-
if (CGM.getLangOpts().OpenMPCUDAMode)
CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 6cb37b20b7aeee..950ed173aecf3a 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -6801,7 +6801,8 @@ static void emitCommonOMPTargetDirective(CodeGenFunction &CGF,
CodeGenModule &CGM = CGF.CGM;
// On device emit this construct as inlined code.
- if (CGM.getLangOpts().OpenMPIsTargetDevice) {
+ if (CGM.getLangOpts().OpenMPIsTargetDevice ||
+ CGM.getOpenMPRuntime().isGPU()) {
OMPLexicalScope Scope(CGF, S, OMPD_target);
CGM.getOpenMPRuntime().emitInlinedDirective(
CGF, OMPD_target, [&S](CodeGenFunction &CGF, PrePostActionTy &) {
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 5f15f0f48c54e4..26abd9a60632ae 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -483,8 +483,6 @@ void CodeGenModule::createOpenMPRuntime() {
case llvm::Triple::nvptx:
case llvm::Triple::nvptx64:
case llvm::Triple::amdgcn:
- assert(getLangOpts().OpenMPIsTargetDevice &&
- "OpenMP AMDGPU/NVPTX is only prepared to deal with device code.");
OpenMPRuntime.reset(new CGOpenMPRuntimeGPU(*this));
break;
default:
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index d711df02ce9503..d2df51593ff62b 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -4210,19 +4210,6 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
Args, OPT_fopenmp_version_EQ,
(IsSimdSpecified || IsTargetSpecified) ? 51 : Opts.OpenMP, Diags))
Opts.OpenMP = Version;
- // Provide diagnostic when a given target is not expected to be an OpenMP
- // device or host.
- if (!Opts.OpenMPIsTargetDevice) {
- switch (T.getArch()) {
- default:
- break;
- // Add unsupported host targets here:
- case llvm::Triple::nvptx:
- case llvm::Triple::nvptx64:
- Diags.Report(diag::err_drv_omp_host_target_not_supported) << T.str();
- break;
- }
- }
}
// Set the flag to prevent the implementation from emitting device exception
diff --git a/clang/test/OpenMP/gpu_target.cpp b/clang/test/OpenMP/gpu_target.cpp
new file mode 100644
index 00000000000000..3d5a47d7050436
--- /dev/null
+++ b/clang/test/OpenMP/gpu_target.cpp
@@ -0,0 +1,220 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --version 5
+// expected-no-diagnostics
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck %s --check-prefix=AMDGCN
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-nvidia-cuda -emit-llvm %s -o - | FileCheck %s --check-prefix=NVPTX
+
+typedef enum omp_allocator_handle_t {
+ omp_null_allocator = 0,
+ omp_default_mem_alloc = 1,
+ omp_large_cap_mem_alloc = 2,
+ omp_const_mem_alloc = 3,
+ omp_high_bw_mem_alloc = 4,
+ omp_low_lat_mem_alloc = 5,
+ omp_cgroup_mem_alloc = 6,
+ omp_pteam_mem_alloc = 7,
+ omp_thread_mem_alloc = 8,
+ KMP_ALLOCATOR_MAX_HANDLE = ~(0LU)
+} omp_allocator_handle_t;
+
+int d = 0;
+#pragma omp allocate(d) allocator(omp_default_mem_alloc)
+
+int g = 0;
+#pragma omp allocate(g) allocator(omp_cgroup_mem_alloc)
+
+extern const int c = 0;
+#pragma omp allocate(c) allocator(omp_const_mem_alloc)
+
+
+int foo() {
+ int t = 0;
+#pragma omp allocate(t) allocator(omp_thread_mem_alloc)
+ return t;
+}
+
+void bar() {
+#pragma omp target
+ ;
+#pragma omp parallel
+ ;
+}
+
+void baz(int *p) {
+#pragma omp atomic
+ *p += 1;
+}
+
+int qux() {
+#if defined(__NVPTX__)
+ return 1;
+#elif defined(__AMDGPU__)
+ return 2;
+#endif
+}
+//.
+// AMDGCN: @c = addrspace(4) constant i32 0, align 4
+// AMDGCN: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
+// AMDGCN: @[[GLOB1:[0-9]+]] = private unnamed_addr addrspace(1) constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+// AMDGCN: @d = global i32 0, align 4
+// AMDGCN: @g = global i32 0, align 4
+// AMDGCN: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
+//.
+// NVPTX: @d = global i32 0, align 4
+// NVPTX: @g = global i32 0, align 4
+// NVPTX: @c = addrspace(4) constant i32 0, align 4
+// NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
+// NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+//.
+// AMDGCN-LABEL: define dso_local noundef i32 @_Z3foov(
+// AMDGCN-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT: [[T:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN-NEXT: [[T_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[T]] to ptr
+// AMDGCN-NEXT: store i32 0, ptr [[T_ASCAST]], align 4
+// AMDGCN-NEXT: [[TMP0:%.*]] = load i32, ptr [[T_ASCAST]], align 4
+// AMDGCN-NEXT: ret i32 [[TMP0]]
+//
+//
+// AMDGCN-LABEL: define dso_local void @_Z3barv(
+// AMDGCN-SAME: ) #[[ATTR0]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8, addrspace(5)
+// AMDGCN-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
+// AMDGCN-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
+// AMDGCN-NEXT: call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @_Z3barv_omp_outlined, ptr @_Z3barv_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0)
+// AMDGCN-NEXT: ret void
+//
+//
+// AMDGCN-LABEL: define internal void @_Z3barv_omp_outlined(
+// AMDGCN-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
+// AMDGCN-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
+// AMDGCN-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: ret void
+//
+//
+// AMDGCN-LABEL: define internal void @_Z3barv_omp_outlined_wrapper(
+// AMDGCN-SAME: i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2, addrspace(5)
+// AMDGCN-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
+// AMDGCN-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
+// AMDGCN-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
+// AMDGCN-NEXT: [[GLOBAL_ARGS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GLOBAL_ARGS]] to ptr
+// AMDGCN-NEXT: store i16 [[TMP0]], ptr [[DOTADDR_ASCAST]], align 2
+// AMDGCN-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
+// AMDGCN-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
+// AMDGCN-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS_ASCAST]])
+// AMDGCN-NEXT: call void @_Z3barv_omp_outlined(ptr [[DOTADDR1_ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]]) #[[ATTR3:[0-9]+]]
+// AMDGCN-NEXT: ret void
+//
+//
+// AMDGCN-LABEL: define dso_local void @_Z3bazPi(
+// AMDGCN-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN-NEXT: [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr
+// AMDGCN-NEXT: store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: [[TMP1:%.*]] = atomicrmw add ptr [[TMP0]], i32 1 monotonic, align 4
+// AMDGCN-NEXT: ret void
+//
+//
+// AMDGCN-LABEL: define dso_local noundef i32 @_Z3quxv(
+// AMDGCN-SAME: ) #[[ATTR0]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN-NEXT: ret i32 2
+//
+//
+// NVPTX-LABEL: define dso_local noundef i32 @_Z3foov(
+// NVPTX-SAME: ) #[[ATTR0:[0-9]+]] {
+// NVPTX-NEXT: [[ENTRY:.*:]]
+// NVPTX-NEXT: [[T:%.*]] = alloca i32, align 4
+// NVPTX-NEXT: store i32 0, ptr [[T]], align 4
+// NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[T]], align 4
+// NVPTX-NEXT: ret i32 [[TMP0]]
+//
+//
+// NVPTX-LABEL: define dso_local void @_Z3barv(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT: [[ENTRY:.*:]]
+// NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
+// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @_Z3barv_omp_outlined, ptr @_Z3barv_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
+// NVPTX-NEXT: ret void
+//
+//
+// NVPTX-LABEL: define internal void @_Z3barv_omp_outlined(
+// NVPTX-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// NVPTX-NEXT: [[ENTRY:.*:]]
+// NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// NVPTX-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// NVPTX-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// NVPTX-NEXT: ret void
+//
+//
+// NVPTX-LABEL: define internal void @_Z3barv_omp_outlined_wrapper(
+// NVPTX-SAME: i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
+// NVPTX-NEXT: [[ENTRY:.*:]]
+// NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2
+// NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
+// NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
+// NVPTX-NEXT: store i16 [[TMP0]], ptr [[DOTADDR]], align 2
+// NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
+// NVPTX-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
+// NVPTX-NEXT: call void @_Z3barv_omp_outlined(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3:[0-9]+]]
+// NVPTX-NEXT: ret void
+//
+//
+// NVPTX-LABEL: define dso_local void @_Z3bazPi(
+// NVPTX-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// NVPTX-NEXT: [[ENTRY:.*:]]
+// NVPTX-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8
+// NVPTX-NEXT: store ptr [[P]], ptr [[P_ADDR]], align 8
+// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// NVPTX-NEXT: [[TMP1:%.*]] = atomicrmw add ptr [[TMP0]], i32 1 monotonic, align 4
+// NVPTX-NEXT: ret void
+//
+//
+// NVPTX-LABEL: define dso_local noundef i32 @_Z3quxv(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT: [[ENTRY:.*:]]
+// NVPTX-NEXT: ret i32 1
+//
+//.
+// AMDGCN: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// AMDGCN: attributes #[[ATTR1]] = { convergent noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// AMDGCN: attributes #[[ATTR2]] = { convergent noinline norecurse nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// AMDGCN: attributes #[[ATTR3]] = { nounwind }
+// AMDGCN: attributes #[[ATTR4:[0-9]+]] = { alwaysinline }
+//.
+// NVPTX: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// NVPTX: attributes #[[ATTR1]] = { convergent noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// NVPTX: attributes #[[ATTR2]] = { convergent noinline norecurse nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// NVPTX: attributes #[[ATTR3]] = { nounwind }
+// NVPTX: attributes #[[ATTR4:[0-9]+]] = { alwaysinline }
+//.
+// AMDGCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
+// AMDGCN: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCN: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 45}
+// AMDGCN: [[META3:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// NVPTX: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// NVPTX: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 45}
+// NVPTX: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/OpenMP/target_messages.cpp b/clang/test/OpenMP/target_messages.cpp
index 62ab5817ba28da..7e68adddc949e0 100644
--- a/clang/test/OpenMP/target_messages.cpp
+++ b/clang/test/OpenMP/target_messages.cpp
@@ -8,7 +8,6 @@
// CHECK: error: OpenMP target is invalid: 'aaa-bbb-ccc-ddd'
// RUN: not %clang_cc1 -fopenmp -std=c++11 -triple nvptx64-nvidia-cuda -o - %s 2>&1 | FileCheck --check-prefix CHECK-UNSUPPORTED-HOST-TARGET %s
// RUN: not %clang_cc1 -fopenmp -std=c++11 -triple nvptx-nvidia-cuda -o - %s 2>&1 | FileCheck --check-prefix CHECK-UNSUPPORTED-HOST-TARGET %s
-// CHECK-UNSUPPORTED-HOST-TARGET: error: target '{{nvptx64-nvidia-cuda|nvptx-nvidia-cuda}}' is not a supported OpenMP host target
// RUN: not %clang_cc1 -fopenmp -std=c++11 -fopenmp-targets=hexagon-linux-gnu -o - %s 2>&1 | FileCheck --check-prefix CHECK-UNSUPPORTED-DEVICE-TARGET %s
// CHECK-UNSUPPORTED-DEVICE-TARGET: OpenMP target is invalid: 'hexagon-linux-gnu'
|
Maybe just turn on OpenMPIsTargetDevice if |
I'll give it a try. |
What code generation path would be used in this case? The GPU code generation or regular host OpenMP? |
The GPU path, I'm treating that as the code generation path that created correct runtime code for the GPU. I.e. you can link it with your OpenMP offloading program and it'll work. |
gpu device code |
Doesn't work, it causes all definitions to be stripped as they are not declared on the device, which is not what we want. |
|
it needs to be, otherwise the code generation for things like |
Summary: Currently we prevent the following from working. However, it is completely reasonable to be able to target the files individually. ``` $ clang --target=amdgcn-amd-amdhsa -fopenmp ``` This patch lifts this restriction, allowing individual files to be compiled as standalone OpenMP without the extra offloading overhead. The main motivation behind this is to update the build of the OpenMP DeviceRTL. Currently, we do `--offload-device-only -S -emit-llvm` which is just a hackier version of `-fopenmp -flto -c`. This patch allows the following to work. ``` $ clang omp.c -fopenmp --target=amdgcn-amd-amdhsa -flto -c $ clang offload.c -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xoffload-linker omp.o $ ./a.out ```
Yes, the example in the description is how I can see this being used. It basically lets us emit GPU code without the build system complications introduced by offloading languages (i.e. we no longer need variants or declare target.) |
I think that is a misuse of OpenMP semantics. We can't expect to have regular OpenMP code working in the same way as OpenMP offloading code when targeting a GPU meanwhile the code is not wrapped into |
It should maintain the normal semantics you'd get with |
That is the key difference.
+1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can't expect to have regular OpenMP code working in the same way as OpenMP offloading code when targeting a GPU meanwhile the code is not wrapped into target region or declare target
The way I see this is:
If the target is a GPU, and OpenMP is enabled (-fopenmp), what we should do is codegen it targeting the GPU OpenMP runtime. This is what this patch does and I don't think it violates anything. What we define as the internal OpenMP runtime could reasonably be different on every target, or even depending on the day of the week ;). No runtime would also be an option if we codegen all the semantics, though we really don't want that.
That said, this makes sense to me. LG.
P.S. I'd suggest discussing the device runtime in the actual follow up.
I see it in a different way. Apparently the compiler skips non-target OpenMP regions when we compile for a GPU in the regular offloading compilation pipeline. |
I guess I'll see how much I favor this approach depending on how much more difficult it is to build the DeviceRTL without OpenMP. I think the only thing we'd miss is the |
To me this looks like compilation for a host, except the GPU is the host. The only functions that could be called from such a CU would be the top-level ones, not any of the auto-generated one. Additionally, the host wouldn't support offload, so we'd need to do something about TARGET (or any other construct that cannot be inside of TARGET). We should probably ignore those with a diagnostic. |
This is not the case. This patch does not make omp parallel the same as omp target parallel. |
I get it, but that doesn't look like the case. If you look at the test case, the
The patch does make |
That is a good point. I think we can fix it in a follow up though.
we should see the store, and the ICV change inside the target task. All that said, there are two cases to consider wrt. the standard:
For 1), omp target, w/o ancestor, is disallowed, IIRC. |
SizeEmitter) { | ||
SmallString<256> Buffer; | ||
llvm::raw_svector_ostream Out(Buffer); | ||
Out << "Cannot emit a '#pragma omp target' on the GPU"; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this be a Sema or CodeGen error?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we have a clear idea on if a construct can behave in a different manner if it is nested in a target region?
Unsure exactly, the target regions are just outlined, so it shouldn't affect anything on a codegen level. |
Option (2) summarizes my point of view. The use case that I have in mind for this is
|
That's what I had in mind, just another way to generate the GPU code that OpenMP wants. |
No, they are not. The standard defines the execution behavior and codegen has to conform with it. The current GPU CodeGen in this discussion assumes it is generating for constructs inside a
This patch basically wraps the entire TU into a giant |
Cannot do things differently. We also have host-offloading targets, There are differences between target and non-target code, even if it is only for |
This was mostly a hack around wanting to do #123673, but I just took the extra time to fix the bugs and do the C++ approach. I can close this. |
Summary:
Currently we prevent the following from working. However, it is
completely reasonable to be able to target the files individually.
This patch lifts this restriction, allowing individual files to be
compiled as standalone OpenMP without the extra offloading overhead. The
main motivation behind this is to update the build of the OpenMP
DeviceRTL. Currently, we do
--offload-device-only -S -emit-llvm
whichis just a hackier version of
-fopenmp -flto -c
.This patch allows the following to work.