Skip to content

[OpenMP] Migrate GPU Reductions CodeGen from Clang to OMPIRBuilder #80343

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,370 changes: 74 additions & 1,296 deletions clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Large diffs are not rendered by default.

3 changes: 0 additions & 3 deletions clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
CodeGenFunction &CGF,
const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) override;

/// Get the GPU warp size.
llvm::Value *getGPUWarpSize(CodeGenFunction &CGF);

/// Get the id of the current thread on the GPU.
llvm::Value *getGPUThreadID(CodeGenFunction &CGF);

Expand Down
165 changes: 93 additions & 72 deletions clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp

Large diffs are not rendered by default.

Large diffs are not rendered by default.

258 changes: 144 additions & 114 deletions clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp

Large diffs are not rendered by default.

96 changes: 96 additions & 0 deletions clang/test/OpenMP/reduction_complex.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a filter here to restrict the check lines to the functions we care about.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've removed the checks which didn't pertain to the complex reduction section of the code.

// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ \
// RUN: -triple powerpc64le-unknown-unknown \
// RUN: -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o \
// RUN: %t-ppc-host.bc

// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ \
// RUN: -triple nvptx64-unknown-unknown -DCUA \
// RUN: -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s \
// RUN: -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc \
// RUN: -o - | FileCheck %s --check-prefix CHECK

// expected-no-diagnostics
int foo() {
int i;
int j;
_Complex float sum = 0;

#pragma omp target teams loop reduction(+:sum) collapse(2) bind(parallel) order(concurrent) lastprivate(j) map(tofrom:sum)

for(i=0; i<10; i++)
for(j=0; j<10; j++)
sum += i;

return 0;
}
// CHECK-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
// CHECK-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: %[[VAL_228:.*]] = alloca ptr, align 8
// CHECK-NEXT: %[[VAL_229:.*]] = alloca i16, align 2
// CHECK-NEXT: %[[VAL_230:.*]] = alloca i16, align 2
// CHECK-NEXT: %[[VAL_231:.*]] = alloca i16, align 2
// CHECK-NEXT: %[[VAL_232:.*]] = alloca [1 x ptr], align 8
// CHECK-NEXT: %[[VAL_233:.*]] = alloca { float, float }, align 8
// CHECK-NEXT: store ptr %[[VAL_234:.*]], ptr %[[VAL_228]], align 8
// CHECK-NEXT: store i16 %[[VAL_235:.*]], ptr %[[VAL_229]], align 2
// CHECK-NEXT: store i16 %[[VAL_236:.*]], ptr %[[VAL_230]], align 2
// CHECK-NEXT: store i16 %[[VAL_237:.*]], ptr %[[VAL_231]], align 2
// CHECK-NEXT: %[[VAL_238:.*]] = load ptr, ptr %[[VAL_228]], align 8
// CHECK-NEXT: %[[VAL_239:.*]] = load i16, ptr %[[VAL_229]], align 2
// CHECK-NEXT: %[[VAL_240:.*]] = load i16, ptr %[[VAL_230]], align 2
// CHECK-NEXT: %[[VAL_241:.*]] = load i16, ptr %[[VAL_231]], align 2
// CHECK-NEXT: %[[VAL_242:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_238]], i64 0, i64 0
// CHECK-NEXT: %[[VAL_243:.*]] = load ptr, ptr %[[VAL_242]], align 8
// CHECK-NEXT: %[[VAL_244:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_232]], i64 0, i64 0
// CHECK-NEXT: %[[VAL_245:.*]] = getelementptr { float, float }, ptr %[[VAL_243]], i64 1
// CHECK-NEXT: %[[VAL_246:.*]] = load i64, ptr %[[VAL_243]], align 8
// CHECK-NEXT: %[[VAL_247:.*]] = call i32 @__kmpc_get_warp_size()
// CHECK-NEXT: %[[VAL_248:.*]] = trunc i32 %[[VAL_247]] to i16
// CHECK-NEXT: %[[VAL_249:.*]] = call i64 @__kmpc_shuffle_int64(i64 %[[VAL_246]], i16 %[[VAL_240]], i16 %[[VAL_248]])
// CHECK-NEXT: store i64 %[[VAL_249]], ptr %[[VAL_233]], align 8
// CHECK-NEXT: %[[VAL_250:.*]] = getelementptr i64, ptr %[[VAL_243]], i64 1
// CHECK-NEXT: %[[VAL_251:.*]] = getelementptr i64, ptr %[[VAL_233]], i64 1
// CHECK-NEXT: store ptr %[[VAL_233]], ptr %[[VAL_244]], align 8
// CHECK-NEXT: %[[VAL_252:.*]] = icmp eq i16 %[[VAL_241]], 0
// CHECK-NEXT: %[[VAL_253:.*]] = icmp eq i16 %[[VAL_241]], 1
// CHECK-NEXT: %[[VAL_254:.*]] = icmp ult i16 %[[VAL_239]], %[[VAL_240]]
// CHECK-NEXT: %[[VAL_255:.*]] = and i1 %[[VAL_253]], %[[VAL_254]]
// CHECK-NEXT: %[[VAL_256:.*]] = icmp eq i16 %[[VAL_241]], 2
// CHECK-NEXT: %[[VAL_257:.*]] = and i16 %[[VAL_239]], 1
// CHECK-NEXT: %[[VAL_258:.*]] = icmp eq i16 %[[VAL_257]], 0
// CHECK-NEXT: %[[VAL_259:.*]] = and i1 %[[VAL_256]], %[[VAL_258]]
// CHECK-NEXT: %[[VAL_260:.*]] = icmp sgt i16 %[[VAL_240]], 0
// CHECK-NEXT: %[[VAL_261:.*]] = and i1 %[[VAL_259]], %[[VAL_260]]
// CHECK-NEXT: %[[VAL_262:.*]] = or i1 %[[VAL_252]], %[[VAL_255]]
// CHECK-NEXT: %[[VAL_263:.*]] = or i1 %[[VAL_262]], %[[VAL_261]]
// CHECK-NEXT: br i1 %[[VAL_263]], label %[[VAL_264:.*]], label %[[VAL_265:.*]]
// CHECK: then: ; preds = %[[VAL_266:.*]]
// CHECK-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l{{[0-9]+}}_omp_outlined_omp_outlined_omp$reduction$reduction_func"(ptr %[[VAL_238]], ptr %[[VAL_232]]) #2
// CHECK-NEXT: br label %[[VAL_267:.*]]
// CHECK: else: ; preds = %[[VAL_266]]
// CHECK-NEXT: br label %[[VAL_267]]
// CHECK: ifcont: ; preds = %[[VAL_265]], %[[VAL_264]]
// CHECK-NEXT: %[[VAL_268:.*]] = icmp eq i16 %[[VAL_241]], 1
// CHECK-NEXT: %[[VAL_269:.*]] = icmp uge i16 %[[VAL_239]], %[[VAL_240]]
// CHECK-NEXT: %[[VAL_270:.*]] = and i1 %[[VAL_268]], %[[VAL_269]]
// CHECK-NEXT: br i1 %[[VAL_270]], label %[[VAL_271:.*]], label %[[VAL_272:.*]]
// CHECK: then4: ; preds = %[[VAL_267]]
// CHECK-NEXT: %[[VAL_273:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_232]], i64 0, i64 0
// CHECK-NEXT: %[[VAL_274:.*]] = load ptr, ptr %[[VAL_273]], align 8
// CHECK-NEXT: %[[VAL_275:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_238]], i64 0, i64 0
// CHECK-NEXT: %[[VAL_276:.*]] = load ptr, ptr %[[VAL_275]], align 8
// CHECK-NEXT: %[[VAL_277:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_274]], i32 0, i32 0
// CHECK-NEXT: %[[VAL_278:.*]] = load float, ptr %[[VAL_277]], align 4
// CHECK-NEXT: %[[VAL_279:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_274]], i32 0, i32 1
// CHECK-NEXT: %[[VAL_280:.*]] = load float, ptr %[[VAL_279]], align 4
// CHECK-NEXT: %[[VAL_281:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_276]], i32 0, i32 0
// CHECK-NEXT: %[[VAL_282:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_276]], i32 0, i32 1
// CHECK-NEXT: store float %[[VAL_278]], ptr %[[VAL_281]], align 4
// CHECK-NEXT: store float %[[VAL_280]], ptr %[[VAL_282]], align 4
// CHECK-NEXT: br label %[[VAL_283:.*]]
// CHECK: else7: ; preds = %[[VAL_267]]
// CHECK-NEXT: br label %[[VAL_283]]
// CHECK: ifcont8: ; preds = %[[VAL_272]], %[[VAL_271]]
// CHECK-NEXT: ret void
9 changes: 5 additions & 4 deletions clang/test/OpenMP/reduction_implicit_map.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,6 @@ int main()
// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
// CHECK-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
Expand All @@ -249,6 +248,7 @@ int main()
// CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 2
// CHECK-NEXT: br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
// CHECK: body:
// CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
// CHECK-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
// CHECK-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
Expand All @@ -263,21 +263,22 @@ int main()
// CHECK: else:
// CHECK-NEXT: br label [[IFCONT]]
// CHECK: ifcont:
// CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
// CHECK-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
// CHECK: then2:
// CHECK: then3:
// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
// CHECK-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
// CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
// CHECK-NEXT: [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
// CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4
// CHECK-NEXT: br label [[IFCONT4:%.*]]
// CHECK: else3:
// CHECK: else4:
// CHECK-NEXT: br label [[IFCONT4]]
// CHECK: ifcont4:
// CHECK: ifcont5:
// CHECK-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
// CHECK-NEXT: store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
// CHECK-NEXT: br label [[PRECOND]]
Expand Down
Loading
Loading