Skip to content

Commit 21199f9

Browse files
[OpenMP][OMPIRBuilder] Fix LLVM IR codegen for collapsed device loop (#78708)
When we generate the loop body function, we need to be sure, that all original loop counters are replaced by the new counter. We need to save all items which use the original loop counter and then perform replacement of the original loop counter. If we don't do it, there is a risk that some values are not updated.
1 parent a2caa49 commit 21199f9

File tree

3 files changed

+87
-3
lines changed

3 files changed

+87
-3
lines changed

llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2876,9 +2876,10 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
28762876
// We need to model loop body region as the function f(cnt, loop_arg).
28772877
// That's why we replace loop induction variable by the new counter
28782878
// which will be one of loop body function argument
2879-
for (auto Use = CLI->getIndVar()->user_begin();
2880-
Use != CLI->getIndVar()->user_end(); ++Use) {
2881-
if (Instruction *Inst = dyn_cast<Instruction>(*Use)) {
2879+
SmallVector<User *> Users(CLI->getIndVar()->user_begin(),
2880+
CLI->getIndVar()->user_end());
2881+
for (auto Use : Users) {
2882+
if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
28822883
if (ParallelRegionBlockSet.count(Inst->getParent())) {
28832884
Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
28842885
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
2+
3+
// The aim of the test is to check the GPU LLVM IR codegen
4+
// for nested omp do loop with collapse clause inside omp target region
5+
6+
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } {
7+
llvm.func @target_collapsed_wsloop(%arg0: !llvm.ptr) {
8+
%loop_ub = llvm.mlir.constant(99 : i32) : i32
9+
%loop_lb = llvm.mlir.constant(0 : i32) : i32
10+
%loop_step = llvm.mlir.constant(1 : index) : i32
11+
omp.wsloop for (%arg1, %arg2) : i32 = (%loop_lb, %loop_lb) to (%loop_ub, %loop_ub) inclusive step (%loop_step, %loop_step) {
12+
%1 = llvm.add %arg1, %arg2 : i32
13+
%2 = llvm.mul %arg2, %loop_ub overflow<nsw> : i32
14+
%3 = llvm.add %arg1, %2 :i32
15+
%4 = llvm.getelementptr %arg0[%3] : (!llvm.ptr, i32) -> !llvm.ptr, i32
16+
llvm.store %1, %4 : i32, !llvm.ptr
17+
omp.yield
18+
}
19+
llvm.return
20+
}
21+
}
22+
23+
// CHECK: define void @[[FUNC_COLLAPSED_WSLOOP:.*]](ptr %[[ARG0:.*]])
24+
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr),
25+
// CHECK-SAME: ptr @[[COLLAPSED_WSLOOP_BODY_FN:.*]], ptr %[[STRUCT_ARG:.*]], i32 10000,
26+
// CHECK-SAME: i32 %[[NUM_THREADS:.*]], i32 0)
27+
28+
// CHECK: define internal void @[[COLLAPSED_WSLOOP_BODY_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])
29+
// CHECK: %[[TMP0:.*]] = urem i32 %[[LOOP_CNT]], 100
30+
// CHECK: %[[TMP1:.*]] = udiv i32 %[[LOOP_CNT]], 100
31+
// CHECK: %[[TMP2:.*]] = mul i32 %[[TMP1]], 1
32+
// CHECK: %[[TMP3:.*]] = add i32 %[[TMP2]], 0
33+
// CHECK: %[[TMP4:.*]] = mul i32 %[[TMP0]], 1
34+
// CHECK: %[[TMP5:.*]] = add i32 %[[TMP4]], 0
35+
// CHECK: %[[TMP6:.*]] = add i32 %[[TMP3]], %[[TMP5]]
36+
// CHECK: %[[TMP7:.*]] = mul nsw i32 %[[TMP5]], 99
37+
// CHECK: %[[TMP8:.*]] = add i32 %[[TMP3]], %[[TMP7]]
38+
// CHECK: %[[TMP9:.*]] = getelementptr i32, ptr %[[ARRAY:.*]], i32 %[[TMP8]]
39+
// CHECK: store i32 %[[TMP6]], ptr %[[TMP9]], align 4
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
! Basic offloading test with a target region
2+
! REQUIRES: flang
3+
! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
4+
! UNSUPPORTED: aarch64-unknown-linux-gnu
5+
! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
6+
! UNSUPPORTED: x86_64-pc-linux-gnu
7+
! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
8+
9+
! RUN: %libomptarget-compile-fortran-generic
10+
! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
11+
program main
12+
use omp_lib
13+
implicit none
14+
integer :: i,j
15+
integer :: array(10,10), errors = 0
16+
do i = 1, 10
17+
do j = 1, 10
18+
array(j, i) = 0
19+
end do
20+
end do
21+
22+
!$omp target parallel do map(from:array) collapse(2)
23+
do i = 1, 10
24+
do j = 1, 10
25+
array( j, i) = i + j
26+
end do
27+
end do
28+
!$omp end target parallel do
29+
30+
do i = 1, 10
31+
do j = 1, 10
32+
if ( array( j, i) .ne. (i + j) ) then
33+
errors = errors + 1
34+
end if
35+
end do
36+
end do
37+
38+
print *,"number of errors: ", errors
39+
40+
end program main
41+
42+
! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
43+
! CHECK: number of errors: 0
44+

0 commit comments

Comments
 (0)