Skip to content

Commit c94833b

Browse files
authored
[Clang] [OpenMP] Fixed wrong type usage in Xteam Reduction codegen. (llvm#1197)
2 parents 7f95a73 + ce171a8 commit c94833b

File tree

2 files changed

+115
-2
lines changed

2 files changed

+115
-2
lines changed

clang/lib/CodeGen/CGStmt.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -593,10 +593,12 @@ bool CodeGenFunction::EmitXteamRedStmt(const Stmt *S) {
593593
}
594594
assert(RedRHSExpr != nullptr && "Did not find a valid reduction rhs");
595595
llvm::Value *RHSValue = EmitScalarExpr(RedRHSExpr);
596-
Address XteamRedLocalAddr = RedVarMap.find(RedVarDecl)->second.RedVarAddr;
596+
auto It = RedVarMap.find(RedVarDecl);
597+
assert(It != RedVarMap.end() && "Variable must be found in reduction map");
598+
Address XteamRedLocalAddr = It->second.RedVarAddr;
597599
// Compute *xteam_red_local_addr + rhs_value
598600
llvm::Value *RedRHS = nullptr;
599-
llvm::Type *RedVarType = ConvertTypeForMem(RedVarDecl->getType());
601+
llvm::Type *RedVarType = ConvertTypeForMem(It->second.RedVarExpr->getType());
600602
if (RedVarType->isFloatTy() || RedVarType->isDoubleTy() ||
601603
RedVarType->isHalfTy() || RedVarType->isBFloatTy()) {
602604
auto RHSOp = RHSValue->getType()->isIntegerTy()
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
2+
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
3+
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
4+
// expected-no-diagnostics
5+
6+
void compute_reduced_sum(int n, int &x) {
7+
#pragma omp target teams distribute parallel for reduction(+ : x)
8+
for (int i = 0; i < n; ++i)
9+
x += i;
10+
}
11+
12+
int main()
13+
{
14+
int n = 1000;
15+
int sum = 0;
16+
compute_reduced_sum(n, sum);
17+
}
18+
19+
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z19compute_reduced_sumiRi_l7
20+
// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
21+
// CHECK-NEXT: entry:
22+
// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
23+
// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
24+
// CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
25+
// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
26+
// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8, addrspace(5)
27+
// CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8, addrspace(5)
28+
// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
29+
// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4, addrspace(5)
30+
// CHECK-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4, addrspace(5)
31+
// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5)
32+
// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5)
33+
// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
34+
// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
35+
// CHECK-NEXT: [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
36+
// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
37+
// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
38+
// CHECK-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
39+
// CHECK-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
40+
// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
41+
// CHECK-NEXT: [[DOTCAPTURE_EXPR__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCAPTURE_EXPR_]] to ptr
42+
// CHECK-NEXT: [[DOTCAPTURE_EXPR_2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCAPTURE_EXPR_2]] to ptr
43+
// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr
44+
// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr
45+
// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
46+
// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
47+
// CHECK-NEXT: store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8
48+
// CHECK-NEXT: store ptr [[X]], ptr [[X_ADDR_ASCAST]], align 8
49+
// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
50+
// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 8
51+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8
52+
// CHECK-NEXT: store ptr [[TMP2]], ptr [[TMP_ASCAST]], align 8
53+
// CHECK-NEXT: [[TMP3:%.*]] = alloca i32, align 4, addrspace(5)
54+
// CHECK-NEXT: store i32 0, ptr addrspace(5) [[TMP3]], align 4
55+
// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4
56+
// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
57+
// CHECK-NEXT: store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR__ASCAST]], align 4
58+
// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ASCAST]], align 4
59+
// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP5]], 0
60+
// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
61+
// CHECK-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
62+
// CHECK-NEXT: store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2_ASCAST]], align 4
63+
// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4
64+
// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4
65+
// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_ASCAST]], align 4
66+
// CHECK-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB_ASCAST]], align 4
67+
// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4
68+
// CHECK-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV_ASCAST]], align 4
69+
// CHECK-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
70+
// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
71+
// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
72+
// CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]]
73+
// CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP8]]
74+
// CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 1
75+
// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
76+
// CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
77+
// CHECK-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_get_hardware_num_blocks()
78+
// CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
79+
// CHECK-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV_ASCAST]], align 4
80+
// CHECK-NEXT: br label [[FOR_COND:%.*]]
81+
// CHECK: for.cond:
82+
// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
83+
// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4
84+
// CHECK-NEXT: [[CMP:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
85+
// CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
86+
// CHECK: for.body:
87+
// CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
88+
// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
89+
// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
90+
// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4
91+
// CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[I_ASCAST]], align 4
92+
// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4
93+
// CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP20]], [[TMP19]]
94+
// CHECK-NEXT: store i32 [[TMP21]], ptr addrspace(5) [[TMP3]], align 4
95+
// CHECK-NEXT: br label [[FOR_INC:%.*]]
96+
// CHECK: for.inc:
97+
// CHECK-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
98+
// CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[NVPTX_NUM_THREADS4]], [[TMP14]]
99+
// CHECK-NEXT: [[TMP23:%.*]] = mul i32 [[TMP22]], 1
100+
// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
101+
// CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP23]], [[TMP24]]
102+
// CHECK-NEXT: store i32 [[TMP25]], ptr [[DOTOMP_IV_ASCAST]], align 4
103+
// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
104+
// CHECK: for.end:
105+
// CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
106+
// CHECK-NEXT: [[TMP27:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
107+
// CHECK-NEXT: [[TMP28:%.*]] = load ptr, ptr [[TMP_ASCAST]], align 8
108+
// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4
109+
// CHECK-NEXT: call void @__kmpc_xteamr_i_16x64(i32 [[TMP29]], ptr [[TMP28]], ptr [[TMP26]], ptr [[TMP27]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP15]], i32 [[TMP14]])
110+
// CHECK-NEXT: ret void
111+
//

0 commit comments

Comments
 (0)