Skip to content

Commit 019ab61

Browse files
nouizArtem-B
authored andcommitted
[NVPTX, LSV] Move the LSV optimization pass to later when the graph is cleaner
This allow it to recognize more loads as being consecutive when the load's address are complex at the start. Differential Revision: https://reviews.llvm.org/D74444
1 parent 3eb1b59 commit 019ab61

File tree

2 files changed

+45
-9
lines changed

2 files changed

+45
-9
lines changed

llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -276,8 +276,6 @@ void NVPTXPassConfig::addIRPasses() {
276276
addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine()));
277277
if (getOptLevel() != CodeGenOpt::None) {
278278
addAddressSpaceInferencePasses();
279-
if (!DisableLoadStoreVectorizer)
280-
addPass(createLoadStoreVectorizerPass());
281279
addStraightLineScalarOptimizationPasses();
282280
}
283281

@@ -295,8 +293,11 @@ void NVPTXPassConfig::addIRPasses() {
295293
// %1 = shl %a, 2
296294
//
297295
// but EarlyCSE can do neither of them.
298-
if (getOptLevel() != CodeGenOpt::None)
296+
if (getOptLevel() != CodeGenOpt::None) {
299297
addEarlyCSEOrGVNPass();
298+
if (!DisableLoadStoreVectorizer)
299+
addPass(createLoadStoreVectorizerPass());
300+
}
300301
}
301302

302303
bool NVPTXPassConfig::addInstSelector() {

llvm/test/CodeGen/NVPTX/vector-loads.ll

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,26 +7,26 @@
77
;
88
; which will load two floats at once into scalar registers.
99

10+
; CHECK-LABEL foo
1011
define void @foo(<2 x float>* %a) {
11-
; CHECK: .func foo
1212
; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}
1313
%t1 = load <2 x float>, <2 x float>* %a
1414
%t2 = fmul <2 x float> %t1, %t1
1515
store <2 x float> %t2, <2 x float>* %a
1616
ret void
1717
}
1818

19+
; CHECK-LABEL foo2
1920
define void @foo2(<4 x float>* %a) {
20-
; CHECK: .func foo2
2121
; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
2222
%t1 = load <4 x float>, <4 x float>* %a
2323
%t2 = fmul <4 x float> %t1, %t1
2424
store <4 x float> %t2, <4 x float>* %a
2525
ret void
2626
}
2727

28+
; CHECK-LABEL foo3
2829
define void @foo3(<8 x float>* %a) {
29-
; CHECK: .func foo3
3030
; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
3131
; CHECK-NEXT: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
3232
%t1 = load <8 x float>, <8 x float>* %a
@@ -37,30 +37,65 @@ define void @foo3(<8 x float>* %a) {
3737

3838

3939

40+
; CHECK-LABEL foo4
4041
define void @foo4(<2 x i32>* %a) {
41-
; CHECK: .func foo4
4242
; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}
4343
%t1 = load <2 x i32>, <2 x i32>* %a
4444
%t2 = mul <2 x i32> %t1, %t1
4545
store <2 x i32> %t2, <2 x i32>* %a
4646
ret void
4747
}
4848

49+
; CHECK-LABEL foo5
4950
define void @foo5(<4 x i32>* %a) {
50-
; CHECK: .func foo5
5151
; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
5252
%t1 = load <4 x i32>, <4 x i32>* %a
5353
%t2 = mul <4 x i32> %t1, %t1
5454
store <4 x i32> %t2, <4 x i32>* %a
5555
ret void
5656
}
5757

58+
; CHECK-LABEL foo6
5859
define void @foo6(<8 x i32>* %a) {
59-
; CHECK: .func foo6
6060
; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
6161
; CHECK-NEXT: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
6262
%t1 = load <8 x i32>, <8 x i32>* %a
6363
%t2 = mul <8 x i32> %t1, %t1
6464
store <8 x i32> %t2, <8 x i32>* %a
6565
ret void
6666
}
67+
68+
; The following test wasn't passing previously as the address
69+
; computation was still too complex when LSV was called.
70+
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
71+
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
72+
; CHECK-LABEL foo_complex
73+
define void @foo_complex(i8* nocapture readonly align 16 dereferenceable(134217728) %alloc0) {
74+
%targ0.1.typed = bitcast i8* %alloc0 to [1024 x [131072 x i8]]*
75+
%t0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !1
76+
%t1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
77+
%t2 = lshr i32 %t1, 8
78+
%t3 = shl nuw nsw i32 %t1, 9
79+
%ttile_origin.2 = and i32 %t3, 130560
80+
%tstart_offset_x_mul = shl nuw nsw i32 %t0, 1
81+
%t4 = or i32 %ttile_origin.2, %tstart_offset_x_mul
82+
%t6 = or i32 %t4, 1
83+
%t8 = or i32 %t4, 128
84+
%t9 = zext i32 %t8 to i64
85+
%t10 = or i32 %t4, 129
86+
%t11 = zext i32 %t10 to i64
87+
%t20 = zext i32 %t2 to i64
88+
%t27 = getelementptr inbounds [1024 x [131072 x i8]], [1024 x [131072 x i8]]* %targ0.1.typed, i64 0, i64 %t20, i64 %t9
89+
; CHECK: ld.v2.u8
90+
%t28 = load i8, i8* %t27, align 2
91+
%t31 = getelementptr inbounds [1024 x [131072 x i8]], [1024 x [131072 x i8]]* %targ0.1.typed, i64 0, i64 %t20, i64 %t11
92+
%t32 = load i8, i8* %t31, align 1
93+
%t33 = icmp ult i8 %t28, %t32
94+
%t34 = select i1 %t33, i8 %t32, i8 %t28
95+
store i8 %t34, i8* %t31
96+
; CHECK: ret
97+
ret void
98+
}
99+
100+
101+
!1 = !{i32 0, i32 64}

0 commit comments

Comments
 (0)