7
7
;
8
8
; which will load two floats at once into scalar registers.
9
9
10
+ ; CHECK-LABEL foo
10
11
define void @foo (<2 x float >* %a ) {
11
- ; CHECK: .func foo
12
12
; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}
13
13
%t1 = load <2 x float >, <2 x float >* %a
14
14
%t2 = fmul <2 x float > %t1 , %t1
15
15
store <2 x float > %t2 , <2 x float >* %a
16
16
ret void
17
17
}
18
18
19
+ ; CHECK-LABEL foo2
19
20
define void @foo2 (<4 x float >* %a ) {
20
- ; CHECK: .func foo2
21
21
; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
22
22
%t1 = load <4 x float >, <4 x float >* %a
23
23
%t2 = fmul <4 x float > %t1 , %t1
24
24
store <4 x float > %t2 , <4 x float >* %a
25
25
ret void
26
26
}
27
27
28
+ ; CHECK-LABEL foo3
28
29
define void @foo3 (<8 x float >* %a ) {
29
- ; CHECK: .func foo3
30
30
; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
31
31
; CHECK-NEXT: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
32
32
%t1 = load <8 x float >, <8 x float >* %a
@@ -37,30 +37,65 @@ define void @foo3(<8 x float>* %a) {
37
37
38
38
39
39
40
+ ; CHECK-LABEL foo4
40
41
define void @foo4 (<2 x i32 >* %a ) {
41
- ; CHECK: .func foo4
42
42
; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}
43
43
%t1 = load <2 x i32 >, <2 x i32 >* %a
44
44
%t2 = mul <2 x i32 > %t1 , %t1
45
45
store <2 x i32 > %t2 , <2 x i32 >* %a
46
46
ret void
47
47
}
48
48
49
+ ; CHECK-LABEL foo5
49
50
define void @foo5 (<4 x i32 >* %a ) {
50
- ; CHECK: .func foo5
51
51
; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
52
52
%t1 = load <4 x i32 >, <4 x i32 >* %a
53
53
%t2 = mul <4 x i32 > %t1 , %t1
54
54
store <4 x i32 > %t2 , <4 x i32 >* %a
55
55
ret void
56
56
}
57
57
58
+ ; CHECK-LABEL foo6
58
59
define void @foo6 (<8 x i32 >* %a ) {
59
- ; CHECK: .func foo6
60
60
; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
61
61
; CHECK-NEXT: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
62
62
%t1 = load <8 x i32 >, <8 x i32 >* %a
63
63
%t2 = mul <8 x i32 > %t1 , %t1
64
64
store <8 x i32 > %t2 , <8 x i32 >* %a
65
65
ret void
66
66
}
67
+
68
+ ; The following test wasn't passing previously as the address
69
+ ; computation was still too complex when LSV was called.
70
+ declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x () #0
71
+ declare i32 @llvm.nvvm.read.ptx.sreg.tid.x () #0
72
+ ; CHECK-LABEL foo_complex
73
+ define void @foo_complex (i8* nocapture readonly align 16 dereferenceable (134217728 ) %alloc0 ) {
74
+ %targ0.1.typed = bitcast i8* %alloc0 to [1024 x [131072 x i8 ]]*
75
+ %t0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x (), !range !1
76
+ %t1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x ()
77
+ %t2 = lshr i32 %t1 , 8
78
+ %t3 = shl nuw nsw i32 %t1 , 9
79
+ %ttile_origin.2 = and i32 %t3 , 130560
80
+ %tstart_offset_x_mul = shl nuw nsw i32 %t0 , 1
81
+ %t4 = or i32 %ttile_origin.2 , %tstart_offset_x_mul
82
+ %t6 = or i32 %t4 , 1
83
+ %t8 = or i32 %t4 , 128
84
+ %t9 = zext i32 %t8 to i64
85
+ %t10 = or i32 %t4 , 129
86
+ %t11 = zext i32 %t10 to i64
87
+ %t20 = zext i32 %t2 to i64
88
+ %t27 = getelementptr inbounds [1024 x [131072 x i8 ]], [1024 x [131072 x i8 ]]* %targ0.1.typed , i64 0 , i64 %t20 , i64 %t9
89
+ ; CHECK: ld.v2.u8
90
+ %t28 = load i8 , i8* %t27 , align 2
91
+ %t31 = getelementptr inbounds [1024 x [131072 x i8 ]], [1024 x [131072 x i8 ]]* %targ0.1.typed , i64 0 , i64 %t20 , i64 %t11
92
+ %t32 = load i8 , i8* %t31 , align 1
93
+ %t33 = icmp ult i8 %t28 , %t32
94
+ %t34 = select i1 %t33 , i8 %t32 , i8 %t28
95
+ store i8 %t34 , i8* %t31
96
+ ; CHECK: ret
97
+ ret void
98
+ }
99
+
100
+
101
+ !1 = !{i32 0 , i32 64 }
0 commit comments