1
+ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
1
2
; RUN: opt -mtriple=amdgcn-- -S -passes=separate-const-offset-from-gep,gvn -reassociate-geps-verify-no-dead-code < %s | FileCheck -check-prefix=IR %s
2
3
3
4
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
4
5
5
6
@array = internal addrspace (4 ) constant [4096 x [32 x float ]] zeroinitializer , align 4
6
7
7
- ; IR-LABEL: @sum_of_array(
8
- ; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [4096 x [32 x float]], ptr addrspace(4) @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
9
- ; IR: getelementptr inbounds float, ptr addrspace(4) [[BASE_PTR]], i64 1
10
- ; IR: getelementptr inbounds float, ptr addrspace(4) [[BASE_PTR]], i64 32
11
- ; IR: getelementptr inbounds float, ptr addrspace(4) [[BASE_PTR]], i64 33
12
8
define amdgpu_kernel void @sum_of_array (i32 %x , i32 %y , ptr addrspace (1 ) nocapture %output ) {
9
+ ; IR-LABEL: define amdgpu_kernel void @sum_of_array(
10
+ ; IR-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr addrspace(1) nocapture [[OUTPUT:%.*]]) {
11
+ ; IR-NEXT: [[TMP:%.*]] = sext i32 [[Y]] to i64
12
+ ; IR-NEXT: [[TMP1:%.*]] = sext i32 [[X]] to i64
13
+ ; IR-NEXT: [[TMP2:%.*]] = getelementptr [4096 x [32 x float]], ptr addrspace(4) @array, i64 0, i64 [[TMP1]], i64 [[TMP]]
14
+ ; IR-NEXT: [[TMP82:%.*]] = getelementptr inbounds float, ptr addrspace(4) [[TMP2]], i64 1
15
+ ; IR-NEXT: [[TMP144:%.*]] = getelementptr inbounds float, ptr addrspace(4) [[TMP2]], i64 32
16
+ ; IR-NEXT: [[TMP187:%.*]] = getelementptr inbounds float, ptr addrspace(4) [[TMP2]], i64 33
17
+ ; IR-NEXT: store float 0.000000e+00, ptr addrspace(1) [[OUTPUT]], align 4
18
+ ; IR-NEXT: ret void
19
+ ;
13
20
%tmp = sext i32 %y to i64
14
21
%tmp1 = sext i32 %x to i64
15
22
%tmp2 = getelementptr inbounds [4096 x [32 x float ]], ptr addrspace (4 ) @array , i64 0 , i64 %tmp1 , i64 %tmp
@@ -36,13 +43,22 @@ define amdgpu_kernel void @sum_of_array(i32 %x, i32 %y, ptr addrspace(1) nocaptu
36
43
37
44
; Some of the indices go over the maximum mubuf offset, so don't split them.
38
45
39
- ; IR-LABEL: @sum_of_array_over_max_mubuf_offset(
40
- ; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
41
- ; IR: getelementptr inbounds float, ptr addrspace(4) [[BASE_PTR]], i64 255
42
- ; IR: add i32 %x, 256
43
- ; IR: getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
44
- ; IR: getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
45
46
define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset (i32 %x , i32 %y , ptr addrspace (1 ) nocapture %output ) {
47
+ ; IR-LABEL: define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(
48
+ ; IR-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr addrspace(1) nocapture [[OUTPUT:%.*]]) {
49
+ ; IR-NEXT: [[TMP:%.*]] = sext i32 [[Y]] to i64
50
+ ; IR-NEXT: [[TMP1:%.*]] = sext i32 [[X]] to i64
51
+ ; IR-NEXT: [[TMP2:%.*]] = getelementptr [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 [[TMP1]], i64 [[TMP]]
52
+ ; IR-NEXT: [[TMP6:%.*]] = add i32 [[Y]], 255
53
+ ; IR-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
54
+ ; IR-NEXT: [[TMP82:%.*]] = getelementptr inbounds float, ptr addrspace(4) [[TMP2]], i64 255
55
+ ; IR-NEXT: [[TMP12:%.*]] = add i32 [[X]], 256
56
+ ; IR-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
57
+ ; IR-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 [[TMP13]], i64 [[TMP]]
58
+ ; IR-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 [[TMP13]], i64 [[TMP7]]
59
+ ; IR-NEXT: store float 0.000000e+00, ptr addrspace(1) [[OUTPUT]], align 4
60
+ ; IR-NEXT: ret void
61
+ ;
46
62
%tmp = sext i32 %y to i64
47
63
%tmp1 = sext i32 %x to i64
48
64
%tmp2 = getelementptr inbounds [4096 x [4 x float ]], ptr addrspace (4 ) @array2 , i64 0 , i64 %tmp1 , i64 %tmp
@@ -69,12 +85,24 @@ define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, pt
69
85
@lds_array = internal addrspace (3 ) global [4096 x [4 x float ]] undef , align 4
70
86
71
87
; DS instructions have a larger immediate offset, so make sure these are OK.
72
- ; IR-LABEL: @sum_of_lds_array_over_max_mubuf_offset(
73
- ; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [4096 x [4 x float]], ptr addrspace(3) @lds_array, i32 0, i32 %{{[a-zA-Z0-9]+}}, i32 %{{[a-zA-Z0-9]+}}
74
- ; IR: getelementptr inbounds float, ptr addrspace(3) [[BASE_PTR]], i32 255
75
- ; IR: getelementptr inbounds float, ptr addrspace(3) [[BASE_PTR]], i32 16128
76
- ; IR: getelementptr inbounds float, ptr addrspace(3) [[BASE_PTR]], i32 16383
77
88
define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset (i32 %x , i32 %y , ptr addrspace (1 ) nocapture %output ) {
89
+ ; IR-LABEL: define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(
90
+ ; IR-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr addrspace(1) nocapture [[OUTPUT:%.*]]) {
91
+ ; IR-NEXT: [[TMP2:%.*]] = getelementptr [4096 x [4 x float]], ptr addrspace(3) @lds_array, i32 0, i32 [[X]], i32 [[Y]]
92
+ ; IR-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(3) [[TMP2]], align 4
93
+ ; IR-NEXT: [[TMP5:%.*]] = fadd float [[TMP4]], 0.000000e+00
94
+ ; IR-NEXT: [[TMP82:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP2]], i32 255
95
+ ; IR-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(3) [[TMP82]], align 4
96
+ ; IR-NEXT: [[TMP11:%.*]] = fadd float [[TMP5]], [[TMP10]]
97
+ ; IR-NEXT: [[TMP144:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP2]], i32 16128
98
+ ; IR-NEXT: [[TMP16:%.*]] = load float, ptr addrspace(3) [[TMP144]], align 4
99
+ ; IR-NEXT: [[TMP17:%.*]] = fadd float [[TMP11]], [[TMP16]]
100
+ ; IR-NEXT: [[TMP187:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP2]], i32 16383
101
+ ; IR-NEXT: [[TMP20:%.*]] = load float, ptr addrspace(3) [[TMP187]], align 4
102
+ ; IR-NEXT: [[TMP21:%.*]] = fadd float [[TMP17]], [[TMP20]]
103
+ ; IR-NEXT: store float [[TMP21]], ptr addrspace(1) [[OUTPUT]], align 4
104
+ ; IR-NEXT: ret void
105
+ ;
78
106
%tmp2 = getelementptr inbounds [4096 x [4 x float ]], ptr addrspace (3 ) @lds_array , i32 0 , i32 %x , i32 %y
79
107
%tmp4 = load float , ptr addrspace (3 ) %tmp2 , align 4
80
108
%tmp5 = fadd float %tmp4 , 0 .000000e+00
@@ -93,11 +121,35 @@ define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y
93
121
ret void
94
122
}
95
123
96
- ; IR-LABEL: @keep_metadata(
97
- ; IR: getelementptr {{.*}} !amdgpu.uniform
98
- ; IR: getelementptr {{.*}} !amdgpu.uniform
99
- ; IR: getelementptr {{.*}} !amdgpu.uniform
100
124
define amdgpu_ps <{ i32 , i32 , i32 , i32 , i32 , float , float , float , float , float , float , float , float , float , float , float , float , float , float , float }> @keep_metadata (ptr addrspace (4 ) inreg noalias dereferenceable (18446744073709551615 ), ptr addrspace (4 ) inreg noalias dereferenceable (18446744073709551615 ), ptr addrspace (4 ) inreg noalias dereferenceable (18446744073709551615 ), ptr addrspace (4 ) inreg noalias dereferenceable (18446744073709551615 ), float inreg , i32 inreg , <2 x i32 >, <2 x i32 >, <2 x i32 >, <3 x i32 >, <2 x i32 >, <2 x i32 >, <2 x i32 >, float , float , float , float , float , i32 , i32 , float , i32 ) #5 {
125
+ ; IR-LABEL: define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @keep_metadata(
126
+ ; IR-SAME: ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615) [[TMP0:%.*]], ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615) [[TMP1:%.*]], ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615) [[TMP2:%.*]], ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615) [[TMP3:%.*]], float inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], <2 x i32> [[TMP6:%.*]], <2 x i32> [[TMP7:%.*]], <2 x i32> [[TMP8:%.*]], <3 x i32> [[TMP9:%.*]], <2 x i32> [[TMP10:%.*]], <2 x i32> [[TMP11:%.*]], <2 x i32> [[TMP12:%.*]], float [[TMP13:%.*]], float [[TMP14:%.*]], float [[TMP15:%.*]], float [[TMP16:%.*]], float [[TMP17:%.*]], i32 [[TMP18:%.*]], i32 [[TMP19:%.*]], float [[TMP20:%.*]], i32 [[TMP21:%.*]]) #[[ATTR0:[0-9]+]] {
127
+ ; IR-NEXT: main_body:
128
+ ; IR-NEXT: [[TMP22:%.*]] = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 [[TMP5]]) #[[ATTR3:[0-9]+]]
129
+ ; IR-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
130
+ ; IR-NEXT: [[TMP24:%.*]] = shl i32 [[TMP23]], 1
131
+ ; IR-NEXT: [[IDXPROM1:%.*]] = sext i32 [[TMP24]] to i64
132
+ ; IR-NEXT: [[TMP25:%.*]] = getelementptr [0 x <8 x i32>], ptr addrspace(4) [[TMP1]], i64 0, i64 [[IDXPROM1]], !amdgpu.uniform [[META0:![0-9]+]]
133
+ ; IR-NEXT: [[TMP26:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP25]], align 32, !invariant.load [[META0]]
134
+ ; IR-NEXT: [[TMP27:%.*]] = shl i32 [[TMP23]], 2
135
+ ; IR-NEXT: [[TMP28:%.*]] = sext i32 [[TMP27]] to i64
136
+ ; IR-NEXT: [[TMP29:%.*]] = getelementptr [0 x <4 x i32>], ptr addrspace(4) [[TMP1]], i64 0, i64 [[TMP28]], !amdgpu.uniform [[META0]]
137
+ ; IR-NEXT: [[TMP30:%.*]] = getelementptr <4 x i32>, ptr addrspace(4) [[TMP29]], i64 3, !amdgpu.uniform [[META0]]
138
+ ; IR-NEXT: [[TMP31:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP30]], align 16, !invariant.load [[META0]]
139
+ ; IR-NEXT: [[TMP32:%.*]] = call nsz <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> zeroinitializer, <8 x i32> [[TMP26]], <4 x i32> [[TMP31]], i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #[[ATTR3]]
140
+ ; IR-NEXT: [[TMP33:%.*]] = extractelement <4 x float> [[TMP32]], i32 0
141
+ ; IR-NEXT: [[TMP34:%.*]] = extractelement <4 x float> [[TMP32]], i32 1
142
+ ; IR-NEXT: [[TMP35:%.*]] = extractelement <4 x float> [[TMP32]], i32 2
143
+ ; IR-NEXT: [[TMP36:%.*]] = extractelement <4 x float> [[TMP32]], i32 3
144
+ ; IR-NEXT: [[TMP37:%.*]] = bitcast float [[TMP4]] to i32
145
+ ; IR-NEXT: [[TMP38:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 [[TMP37]], 4
146
+ ; IR-NEXT: [[TMP39:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP38]], float [[TMP33]], 5
147
+ ; IR-NEXT: [[TMP40:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP39]], float [[TMP34]], 6
148
+ ; IR-NEXT: [[TMP41:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP40]], float [[TMP35]], 7
149
+ ; IR-NEXT: [[TMP42:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP41]], float [[TMP36]], 8
150
+ ; IR-NEXT: [[TMP43:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP42]], float [[TMP20]], 19
151
+ ; IR-NEXT: ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP43]]
152
+ ;
101
153
main_body:
102
154
%22 = call nsz float @llvm.amdgcn.interp.mov (i32 2 , i32 0 , i32 0 , i32 %5 ) #8
103
155
%23 = bitcast float %22 to i32
@@ -136,3 +188,6 @@ attributes #5 = { "InitialPSInputAddr"="45175" }
136
188
attributes #6 = { nounwind readnone speculatable }
137
189
attributes #7 = { nounwind readonly }
138
190
attributes #8 = { nounwind readnone }
191
+ ;.
192
+ ; IR: [[META0]] = !{}
193
+ ;.
0 commit comments