@@ -23,6 +23,23 @@ define i8 @memcpy_constant_arg_ptr_to_alloca([32 x i8] addrspace(4)* noalias rea
23
23
ret i8 %load
24
24
}
25
25
26
+ ; Simple memmove to alloca from constant address space argument.
27
+ define i8 @memmove_constant_arg_ptr_to_alloca ([32 x i8 ] addrspace (4 )* noalias readonly align 4 dereferenceable (32 ) %arg , i32 %idx ) {
28
+ ; CHECK-LABEL: @memmove_constant_arg_ptr_to_alloca(
29
+ ; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
30
+ ; CHECK-NEXT: [[GEP:%.*]] = getelementptr [32 x i8], [32 x i8] addrspace(4)* [[ARG:%.*]], i64 0, i64 [[TMP1]]
31
+ ; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(4)* [[GEP]], align 1
32
+ ; CHECK-NEXT: ret i8 [[LOAD]]
33
+ ;
34
+ %alloca = alloca [32 x i8 ], align 4 , addrspace (5 )
35
+ %alloca.cast = bitcast [32 x i8 ] addrspace (5 )* %alloca to i8 addrspace (5 )*
36
+ %arg.cast = bitcast [32 x i8 ] addrspace (4 )* %arg to i8 addrspace (4 )*
37
+ call void @llvm.memmove.p5i8.p4i8.i32 (i8 addrspace (5 )* %alloca.cast , i8 addrspace (4 )* %arg.cast , i32 32 , i1 false )
38
+ %gep = getelementptr inbounds [32 x i8 ], [32 x i8 ] addrspace (5 )* %alloca , i32 0 , i32 %idx
39
+ %load = load i8 , i8 addrspace (5 )* %gep
40
+ ret i8 %load
41
+ }
42
+
26
43
; Simple memcpy to alloca from byref constant address space argument.
27
44
define amdgpu_kernel void @memcpy_constant_byref_arg_ptr_to_alloca ([32 x i8 ] addrspace (4 )* noalias readonly align 4 byref([32 x i8 ]) %arg , i8 addrspace (1 )* %out , i32 %idx ) {
28
45
; CHECK-LABEL: @memcpy_constant_byref_arg_ptr_to_alloca(
@@ -87,9 +104,13 @@ define amdgpu_kernel void @memcpy_constant_intrinsic_ptr_to_alloca(i8 addrspace(
87
104
; Alloca is written through a flat pointer
88
105
define i8 @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat ([31 x i8 ] addrspace (4 )* noalias readonly align 4 dereferenceable (32 ) %arg , i32 %idx ) {
89
106
; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat(
90
- ; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
91
- ; CHECK-NEXT: [[GEP:%.*]] = getelementptr [31 x i8], [31 x i8] addrspace(4)* [[ARG:%.*]], i64 0, i64 [[TMP1]]
92
- ; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(4)* [[GEP]], align 1
107
+ ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [32 x i8], align 4, addrspace(5)
108
+ ; CHECK-NEXT: [[ALLOCA_CAST:%.*]] = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* [[ALLOCA]], i32 0, i32 0
109
+ ; CHECK-NEXT: [[ALLOCA_CAST_ASC:%.*]] = addrspacecast i8 addrspace(5)* [[ALLOCA_CAST]] to i8*
110
+ ; CHECK-NEXT: [[ARG_CAST:%.*]] = getelementptr inbounds [31 x i8], [31 x i8] addrspace(4)* [[ARG:%.*]], i64 0, i64 0
111
+ ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p4i8.i64(i8* nonnull align 1 dereferenceable(31) [[ALLOCA_CAST_ASC]], i8 addrspace(4)* align 4 dereferenceable(31) [[ARG_CAST]], i64 31, i1 false)
112
+ ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* [[ALLOCA]], i32 0, i32 [[IDX:%.*]]
113
+ ; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(5)* [[GEP]], align 1
93
114
; CHECK-NEXT: ret i8 [[LOAD]]
94
115
;
95
116
%alloca = alloca [32 x i8 ], align 4 , addrspace (5 )
@@ -125,9 +146,88 @@ define i8 @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat2([32 x i8] ad
125
146
ret i8 %load
126
147
}
127
148
128
- declare void @llvm.memcpy.p5i8.p4i8.i64 (i8 addrspace (5 )* nocapture , i8 addrspace (4 )* nocapture , i64 , i1 ) #0
149
+ %struct.ty = type { [4 x i32 ] }
150
+
151
+ define amdgpu_kernel void @byref_infloop (i8* %scratch , %struct.ty addrspace (4 )* byref(%struct.ty ) align 4 %arg ) local_unnamed_addr #1 {
152
+ ; CHECK-LABEL: @byref_infloop(
153
+ ; CHECK-NEXT: bb:
154
+ ; CHECK-NEXT: [[CAST_ALLOCA:%.*]] = bitcast [[STRUCT_TY:%.*]] addrspace(4)* [[ARG:%.*]] to i8 addrspace(4)*
155
+ ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p4i8.i32(i8* nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], i8 addrspace(4)* align 4 dereferenceable(16) [[CAST_ALLOCA]], i32 16, i1 false)
156
+ ; CHECK-NEXT: ret void
157
+ ;
158
+ bb:
159
+ %alloca = alloca [4 x i32 ], align 4 , addrspace (5 )
160
+ %cast.arg = bitcast %struct.ty addrspace (4 )* %arg to i8 addrspace (4 )*
161
+ %cast.alloca = bitcast [4 x i32 ] addrspace (5 )* %alloca to i8 addrspace (5 )*
162
+ call void @llvm.memcpy.p5i8.p4i8.i32 (i8 addrspace (5 )* align 4 %cast.alloca , i8 addrspace (4 )* align 4 %cast.arg , i32 16 , i1 false )
163
+ call void @llvm.memcpy.p0i8.p5i8.i32 (i8* align 4 %scratch , i8 addrspace (5 )* align 4 %cast.alloca , i32 16 , i1 false )
164
+ ret void
165
+ }
166
+
167
+ define amdgpu_kernel void @byref_infloop_metadata (i8* %scratch , %struct.ty addrspace (4 )* byref(%struct.ty ) align 4 %arg ) local_unnamed_addr #1 {
168
+ ; CHECK-LABEL: @byref_infloop_metadata(
169
+ ; CHECK-NEXT: bb:
170
+ ; CHECK-NEXT: [[CAST_ALLOCA:%.*]] = bitcast [[STRUCT_TY:%.*]] addrspace(4)* [[ARG:%.*]] to i8 addrspace(4)*
171
+ ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p4i8.i32(i8* nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], i8 addrspace(4)* align 4 dereferenceable(16) [[CAST_ALLOCA]], i32 16, i1 false), !noalias !0
172
+ ; CHECK-NEXT: ret void
173
+ ;
174
+ bb:
175
+ %alloca = alloca [4 x i32 ], align 4 , addrspace (5 )
176
+ %cast.arg = bitcast %struct.ty addrspace (4 )* %arg to i8 addrspace (4 )*
177
+ %cast.alloca = bitcast [4 x i32 ] addrspace (5 )* %alloca to i8 addrspace (5 )*
178
+ call void @llvm.memcpy.p5i8.p4i8.i32 (i8 addrspace (5 )* align 4 %cast.alloca , i8 addrspace (4 )* align 4 %cast.arg , i32 16 , i1 false ), !noalias !0
179
+ call void @llvm.memcpy.p0i8.p5i8.i32 (i8* align 4 %scratch , i8 addrspace (5 )* align 4 %cast.alloca , i32 16 , i1 false ), !noalias !1
180
+ ret void
181
+ }
182
+
183
+ define amdgpu_kernel void @byref_infloop_addrspacecast (i8* %scratch , %struct.ty addrspace (4 )* byref(%struct.ty ) align 4 %arg ) local_unnamed_addr #1 {
184
+ ; CHECK-LABEL: @byref_infloop_addrspacecast(
185
+ ; CHECK-NEXT: bb:
186
+ ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4 x i32], align 4, addrspace(5)
187
+ ; CHECK-NEXT: [[CAST_ARG:%.*]] = bitcast [[STRUCT_TY:%.*]] addrspace(4)* [[ARG:%.*]] to i8 addrspace(4)*
188
+ ; CHECK-NEXT: [[CAST_ALLOCA:%.*]] = bitcast [4 x i32] addrspace(5)* [[ALLOCA]] to i8 addrspace(5)*
189
+ ; CHECK-NEXT: [[ADDRSPACECAST_ALLOCA:%.*]] = addrspacecast i8 addrspace(5)* [[CAST_ALLOCA]] to i8*
190
+ ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p4i8.i64(i8* nonnull align 4 dereferenceable(16) [[ADDRSPACECAST_ALLOCA]], i8 addrspace(4)* align 4 dereferenceable(16) [[CAST_ARG]], i64 16, i1 false)
191
+ ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], i8* nonnull align 4 dereferenceable(16) [[ADDRSPACECAST_ALLOCA]], i64 16, i1 false)
192
+ ; CHECK-NEXT: ret void
193
+ ;
194
+ bb:
195
+ %alloca = alloca [4 x i32 ], align 4 , addrspace (5 )
196
+ %cast.arg = bitcast %struct.ty addrspace (4 )* %arg to i8 addrspace (4 )*
197
+ %cast.alloca = bitcast [4 x i32 ] addrspace (5 )* %alloca to i8 addrspace (5 )*
198
+ %addrspacecast.alloca = addrspacecast i8 addrspace (5 )* %cast.alloca to i8*
199
+ call void @llvm.memcpy.p0i8.p4i8.i64 (i8* nonnull align 4 dereferenceable (16 ) %addrspacecast.alloca , i8 addrspace (4 )* align 4 dereferenceable (16 ) %cast.arg , i64 16 , i1 false )
200
+ call void @llvm.memcpy.p0i8.p0i8.i64 (i8* nonnull align 4 dereferenceable (16 ) %scratch , i8* nonnull align 4 dereferenceable (16 ) %addrspacecast.alloca , i64 16 , i1 false )
201
+ ret void
202
+ }
203
+
204
+ define amdgpu_kernel void @byref_infloop_memmove (i8* %scratch , %struct.ty addrspace (4 )* byref(%struct.ty ) align 4 %arg ) local_unnamed_addr #1 {
205
+ ; CHECK-LABEL: @byref_infloop_memmove(
206
+ ; CHECK-NEXT: bb:
207
+ ; CHECK-NEXT: [[CAST_ALLOCA:%.*]] = bitcast [[STRUCT_TY:%.*]] addrspace(4)* [[ARG:%.*]] to i8 addrspace(4)*
208
+ ; CHECK-NEXT: call void @llvm.memmove.p0i8.p4i8.i32(i8* nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], i8 addrspace(4)* align 4 dereferenceable(16) [[CAST_ALLOCA]], i32 16, i1 false)
209
+ ; CHECK-NEXT: ret void
210
+ ;
211
+ bb:
212
+ %alloca = alloca [4 x i32 ], align 4 , addrspace (5 )
213
+ %cast.arg = bitcast %struct.ty addrspace (4 )* %arg to i8 addrspace (4 )*
214
+ %cast.alloca = bitcast [4 x i32 ] addrspace (5 )* %alloca to i8 addrspace (5 )*
215
+ call void @llvm.memmove.p5i8.p4i8.i32 (i8 addrspace (5 )* align 4 %cast.alloca , i8 addrspace (4 )* align 4 %cast.arg , i32 16 , i1 false )
216
+ call void @llvm.memmove.p0i8.p5i8.i32 (i8* align 4 %scratch , i8 addrspace (5 )* align 4 %cast.alloca , i32 16 , i1 false )
217
+ ret void
218
+ }
219
+
220
+ declare void @llvm.memcpy.p0i8.p5i8.i32 (i8* noalias nocapture writeonly , i8 addrspace (5 )* noalias nocapture readonly , i32 , i1 immarg) #0
221
+ declare void @llvm.memcpy.p5i8.p4i8.i32 (i8 addrspace (5 )* nocapture , i8 addrspace (4 )* nocapture , i32 , i1 ) #0
129
222
declare void @llvm.memcpy.p0i8.p4i8.i64 (i8* nocapture , i8 addrspace (4 )* nocapture , i64 , i1 ) #0
223
+ declare void @llvm.memcpy.p0i8.p0i8.i64 (i8* noalias nocapture writeonly , i8* noalias nocapture readonly , i64 , i1 immarg) #0
224
+ declare void @llvm.memcpy.p5i8.p4i8.i64 (i8 addrspace (5 )* nocapture , i8 addrspace (4 )* nocapture , i64 , i1 ) #0
225
+ declare void @llvm.memmove.p5i8.p4i8.i32 (i8 addrspace (5 )* nocapture , i8 addrspace (4 )* nocapture , i32 , i1 ) #0
226
+ declare void @llvm.memmove.p0i8.p5i8.i32 (i8* nocapture , i8 addrspace (5 )* nocapture , i32 , i1 ) #0
130
227
declare i8 addrspace (4 )* @llvm.amdgcn.kernarg.segment.ptr () #1
131
228
132
229
attributes #0 = { argmemonly nounwind willreturn }
133
230
attributes #1 = { nounwind readnone speculatable }
231
+
232
+ !0 = !{!0 }
233
+ !1 = !{!1 }
0 commit comments