|
| 1 | +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py |
| 2 | +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -amdgpu-aa-wrapper -amdgpu-aa -instcombine -o - %s | FileCheck %s |
| 3 | + |
| 4 | +; Make sure the optimization from memcpy-from-global.ll happens, but |
| 5 | +; the constant source is not a global variable. |
| 6 | + |
| 7 | +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" |
| 8 | + |
| 9 | +; Simple memcpy to alloca from constant address space argument. |
| 10 | +define i8 @memcpy_constant_arg_ptr_to_alloca([32 x i8] addrspace(4)* noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) { |
| 11 | +; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca( |
| 12 | +; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64 |
| 13 | +; CHECK-NEXT: [[GEP:%.*]] = getelementptr [32 x i8], [32 x i8] addrspace(4)* [[ARG:%.*]], i64 0, i64 [[TMP1]] |
| 14 | +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(4)* [[GEP]], align 1 |
| 15 | +; CHECK-NEXT: ret i8 [[LOAD]] |
| 16 | +; |
| 17 | + %alloca = alloca [32 x i8], align 4, addrspace(5) |
| 18 | + %alloca.cast = bitcast [32 x i8] addrspace(5)* %alloca to i8 addrspace(5)* |
| 19 | + %arg.cast = bitcast [32 x i8] addrspace(4)* %arg to i8 addrspace(4)* |
| 20 | + call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* %alloca.cast, i8 addrspace(4)* %arg.cast, i64 32, i1 false) |
| 21 | + %gep = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %alloca, i32 0, i32 %idx |
| 22 | + %load = load i8, i8 addrspace(5)* %gep |
| 23 | + ret i8 %load |
| 24 | +} |
| 25 | + |
| 26 | +; Simple memcpy to alloca from constant address space intrinsic call |
| 27 | +define amdgpu_kernel void @memcpy_constant_intrinsic_ptr_to_alloca(i8 addrspace(1)* %out, i32 %idx) { |
| 28 | +; CHECK-LABEL: @memcpy_constant_intrinsic_ptr_to_alloca( |
| 29 | +; CHECK-NEXT: [[KERNARG_SEGMENT_PTR:%.*]] = call align 16 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() |
| 30 | +; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64 |
| 31 | +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERNARG_SEGMENT_PTR]], i64 [[TMP1]] |
| 32 | +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(4)* [[GEP]], align 1 |
| 33 | +; CHECK-NEXT: store i8 [[LOAD]], i8 addrspace(1)* [[OUT:%.*]], align 1 |
| 34 | +; CHECK-NEXT: ret void |
| 35 | +; |
| 36 | + %alloca = alloca [32 x i8], align 4, addrspace(5) |
| 37 | + %alloca.cast = bitcast [32 x i8] addrspace(5)* %alloca to i8 addrspace(5)* |
| 38 | + %kernarg.segment.ptr = call dereferenceable(32) align 16 i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() |
| 39 | + call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* %alloca.cast, i8 addrspace(4)* %kernarg.segment.ptr, i64 32, i1 false) |
| 40 | + %gep = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %alloca, i32 0, i32 %idx |
| 41 | + %load = load i8, i8 addrspace(5)* %gep |
| 42 | + store i8 %load, i8 addrspace(1)* %out |
| 43 | + ret void |
| 44 | +} |
| 45 | + |
| 46 | +; Alloca is written through a flat pointer |
| 47 | +define i8 @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat([32 x i8] addrspace(4)* noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) { |
| 48 | +; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat( |
| 49 | +; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64 |
| 50 | +; CHECK-NEXT: [[GEP:%.*]] = getelementptr [32 x i8], [32 x i8] addrspace(4)* [[ARG:%.*]], i64 0, i64 [[TMP1]] |
| 51 | +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(4)* [[GEP]], align 1 |
| 52 | +; CHECK-NEXT: ret i8 [[LOAD]] |
| 53 | +; |
| 54 | + %alloca = alloca [32 x i8], align 4, addrspace(5) |
| 55 | + %alloca.cast = bitcast [32 x i8] addrspace(5)* %alloca to i8 addrspace(5)* |
| 56 | + %alloca.cast.asc = addrspacecast i8 addrspace(5)* %alloca.cast to i8* |
| 57 | + %arg.cast = bitcast [32 x i8] addrspace(4)* %arg to i8 addrspace(4)* |
| 58 | + call void @llvm.memcpy.p0i8.p4i8.i64(i8* %alloca.cast.asc, i8 addrspace(4)* %arg.cast, i64 32, i1 false) |
| 59 | + %gep = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %alloca, i32 0, i32 %idx |
| 60 | + %load = load i8, i8 addrspace(5)* %gep |
| 61 | + ret i8 %load |
| 62 | +} |
| 63 | + |
| 64 | +; Alloca is only addressed through flat pointer. |
| 65 | +define i8 @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat2([32 x i8] addrspace(4)* noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) { |
| 66 | +; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat2( |
| 67 | +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [32 x i8], align 4, addrspace(5) |
| 68 | +; CHECK-NEXT: [[ALLOCA_CAST1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* [[ALLOCA]], i32 0, i32 0 |
| 69 | +; CHECK-NEXT: [[ALLOCA_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[ALLOCA_CAST1]] to i8* |
| 70 | +; CHECK-NEXT: [[ARG_CAST:%.*]] = getelementptr inbounds [32 x i8], [32 x i8] addrspace(4)* [[ARG:%.*]], i64 0, i64 0 |
| 71 | +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p4i8.i64(i8* nonnull align 1 dereferenceable(32) [[ALLOCA_CAST]], i8 addrspace(4)* align 4 dereferenceable(32) [[ARG_CAST]], i64 32, i1 false) |
| 72 | +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* [[ALLOCA]], i32 0, i32 [[IDX:%.*]] |
| 73 | +; CHECK-NEXT: [[GEP:%.*]] = addrspacecast i8 addrspace(5)* [[GEP2]] to i8* |
| 74 | +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8* [[GEP]], align 1 |
| 75 | +; CHECK-NEXT: ret i8 [[LOAD]] |
| 76 | +; |
| 77 | + %alloca = alloca [32 x i8], align 4, addrspace(5) |
| 78 | + %alloca.cast.asc = addrspacecast [32 x i8] addrspace(5)* %alloca to [32 x i8]* |
| 79 | + %alloca.cast = bitcast [32 x i8]* %alloca.cast.asc to i8* |
| 80 | + %arg.cast = bitcast [32 x i8] addrspace(4)* %arg to i8 addrspace(4)* |
| 81 | + call void @llvm.memcpy.p0i8.p4i8.i64(i8* %alloca.cast, i8 addrspace(4)* %arg.cast, i64 32, i1 false) |
| 82 | + %gep = getelementptr inbounds [32 x i8], [32 x i8]* %alloca.cast.asc, i32 0, i32 %idx |
| 83 | + %load = load i8, i8* %gep |
| 84 | + ret i8 %load |
| 85 | +} |
| 86 | + |
| 87 | +declare void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* nocapture, i8 addrspace(4)* nocapture, i64, i1) #0 |
| 88 | +declare void @llvm.memcpy.p0i8.p4i8.i64(i8* nocapture, i8 addrspace(4)* nocapture, i64, i1) #0 |
| 89 | +declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #1 |
| 90 | + |
| 91 | +attributes #0 = { argmemonly nounwind willreturn } |
| 92 | +attributes #1 = { nounwind readnone speculatable } |
0 commit comments