Skip to content

Commit 5afd162

Browse files
committed
Add HIP tests, plumb the intrinsic through infer-address-spaces
1 parent da53e5e commit 5afd162

File tree

4 files changed

+84
-1
lines changed

4 files changed

+84
-1
lines changed
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
2+
// REQUIRES: amdgpu-registered-target
3+
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx950 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
4+
5+
// COM: Most tests are in the OpenCL semastics, this is just a verification for HIP
6+
7+
#define __device__ __attribute__((device))
8+
#define __shared__ __attribute__((shared))
9+
10+
typedef unsigned int u32;
11+
12+
// CHECK-LABEL: define dso_local void @_Z20test_load_to_lds_u32PjS_(
13+
// CHECK-SAME: ptr noundef [[SRC:%.*]], ptr noundef [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
14+
// CHECK-NEXT: [[ENTRY:.*:]]
15+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
16+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
17+
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
18+
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
19+
// CHECK-NEXT: store ptr [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 8
20+
// CHECK-NEXT: store ptr [[DST]], ptr [[DST_ADDR_ASCAST]], align 8
21+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SRC_ADDR_ASCAST]], align 8
22+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DST_ADDR_ASCAST]], align 8
23+
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
24+
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p0(ptr [[TMP0]], ptr addrspace(3) [[TMP2]], i32 4, i32 0, i32 0)
25+
// CHECK-NEXT: ret void
26+
//
27+
__device__ void test_load_to_lds_u32(u32* src, __shared__ u32 *dst) {
28+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
29+
}
30+
31+
// CHECK-LABEL: define dso_local void @_Z20test_load_to_lds_128PvS_(
32+
// CHECK-SAME: ptr noundef [[SRC:%.*]], ptr noundef [[DST:%.*]]) #[[ATTR0]] {
33+
// CHECK-NEXT: [[ENTRY:.*:]]
34+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
35+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
36+
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
37+
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
38+
// CHECK-NEXT: store ptr [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 8
39+
// CHECK-NEXT: store ptr [[DST]], ptr [[DST_ADDR_ASCAST]], align 8
40+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SRC_ADDR_ASCAST]], align 8
41+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DST_ADDR_ASCAST]], align 8
42+
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
43+
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p0(ptr [[TMP0]], ptr addrspace(3) [[TMP2]], i32 16, i32 0, i32 0)
44+
// CHECK-NEXT: ret void
45+
//
46+
__device__ void test_load_to_lds_128(void* src, __shared__ void *dst) {
47+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0);
48+
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2656,7 +2656,9 @@ class AMDGPULoadToLDS :
26562656
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
26572657
// bit 1 = sc1,
26582658
// bit 4 = scc))
2659-
[IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
2659+
[IntrWillReturn, IntrArgMemOnly,
2660+
NoCapture<ArgIndex<0>>, ReadOnly<ArgIndex<0>>,
2661+
NoCapture<ArgIndex<1>>, WriteOnly<ArgIndex<1>>,
26602662
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
26612663
"", [SDNPMemOperand]>;
26622664
def int_amdgcn_load_to_lds : AMDGPULoadToLDS;

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1043,6 +1043,7 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
10431043
case Intrinsic::amdgcn_is_private:
10441044
case Intrinsic::amdgcn_flat_atomic_fmax_num:
10451045
case Intrinsic::amdgcn_flat_atomic_fmin_num:
1046+
case Intrinsic::amdgcn_load_to_lds:
10461047
OpIndexes.push_back(0);
10471048
return true;
10481049
default:
@@ -1114,6 +1115,15 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
11141115
II->setCalledFunction(NewDecl);
11151116
return II;
11161117
}
1118+
case Intrinsic::amdgcn_load_to_lds: {
1119+
Type *SrcTy = NewV->getType();
1120+
Module *M = II->getModule();
1121+
Function *NewDecl =
1122+
Intrinsic::getOrInsertDeclaration(M, II->getIntrinsicID(), {SrcTy});
1123+
II->setArgOperand(0, NewV);
1124+
II->setCalledFunction(NewDecl);
1125+
return II;
1126+
}
11171127
default:
11181128
return nullptr;
11191129
}

llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,11 +170,34 @@ define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(ptr %dest
170170
ret void
171171
}
172172

173+
define amdgpu_kernel void @load_to_lds_global_as_flat(ptr addrspace(1) %global.ptr, ptr addrspace(3) %group.ptr) #0 {
174+
; CHECK-LABEL: define amdgpu_kernel void @load_to_lds_global_as_flat(
175+
; CHECK-SAME: ptr addrspace(1) [[GLOBAL_PTR:%.*]], ptr addrspace(3) [[GROUP_PTR:%.*]]) #[[ATTR0]] {
176+
; CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[GLOBAL_PTR]], ptr addrspace(3) [[GROUP_PTR]], i32 4, i32 0, i32 0)
177+
; CHECK-NEXT: ret void
178+
;
179+
%cast = addrspacecast ptr addrspace(1) %global.ptr to ptr
180+
call void @llvm.amdgcn.load.to.lds.p0(ptr %cast, ptr addrspace(3) %group.ptr, i32 4, i32 0, i32 0)
181+
ret void
182+
}
183+
184+
define amdgpu_kernel void @load_to_lds_fat_pointer_as_flat(ptr addrspace(7) %buffer.fat.ptr, ptr addrspace(3) %group.ptr) #0 {
185+
; CHECK-LABEL: define amdgpu_kernel void @load_to_lds_fat_pointer_as_flat(
186+
; CHECK-SAME: ptr addrspace(7) [[BUFFER_FAT_PTR:%.*]], ptr addrspace(3) [[GROUP_PTR:%.*]]) #[[ATTR0]] {
187+
; CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) [[BUFFER_FAT_PTR]], ptr addrspace(3) [[GROUP_PTR]], i32 4, i32 0, i32 0)
188+
; CHECK-NEXT: ret void
189+
;
190+
%cast = addrspacecast ptr addrspace(7) %buffer.fat.ptr to ptr
191+
call void @llvm.amdgcn.load.to.lds.p0(ptr %cast, ptr addrspace(3) %group.ptr, i32 4, i32 0, i32 0)
192+
ret void
193+
}
194+
173195
declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1) #1
174196
declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1) #1
175197
declare void @llvm.memcpy.inline.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1) #1
176198
declare void @llvm.memcpy.p0.p3.i32(ptr nocapture writeonly, ptr addrspace(3) nocapture readonly, i32, i1) #1
177199
declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1) #1
200+
declare void @llvm.amdgcn.load.to.lds.p0(ptr nocapture readonly, ptr addrspace(3) nocapture writeonly, i32 immarg, i32 immarg, i32 immarg) #1
178201

179202
attributes #0 = { nounwind }
180203
attributes #1 = { argmemonly nounwind }

0 commit comments

Comments
 (0)