Skip to content

Commit 96e94b5

Browse files
committed
[AMDGPU] Add a new amdgcn.load.to.lds intrinsic
This PR adds a amdgns_load_to_lds intrinsic that abstracts over loads to LDS from global (address space 1) pointers and buffer fat pointers (address space 7), since they use the saem API and "gather from a pointer to LDS" is something of an abstract operation. This commet adds the intrinsic and its lowerings for addrspaces 1 and 7, and updates the MLIR wrappers to use it (loosening up the restrictions on loads to LDS along the way to match the ground truth from target features). It also plumbs the intrinsic through to clang.
1 parent 77f8335 commit 96e94b5

File tree

21 files changed

+598
-53
lines changed

21 files changed

+598
-53
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at
257257
TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst")
258258
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts")
259259
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts")
260+
TARGET_BUILTIN(__builtin_amdgcn_load_to_lds, "vv*v*3IUiIiIUi", "t", "vmem-to-lds-load-insts")
260261
TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "vmem-to-lds-load-insts")
261262

262263
//===----------------------------------------------------------------------===//

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
564564
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
565565
return Builder.CreateCall(F, {Addr});
566566
}
567+
case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
568+
return emitBuiltinWithOneOverloadedType<5>(*this, E,
569+
Intrinsic::amdgcn_load_to_lds);
570+
}
567571
case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
568572
Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
569573
{llvm::Type::getInt64Ty(getLLVMContext())});

clang/lib/Sema/SemaAMDGPU.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
3636

3737
switch (BuiltinID) {
3838
case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_load_lds:
39+
case AMDGPU::BI__builtin_amdgcn_load_to_lds:
3940
case AMDGPU::BI__builtin_amdgcn_global_load_lds: {
4041
constexpr const int SizeIdx = 2;
4142
llvm::APSInt Size;

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1766,6 +1766,36 @@ void test_cvt_sr_f16_f32(global half2 *out, float src, uint seed)
17661766
*out = __builtin_amdgcn_cvt_sr_f16_f32(*out, src, seed, 1);
17671767
}
17681768

1769+
// CHECK-LABEL: @test_load_to_lds_96(
1770+
// CHECK-NEXT: entry:
1771+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1772+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
1773+
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
1774+
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
1775+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
1776+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
1777+
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 12, i32 0, i32 0)
1778+
// CHECK-NEXT: ret void
1779+
//
1780+
void test_load_to_lds_96(global void* src, local void *dst) {
1781+
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0);
1782+
}
1783+
1784+
// CHECK-LABEL: @test_load_to_lds_128(
1785+
// CHECK-NEXT: entry:
1786+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1787+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
1788+
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
1789+
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
1790+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
1791+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
1792+
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 16, i32 0, i32 0)
1793+
// CHECK-NEXT: ret void
1794+
//
1795+
void test_load_to_lds_128(global void* src, local void *dst) {
1796+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0);
1797+
}
1798+
17691799
// CHECK-LABEL: @test_global_load_lds_96(
17701800
// CHECK-NEXT: entry:
17711801
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s
3+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s
4+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
5+
// REQUIRES: amdgpu-registered-target
6+
7+
typedef unsigned int u32;
8+
typedef unsigned short u16;
9+
typedef unsigned char u8;
10+
11+
// CHECK-LABEL: @test_load_to_lds_u32(
12+
// CHECK-NEXT: entry:
13+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
14+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
15+
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
16+
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
17+
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
18+
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
19+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
20+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
21+
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0)
22+
// CHECK-NEXT: ret void
23+
//
24+
void test_load_to_lds_u32(global u32* src, local u32 *dst) {
25+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
26+
}
27+
28+
// CHECK-LABEL: @test_load_to_lds_u16(
29+
// CHECK-NEXT: entry:
30+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
31+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
32+
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
33+
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
34+
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
35+
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
36+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
37+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
38+
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0)
39+
// CHECK-NEXT: ret void
40+
//
41+
void test_load_to_lds_u16(global u16* src, local u16 *dst) {
42+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0);
43+
}
44+
45+
// CHECK-LABEL: @test_load_to_lds_u8(
46+
// CHECK-NEXT: entry:
47+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
48+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
49+
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
50+
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
51+
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
52+
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
53+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
54+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
55+
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0)
56+
// CHECK-NEXT: ret void
57+
//
58+
void test_load_to_lds_u8(global u8* src, local u8 *dst) {
59+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0);
60+
}

llvm/docs/ReleaseNotes.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,14 @@ Changes to the AMDGPU Backend
102102

103103
* Bump the default `.amdhsa_code_object_version` to 6. ROCm 6.3 is required to run any program compiled with COV6.
104104

105+
* Add a new `amdgcn.load.to.lds` intrinsic that wraps the existing global.load.lds
106+
intrinsic and has the same semantics. This intrinsic allows using buffer fat pointers
107+
(`ptr addrspace(7)`) as arguments, allowing loads to LDS from these pointers to be
108+
represented in the IR without needing to use buffer resource intrinsics directly.
109+
This intrinsic is exposed to Clang as `__builtin_amdgcn_load_to_lds`, though
110+
buffer fat pointers are not yet enabled in Clang. Migration to this intrinsic is
111+
optional, and there are no plans to deprecate `amdgcn.global.load.lds`.
112+
105113
Changes to the ARM Backend
106114
--------------------------
107115

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2641,6 +2641,27 @@ def int_amdgcn_perm :
26412641
// GFX9 Intrinsics
26422642
//===----------------------------------------------------------------------===//
26432643

2644+
/// This is a general-purpose intrinsic for all operations that take a pointer
2645+
/// a base location in LDS, and a data size and use it to perform a gather to LDS.
2646+
/// This allows abstracting over both global pointers (address space 1) and
2647+
/// the buffer-resource-wrapper pointers (address space 7 and 9).
2648+
/// TODO: add support for address space 5 and scratch_load_lds.
2649+
class AMDGPULoadToLDS :
2650+
ClangBuiltin<"__builtin_amdgcn_load_to_lds">,
2651+
Intrinsic <
2652+
[],
2653+
[llvm_anyptr_ty, // Base pointer to load from. Varies per lane.
2654+
LLVMQualPointerType<3>, // LDS base pointer to store to. Must be wave-uniform.
2655+
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
2656+
llvm_i32_ty, // imm offset (applied to both input and LDS address)
2657+
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
2658+
// bit 1 = sc1,
2659+
// bit 4 = scc))
2660+
[IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
2661+
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
2662+
"", [SDNPMemOperand]>;
2663+
def int_amdgcn_load_to_lds : AMDGPULoadToLDS;
2664+
26442665
class AMDGPUGlobalLoadLDS :
26452666
ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
26462667
Intrinsic <

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2335,6 +2335,11 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
23352335
case Intrinsic::amdgcn_struct_buffer_load_lds:
23362336
case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
23372337
return selectBufferLoadLds(I);
2338+
// Until we can store both the address space of the global and the LDS
2339+
// arguments by having tto MachineMemOperands on an intrinsic, we just trust
2340+
// that the argument is a global pointer (buffer pointers have been handled by
2341+
// a LLVM IR-level lowering).
2342+
case Intrinsic::amdgcn_load_to_lds:
23382343
case Intrinsic::amdgcn_global_load_lds:
23392344
return selectGlobalLoadLds(I);
23402345
case Intrinsic::amdgcn_exp_compr:

llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2167,6 +2167,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
21672167
case Intrinsic::memset:
21682168
case Intrinsic::memset_inline:
21692169
case Intrinsic::experimental_memset_pattern:
2170+
case Intrinsic::amdgcn_load_to_lds:
21702171
return true;
21712172
}
21722173
}
@@ -2255,6 +2256,25 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
22552256
SplitUsers.insert(&I);
22562257
return {NewRsrc, Off};
22572258
}
2259+
case Intrinsic::amdgcn_load_to_lds: {
2260+
Value *Ptr = I.getArgOperand(0);
2261+
if (!isSplitFatPtr(Ptr->getType()))
2262+
return {nullptr, nullptr};
2263+
IRB.SetInsertPoint(&I);
2264+
auto [Rsrc, Off] = getPtrParts(Ptr);
2265+
Value *LDSPtr = I.getArgOperand(1);
2266+
Value *LoadSize = I.getArgOperand(2);
2267+
Value *ImmOff = I.getArgOperand(3);
2268+
Value *Aux = I.getArgOperand(4);
2269+
Value *SOffset = IRB.getInt32(0);
2270+
Instruction *NewLoad = IRB.CreateIntrinsic(
2271+
Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {},
2272+
{Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux});
2273+
copyMetadata(NewLoad, &I);
2274+
SplitUsers.insert(&I);
2275+
I.replaceAllUsesWith(NewLoad);
2276+
return {nullptr, nullptr};
2277+
}
22582278
}
22592279
return {nullptr, nullptr};
22602280
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3312,6 +3312,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
33123312
constrainOpWithReadfirstlane(B, MI, 6); // soffset
33133313
return;
33143314
}
3315+
case Intrinsic::amdgcn_load_to_lds:
33153316
case Intrinsic::amdgcn_global_load_lds: {
33163317
applyDefaultMapping(OpdMapper);
33173318
constrainOpWithReadfirstlane(B, MI, 2);
@@ -5273,6 +5274,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
52735274
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
52745275
break;
52755276
}
5277+
case Intrinsic::amdgcn_load_to_lds:
52765278
case Intrinsic::amdgcn_global_load_lds: {
52775279
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
52785280
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1450,6 +1450,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
14501450
Info.flags |= MachineMemOperand::MOStore;
14511451
return true;
14521452
}
1453+
case Intrinsic::amdgcn_load_to_lds:
14531454
case Intrinsic::amdgcn_global_load_lds: {
14541455
Info.opc = ISD::INTRINSIC_VOID;
14551456
unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
@@ -1531,6 +1532,7 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
15311532
case Intrinsic::amdgcn_global_load_tr_b128:
15321533
Ptr = II->getArgOperand(0);
15331534
break;
1535+
case Intrinsic::amdgcn_load_to_lds:
15341536
case Intrinsic::amdgcn_global_load_lds:
15351537
Ptr = II->getArgOperand(1);
15361538
break;
@@ -10219,6 +10221,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
1021910221

1022010222
return SDValue(Load, 0);
1022110223
}
10224+
// Buffers are handled by LowerBufferFatPointers, and we're going to go
10225+
// for "trust me" that the remaining cases are global pointers until
10226+
// such time as we can put two mem operands on an intrinsic.
10227+
case Intrinsic::amdgcn_load_to_lds:
1022210228
case Intrinsic::amdgcn_global_load_lds: {
1022310229
if (!Subtarget->hasVMemToLDSLoad())
1022410230
return SDValue();
@@ -10249,7 +10255,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
1024910255
break;
1025010256
}
1025110257

10252-
auto *M = cast<MemSDNode>(Op);
1025310258
SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
1025410259

1025510260
SmallVector<SDValue, 6> Ops;
@@ -10289,6 +10294,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
1028910294
Ops.push_back(M0Val.getValue(0)); // Chain
1029010295
Ops.push_back(M0Val.getValue(1)); // Glue
1029110296

10297+
auto *M = cast<MemSDNode>(Op);
1029210298
MachineMemOperand *LoadMMO = M->getMemOperand();
1029310299
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
1029410300
LoadPtrI.Offset = Op->getConstantOperandVal(5);
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s
3+
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s
4+
5+
; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s
6+
; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s
7+
8+
; ERR-SDAG: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.load.to.lds
9+
10+
; ERR-GISEL: LLVM ERROR: cannot select: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.load.to.lds),
11+
12+
;; Note: this is a bare-bones test to make sure that amdgcn.load.to.lds lowers to
13+
;; the correct intrinsic.
14+
15+
declare void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
16+
declare void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
17+
18+
;---------------------------------------------------------------------y
19+
; dwordx3
20+
;---------------------------------------------------------------------
21+
22+
define amdgpu_ps void @global_load_lds_dwordx3_vaddr_saddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture inreg %lptr) {
23+
; GFX950-LABEL: global_load_lds_dwordx3_vaddr_saddr:
24+
; GFX950: ; %bb.0:
25+
; GFX950-NEXT: s_mov_b32 m0, s0
26+
; GFX950-NEXT: s_nop 0
27+
; GFX950-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0
28+
; GFX950-NEXT: s_endpgm
29+
call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1)
30+
ret void
31+
}
32+
33+
define amdgpu_ps void @buffer_load_lds_dwordx3_vaddr_saddr(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) nocapture inreg %lptr) {
34+
; GFX950-LABEL: buffer_load_lds_dwordx3_vaddr_saddr:
35+
; GFX950: ; %bb.0:
36+
; GFX950-NEXT: v_add_u32_e32 v0, s4, v0
37+
; GFX950-NEXT: s_mov_b32 m0, s5
38+
; GFX950-NEXT: s_nop 0
39+
; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen offset:16 sc0 lds
40+
; GFX950-NEXT: s_endpgm
41+
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
42+
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1)
43+
ret void
44+
}
45+
46+
;---------------------------------------------------------------------
47+
; dwordx4
48+
;---------------------------------------------------------------------
49+
50+
define amdgpu_ps void @global_load_lds_dwordx4_vaddr_saddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture inreg %lptr) {
51+
; GFX950-LABEL: global_load_lds_dwordx4_vaddr_saddr:
52+
; GFX950: ; %bb.0:
53+
; GFX950-NEXT: s_mov_b32 m0, s0
54+
; GFX950-NEXT: s_nop 0
55+
; GFX950-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0
56+
; GFX950-NEXT: s_endpgm
57+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 16, i32 16, i32 1)
58+
ret void
59+
}
60+
61+
define amdgpu_ps void @buffer_load_lds_dwordx4_vaddr_saddr(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) nocapture inreg %lptr) {
62+
; GFX950-LABEL: buffer_load_lds_dwordx4_vaddr_saddr:
63+
; GFX950: ; %bb.0:
64+
; GFX950-NEXT: v_add_u32_e32 v0, s4, v0
65+
; GFX950-NEXT: s_mov_b32 m0, s5
66+
; GFX950-NEXT: s_nop 0
67+
; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 offen offset:16 sc0 lds
68+
; GFX950-NEXT: s_endpgm
69+
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
70+
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 16, i32 16, i32 1)
71+
ret void
72+
}
73+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
74+
; GFX950-GISEL: {{.*}}
75+
; GFX950-SDAG: {{.*}}

0 commit comments

Comments
 (0)